From d82ea3af02021f5292c5a49acfb92942b82a25e3 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Fri, 17 Jan 2020 23:18:48 +0100
Subject: [PATCH 1/8] reduce some include dependencies

---
 rtengine/camconst.cc       |  4 ++--
 rtengine/camconst.h        | 12 +++++++++---
 rtengine/color.h           |  8 +++++++-
 rtengine/curves.cc         |  5 ++---
 rtengine/curves.h          | 12 ++++++++----
 rtengine/diagonalcurves.cc |  2 --
 rtengine/iimage.h          |  8 +++++++-
 rtengine/imagedata.h       |  8 +++++++-
 rtengine/improcfun.h       |  9 ++++++++-
 rtengine/previewimage.h    |  8 +++++++-
 rtengine/rtthumbnail.h     |  9 +++++++--
 rtgui/previewloader.h      |  9 +++++++--
 12 files changed, 71 insertions(+), 23 deletions(-)
diff --git a/rtengine/camconst.cc b/rtengine/camconst.cc
index d136d6a21..c5cfc26fa 100644
--- a/rtengine/camconst.cc
+++ b/rtengine/camconst.cc
@@ -638,7 +638,7 @@ CameraConst::update_globalGreenEquilibration(bool other)
 }
 
 bool
-CameraConstantsStore::parse_camera_constants_file(Glib::ustring filename_)
+CameraConstantsStore::parse_camera_constants_file(const Glib::ustring& filename_)
 {
     // read the file into a single long string
     const char *filename = filename_.c_str();
@@ -809,7 +809,7 @@ CameraConstantsStore::~CameraConstantsStore()
     }
 }
 
-void CameraConstantsStore::init(Glib::ustring baseDir, Glib::ustring userSettingsDir)
+void CameraConstantsStore::init(const Glib::ustring& baseDir, const Glib::ustring& userSettingsDir)
 {
     parse_camera_constants_file(Glib::build_filename(baseDir, "camconst.json"));
 
diff --git a/rtengine/camconst.h b/rtengine/camconst.h
index 1096e1767..fbc16cb13 100644
--- a/rtengine/camconst.h
+++ b/rtengine/camconst.h
@@ -3,10 +3,16 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
 #include <map>
 #include <vector>
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
@@ -62,11 +68,11 @@ private:
     std::map<std::string, CameraConst *> mCameraConstants;
 
     CameraConstantsStore();
-    bool parse_camera_constants_file(Glib::ustring filename);
+    bool parse_camera_constants_file(const Glib::ustring& filename);
 
 public:
     ~CameraConstantsStore();
-    void init(Glib::ustring baseDir, Glib::ustring userSettingsDir);
+    void init(const Glib::ustring& baseDir, const Glib::ustring& userSettingsDir);
     static CameraConstantsStore *getInstance(void);
     CameraConst *get(const char make[], const char model[]);
 };
diff --git a/rtengine/color.h b/rtengine/color.h
index 211615de1..8e0015c42 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -20,7 +20,6 @@
 #pragma once
 
 #include <array>
-#include <glibmm/ustring.h>
 
 #include "rt_math.h"
 #include "LUT.h"
@@ -30,6 +29,13 @@
 
 #define SAT(a,b,c) ((float)max(a,b,c)-(float)min(a,b,c))/(float)max(a,b,c)
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/curves.cc b/rtengine/curves.cc
index b2105a091..35a38d3b8 100644
--- a/rtengine/curves.cc
+++ b/rtengine/curves.cc
@@ -21,8 +21,7 @@
 #include <memory>
 #include <cmath>
 #include <cstring>
-#include <glib.h>
-#include <glib/gstdio.h>
+#include <glibmm/ustring.h>
 
 #include "rt_math.h"
 
@@ -2186,7 +2185,7 @@ void PerceptualToneCurve::init()
     }
 }
 
-void PerceptualToneCurve::initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const
+void PerceptualToneCurve::initApplyState(PerceptualToneCurveState & state, const Glib::ustring &workingSpace) const
 {
 
     // Get the curve's contrast value, and convert to a chroma scaling
diff --git a/rtengine/curves.h b/rtengine/curves.h
index bc8193b76..27f5a8adc 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -22,12 +22,9 @@
 #include <string>
 #include <vector>
 
-#include <glibmm/ustring.h>
-
 #include "rt_math.h"
 #include "flatcurvetypes.h"
 #include "diagonalcurvetypes.h"
-#include "pipettebuffer.h"
 #include "noncopyable.h"
 #include "LUT.h"
 #include "sleef.h"
@@ -37,6 +34,13 @@
 
 #define CLIPI(a) ((a)>0?((a)<65534?(a):65534):0)
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 using namespace std;
 
 namespace rtengine
@@ -940,7 +944,7 @@ private:
     float calculateToneCurveContrastValue() const;
 public:
     static void init();
-    void initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const;
+    void initApplyState(PerceptualToneCurveState & state, const Glib::ustring& workingSpace) const;
     void BatchApply(const size_t start, const size_t end, float *r, float *g, float *b, const PerceptualToneCurveState &state) const;
 };
 
diff --git a/rtengine/diagonalcurves.cc b/rtengine/diagonalcurves.cc
index bb20b7cc1..e81f2fe92 100644
--- a/rtengine/diagonalcurves.cc
+++ b/rtengine/diagonalcurves.cc
@@ -16,8 +16,6 @@
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
  */
-#include <glib.h>
-#include <glib/gstdio.h>
 #include "curves.h"
 #include <cmath>
 #include <vector>
diff --git a/rtengine/iimage.h b/rtengine/iimage.h
index 7309dd91f..c2e9dfd3b 100644
--- a/rtengine/iimage.h
+++ b/rtengine/iimage.h
@@ -20,7 +20,6 @@
 
 #include <vector>
 
-#include <glibmm/ustring.h>
 #include <lcms2.h>
 
 #include "alignedbuffer.h"
@@ -41,6 +40,13 @@
 
 #define CHECK_BOUNDS 0
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/imagedata.h b/rtengine/imagedata.h
index ff8ed4b86..5765d9aec 100644
--- a/rtengine/imagedata.h
+++ b/rtengine/imagedata.h
@@ -23,12 +23,18 @@
 #include <string>
 #include <vector>
 
-#include <glibmm/ustring.h>
 
 #include <libiptcdata/iptc-data.h>
 
 #include "imageio.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtexif
 {
 
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 8f4ae7771..6538bd6b1 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -19,11 +19,17 @@
 #pragma once
 
 #include <memory>
+#include <vector>
 
 #include "coord2d.h"
 #include "gamutwarning.h"
-#include "pipettebuffer.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
 template<typename T>
 class LUT;
 
@@ -44,6 +50,7 @@ class FramesMetaData;
 class LensCorrection;
 class NoiseCurve;
 class OpacityCurve;
+class PipetteBuffer;
 class ToneCurve;
 class WavCurve;
 class WavOpacityCurveBY;
diff --git a/rtengine/previewimage.h b/rtengine/previewimage.h
index e6c3ea070..2143509a3 100644
--- a/rtengine/previewimage.h
+++ b/rtengine/previewimage.h
@@ -18,10 +18,16 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
 
 #include <cairomm/cairomm.h>
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/rtthumbnail.h b/rtengine/rtthumbnail.h
index dcc9596f6..c8d657a62 100644
--- a/rtengine/rtthumbnail.h
+++ b/rtengine/rtthumbnail.h
@@ -18,8 +18,6 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
-
 #include <lcms2.h>
 
 #include "image16.h"
@@ -30,6 +28,13 @@
 
 #include "../rtgui/threadutils.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtgui/previewloader.h b/rtgui/previewloader.h
index 9a74ee2eb..a8032fcaf 100644
--- a/rtgui/previewloader.h
+++ b/rtgui/previewloader.h
@@ -20,10 +20,15 @@
 
 #include <set>
 
-#include <glibmm/ustring.h>
-
 #include "../rtengine/noncopyable.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 class FileBrowserEntry;
 
 class PreviewLoaderListener

From 54d8efc5f62724a1d07b543090f98401c04b5499 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Fri, 17 Jan 2020 23:56:17 +0100
Subject: [PATCH 2/8] reduce include dependecies

---
 rtengine/guidedfilter.cc | 1 +
 rtengine/imagesource.h   | 1 -
 rtengine/ipdehaze.cc     | 1 +
 rtengine/rescale.h       | 5 ++++-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc
index ad3beec51..f83560cfc 100644
--- a/rtengine/guidedfilter.cc
+++ b/rtengine/guidedfilter.cc
@@ -29,6 +29,7 @@
  * available at https://arxiv.org/abs/1505.00996
 */
 
+#include "array2D.h"
 #include "boxblur.h"
 #include "guidedfilter.h"
 #include "imagefloat.h"
diff --git a/rtengine/imagesource.h b/rtengine/imagesource.h
index e0c26aa9f..2ef2554bc 100644
--- a/rtengine/imagesource.h
+++ b/rtengine/imagesource.h
@@ -25,7 +25,6 @@
 
 #include "coord2d.h"
 #include "imagedata.h"
-#include "LUT.h"
 #include "rtengine.h"
 
 template<typename T>
diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 28a0f2d57..38e35c612 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -32,6 +32,7 @@
 #include <iostream>
 #include <vector>
 
+#include "array2D.h"
 #include "color.h"
 #include "guidedfilter.h"
 #include "iccstore.h"
diff --git a/rtengine/rescale.h b/rtengine/rescale.h
index 70974aa48..2138cd8e8 100644
--- a/rtengine/rescale.h
+++ b/rtengine/rescale.h
@@ -20,9 +20,12 @@
 
 #pragma once
 
-#include "array2D.h"
 #include "rt_math.h"
 
+template<typename T>
+class array2D;
+
+
 namespace rtengine
 {
 

From c3a86befaaa8eab8aebbeddb46f3b92492f6f3d6 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sat, 18 Jan 2020 11:57:31 +0100
Subject: [PATCH 3/8] Fix broken build

---
 rtengine/camconst.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rtengine/camconst.h b/rtengine/camconst.h
index fbc16cb13..0c0618671 100644
--- a/rtengine/camconst.h
+++ b/rtengine/camconst.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <map>
+#include <string>
 #include <vector>
 
 namespace Glib

From bf301b7e4099337d1bbd37c8a02dc72876ecaa1f Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sat, 18 Jan 2020 23:46:48 +0100
Subject: [PATCH 4/8] reduce include depenencies

---
 rtengine/curves.h            |    2 -
 rtengine/dcrop.cc            |    1 +
 rtengine/dcrop.h             |    5 +-
 rtengine/improccoordinator.h |    1 -
 rtengine/improcfun.cc        |    3 +-
 rtengine/opthelper.h         |   39 +-
 rtengine/rtengine.h          |    1 -
 rtengine/sleefsseavx.c       | 1483 --------------------------------
 rtengine/sleefsseavx.h       | 1558 +++++++++++++++++++++++++++++++---
 rtengine/utils.cc            |    1 -
 rtgui/cropwindow.cc          |    1 +
 rtgui/curveeditor.h          |    3 +-
 rtgui/guiutils.cc            |    1 -
 13 files changed, 1473 insertions(+), 1626 deletions(-)
 delete mode 100644 rtengine/sleefsseavx.c

diff --git a/rtengine/curves.h b/rtengine/curves.h
index 27f5a8adc..d7c35b619 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -30,8 +30,6 @@
 #include "sleef.h"
 #define CURVES_MIN_POLY_POINTS  1000
 
-#include "rt_math.h"
-
 #define CLIPI(a) ((a)>0?((a)<65534?(a):65534):0)
 
 namespace Glib
diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 890003ee0..77392a552 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -24,6 +24,7 @@
 #include "dcrop.h"
 #include "image8.h"
 #include "imagefloat.h"
+#include "improccoordinator.h"
 #include "labimage.h"
 #include "mytime.h"
 #include "procparams.h"
diff --git a/rtengine/dcrop.h b/rtengine/dcrop.h
index c65c1e72f..6667800f9 100644
--- a/rtengine/dcrop.h
+++ b/rtengine/dcrop.h
@@ -18,11 +18,7 @@
  */
 #pragma once
 
-#include "improccoordinator.h"
 #include "rtengine.h"
-#include "improcfun.h"
-#include "imagesource.h"
-#include "procevents.h"
 #include "pipettebuffer.h"
 #include "../rtgui/threadutils.h"
 
@@ -30,6 +26,7 @@ namespace rtengine
 {
 
 class Image8;
+class CieImage;
 
 using namespace procparams;
 
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index 96d1f80ce..8b5b37625 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -27,7 +27,6 @@
 #include "imagesource.h"
 #include "improcfun.h"
 #include "LUT.h"
-#include "procevents.h"
 #include "rtengine.h"
 
 #include "../rtgui/threadutils.h"
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index d5c69773e..6749ff305 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -31,15 +31,16 @@
 #include "cieimage.h"
 #include "clutstore.h"
 #include "color.h"
+#include "colortemp.h"
 #include "curves.h"
 #include "dcp.h"
 #include "EdgePreservingDecomposition.h"
 #include "iccmatrices.h"
 #include "iccstore.h"
 #include "imagesource.h"
-#include "improccoordinator.h"
 #include "improcfun.h"
 #include "labimage.h"
+#include "pipettebuffer.h"
 #include "procparams.h"
 #include "rt_math.h"
 #include "rtengine.h"
diff --git a/rtengine/opthelper.h b/rtengine/opthelper.h
index b65ede227..f431c0ec9 100644
--- a/rtengine/opthelper.h
+++ b/rtengine/opthelper.h
@@ -18,27 +18,24 @@
 //  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 //
 ////////////////////////////////////////////////////////////////
+#pragma once
 
-#ifndef OPTHELPER_H
-    #define OPTHELPER_H
+#define pow_F(a,b) (xexpf(b*xlogf(a)))
 
-    #define pow_F(a,b) (xexpf(b*xlogf(a)))
-
-    #ifdef __SSE2__
-        #include "sleefsseavx.c"
-    #endif
-
-    #ifdef __GNUC__
-        #define RESTRICT    __restrict__
-        #define LIKELY(x)   __builtin_expect (!!(x), 1)
-        #define UNLIKELY(x) __builtin_expect (!!(x), 0)
-        #define ALIGNED64 __attribute__ ((aligned (64)))
-        #define ALIGNED16 __attribute__ ((aligned (16)))
-    #else
-        #define RESTRICT
-        #define LIKELY(x)    (x)
-        #define UNLIKELY(x)  (x)
-        #define ALIGNED64
-        #define ALIGNED16
-    #endif
+#ifdef __SSE2__
+    #include "sleefsseavx.h"
+#endif
+
+#ifdef __GNUC__
+    #define RESTRICT    __restrict__
+    #define LIKELY(x)   __builtin_expect (!!(x), 1)
+    #define UNLIKELY(x) __builtin_expect (!!(x), 0)
+    #define ALIGNED64 __attribute__ ((aligned (64)))
+    #define ALIGNED16 __attribute__ ((aligned (16)))
+#else
+    #define RESTRICT
+    #define LIKELY(x)    (x)
+    #define UNLIKELY(x)  (x)
+    #define ALIGNED64
+    #define ALIGNED16
 #endif
diff --git a/rtengine/rtengine.h b/rtengine/rtengine.h
index 0473622c4..ca663c5bc 100644
--- a/rtengine/rtengine.h
+++ b/rtengine/rtengine.h
@@ -31,7 +31,6 @@
 #include "imageformat.h"
 #include "procevents.h"
 #include "rawmetadatalocation.h"
-#include "rt_math.h"
 #include "settings.h"
 
 #include "../rtgui/threadutils.h"
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
deleted file mode 100644
index 0af516f9b..000000000
--- a/rtengine/sleefsseavx.c
+++ /dev/null
@@ -1,1483 +0,0 @@
-////////////////////////////////////////////////////////////////
-//
-//  this code was taken from http://shibatch.sourceforge.net/
-//  Many thanks to the author of original version: Naoki Shibata
-//
-//  This version contains modifications made by Ingo Weyrich
-//
-////////////////////////////////////////////////////////////////
-
-
-#ifndef SLEEFSSEAVX
-#define SLEEFSSEAVX
-
-#include <assert.h>
-#include "rt_math.h"
-#ifdef __SSE2__
-#include "helpersse2.h"
-
-#ifdef ENABLE_AVX
-#include "helperavx.h"
-#endif
-
-#ifdef __GNUC__
-#define INLINE __inline
-#else
-#define INLINE inline
-#endif
-
-#define PI4_A .7853981554508209228515625
-#define PI4_B .794662735614792836713604629039764404296875e-8
-#define PI4_C .306161699786838294306516483068750264552437361480769e-16
-#define M_4_PI 1.273239544735162542821171882678754627704620361328125
-
-#define L2U .69314718055966295651160180568695068359375
-#define L2L .28235290563031577122588448175013436025525412068e-12
-#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
-
-#define PI4_Af 0.78515625f
-#define PI4_Bf 0.00024127960205078125f
-#define PI4_Cf 6.3329935073852539062e-07f
-#define PI4_Df 4.9604681473525147339e-10f
-
-#define L2Uf 0.693145751953125f
-#define L2Lf 1.428606765330187045e-06f
-#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
-
-#define INFINITYf ((float)rtengine::RT_INFINITY)
-#define NANf ((float)rtengine::RT_NAN)
-
-static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
-    return vadd(vadd(v0, v1), v2);
-}
-
-static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
-    return vadd3(vadd(v0, v1), v2, v3);
-}
-
-static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
-    return vadd4(vadd(v0, v1), v2, v3, v4);
-}
-
-static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
-    return vadd5(vadd(v0, v1), v2, v3, v4, v5);
-}
-
-static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
-    return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
-}
-
-static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
-    return vsub(vsub(v0, v1), v2);
-}
-
-static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
-    return vsub3(vsub(v0, v1), v2, v3);
-}
-
-static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
-    return vsub4(vsub(v0, v1), v2, v3, v4);
-}
-
-//
-
-static INLINE vdouble2 normalize_d(vdouble2 t) {
-    vdouble2 s;
-
-    s.x = vadd(t.x, t.y);
-    s.y = vadd(vsub(t.x, s.x), t.y);
-
-    return s;
-}
-
-static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
-    vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
-    return r;
-}
-
-static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y);
-    r.y = vadd(vsub(x, r.x), y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y);
-    vdouble v = vsub(r.x, x);
-    r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
-
-    return r;
-}
-
-static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x.x, y);
-    r.y = vadd3(vsub(x.x, r.x), y, x.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x.x, y);
-    vdouble v = vsub(r.x, x.x);
-    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
-    r.y = vadd(r.y, x.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y.x);
-    r.y = vadd3(vsub(x, r.x), y.x, y.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
-    // |x| >= |y|
-
-    vdouble2 r;
-
-    r.x = vadd(x.x, y.x);
-    r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
-    vdouble2 r;
-
-    r.x  = vadd(x.x, y.x);
-    vdouble v = vsub(r.x, x.x);
-    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
-    r.y = vadd(r.y, vadd(x.y, y.y));
-
-    return r;
-}
-
-static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
-    vdouble t = vrec(d.x);
-    vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
-    vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
-    vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
-
-    vdouble2 q;
-
-    q.x = vmul(n.x, t);
-
-    vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
-            vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
-
-    q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
-
-    return q;
-}
-
-static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
-    vdouble xh = vupper(x), xl = vsub(x, xh);
-    vdouble yh = vupper(y), yl = vsub(y, yh);
-    vdouble2 r;
-
-    r.x = vmul(x, y);
-    r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
-
-    return r;
-}
-
-static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble yh = vupper(y  ), yl = vsub(y  , yh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, y);
-    r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
-
-    return r;
-}
-
-static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, y.x);
-    r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
-
-    return r;
-}
-
-static INLINE vdouble2 squ_d(vdouble2 x) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, x.x);
-    r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
-
-    return r;
-}
-
-static INLINE vdouble2 rec_s(vdouble d) {
-    vdouble t = vrec(d);
-    vdouble dh = vupper(d), dl = vsub(d, dh);
-    vdouble th = vupper(t), tl = vsub(t, th);
-    vdouble2 q;
-
-    q.x = t;
-    q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
-
-    return q;
-}
-
-static INLINE vdouble2 sqrt_d(vdouble2 d) {
-    vdouble t = vsqrt(vadd(d.x, d.y));
-    return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
-}
-
-//
-
-static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
-
-static INLINE vint xilogb(vdouble d) {
-    vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
-    e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
-    e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
-    return vrint_vi_vd(e);
-}
-
-static INLINE vdouble xsin(vdouble d) {
-    vint q;
-    vdouble u, s;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
-
-    u = vcast_vd_vi(q);
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
-
-    s = vmul(d, d);
-
-    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
-
-    u = vcast_vd_d(-7.97255955009037868891952e-18);
-    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
-    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
-    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
-    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
-    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
-
-    u = vmla(s, vmul(u, d), d);
-
-    return u;
-}
-
-static INLINE vdouble xcos(vdouble d) {
-    vint q;
-    vdouble u, s;
-
-    q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
-    q = vaddi(vaddi(q, q), vcast_vi_i(1));
-
-    u = vcast_vd_vi(q);
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
-
-    s = vmul(d, d);
-
-    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
-
-    u = vcast_vd_d(-7.97255955009037868891952e-18);
-    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
-    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
-    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
-    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
-    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
-
-    u = vmla(s, vmul(u, d), d);
-
-    return u;
-}
-
-static INLINE vdouble2 xsincos(vdouble d) {
-    vint q;
-    vmask m;
-    vdouble u, s, t, rx, ry;
-    vdouble2 r;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
-
-    s = d;
-
-    u = vcast_vd_vi(q);
-    s = vmla(u, vcast_vd_d(-PI4_A*2), s);
-    s = vmla(u, vcast_vd_d(-PI4_B*2), s);
-    s = vmla(u, vcast_vd_d(-PI4_C*2), s);
-
-    t = s;
-
-    s = vmul(s, s);
-
-    u = vcast_vd_d(1.58938307283228937328511e-10);
-    u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
-    u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
-    u = vmul(vmul(u, s), t);
-
-    rx = vadd(t, u);
-
-    u = vcast_vd_d(-1.13615350239097429531523e-11);
-    u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
-    u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
-    u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
-    u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
-    u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
-    u = vmla(u, s, vcast_vd_d(-0.5));
-
-    ry = vadd(vcast_vd_d(1), vmul(s, u));
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
-    r.x = vsel(m, rx, ry);
-    r.y = vsel(m, ry, rx);
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
-    r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
-
-    m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
-    r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
-
-    m = vmask_isinf(d);
-    r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
-    r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
-
-    return r;
-}
-
-static INLINE vdouble xtan(vdouble d) {
-    vint q;
-    vdouble u, s, x;
-    vmask m;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
-
-    u = vcast_vd_vi(q);
-    x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
-    x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
-    x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
-
-    s = vmul(x, x);
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
-    x = vsel(m, vneg(x), x);
-
-    u = vcast_vd_d(1.01419718511083373224408e-05);
-    u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
-    u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
-    u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
-    u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
-    u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
-    u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
-    u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
-    u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
-    u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
-    u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
-    u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
-    u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
-    u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
-    u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
-
-    u = vmla(s, vmul(u, x), x);
-
-    u = vsel(m, vrec(u), u);
-
-    u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
-
-    return u;
-}
-
-static INLINE vdouble atan2k(vdouble y, vdouble x) {
-    vdouble s, t, u;
-    vint q;
-    vmask p;
-
-    q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
-    x = vabs(x);
-
-    q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
-    p = vmask_lt(x, y);
-    s = vsel (p, vneg(x), y);
-    t = vmax (x, y);
-
-    s = vdiv(s, t);
-    t = vmul(s, s);
-
-    u = vcast_vd_d(-1.88796008463073496563746e-05);
-    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
-    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
-    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
-    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
-    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
-    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
-    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
-    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
-    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
-    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
-    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
-    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
-    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
-    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
-    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
-    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
-    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
-    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
-
-    t = vadd(s, vmul(s, vmul(t, u)));
-    t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
-
-    return t;
-}
-
-static INLINE vdouble xatan2(vdouble y, vdouble x) {
-    vdouble r = atan2k(vabs(y), x);
-
-    r = vmulsign(r, x);
-    r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
-    r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
-    r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
-
-    return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
-}
-
-static INLINE vdouble xasin(vdouble d) {
-    vdouble x, y;
-    x = vadd(vcast_vd_d(1), d);
-    y = vsub(vcast_vd_d(1), d);
-    x = vmul(x, y);
-    x = vsqrt(x);
-    x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
-    return vmulsign(x, d);
-}
-
-static INLINE vdouble xacos(vdouble d) {
-    vdouble x, y;
-    x = vadd(vcast_vd_d(1), d);
-    y = vsub(vcast_vd_d(1), d);
-    x = vmul(x, y);
-    x = vsqrt(x);
-    x = vmulsign(atan2k(x, vabs(d)), d);
-    y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
-    x = vadd(x, y);
-    return x;
-}
-
-static INLINE vdouble xatan(vdouble s) {
-    vdouble t, u;
-    vint q;
-
-    q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
-    s = vabs(s);
-
-    q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
-    s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
-
-    t = vmul(s, s);
-
-    u = vcast_vd_d(-1.88796008463073496563746e-05);
-    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
-    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
-    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
-    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
-    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
-    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
-    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
-    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
-    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
-    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
-    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
-    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
-    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
-    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
-    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
-    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
-    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
-    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
-
-    t = vadd(s, vmul(s, vmul(t, u)));
-
-    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
-    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
-
-    return t;
-}
-
-static INLINE vdouble xlog(vdouble d) {
-    vdouble x, x2;
-    vdouble t, m;
-    vint e;
-
-    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
-    m = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
-    x2 = vmul(x, x);
-
-    t = vcast_vd_d(0.148197055177935105296783);
-    t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
-    t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
-    t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
-    t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
-    t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
-    t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
-    t = vmla(t, x2, vcast_vd_d(2));
-
-    x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
-
-    x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-static INLINE vdouble xexp(vdouble d) {
-    vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
-    vdouble s, u;
-
-    s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    u = vcast_vd_d(2.08860621107283687536341e-09);
-    u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
-    u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
-    u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
-    u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
-    u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
-    u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
-    u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
-    u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
-    u = vmla(u, s, vcast_vd_d(0.5));
-
-    u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
-
-    u = vldexp(u, q);
-
-    u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
-
-    return u;
-}
-
-static INLINE vdouble2 logk(vdouble d) {
-    vdouble2 x, x2;
-    vdouble t, m;
-    vint e;
-
-    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
-    m = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
-    x2 = squ_d(x);
-    x2 = normalize_d(x2);
-
-    t = vcast_vd_d(0.134601987501262130076155);
-    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
-    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
-    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
-    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
-    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
-    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
-    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
-
-    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
-                vcast_vd_vi(e)),
-            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
-}
-
-static INLINE vdouble expk(vdouble2 d) {
-    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
-    vint q = vrint_vi_vd(u);
-    vdouble2 s, t;
-
-    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
-
-    s = normalize_d(s);
-
-    u = vcast_vd_d(2.51069683420950419527139e-08);
-    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
-    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
-    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
-    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
-    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
-    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
-    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
-    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
-    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
-
-    t = add_dd(s, mul_ds(squ_d(s), u));
-
-    t = add_sd(vcast_vd_d(1), t);
-    u = vadd(t.x, t.y);
-    u = vldexp(u, q);
-
-    return u;
-}
-
-static INLINE vdouble xpow(vdouble x, vdouble y) {
-#if 1
-    vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
-    vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
-
-    vdouble result = expk(mul_ds(logk(vabs(x)), y));
-
-    //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
-
-    result = vmul(result,
-            vsel(vmask_gt(x, vcast_vd_d(0)),
-                vcast_vd_d(1),
-                vsel(yisint,
-                    vsel(yisodd,
-                        vcast_vd_d(-1),
-                        vcast_vd_d(1)),
-                    vcast_vd_d(rtengine::RT_NAN))));
-
-    vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
-
-    result = vsel(vmask_isinf(y),
-            vsel(vmask_lt(efx, vcast_vd_d(0)),
-                vcast_vd_d(0),
-                vsel(vmask_eq(efx, vcast_vd_d(0)),
-                    vcast_vd_d(1.0),
-                    vcast_vd_d(rtengine::RT_INFINITY))),
-            result);
-
-    result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
-            vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
-                vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
-                    vcast_vd_d(0),
-                    vcast_vd_d(rtengine::RT_INFINITY))),
-            result);
-
-    result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
-
-    result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
-
-    return result;
-#else
-    return expk(mul_ds(logk(x), y));
-#endif
-}
-
-static INLINE vdouble2 expk2(vdouble2 d) {
-    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
-    vint q = vrint_vi_vd(u);
-    vdouble2 s, t;
-
-    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
-
-    s = normalize_d(s);
-
-    u = vcast_vd_d(2.51069683420950419527139e-08);
-    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
-    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
-    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
-    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
-    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
-    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
-    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
-    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
-    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
-
-    t = add_dd(s, mul_ds(squ_d(s), u));
-
-    t = add_sd(vcast_vd_d(1), t);
-
-    return dd(vldexp(t.x, q), vldexp(t.y, q));
-}
-
-static INLINE vdouble xsinh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
-    d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
-    y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xcosh(vdouble x) {
-    vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
-    d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
-    vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xtanh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
-    vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
-    d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
-    y = d.x + d.y;
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble2 logk2(vdouble2 d) {
-    vdouble2 x, x2, m;
-    vdouble t;
-    vint e;
-
-    d = normalize_d(d);
-    e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
-    m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
-
-    x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
-    x2 = squ_d(x);
-    x2 = normalize_d(x2);
-
-    t = vcast_vd_d(0.134601987501262130076155);
-    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
-    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
-    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
-    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
-    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
-    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
-    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
-
-    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
-                vcast_vd_vi(e)),
-            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
-}
-
-static INLINE vdouble xasinh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
-    y = vadd(d.x, d.y);
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xacosh(vdouble x) {
-    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
-    vdouble y = vadd(d.x, d.y);
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
-    y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xatanh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
-    y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xcbrt(vdouble d) {
-    vdouble x, y, q = vcast_vd_d(1.0);
-    vint e, qu, re;
-    vdouble t;
-
-    e = vilogbp1(vabs(d));
-    d = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
-    qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
-    re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
-
-    q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
-    q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
-    q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
-
-    q = vmulsign(q, d);
-
-    d = vabs(d);
-
-    x = vcast_vd_d(-0.640245898480692909870982);
-    x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
-    x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
-    x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
-    x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
-    x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
-
-    y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
-    y = vmul(vmul(d, x), x);
-    y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
-
-    return y;
-}
-
-static INLINE vdouble xexp2(vdouble a) {
-    vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
-    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
-    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
-    return u;
-}
-
-static INLINE vdouble xexp10(vdouble a) {
-    vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
-    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
-    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
-    return u;
-}
-
-static INLINE vdouble xexpm1(vdouble a) {
-    vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
-    vdouble x = d.x + d.y;
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
-    return x;
-}
-
-static INLINE vdouble xlog10(vdouble a) {
-    vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
-    vdouble x = d.x + d.y;
-
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-static INLINE vdouble xlog1p(vdouble a) {
-    vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
-    vdouble x = d.x + d.y;
-
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-//
-
-typedef struct {
-    vfloat x, y;
-} vfloat2;
-
-static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
-static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
-
-#ifdef __SSE4_1__
-// only one instruction when using SSE4.1
-static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
-    return _mm_blendv_ps(y,x,(vfloat)mask);
-}
-
-static INLINE vint vselc(vmask mask, vint x, vint y) {
-    return _mm_blendv_epi8(y,x,mask);
-}
-
-#else
-// three instructions when using SSE2
-static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
-    return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
-}
-
-static INLINE vint vselc(vmask mask, vint x, vint y) {
-    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
-}
-#endif
-
-static INLINE vfloat vselfzero(vmask mask, vfloat x) {
-    // returns value of x if corresponding mask bits are 1, else returns 0
-    // faster than vself(mask, x, ZEROV)
-    return _mm_and_ps((vfloat)mask, x);
-}
-static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
-    // returns value of x if corresponding mask bits are 0, else returns 0
-    // faster than vself(mask, ZEROV, x)
-    return _mm_andnot_ps((vfloat)mask, x);
-}
-
-static INLINE vint vselizero(vmask mask, vint x) {
-    // returns value of x if corresponding mask bits are 1, else returns 0
-    // faster than vselc(mask, x, ZEROV)
-    return _mm_and_si128(mask, x);
-}
-static INLINE vint vselinotzero(vmask mask, vint x) {
-    // returns value of x if corresponding mask bits are 0, else returns 0
-    // faster than vselc(mask, ZEROV, x)
-    return _mm_andnot_si128(mask, x);
-}
-
-static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
-    vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
-    return vori2(vandi2(m2, x), vandnoti2(m2, y));
-}
-
-static INLINE vmask vsignbitf(vfloat f) {
-    return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
-}
-
-static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
-    return (vfloat)vxorm((vmask)x, vsignbitf(y));
-}
-
-static INLINE vfloat vsignf(vfloat f) {
-    return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
-}
-
-static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
-static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
-static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
-static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
-// the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
-static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
-static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
-static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
-
-static INLINE vint2 vilogbp1f(vfloat d) {
-    vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
-    d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
-    vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
-    q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
-    return q;
-}
-
-static INLINE vfloat vldexpf(vfloat x, vint2 q) {
-    vfloat u;
-    vint2 m = vsrai2(q, 31);
-    m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
-    q = vsubi2(q, vslli2(m, 2));
-    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
-    x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
-    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
-    return vmulf(x, u);
-}
-
-static INLINE vfloat xsinf(vfloat d) {
-    vint2 q;
-    vfloat u, s;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
-
-    u = vcast_vf_vi2(q);
-    d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
-
-    s = vmulf(d, d);
-
-    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
-
-    u = vcast_vf_f(2.6083159809786593541503e-06f);
-    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
-    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
-
-    u = vmlaf(s, vmulf(u, d), d);
-
-    return u;
-}
-
-static INLINE vfloat xcosf(vfloat d) {
-    vint2 q;
-    vfloat u, s;
-
-    q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
-    q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
-
-    u = vcast_vf_vi2(q);
-    d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
-
-    s = vmulf(d, d);
-
-    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
-
-    u = vcast_vf_f(2.6083159809786593541503e-06f);
-    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
-    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
-
-    u = vmlaf(s, vmulf(u, d), d);
-
-    return u;
-}
-
-static INLINE vfloat2 xsincosf(vfloat d) {
-    vint2 q;
-    vmask m;
-    vfloat u, s, t, rx, ry;
-    vfloat2 r;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
-
-    s = d;
-
-    u = vcast_vf_vi2(q);
-    s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
-
-    t = s;
-
-    s = vmulf(s, s);
-
-    u = vcast_vf_f(-0.000195169282960705459117889f);
-    u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
-    u = vmulf(vmulf(u, s), t);
-
-    rx = vaddf(t, u);
-
-    u = vcast_vf_f(-2.71811842367242206819355e-07f);
-    u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
-    u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
-    u = vmlaf(u, s, vcast_vf_f(-0.5));
-
-    ry = vaddf(vcast_vf_f(1), vmulf(s, u));
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
-    r.x = vself(m, rx, ry);
-    r.y = vself(m, ry, rx);
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
-    r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
-
-    m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
-    r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
-
-    m = vmaskf_isinf(d);
-    r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
-    r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
-
-    return r;
-}
-
-static INLINE vfloat xtanf(vfloat d) {
-    vint2 q;
-    vmask m;
-    vfloat u, s, x;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
-
-    x = d;
-
-    u = vcast_vf_vi2(q);
-    x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
-
-    s = vmulf(x, x);
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
-    x = vself(m, vnegf(x), x);
-
-    u = vcast_vf_f(0.00927245803177356719970703f);
-    u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
-    u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
-    u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
-    u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
-    u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
-
-    u = vmlaf(s, vmulf(u, x), x);
-
-    u = vself(m, vrecf(u), u);
-
-    u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
-
-    return u;
-}
-
-static INLINE vfloat xatanf(vfloat s) {
-    vfloat t, u;
-    vint2 q;
-
-    q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
-    s = vabsf(s);
-
-    q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
-    s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
-
-    t = vmulf(s, s);
-
-    u = vcast_vf_f(0.00282363896258175373077393f);
-    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
-    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
-    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
-    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
-    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
-
-    t = vaddf(s, vmulf(s, vmulf(t, u)));
-
-    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
-    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
-
-    return t;
-}
-
-static INLINE vfloat atan2kf(vfloat y, vfloat x) {
-    vfloat s, t, u;
-    vint2 q;
-    vmask p;
-
-    q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
-    x = vabsf(x);
-
-    q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
-    p = vmaskf_lt(x, y);
-    s = vself(p, vnegf(x), y);
-    t = vmaxf(x, y);
-
-    s = vdivf(s, t);
-    t = vmulf(s, s);
-
-    u = vcast_vf_f(0.00282363896258175373077393f);
-    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
-    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
-    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
-    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
-    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
-
-    t = vaddf(s, vmulf(s, vmulf(t, u)));
-    t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
-
-    return t;
-}
-
-static INLINE vfloat xatan2f(vfloat y, vfloat x) {
-    vfloat r = atan2kf(vabsf(y), x);
-
-    r = vmulsignf(r, x);
-    r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
-    r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
-    r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
-
-    return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
-}
-
-static INLINE vfloat xasinf(vfloat d) {
-    vfloat x, y;
-    x = vaddf(vcast_vf_f(1.0f), d);
-    y = vsubf(vcast_vf_f(1.0f), d);
-    x = vmulf(x, y);
-    x = vsqrtf(x);
-    x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
-    return vmulsignf(x, d);
-}
-
-static INLINE vfloat xacosf(vfloat d) {
-    vfloat x, y;
-    x = vaddf(vcast_vf_f(1.0f), d);
-    y = vsubf(vcast_vf_f(1.0f), d);
-    x = vmulf(x, y);
-    x = vsqrtf(x);
-    x = vmulsignf(atan2kf(x, vabsf(d)), d);
-    y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
-    x = vaddf(x, y);
-    return x;
-}
-
-static INLINE vfloat xlogf(vfloat d) {
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
-    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
-    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
-    x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogf0(vfloat d) {
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
-    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
-    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-}
-
-static INLINE vfloat xexpf(vfloat d) {
-    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
-    vfloat s, u;
-
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
-
-    u = vcast_vf_f(0.00136324646882712841033936f);
-    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
-    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
-    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
-
-    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
-
-    u = vldexpf(u, q);
-
-    // -104.0
-    return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
-}
-
-static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
-    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
-    vfloat s, u;
-
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
-
-    u = vcast_vf_f(0.00136324646882712841033936f);
-    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
-    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
-    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
-
-    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
-
-    return vldexpf(u, q);
-}
-
-static INLINE vfloat xcbrtf(vfloat d) {
-    vfloat x, y, q = vcast_vf_f(1.0), t;
-    vint2 e, qu, re;
-
-    e = vilogbp1f(vabsf(d));
-    d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
-    qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
-    re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
-
-    q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
-    q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
-    q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
-
-    q = vmulsignf(q, d);
-    d = vabsf(d);
-
-    x = vcast_vf_f(-0.601564466953277587890625f);
-    x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
-    x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
-    x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
-    x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
-    x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
-
-    y = vmulf(vmulf(d, x), x);
-    y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
-
-    return y;
-}
-
-static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
-    // clamps value in [low;high], returns low if value is NaN
-    return vmaxf(vminf(high, value), low);
-}
-
-static INLINE vfloat SQRV(vfloat a){
-    return a * a;
-}
-
-static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
-    // conditional swap the elements of two vfloats
-    vfloat temp = vself(condition, a, b); // the values which fit to condition
-    a = vself(condition, b, a); // the values which fit to inverted condition
-    b = temp;
-}
-
-static inline float vhadd( vfloat a ) {
-    // returns a[0] + a[1] + a[2] + a[3]
-    a += _mm_movehl_ps(a, a);
-    return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static inline float vhmin(vfloat a) {
-    // returns min(a[0], a[1], a[2], a[3])
-    a = vminf(a, _mm_movehl_ps(a, a));
-    return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static inline float vhmax(vfloat a) {
-    // returns max(a[0], a[1], a[2], a[3])
-    a = vmaxf(a, _mm_movehl_ps(a, a));
-    return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static INLINE vfloat vmul2f(vfloat a){
-    // fastest way to multiply by 2
-    return a + a;
-}
-
-static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
-    // calculate a * b + (1 - a) * c (interpolate two values)
-    // following is valid:
-    // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
-    // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
-    return a * (b-c) + c;
-}
-
-static INLINE vfloat vdup(vfloat a){
-    // returns { a[0],a[0],a[1],a[1] }
-    return _mm_unpacklo_ps( a, a );
-}
-
-static INLINE vfloat vaddc2vfu(float &a)
-{
-    // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
-    vfloat a1 = _mm_loadu_ps( &a );
-    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
-    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
-}
-
-static INLINE vfloat vadivapb (vfloat a, vfloat b) {
-    return a / (a+b);
-}
-
-static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
-    // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
-    rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
-    gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
-    bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
-}
-
-#if defined( __SSE4_1__ ) && defined( __x86_64__ )
-static INLINE vfloat vceilf(vfloat x) {
-    return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
-}
-
-#else
-
-static INLINE vfloat vceilf(vfloat x) {
-    __m128i zerov = _mm_setzero_si128();
-    zerov = _mm_cmpeq_epi32(zerov, zerov);
-    const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
-    const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
-    return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
-}
-#endif
-
-#endif // __SSE2__
-#endif // SLEEFSSEAVX
diff --git a/rtengine/sleefsseavx.h b/rtengine/sleefsseavx.h
index 8fe20c54b..c10f4a0c2 100644
--- a/rtengine/sleefsseavx.h
+++ b/rtengine/sleefsseavx.h
@@ -1,124 +1,897 @@
-#include <immintrin.h>
-#include <stdint.h>
+////////////////////////////////////////////////////////////////
+//
+//  this code was taken from http://shibatch.sourceforge.net/
+//  Many thanks to the author of original version: Naoki Shibata
+//
+//  This version contains modifications made by Ingo Weyrich
+//
+////////////////////////////////////////////////////////////////
+#pragma once
 
+#include "rt_math.h"
 #ifdef __SSE2__
-#define VECTLENDP 2
-#define VECTLENSP 4
-
-typedef __m128d vdouble;
-typedef __m128i vint;
-
-typedef __m128 vfloat;
-typedef __m128i vint2;
-typedef __m128i vmask;
-
-static vdouble vloadu(double *p)
-{
-    return _mm_loadu_pd(p);
-}
-static void vstoreu(double *p, vdouble v)
-{
-    _mm_storeu_pd(p, v);
-}
-
-static vfloat vloaduf(float *p)
-{
-    return _mm_loadu_ps(p);
-}
-static void vstoreuf(float *p, vfloat v)
-{
-    _mm_storeu_ps(p, v);
-}
-
-static vint2 vloadui2(int32_t *p)
-{
-    return (vint2)_mm_loadu_si128((__m128i *)p);
-}
-static void vstoreui2(int32_t *p, vint2 v)
-{
-    _mm_storeu_si128((__m128i *)p, (__m128i)v);
-}
-#endif
+#include "helpersse2.h"
 
 #ifdef ENABLE_AVX
-#define VECTLENDP 4
-#define VECTLENSP 8
+#include "helperavx.h"
+#endif
 
-typedef __m256d vdouble;
-typedef __m128i vint;
+#ifdef __GNUC__
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
 
+#define PI4_A .7853981554508209228515625
+#define PI4_B .794662735614792836713604629039764404296875e-8
+#define PI4_C .306161699786838294306516483068750264552437361480769e-16
+#define M_4_PI 1.273239544735162542821171882678754627704620361328125
 
-typedef __m256 vfloat;
-typedef struct {
-    vint x, y;
-} vint2;
+#define L2U .69314718055966295651160180568695068359375
+#define L2L .28235290563031577122588448175013436025525412068e-12
+#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
 
-static vdouble vloadu(double *p)
-{
-    return _mm256_loadu_pd(p);
-}
-static void vstoreu(double *p, vdouble v)
-{
-    return _mm256_storeu_pd(p, v);
+#define PI4_Af 0.78515625f
+#define PI4_Bf 0.00024127960205078125f
+#define PI4_Cf 6.3329935073852539062e-07f
+#define PI4_Df 4.9604681473525147339e-10f
+
+#define L2Uf 0.693145751953125f
+#define L2Lf 1.428606765330187045e-06f
+#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
+
+#define INFINITYf ((float)rtengine::RT_INFINITY)
+#define NANf ((float)rtengine::RT_NAN)
+
+static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
+    return vadd(vadd(v0, v1), v2);
 }
 
-static vfloat vloaduf(float *p)
-{
-    return _mm256_loadu_ps(p);
-}
-static void vstoreuf(float *p, vfloat v)
-{
-    return _mm256_storeu_ps(p, v);
+static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+    return vadd3(vadd(v0, v1), v2, v3);
 }
 
-static vint2 vloadui2(int32_t *p)
-{
-    vint2 r;
-    r.x = _mm_loadu_si128((__m128i *) p     );
-    r.y = _mm_loadu_si128((__m128i *)(p + 4));
+static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+    return vadd4(vadd(v0, v1), v2, v3, v4);
+}
+
+static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
+    return vadd5(vadd(v0, v1), v2, v3, v4, v5);
+}
+
+static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
+    return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
+}
+
+static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
+    return vsub(vsub(v0, v1), v2);
+}
+
+static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+    return vsub3(vsub(v0, v1), v2, v3);
+}
+
+static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+    return vsub4(vsub(v0, v1), v2, v3, v4);
+}
+
+//
+
+static INLINE vdouble2 normalize_d(vdouble2 t) {
+    vdouble2 s;
+
+    s.x = vadd(t.x, t.y);
+    s.y = vadd(vsub(t.x, s.x), t.y);
+
+    return s;
+}
+
+static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
+    vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
     return r;
 }
 
-static void vstoreui2(int32_t *p, vint2 v)
-{
-    _mm_storeu_si128((__m128i *) p     , v.x);
-    _mm_storeu_si128((__m128i *)(p + 4), v.y);
+static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y);
+    r.y = vadd(vsub(x, r.x), y);
+
+    return r;
 }
-#endif
 
-typedef struct {
+static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y);
+    vdouble v = vsub(r.x, x);
+    r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
+
+    return r;
+}
+
+static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x.x, y);
+    r.y = vadd3(vsub(x.x, r.x), y, x.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x.x, y);
+    vdouble v = vsub(r.x, x.x);
+    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
+    r.y = vadd(r.y, x.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y.x);
+    r.y = vadd3(vsub(x, r.x), y.x, y.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
+    // |x| >= |y|
+
+    vdouble2 r;
+
+    r.x = vadd(x.x, y.x);
+    r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
+    vdouble2 r;
+
+    r.x  = vadd(x.x, y.x);
+    vdouble v = vsub(r.x, x.x);
+    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
+    r.y = vadd(r.y, vadd(x.y, y.y));
+
+    return r;
+}
+
+static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
+    vdouble t = vrec(d.x);
+    vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
+    vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
+    vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
+
+    vdouble2 q;
+
+    q.x = vmul(n.x, t);
+
+    vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
+            vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
+
+    q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
+
+    return q;
+}
+
+static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
+    vdouble xh = vupper(x), xl = vsub(x, xh);
+    vdouble yh = vupper(y), yl = vsub(y, yh);
+    vdouble2 r;
+
+    r.x = vmul(x, y);
+    r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
+
+    return r;
+}
+
+static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble yh = vupper(y  ), yl = vsub(y  , yh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, y);
+    r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
+
+    return r;
+}
+
+static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, y.x);
+    r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
+
+    return r;
+}
+
+static INLINE vdouble2 squ_d(vdouble2 x) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, x.x);
+    r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
+
+    return r;
+}
+
+static INLINE vdouble2 rec_s(vdouble d) {
+    vdouble t = vrec(d);
+    vdouble dh = vupper(d), dl = vsub(d, dh);
+    vdouble th = vupper(t), tl = vsub(t, th);
+    vdouble2 q;
+
+    q.x = t;
+    q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
+
+    return q;
+}
+
+static INLINE vdouble2 sqrt_d(vdouble2 d) {
+    vdouble t = vsqrt(vadd(d.x, d.y));
+    return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
+}
+
+//
+
+static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
+
+static INLINE vint xilogb(vdouble d) {
+    vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
+    e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
+    e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
+    return vrint_vi_vd(e);
+}
+
+static INLINE vdouble xsin(vdouble d) {
+    vint q;
+    vdouble u, s;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
+
+    u = vcast_vd_vi(q);
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
+
+    s = vmul(d, d);
+
+    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
+
+    u = vcast_vd_d(-7.97255955009037868891952e-18);
+    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
+    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+    u = vmla(s, vmul(u, d), d);
+
+    return u;
+}
+
+static INLINE vdouble xcos(vdouble d) {
+    vint q;
+    vdouble u, s;
+
+    q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
+    q = vaddi(vaddi(q, q), vcast_vi_i(1));
+
+    u = vcast_vd_vi(q);
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
+
+    s = vmul(d, d);
+
+    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
+
+    u = vcast_vd_d(-7.97255955009037868891952e-18);
+    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
+    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+    u = vmla(s, vmul(u, d), d);
+
+    return u;
+}
+
+static INLINE vdouble2 xsincos(vdouble d) {
+    vint q;
+    vmask m;
+    vdouble u, s, t, rx, ry;
+    vdouble2 r;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
+
+    s = d;
+
+    u = vcast_vd_vi(q);
+    s = vmla(u, vcast_vd_d(-PI4_A*2), s);
+    s = vmla(u, vcast_vd_d(-PI4_B*2), s);
+    s = vmla(u, vcast_vd_d(-PI4_C*2), s);
+
+    t = s;
+
+    s = vmul(s, s);
+
+    u = vcast_vd_d(1.58938307283228937328511e-10);
+    u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
+    u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
+    u = vmul(vmul(u, s), t);
+
+    rx = vadd(t, u);
+
+    u = vcast_vd_d(-1.13615350239097429531523e-11);
+    u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
+    u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
+    u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
+    u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
+    u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
+    u = vmla(u, s, vcast_vd_d(-0.5));
+
+    ry = vadd(vcast_vd_d(1), vmul(s, u));
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
+    r.x = vsel(m, rx, ry);
+    r.y = vsel(m, ry, rx);
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
+    r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
+
+    m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
+    r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
+
+    m = vmask_isinf(d);
+    r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
+    r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
+
+    return r;
+}
+
+static INLINE vdouble xtan(vdouble d) {
+    vint q;
+    vdouble u, s, x;
+    vmask m;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
+
+    u = vcast_vd_vi(q);
+    x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
+    x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
+    x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
+
+    s = vmul(x, x);
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
+    x = vsel(m, vneg(x), x);
+
+    u = vcast_vd_d(1.01419718511083373224408e-05);
+    u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
+    u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
+    u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
+    u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
+    u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
+    u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
+    u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
+    u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
+    u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
+    u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
+    u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
+    u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
+    u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
+    u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
+
+    u = vmla(s, vmul(u, x), x);
+
+    u = vsel(m, vrec(u), u);
+
+    u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
+
+    return u;
+}
+
+static INLINE vdouble atan2k(vdouble y, vdouble x) {
+    vdouble s, t, u;
+    vint q;
+    vmask p;
+
+    q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
+    x = vabs(x);
+
+    q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
+    p = vmask_lt(x, y);
+    s = vsel (p, vneg(x), y);
+    t = vmax (x, y);
+
+    s = vdiv(s, t);
+    t = vmul(s, s);
+
+    u = vcast_vd_d(-1.88796008463073496563746e-05);
+    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
+    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
+    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
+    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
+    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
+    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
+    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
+    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
+    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
+    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
+    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
+    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
+    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
+    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
+    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
+    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
+    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
+    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+    t = vadd(s, vmul(s, vmul(t, u)));
+    t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
+
+    return t;
+}
+
+static INLINE vdouble xatan2(vdouble y, vdouble x) {
+    vdouble r = atan2k(vabs(y), x);
+
+    r = vmulsign(r, x);
+    r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
+    r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
+    r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
+
+    return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
+}
+
+static INLINE vdouble xasin(vdouble d) {
     vdouble x, y;
-} vdouble2;
+    x = vadd(vcast_vd_d(1), d);
+    y = vsub(vcast_vd_d(1), d);
+    x = vmul(x, y);
+    x = vsqrt(x);
+    x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
+    return vmulsign(x, d);
+}
 
-vdouble xldexp(vdouble x, vint q);
-vint xilogb(vdouble d);
+static INLINE vdouble xacos(vdouble d) {
+    vdouble x, y;
+    x = vadd(vcast_vd_d(1), d);
+    y = vsub(vcast_vd_d(1), d);
+    x = vmul(x, y);
+    x = vsqrt(x);
+    x = vmulsign(atan2k(x, vabs(d)), d);
+    y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
+    x = vadd(x, y);
+    return x;
+}
 
-vdouble xsin(vdouble d);
-vdouble xcos(vdouble d);
-vdouble2 xsincos(vdouble d);
-vdouble xtan(vdouble d);
-vdouble xasin(vdouble s);
-vdouble xacos(vdouble s);
-vdouble xatan(vdouble s);
-vdouble xatan2(vdouble y, vdouble x);
-vdouble xlog(vdouble d);
-vdouble xexp(vdouble d);
-vdouble xpow(vdouble x, vdouble y);
+static INLINE vdouble xatan(vdouble s) {
+    vdouble t, u;
+    vint q;
 
-vdouble xsinh(vdouble d);
-vdouble xcosh(vdouble d);
-vdouble xtanh(vdouble d);
-vdouble xasinh(vdouble s);
-vdouble xacosh(vdouble s);
-vdouble xatanh(vdouble s);
+    q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
+    s = vabs(s);
 
-vdouble xcbrt(vdouble d);
+    q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
+    s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
 
-vdouble xexp2(vdouble a);
-vdouble xexp10(vdouble a);
-vdouble xexpm1(vdouble a);
-vdouble xlog10(vdouble a);
-vdouble xlog1p(vdouble a);
+    t = vmul(s, s);
+
+    u = vcast_vd_d(-1.88796008463073496563746e-05);
+    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
+    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
+    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
+    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
+    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
+    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
+    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
+    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
+    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
+    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
+    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
+    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
+    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
+    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
+    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
+    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
+    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
+    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+    t = vadd(s, vmul(s, vmul(t, u)));
+
+    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
+    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
+
+    return t;
+}
+
+static INLINE vdouble xlog(vdouble d) {
+    vdouble x, x2;
+    vdouble t, m;
+    vint e;
+
+    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
+    m = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
+    x2 = vmul(x, x);
+
+    t = vcast_vd_d(0.148197055177935105296783);
+    t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
+    t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
+    t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
+    t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
+    t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
+    t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
+    t = vmla(t, x2, vcast_vd_d(2));
+
+    x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
+
+    x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
+
+static INLINE vdouble xexp(vdouble d) {
+    vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
+    vdouble s, u;
+
+    s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    u = vcast_vd_d(2.08860621107283687536341e-09);
+    u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
+    u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
+    u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
+    u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
+    u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
+    u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
+    u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
+    u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
+    u = vmla(u, s, vcast_vd_d(0.5));
+
+    u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
+
+    u = vldexp(u, q);
+
+    u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
+
+    return u;
+}
+
+static INLINE vdouble2 logk(vdouble d) {
+    vdouble2 x, x2;
+    vdouble t, m;
+    vint e;
+
+    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
+    m = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
+    x2 = squ_d(x);
+    x2 = normalize_d(x2);
+
+    t = vcast_vd_d(0.134601987501262130076155);
+    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
+    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
+    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
+    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
+    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
+    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
+    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
+
+    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
+                vcast_vd_vi(e)),
+            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
+}
+
+static INLINE vdouble expk(vdouble2 d) {
+    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
+    vint q = vrint_vi_vd(u);
+    vdouble2 s, t;
+
+    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
+
+    s = normalize_d(s);
+
+    u = vcast_vd_d(2.51069683420950419527139e-08);
+    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
+    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+    t = add_dd(s, mul_ds(squ_d(s), u));
+
+    t = add_sd(vcast_vd_d(1), t);
+    u = vadd(t.x, t.y);
+    u = vldexp(u, q);
+
+    return u;
+}
+
+static INLINE vdouble xpow(vdouble x, vdouble y) {
+#if 1
+    vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
+    vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
+
+    vdouble result = expk(mul_ds(logk(vabs(x)), y));
+
+    //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
+
+    result = vmul(result,
+            vsel(vmask_gt(x, vcast_vd_d(0)),
+                vcast_vd_d(1),
+                vsel(yisint,
+                    vsel(yisodd,
+                        vcast_vd_d(-1),
+                        vcast_vd_d(1)),
+                    vcast_vd_d(rtengine::RT_NAN))));
+
+    vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
+
+    result = vsel(vmask_isinf(y),
+            vsel(vmask_lt(efx, vcast_vd_d(0)),
+                vcast_vd_d(0),
+                vsel(vmask_eq(efx, vcast_vd_d(0)),
+                    vcast_vd_d(1.0),
+                    vcast_vd_d(rtengine::RT_INFINITY))),
+            result);
+
+    result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
+            vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
+                vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
+                    vcast_vd_d(0),
+                    vcast_vd_d(rtengine::RT_INFINITY))),
+            result);
+
+    result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
+
+    result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
+
+    return result;
+#else
+    return expk(mul_ds(logk(x), y));
+#endif
+}
+
+static INLINE vdouble2 expk2(vdouble2 d) {
+    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
+    vint q = vrint_vi_vd(u);
+    vdouble2 s, t;
+
+    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
+
+    s = normalize_d(s);
+
+    u = vcast_vd_d(2.51069683420950419527139e-08);
+    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
+    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+    t = add_dd(s, mul_ds(squ_d(s), u));
+
+    t = add_sd(vcast_vd_d(1), t);
+
+    return dd(vldexp(t.x, q), vldexp(t.y, q));
+}
+
+static INLINE vdouble xsinh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
+    d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
+    y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xcosh(vdouble x) {
+    vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
+    d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
+    vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xtanh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
+    vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
+    d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
+    y = d.x + d.y;
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble2 logk2(vdouble2 d) {
+    vdouble2 x, x2, m;
+    vdouble t;
+    vint e;
+
+    d = normalize_d(d);
+    e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
+    m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
+
+    x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
+    x2 = squ_d(x);
+    x2 = normalize_d(x2);
+
+    t = vcast_vd_d(0.134601987501262130076155);
+    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
+    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
+    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
+    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
+    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
+    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
+    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
+
+    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
+                vcast_vd_vi(e)),
+            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
+}
+
+static INLINE vdouble xasinh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
+    y = vadd(d.x, d.y);
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xacosh(vdouble x) {
+    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
+    vdouble y = vadd(d.x, d.y);
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
+    y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xatanh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
+    y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xcbrt(vdouble d) {
+    vdouble x, y, q = vcast_vd_d(1.0);
+    vint e, qu, re;
+    vdouble t;
+
+    e = vilogbp1(vabs(d));
+    d = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
+    qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
+    re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
+
+    q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
+    q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
+    q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
+
+    q = vmulsign(q, d);
+
+    d = vabs(d);
+
+    x = vcast_vd_d(-0.640245898480692909870982);
+    x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
+    x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
+    x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
+    x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
+    x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
+
+    y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
+    y = vmul(vmul(d, x), x);
+    y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
+
+    return y;
+}
+
+static INLINE vdouble xexp2(vdouble a) {
+    vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
+    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
+    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
+    return u;
+}
+
+static INLINE vdouble xexp10(vdouble a) {
+    vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
+    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
+    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
+    return u;
+}
+
+static INLINE vdouble xexpm1(vdouble a) {
+    vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
+    vdouble x = d.x + d.y;
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
+    return x;
+}
+
+static INLINE vdouble xlog10(vdouble a) {
+    vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
+    vdouble x = d.x + d.y;
+
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
+
+static INLINE vdouble xlog1p(vdouble a) {
+    vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
+    vdouble x = d.x + d.y;
+
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
 
 //
 
@@ -126,15 +899,580 @@ typedef struct {
     vfloat x, y;
 } vfloat2;
 
-vfloat xsinf(vfloat d);
-vfloat xcosf(vfloat d);
-vfloat2 xsincosf(vfloat d);
-vfloat xtanf(vfloat d);
-vfloat xasinf(vfloat s);
-vfloat xacosf(vfloat s);
-vfloat xatanf(vfloat s);
-vfloat xatan2f(vfloat y, vfloat x);
-vfloat xlogf(vfloat d);
-vfloat xlogf0(vfloat d);
-vfloat xexpf(vfloat d);
-vfloat xcbrtf(vfloat s);
+static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
+static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
+
+#ifdef __SSE4_1__
+// only one instruction when using SSE4.1
+static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
+    return _mm_blendv_ps(y,x,(vfloat)mask);
+}
+
+static INLINE vint vselc(vmask mask, vint x, vint y) {
+    return _mm_blendv_epi8(y,x,mask);
+}
+
+#else
+// three instructions when using SSE2
+static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
+    return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+}
+
+static INLINE vint vselc(vmask mask, vint x, vint y) {
+    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+}
+#endif
+
+static INLINE vfloat vselfzero(vmask mask, vfloat x) {
+    // returns value of x if corresponding mask bits are 1, else returns 0
+    // faster than vself(mask, x, ZEROV)
+    return _mm_and_ps((vfloat)mask, x);
+}
+static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
+    // returns value of x if corresponding mask bits are 0, else returns 0
+    // faster than vself(mask, ZEROV, x)
+    return _mm_andnot_ps((vfloat)mask, x);
+}
+
+static INLINE vint vselizero(vmask mask, vint x) {
+    // returns value of x if corresponding mask bits are 1, else returns 0
+    // faster than vselc(mask, x, ZEROV)
+    return _mm_and_si128(mask, x);
+}
+static INLINE vint vselinotzero(vmask mask, vint x) {
+    // returns value of x if corresponding mask bits are 0, else returns 0
+    // faster than vselc(mask, ZEROV, x)
+    return _mm_andnot_si128(mask, x);
+}
+
+static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
+    vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
+    return vori2(vandi2(m2, x), vandnoti2(m2, y));
+}
+
+static INLINE vmask vsignbitf(vfloat f) {
+    return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
+}
+
+static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
+    return (vfloat)vxorm((vmask)x, vsignbitf(y));
+}
+
+static INLINE vfloat vsignf(vfloat f) {
+    return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
+}
+
+static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
+static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
+// the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
+static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
+static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
+static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
+
+static INLINE vint2 vilogbp1f(vfloat d) {
+    vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
+    d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
+    vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
+    q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
+    return q;
+}
+
+static INLINE vfloat vldexpf(vfloat x, vint2 q) {
+    vfloat u;
+    vint2 m = vsrai2(q, 31);
+    m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
+    q = vsubi2(q, vslli2(m, 2));
+    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
+    x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
+    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
+    return vmulf(x, u);
+}
+
+static INLINE vfloat xsinf(vfloat d) {
+    vint2 q;
+    vfloat u, s;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
+
+    u = vcast_vf_vi2(q);
+    d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
+
+    s = vmulf(d, d);
+
+    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
+
+    u = vcast_vf_f(2.6083159809786593541503e-06f);
+    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+    u = vmlaf(s, vmulf(u, d), d);
+
+    return u;
+}
+
+static INLINE vfloat xcosf(vfloat d) {
+    vint2 q;
+    vfloat u, s;
+
+    q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
+    q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
+
+    u = vcast_vf_vi2(q);
+    d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
+
+    s = vmulf(d, d);
+
+    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
+
+    u = vcast_vf_f(2.6083159809786593541503e-06f);
+    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+    u = vmlaf(s, vmulf(u, d), d);
+
+    return u;
+}
+
+static INLINE vfloat2 xsincosf(vfloat d) {
+    vint2 q;
+    vmask m;
+    vfloat u, s, t, rx, ry;
+    vfloat2 r;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
+
+    s = d;
+
+    u = vcast_vf_vi2(q);
+    s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
+
+    t = s;
+
+    s = vmulf(s, s);
+
+    u = vcast_vf_f(-0.000195169282960705459117889f);
+    u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
+    u = vmulf(vmulf(u, s), t);
+
+    rx = vaddf(t, u);
+
+    u = vcast_vf_f(-2.71811842367242206819355e-07f);
+    u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
+    u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
+    u = vmlaf(u, s, vcast_vf_f(-0.5));
+
+    ry = vaddf(vcast_vf_f(1), vmulf(s, u));
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
+    r.x = vself(m, rx, ry);
+    r.y = vself(m, ry, rx);
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
+    r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
+
+    m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
+    r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
+
+    m = vmaskf_isinf(d);
+    r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
+    r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
+
+    return r;
+}
+
+static INLINE vfloat xtanf(vfloat d) {
+    vint2 q;
+    vmask m;
+    vfloat u, s, x;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
+
+    x = d;
+
+    u = vcast_vf_vi2(q);
+    x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
+
+    s = vmulf(x, x);
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
+    x = vself(m, vnegf(x), x);
+
+    u = vcast_vf_f(0.00927245803177356719970703f);
+    u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
+    u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
+    u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
+    u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
+    u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
+
+    u = vmlaf(s, vmulf(u, x), x);
+
+    u = vself(m, vrecf(u), u);
+
+    u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
+
+    return u;
+}
+
+static INLINE vfloat xatanf(vfloat s) {
+    vfloat t, u;
+    vint2 q;
+
+    q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
+    s = vabsf(s);
+
+    q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
+    s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
+
+    t = vmulf(s, s);
+
+    u = vcast_vf_f(0.00282363896258175373077393f);
+    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
+    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+    t = vaddf(s, vmulf(s, vmulf(t, u)));
+
+    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
+    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
+
+    return t;
+}
+
+static INLINE vfloat atan2kf(vfloat y, vfloat x) {
+    vfloat s, t, u;
+    vint2 q;
+    vmask p;
+
+    q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
+    x = vabsf(x);
+
+    q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
+    p = vmaskf_lt(x, y);
+    s = vself(p, vnegf(x), y);
+    t = vmaxf(x, y);
+
+    s = vdivf(s, t);
+    t = vmulf(s, s);
+
+    u = vcast_vf_f(0.00282363896258175373077393f);
+    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
+    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+    t = vaddf(s, vmulf(s, vmulf(t, u)));
+    t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
+
+    return t;
+}
+
+static INLINE vfloat xatan2f(vfloat y, vfloat x) {
+    vfloat r = atan2kf(vabsf(y), x);
+
+    r = vmulsignf(r, x);
+    r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
+    r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
+    r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
+
+    return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
+}
+
+static INLINE vfloat xasinf(vfloat d) {
+    vfloat x, y;
+    x = vaddf(vcast_vf_f(1.0f), d);
+    y = vsubf(vcast_vf_f(1.0f), d);
+    x = vmulf(x, y);
+    x = vsqrtf(x);
+    x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
+    return vmulsignf(x, d);
+}
+
+static INLINE vfloat xacosf(vfloat d) {
+    vfloat x, y;
+    x = vaddf(vcast_vf_f(1.0f), d);
+    y = vsubf(vcast_vf_f(1.0f), d);
+    x = vmulf(x, y);
+    x = vsqrtf(x);
+    x = vmulsignf(atan2kf(x, vabsf(d)), d);
+    y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
+    x = vaddf(x, y);
+    return x;
+}
+
+static INLINE vfloat xlogf(vfloat d) {
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
+    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
+    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
+    x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogf0(vfloat d) {
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
+    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
+    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+}
+
+static INLINE vfloat xexpf(vfloat d) {
+    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
+    vfloat s, u;
+
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
+
+    u = vcast_vf_f(0.00136324646882712841033936f);
+    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
+    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
+    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
+
+    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
+
+    u = vldexpf(u, q);
+
+    // -104.0
+    return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
+}
+
+static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
+    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
+    vfloat s, u;
+
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
+
+    u = vcast_vf_f(0.00136324646882712841033936f);
+    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
+    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
+    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
+
+    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
+
+    return vldexpf(u, q);
+}
+
+static INLINE vfloat xcbrtf(vfloat d) {
+    vfloat x, y, q = vcast_vf_f(1.0), t;
+    vint2 e, qu, re;
+
+    e = vilogbp1f(vabsf(d));
+    d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
+    qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
+    re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
+
+    q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
+    q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
+    q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
+
+    q = vmulsignf(q, d);
+    d = vabsf(d);
+
+    x = vcast_vf_f(-0.601564466953277587890625f);
+    x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
+    x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
+    x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
+    x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
+    x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
+
+    y = vmulf(vmulf(d, x), x);
+    y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
+
+    return y;
+}
+
+static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
+    // clamps value in [low;high], returns low if value is NaN
+    return vmaxf(vminf(high, value), low);
+}
+
+static INLINE vfloat SQRV(vfloat a){
+    return a * a;
+}
+
+static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
+    // conditional swap the elements of two vfloats
+    vfloat temp = vself(condition, a, b); // the values which fit to condition
+    a = vself(condition, b, a); // the values which fit to inverted condition
+    b = temp;
+}
+
+static inline float vhadd( vfloat a ) {
+    // returns a[0] + a[1] + a[2] + a[3]
+    a += _mm_movehl_ps(a, a);
+    return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static inline float vhmin(vfloat a) {
+    // returns min(a[0], a[1], a[2], a[3])
+    a = vminf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static inline float vhmax(vfloat a) {
+    // returns max(a[0], a[1], a[2], a[3])
+    a = vmaxf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static INLINE vfloat vmul2f(vfloat a){
+    // fastest way to multiply by 2
+    return a + a;
+}
+
+static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
+    // calculate a * b + (1 - a) * c (interpolate two values)
+    // following is valid:
+    // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
+    // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
+    return a * (b-c) + c;
+}
+
+static INLINE vfloat vdup(vfloat a){
+    // returns { a[0],a[0],a[1],a[1] }
+    return _mm_unpacklo_ps( a, a );
+}
+
+static INLINE vfloat vaddc2vfu(float &a)
+{
+    // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
+    vfloat a1 = _mm_loadu_ps( &a );
+    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
+    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
+}
+
+static INLINE vfloat vadivapb (vfloat a, vfloat b) {
+    return a / (a+b);
+}
+
+static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
+    // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
+    rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
+    gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
+    bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
+}
+
+#if defined( __SSE4_1__ ) && defined( __x86_64__ )
+static INLINE vfloat vceilf(vfloat x) {
+    return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
+}
+
+#else
+
+static INLINE vfloat vceilf(vfloat x) {
+    __m128i zerov = _mm_setzero_si128();
+    zerov = _mm_cmpeq_epi32(zerov, zerov);
+    const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
+    const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
+    return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
+}
+#endif
+
+#endif // __SSE2__
diff --git a/rtengine/utils.cc b/rtengine/utils.cc
index a07a1235f..0674c9806 100644
--- a/rtengine/utils.cc
+++ b/rtengine/utils.cc
@@ -22,7 +22,6 @@
 #include "rt_math.h"
 
 #include "utils.h"
-#include "rt_math.h"
 
 using namespace std;
 
diff --git a/rtgui/cropwindow.cc b/rtgui/cropwindow.cc
index 0a9b81112..edc378700 100644
--- a/rtgui/cropwindow.cc
+++ b/rtgui/cropwindow.cc
@@ -37,6 +37,7 @@
 #include "rtsurface.h"
 
 #include "../rtengine/dcrop.h"
+#include "../rtengine/imagesource.h"
 #include "../rtengine/procparams.h"
 #include "../rtengine/rt_math.h"
 
diff --git a/rtgui/curveeditor.h b/rtgui/curveeditor.h
index baae8f492..abc0cc10a 100644
--- a/rtgui/curveeditor.h
+++ b/rtgui/curveeditor.h
@@ -18,8 +18,8 @@
  */
 #pragma once
 
-#include "coloredbar.h"
 #include "editcallbacks.h"
+#include "guiutils.h"
 
 #include "../rtengine/diagonalcurvetypes.h"
 #include "../rtengine/flatcurvetypes.h"
@@ -28,6 +28,7 @@
 
 class CurveEditorGroup;
 class CurveEditorSubGroup;
+class ColorProvider;
 class PopUpToggleButton;
 
 /*
diff --git a/rtgui/guiutils.cc b/rtgui/guiutils.cc
index 02a28607f..9a4e71ab4 100644
--- a/rtgui/guiutils.cc
+++ b/rtgui/guiutils.cc
@@ -20,7 +20,6 @@
 
 #include "guiutils.h"
 #include "options.h"
-#include "../rtengine/rt_math.h"
 #include "../rtengine/utils.h"
 #include "../rtengine/procparams.h"
 #include "rtimage.h"

From 9a4ade6f203bbb459582b8c7706a509e937724c7 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Thu, 23 Jan 2020 22:18:46 +0100
Subject: [PATCH 5/8] devirtualize method calls by declaring classes and
 methods final

---
 rtengine/curves.h                   |  4 ++--
 rtengine/iimage.h                   | 20 +++++++++----------
 rtengine/image16.h                  |  2 +-
 rtengine/image8.h                   |  2 +-
 rtengine/imagedata.h                |  4 ++--
 rtengine/imagefloat.h               |  2 +-
 rtengine/imagesource.h              | 10 +++++-----
 rtengine/improccoordinator.h        |  2 +-
 rtengine/processingjob.h            |  2 +-
 rtengine/rawimagesource.h           |  2 +-
 rtexif/pentaxattribs.cc             |  2 +-
 rtexif/rtexif.h                     | 10 +++++-----
 rtexif/sonyminoltaattribs.cc        |  4 ++--
 rtgui/adjuster.h                    |  2 +-
 rtgui/batchqueueentry.h             |  2 +-
 rtgui/batchtoolpanelcoord.h         |  2 +-
 rtgui/coloredbar.h                  |  2 +-
 rtgui/coordinateadjuster.h          |  2 +-
 rtgui/cropwindow.h                  |  2 +-
 rtgui/diagonalcurveeditorsubgroup.h |  2 +-
 rtgui/editcallbacks.h               |  6 +++---
 rtgui/exifpanel.h                   |  2 +-
 rtgui/filebrowser.h                 |  2 +-
 rtgui/filebrowserentry.h            |  2 +-
 rtgui/filecatalog.h                 |  2 +-
 rtgui/flatcurveeditorsubgroup.h     |  2 +-
 rtgui/guiutils.h                    | 30 ++++++++++++++---------------
 rtgui/histogrampanel.h              |  6 +++---
 rtgui/iccprofilecreator.h           |  2 +-
 rtgui/imagearea.h                   |  2 +-
 rtgui/imageareapanel.h              |  2 +-
 rtgui/inspector.h                   |  2 +-
 rtgui/iptcpanel.h                   |  2 +-
 rtgui/lockablecolorpicker.h         |  2 +-
 rtgui/mydiagonalcurve.h             |  2 +-
 rtgui/myflatcurve.h                 |  2 +-
 rtgui/navigator.h                   |  2 +-
 rtgui/partialpastedlg.h             |  2 +-
 rtgui/popupbutton.h                 |  2 +-
 rtgui/preferences.h                 |  2 +-
 rtgui/previewhandler.h              |  2 +-
 rtgui/profilepanel.h                |  2 +-
 rtgui/progressconnector.h           |  2 +-
 rtgui/renamedlg.h                   |  2 +-
 rtgui/rtimage.h                     |  2 +-
 rtgui/rtwindow.h                    |  2 +-
 rtgui/saveasdlg.h                   |  2 +-
 rtgui/shcselector.h                 |  2 +-
 rtgui/splash.h                      |  4 ++--
 rtgui/thresholdselector.h           |  8 ++++----
 rtgui/thumbbrowserbase.h            |  8 ++++----
 rtgui/toolpanel.h                   | 12 ++++++------
 rtgui/toolpanelcoord.h              |  4 ++--
 53 files changed, 104 insertions(+), 104 deletions(-)

diff --git a/rtengine/curves.h b/rtengine/curves.h
index bc8193b76..1f07e6dbe 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -454,7 +454,7 @@ public:
     virtual bool   isIdentity () const = 0;
 };
 
-class DiagonalCurve : public Curve
+class DiagonalCurve final : public Curve
 {
 
 protected:
@@ -476,7 +476,7 @@ public:
     };
 };
 
-class FlatCurve : public Curve, public rtengine::NonCopyable
+class FlatCurve final : public Curve, public rtengine::NonCopyable
 {
 
 private:
diff --git a/rtengine/iimage.h b/rtengine/iimage.h
index 7309dd91f..f44c01320 100644
--- a/rtengine/iimage.h
+++ b/rtengine/iimage.h
@@ -658,7 +658,7 @@ public:
 
     /* If any of the required allocation fails, "width" and "height" are set to -1, and all remaining buffer are freed
      * Can be safely used to reallocate an existing image */
-    void allocate (int W, int H) override
+    void allocate (int W, int H) override final
     {
 
         if (W == width && H == height) {
@@ -746,7 +746,7 @@ public:
         }
     }
 
-    void rotate (int deg) override
+    void rotate (int deg) override final
     {
 
         if (deg == 90) {
@@ -873,7 +873,7 @@ public:
         }
     }
 
-    void hflip () override
+    void hflip () override final
     {
         int width2 = width / 2;
 
@@ -905,7 +905,7 @@ public:
 #endif
     }
 
-    void vflip () override
+    void vflip () override final
     {
 
         int height2 = height / 2;
@@ -989,7 +989,7 @@ public:
         }
     }
 
-    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const override
+    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const override final
     {
         histogram.clear();
         avg_r = avg_g = avg_b = 0.;
@@ -1328,7 +1328,7 @@ public:
      * If any of the required allocation fails, "width" and "height" are set to -1, and all remaining buffer are freed
      * Can be safely used to reallocate an existing image or to free up it's memory with "allocate (0,0);"
      */
-    void allocate (int W, int H) override
+    void allocate (int W, int H) override final
     {
 
         if (W == width && H == height) {
@@ -1382,7 +1382,7 @@ public:
         memcpy (dest->data, data, 3 * width * height * sizeof(T));
     }
 
-    void rotate (int deg) override
+    void rotate (int deg) override final
     {
 
         if (deg == 90) {
@@ -1516,7 +1516,7 @@ public:
         }
     }
 
-    void hflip () override
+    void hflip () override final
     {
         int width2 = width / 2;
 
@@ -1552,7 +1552,7 @@ public:
         }
     }
 
-    void vflip () override
+    void vflip () override final
     {
 
         AlignedBuffer<T> lBuffer(3 * width);
@@ -1619,7 +1619,7 @@ public:
         }
     }
 
-    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const override
+    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const override final
     {
         histogram.clear();
         avg_r = avg_g = avg_b = 0.;
diff --git a/rtengine/image16.h b/rtengine/image16.h
index d0053cbfc..de9288f2d 100644
--- a/rtengine/image16.h
+++ b/rtengine/image16.h
@@ -29,7 +29,7 @@ namespace rtengine
 class Image8;
 class Imagefloat;
 
-class Image16 : public IImage16, public ImageIO
+class Image16 final : public IImage16, public ImageIO
 {
 
 public:
diff --git a/rtengine/image8.h b/rtengine/image8.h
index f125dccf8..c11d9d7fc 100644
--- a/rtengine/image8.h
+++ b/rtengine/image8.h
@@ -27,7 +27,7 @@ namespace rtengine
 {
 class Imagefloat;
 
-class Image8 : public IImage8, public ImageIO
+class Image8 final : public IImage8, public ImageIO
 {
 
 public:
diff --git a/rtengine/imagedata.h b/rtengine/imagedata.h
index ff8ed4b86..28d32c5f0 100644
--- a/rtengine/imagedata.h
+++ b/rtengine/imagedata.h
@@ -38,7 +38,7 @@ class TagDirectory;
 namespace rtengine
 {
 
-class FrameData
+class FrameData final
 {
 
 protected:
@@ -95,7 +95,7 @@ public:
     int getRating () const;
 };
 
-class FramesData : public FramesMetaData {
+class FramesData final : public FramesMetaData {
 private:
     // frame's root IFD, can be a file root IFD or a SUB-IFD
     std::vector<std::unique_ptr<FrameData>> frames;
diff --git a/rtengine/imagefloat.h b/rtengine/imagefloat.h
index 4a2b2f7e1..cd08daf70 100644
--- a/rtengine/imagefloat.h
+++ b/rtengine/imagefloat.h
@@ -34,7 +34,7 @@ class LabImage;
 /*
  * Image type used by most tools; expected range: [0.0 ; 65535.0]
  */
-class Imagefloat : public IImagefloat, public ImageIO
+class Imagefloat final : public IImagefloat, public ImageIO
 {
 
 public:
diff --git a/rtengine/imagesource.h b/rtengine/imagesource.h
index e0c26aa9f..d1fa5896e 100644
--- a/rtengine/imagesource.h
+++ b/rtengine/imagesource.h
@@ -143,11 +143,11 @@ public:
 
     virtual void        setProgressListener (ProgressListener* pl) {}
 
-    void        increaseRef () override
+    void        increaseRef () override final
     {
         references++;
     }
-    void        decreaseRef () override
+    void        decreaseRef () override final
     {
         references--;
 
@@ -175,15 +175,15 @@ public:
         return dirpyrdenoiseExpComp;
     }
     // functions inherited from the InitialImage interface
-    Glib::ustring getFileName () override
+    Glib::ustring getFileName () override final
     {
         return fileName;
     }
-    cmsHPROFILE getEmbeddedProfile () override
+    cmsHPROFILE getEmbeddedProfile () override final
     {
         return embProfile;
     }
-    const FramesMetaData* getMetaData () override
+    const FramesMetaData* getMetaData () override final
     {
         return idata;
     }
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index 96d1f80ce..724443ecd 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -55,7 +55,7 @@ class Crop;
   * but using this class' LUT and other precomputed parameters. The main preview area is displaying a non framed Crop object,
   * while detail windows are framed Crop objects.
   */
-class ImProcCoordinator : public StagedImageProcessor
+class ImProcCoordinator final : public StagedImageProcessor
 {
 
     friend class Crop;
diff --git a/rtengine/processingjob.h b/rtengine/processingjob.h
index 6cdc6bd7c..9c3846958 100644
--- a/rtengine/processingjob.h
+++ b/rtengine/processingjob.h
@@ -24,7 +24,7 @@
 namespace rtengine
 {
 
-class ProcessingJobImpl : public ProcessingJob
+class ProcessingJobImpl final : public ProcessingJob
 {
 
 public:
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index bd460c61b..6be02f6d6 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -37,7 +37,7 @@ class DiagonalCurve;
 class RetinextransmissionCurve;
 class RetinexgaintransmissionCurve;
 
-class RawImageSource : public ImageSource
+class RawImageSource final : public ImageSource
 {
 private:
     static DiagonalCurve *phaseOneIccCurve;
diff --git a/rtexif/pentaxattribs.cc b/rtexif/pentaxattribs.cc
index 898150696..14f204b08 100644
--- a/rtexif/pentaxattribs.cc
+++ b/rtexif/pentaxattribs.cc
@@ -696,7 +696,7 @@ public:
 };
 PAColorSpaceInterpreter paColorSpaceInterpreter;
 
-class PALensTypeInterpreter : public IntLensInterpreter< int >
+class PALensTypeInterpreter final: public IntLensInterpreter< int >
 {
 public:
     PALensTypeInterpreter ()
diff --git a/rtexif/rtexif.h b/rtexif/rtexif.h
index c37533352..1a956a4a5 100644
--- a/rtexif/rtexif.h
+++ b/rtexif/rtexif.h
@@ -163,14 +163,14 @@ public:
     // Try to get the Tag in the current directory and in subdirectories
     // if lookUpward = true, it will scan the parents TagDirectory up to the root one,
     // but w/o looking into their subdirs
-    virtual Tag*     findTag       (const char* name, bool lookUpward = false) const;
+    Tag*     findTag       (const char* name, bool lookUpward = false) const;
     // Find a all Tags with the given name by scanning the whole tag tree
     std::vector<const Tag*> findTags (const char* name);
     // Find a all Tags with the given ID by scanning the whole tag tree
     std::vector<const Tag*> findTags (int ID);
     // Try to get the Tag in the current directory and in parent directories
     // (won't look into subdirs)
-    virtual Tag*     findTagUpward (const char* name) const;
+    Tag*     findTagUpward (const char* name) const;
     bool             getXMPTagValue (const char* name, char* value) const;
 
     void        keepTag       (int ID);
@@ -191,10 +191,10 @@ public:
     virtual TagDirectory* clone    (TagDirectory* parent) const;
     void     applyChange   (const std::string &field, const Glib::ustring &value);
 
-    virtual void     printAll      (unsigned  int level = 0) const; // reentrant debug function, keep level=0 on first call !
-    virtual bool     CPBDump       (const Glib::ustring &commFName, const Glib::ustring &imageFName, const Glib::ustring &profileFName, const Glib::ustring &defaultPParams,
+    void     printAll      (unsigned  int level = 0) const; // reentrant debug function, keep level=0 on first call !
+    bool     CPBDump       (const Glib::ustring &commFName, const Glib::ustring &imageFName, const Glib::ustring &profileFName, const Glib::ustring &defaultPParams,
                                     const CacheImageData* cfs, const bool flagMode, Glib::KeyFile *keyFile = nullptr, Glib::ustring tagDirName = "") const;
-    virtual void     sort     ();
+    void     sort     ();
 };
 
 // a table of tags: id are offset from beginning and not identifiers
diff --git a/rtexif/sonyminoltaattribs.cc b/rtexif/sonyminoltaattribs.cc
index 4410a4051..82323cede 100644
--- a/rtexif/sonyminoltaattribs.cc
+++ b/rtexif/sonyminoltaattribs.cc
@@ -541,7 +541,7 @@ public:
 };
 SAAntiBlurInterpreter saAntiBlurInterpreter;
 
-class SALensIDInterpreter : public IntLensInterpreter<int>
+class SALensIDInterpreter final : public IntLensInterpreter<int>
 {
 public:
     SALensIDInterpreter ()
@@ -1192,7 +1192,7 @@ public:
 };
 SALensIDInterpreter saLensIDInterpreter;
 
-class SALensID2Interpreter : public IntLensInterpreter< int >
+class SALensID2Interpreter final : public IntLensInterpreter< int >
 {
 public:
     SALensID2Interpreter ()
diff --git a/rtgui/adjuster.h b/rtgui/adjuster.h
index 59250bc81..143268786 100644
--- a/rtgui/adjuster.h
+++ b/rtgui/adjuster.h
@@ -33,7 +33,7 @@ public:
 
 typedef double(*double2double_fun)(double val);
 
-class Adjuster : public Gtk::Grid
+class Adjuster final : public Gtk::Grid
 {
 
 protected:
diff --git a/rtgui/batchqueueentry.h b/rtgui/batchqueueentry.h
index c4cd48615..f06b65046 100644
--- a/rtgui/batchqueueentry.h
+++ b/rtgui/batchqueueentry.h
@@ -50,7 +50,7 @@ struct BatchQueueEntryIdleHelper {
     int pending;
 };
 
-class BatchQueueEntry : public ThumbBrowserEntryBase, public BQEntryUpdateListener, public rtengine::NonCopyable
+class BatchQueueEntry final : public ThumbBrowserEntryBase, public BQEntryUpdateListener, public rtengine::NonCopyable
 {
 
     guint8* opreview;
diff --git a/rtgui/batchtoolpanelcoord.h b/rtgui/batchtoolpanelcoord.h
index 7a5fe77ed..ea11f97c8 100644
--- a/rtgui/batchtoolpanelcoord.h
+++ b/rtgui/batchtoolpanelcoord.h
@@ -28,7 +28,7 @@
 
 class FilePanel;
 class Thumbnail;
-class BatchToolPanelCoordinator :
+class BatchToolPanelCoordinator final :
     public ToolPanelCoordinator,
     public FileSelectionChangeListener,
     public BatchPParamsChangeListener,
diff --git a/rtgui/coloredbar.h b/rtgui/coloredbar.h
index 6cc121cd5..69cfa47d1 100644
--- a/rtgui/coloredbar.h
+++ b/rtgui/coloredbar.h
@@ -30,7 +30,7 @@
  *          the bar itself, i.e. use render_background (depending on its Gtk::Style)
  *
  */
-class ColoredBar : private BackBuffer, public ColorCaller
+class ColoredBar final : private BackBuffer, public ColorCaller
 {
 
 private:
diff --git a/rtgui/coordinateadjuster.h b/rtgui/coordinateadjuster.h
index 70fe42233..24edc707a 100644
--- a/rtgui/coordinateadjuster.h
+++ b/rtgui/coordinateadjuster.h
@@ -69,7 +69,7 @@ public:
  *
  * The position of the Axis in the vector will be used in the communication between the Adjuster and the Provider to identify the Axis
  */
-class CoordinateAdjuster : public Gtk::FlowBox
+class CoordinateAdjuster final : public Gtk::FlowBox
 {
 
 public:
diff --git a/rtgui/cropwindow.h b/rtgui/cropwindow.h
index 491124ad5..623653d2d 100644
--- a/rtgui/cropwindow.h
+++ b/rtgui/cropwindow.h
@@ -54,7 +54,7 @@ public:
 };
 
 class ImageArea;
-class CropWindow : public LWButtonListener, public CropDisplayHandler, public EditCoordSystem, public ObjectMOBuffer, public rtengine::NonCopyable
+class CropWindow final : public LWButtonListener, public CropDisplayHandler, public EditCoordSystem, public ObjectMOBuffer, public rtengine::NonCopyable
 {
     static bool initialized;
 
diff --git a/rtgui/diagonalcurveeditorsubgroup.h b/rtgui/diagonalcurveeditorsubgroup.h
index a077da807..9b1f67462 100644
--- a/rtgui/diagonalcurveeditorsubgroup.h
+++ b/rtgui/diagonalcurveeditorsubgroup.h
@@ -27,7 +27,7 @@
 class DiagonalCurveEditor;
 class MyDiagonalCurve;
 
-class DiagonalCurveEditorSubGroup :
+class DiagonalCurveEditorSubGroup final :
     public CurveEditorSubGroup,
     public SHCListener,
     public AdjusterListener,
diff --git a/rtgui/editcallbacks.h b/rtgui/editcallbacks.h
index c2efcf53e..ee357c2f7 100644
--- a/rtgui/editcallbacks.h
+++ b/rtgui/editcallbacks.h
@@ -129,19 +129,19 @@ public:
     @param picked True if the cursor is still above the the same object than on button pressed and with the same modifier keys.
                   If false, the user moved the cursor away or the modifier key is different, so the element is considered as NOT selected.
     @return true if the preview has to be redrawn, false otherwise */
-    virtual bool pick1 (bool picked);
+    bool pick1 (bool picked);
 
     /** @brief Triggered when the user is releasing mouse button 2 while in action==ES_ACTION_PICKING mode
     @param picked True if the cursor is still above the the same object than on button pressed and with the same modifier keys.
                   If false, the user moved the cursor away or the modifier key is different, so the element is considered as NOT selected.
     @return true if the preview has to be redrawn, false otherwise */
-    virtual bool pick2 (bool picked);
+    bool pick2 (bool picked);
 
     /** @brief Triggered when the user is releasing mouse button 3 while in action==ES_ACTION_PICKING mode
     @param picked True if the cursor is still above the the same object than on button pressed and with the same modifier keys.
                   If false, the user moved the cursor away or the modifier key is different, so the element is considered as NOT selected.
     @return true if the preview has to be redrawn, false otherwise */
-    virtual bool pick3 (bool picked);
+    bool pick3 (bool picked);
 
     /** @brief Get the geometry to be shown to the user */
     const std::vector<Geometry*>& getVisibleGeometry ();
diff --git a/rtgui/exifpanel.h b/rtgui/exifpanel.h
index 0894c21ad..d336d83f9 100644
--- a/rtgui/exifpanel.h
+++ b/rtgui/exifpanel.h
@@ -25,7 +25,7 @@
 #include "toolpanel.h"
 #include "../rtexif/rtexif.h"
 
-class ExifPanel :
+class ExifPanel final :
     public Gtk::VBox,
     public ToolPanel
 {
diff --git a/rtgui/filebrowser.h b/rtgui/filebrowser.h
index 86ab59395..53f3f1f2b 100644
--- a/rtgui/filebrowser.h
+++ b/rtgui/filebrowser.h
@@ -56,7 +56,7 @@ public:
 /*
  * Class handling actions common to all thumbnails of the file browser
  */
-class FileBrowser  : public ThumbBrowserBase,
+class FileBrowser final : public ThumbBrowserBase,
     public LWButtonListener,
     public ExportPanelListener,
     public ProfileStoreListener,
diff --git a/rtgui/filebrowserentry.h b/rtgui/filebrowserentry.h
index ea5140ed6..67b953514 100644
--- a/rtgui/filebrowserentry.h
+++ b/rtgui/filebrowserentry.h
@@ -43,7 +43,7 @@ struct FileBrowserEntryIdleHelper {
 };
 
 class FileThumbnailButtonSet;
-class FileBrowserEntry : public ThumbBrowserEntryBase,
+class FileBrowserEntry final : public ThumbBrowserEntryBase,
     public ThumbnailListener,
     public ThumbImageUpdateListener,
     public rtengine::NonCopyable
diff --git a/rtgui/filecatalog.h b/rtgui/filecatalog.h
index 8f7e5618f..b94d049d0 100644
--- a/rtgui/filecatalog.h
+++ b/rtgui/filecatalog.h
@@ -43,7 +43,7 @@ class ToolBar;
  *   - handling the thumbnail toolbar,
  *   - monitoring the directory (for any change)
  */
-class FileCatalog : public Gtk::VBox,
+class FileCatalog final : public Gtk::VBox,
     public PreviewLoaderListener,
     public FilterPanelListener,
     public FileBrowserListener,
diff --git a/rtgui/flatcurveeditorsubgroup.h b/rtgui/flatcurveeditorsubgroup.h
index c358dde3b..865a0ef83 100644
--- a/rtgui/flatcurveeditorsubgroup.h
+++ b/rtgui/flatcurveeditorsubgroup.h
@@ -27,7 +27,7 @@
 class FlatCurveEditor;
 class MyFlatCurve;
 
-class FlatCurveEditorSubGroup:
+class FlatCurveEditorSubGroup final :
     public CurveEditorSubGroup,
     public rtengine::NonCopyable
 {
diff --git a/rtgui/guiutils.h b/rtgui/guiutils.h
index 97c72513b..d4c91734a 100644
--- a/rtgui/guiutils.h
+++ b/rtgui/guiutils.h
@@ -92,7 +92,7 @@ private:
  *     }
  *   </code>
  */
-class GThreadLock
+class GThreadLock final
 {
 public:
     GThreadLock()
@@ -110,7 +110,7 @@ public:
  *
  * Will relock on destruction.
  */
-class GThreadUnLock
+class GThreadUnLock final
 {
 public:
     GThreadUnLock()
@@ -125,7 +125,7 @@ public:
 
 #pragma GCC diagnostic pop
 
-class ConnectionBlocker
+class ConnectionBlocker final
 {
 public:
     explicit ConnectionBlocker (Gtk::Widget *associatedWidget, sigc::connection& connection) : connection (associatedWidget ? &connection : nullptr), wasBlocked(false)
@@ -152,7 +152,7 @@ private:
 /**
  * @brief Glue box to control visibility of the MyExpender's content ; also handle the frame around it
  */
-class ExpanderBox: public Gtk::EventBox
+class ExpanderBox final : public Gtk::EventBox
 {
 private:
     Gtk::Container *pC;
@@ -185,7 +185,7 @@ public:
  *
  * Warning: once you've instantiated this class with a text label or a widget label, you won't be able to revert to the other solution.
  */
-class MyExpander : public Gtk::VBox
+class MyExpander final : public Gtk::VBox
 {
 public:
     typedef sigc::signal<void> type_signal_enabled_toggled;
@@ -295,7 +295,7 @@ public:
 /**
  * @brief subclass of Gtk::ScrolledWindow in order to handle the scrollwheel
  */
-class MyScrolledWindow : public Gtk::ScrolledWindow
+class MyScrolledWindow final : public Gtk::ScrolledWindow
 {
 
     bool on_scroll_event (GdkEventScroll* event) override;
@@ -310,7 +310,7 @@ public:
 /**
  * @brief subclass of Gtk::ScrolledWindow in order to handle the large toolbars (wider than available space)
  */
-class MyScrolledToolbar : public Gtk::ScrolledWindow
+class MyScrolledToolbar final : public Gtk::ScrolledWindow
 {
 
     bool on_scroll_event (GdkEventScroll* event) override;
@@ -340,7 +340,7 @@ public:
 /**
  * @brief subclass of Gtk::ComboBoxText in order to handle the scrollwheel
  */
-class MyComboBoxText : public Gtk::ComboBoxText
+class MyComboBoxText final : public Gtk::ComboBoxText
 {
     int naturalWidth, minimumWidth;
     sigc::connection myConnection;
@@ -360,7 +360,7 @@ public:
 /**
  * @brief subclass of Gtk::SpinButton in order to handle the scrollwheel
  */
-class MySpinButton : public Gtk::SpinButton
+class MySpinButton final : public Gtk::SpinButton
 {
 
 protected:
@@ -375,7 +375,7 @@ public:
 /**
  * @brief subclass of Gtk::HScale in order to handle the scrollwheel
  */
-class MyHScale : public Gtk::HScale
+class MyHScale final : public Gtk::HScale
 {
 
     bool on_scroll_event (GdkEventScroll* event) override;
@@ -385,7 +385,7 @@ class MyHScale : public Gtk::HScale
 /**
  * @brief subclass of Gtk::FileChooserButton in order to handle the scrollwheel
  */
-class MyFileChooserButton: public Gtk::Button {
+class MyFileChooserButton final : public Gtk::Button {
 private:
     void show_chooser();
 
@@ -473,14 +473,14 @@ typedef enum RTNav {
 /**
  * @brief Handle the switch between text and image to be displayed in the HBox (to be used in a button/toolpanel)
  */
-class TextOrIcon : public Gtk::HBox
+class TextOrIcon final : public Gtk::HBox
 {
 
 public:
     TextOrIcon (const Glib::ustring &filename, const Glib::ustring &labelTx, const Glib::ustring &tooltipTx);
 };
 
-class MyImageMenuItem : public Gtk::MenuItem
+class MyImageMenuItem final : public Gtk::MenuItem
 {
 private:
     Gtk::Grid *box;
@@ -493,7 +493,7 @@ public:
     const Gtk::Label* getLabel () const;
 };
 
-class MyProgressBar : public Gtk::ProgressBar
+class MyProgressBar final : public Gtk::ProgressBar
 {
 private:
     int w;
@@ -512,7 +512,7 @@ public:
 /**
  * @brief Define a gradient milestone
  */
-class GradientMilestone
+class GradientMilestone final
 {
 public:
     double position;
diff --git a/rtgui/histogrampanel.h b/rtgui/histogrampanel.h
index 4fd21bcc2..cec44ed03 100644
--- a/rtgui/histogrampanel.h
+++ b/rtgui/histogrampanel.h
@@ -53,7 +53,7 @@ public:
     double log (double vsize, double val);
 };
 
-class HistogramRGBArea : public Gtk::DrawingArea, public BackBuffer, private HistogramScaling, public rtengine::NonCopyable
+class HistogramRGBArea final : public Gtk::DrawingArea, public BackBuffer, private HistogramScaling, public rtengine::NonCopyable
 {
 private:
     typedef const double (*TMatrix)[3];
@@ -116,7 +116,7 @@ public:
     virtual void toggleButtonMode() = 0;
 };
 
-class HistogramArea : public Gtk::DrawingArea, public BackBuffer, private HistogramScaling, public rtengine::NonCopyable
+class HistogramArea final : public Gtk::DrawingArea, public BackBuffer, private HistogramScaling, public rtengine::NonCopyable
 {
 public:
     typedef sigc::signal<void, double> type_signal_factor_changed;
@@ -174,7 +174,7 @@ private:
     void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override;
 };
 
-class HistogramPanel : public Gtk::Grid, public PointerMotionListener, public DrawModeListener, public rtengine::NonCopyable
+class HistogramPanel final : public Gtk::Grid, public PointerMotionListener, public DrawModeListener, public rtengine::NonCopyable
 {
 
 protected:
diff --git a/rtgui/iccprofilecreator.h b/rtgui/iccprofilecreator.h
index 2cd19e14f..8ee46a71d 100644
--- a/rtgui/iccprofilecreator.h
+++ b/rtgui/iccprofilecreator.h
@@ -27,7 +27,7 @@
 
 class RTWindow;
 
-class ICCProfileCreator : public Gtk::Dialog, public AdjusterListener
+class ICCProfileCreator final : public Gtk::Dialog, public AdjusterListener
 {
 
 private:
diff --git a/rtgui/imagearea.h b/rtgui/imagearea.h
index 28b52dcd2..586bba7a7 100644
--- a/rtgui/imagearea.h
+++ b/rtgui/imagearea.h
@@ -34,7 +34,7 @@
 
 class ImageAreaPanel;
 
-class ImageArea :
+class ImageArea final :
     public Gtk::DrawingArea,
     public CropWindowListener,
     public EditDataProvider,
diff --git a/rtgui/imageareapanel.h b/rtgui/imageareapanel.h
index 831371dbb..f52ca9060 100644
--- a/rtgui/imageareapanel.h
+++ b/rtgui/imageareapanel.h
@@ -22,7 +22,7 @@
 
 class ImageArea;
 
-class ImageAreaPanel :
+class ImageAreaPanel final :
     public Gtk::VBox
 {
 
diff --git a/rtgui/inspector.h b/rtgui/inspector.h
index 8338259bf..1526f90be 100644
--- a/rtgui/inspector.h
+++ b/rtgui/inspector.h
@@ -40,7 +40,7 @@ public:
     //~InspectorBuffer();
 };
 
-class Inspector : public Gtk::DrawingArea
+class Inspector final : public Gtk::DrawingArea
 {
 
 private:
diff --git a/rtgui/iptcpanel.h b/rtgui/iptcpanel.h
index 15d117f87..100904d1a 100644
--- a/rtgui/iptcpanel.h
+++ b/rtgui/iptcpanel.h
@@ -25,7 +25,7 @@
 #include "guiutils.h"
 #include "toolpanel.h"
 
-class IPTCPanel :
+class IPTCPanel final :
     public Gtk::VBox,
     public ToolPanel
 {
diff --git a/rtgui/lockablecolorpicker.h b/rtgui/lockablecolorpicker.h
index 77d2e8e9f..baeea41ef 100644
--- a/rtgui/lockablecolorpicker.h
+++ b/rtgui/lockablecolorpicker.h
@@ -33,7 +33,7 @@ public:
     virtual void switchPickerVisibility(bool isVisible) = 0;
 };
 
-class LockableColorPicker : BackBuffer
+class LockableColorPicker final : BackBuffer
 {
 public:
     enum class Size {
diff --git a/rtgui/mydiagonalcurve.h b/rtgui/mydiagonalcurve.h
index b38373006..b2b83c02e 100644
--- a/rtgui/mydiagonalcurve.h
+++ b/rtgui/mydiagonalcurve.h
@@ -41,7 +41,7 @@ public:
     std::vector<double> x, y;   // in case of parametric curves the curve parameters are stored in vector x. In other cases these vectors store the coordinates of the bullets.
 };
 
-class MyDiagonalCurve : public MyCurve
+class MyDiagonalCurve final : public MyCurve
 {
 private:
     IdleRegister idle_register;
diff --git a/rtgui/myflatcurve.h b/rtgui/myflatcurve.h
index 88d651a05..5da1d09ad 100644
--- a/rtgui/myflatcurve.h
+++ b/rtgui/myflatcurve.h
@@ -66,7 +66,7 @@ public:
     double centerY;
 };
 
-class MyFlatCurve : public MyCurve
+class MyFlatCurve final : public MyCurve
 {
 
 protected:
diff --git a/rtgui/navigator.h b/rtgui/navigator.h
index c1c23c6dc..e9d40e309 100644
--- a/rtgui/navigator.h
+++ b/rtgui/navigator.h
@@ -25,7 +25,7 @@
 
 class PreviewWindow;
 
-class Navigator :
+class Navigator final :
     public Gtk::Frame,
     public PointerMotionListener
 {
diff --git a/rtgui/partialpastedlg.h b/rtgui/partialpastedlg.h
index 1403e7c1b..75e18e83c 100644
--- a/rtgui/partialpastedlg.h
+++ b/rtgui/partialpastedlg.h
@@ -34,7 +34,7 @@ class ProcParams;
 
 struct ParamsEdited;
 
-class PartialPasteDlg : public Gtk::Dialog
+class PartialPasteDlg final : public Gtk::Dialog
 {
 
 public:
diff --git a/rtgui/popupbutton.h b/rtgui/popupbutton.h
index 87b1b73a6..a67339a9f 100644
--- a/rtgui/popupbutton.h
+++ b/rtgui/popupbutton.h
@@ -24,7 +24,7 @@
 
 #include "popupcommon.h"
 
-class PopUpButton :
+class PopUpButton final :
     public Gtk::Button,
     public PopUpCommon
 {
diff --git a/rtgui/preferences.h b/rtgui/preferences.h
index 76a104ffa..a6cbe7939 100644
--- a/rtgui/preferences.h
+++ b/rtgui/preferences.h
@@ -29,7 +29,7 @@
 class RTWindow;
 class Splash;
 
-class Preferences :
+class Preferences final :
     public Gtk::Dialog,
     public ProfileStoreListener
 {
diff --git a/rtgui/previewhandler.h b/rtgui/previewhandler.h
index d9c91f6ad..4fd9a1e82 100644
--- a/rtgui/previewhandler.h
+++ b/rtgui/previewhandler.h
@@ -44,7 +44,7 @@ struct PreviewHandlerIdleHelper {
     int pending;
 };
 
-class PreviewHandler : public rtengine::PreviewImageListener, public rtengine::NonCopyable
+class PreviewHandler final : public rtengine::PreviewImageListener, public rtengine::NonCopyable
 {
 private:
     friend int setImageUI   (void* data);
diff --git a/rtgui/profilepanel.h b/rtgui/profilepanel.h
index b3c968682..c3a125c49 100644
--- a/rtgui/profilepanel.h
+++ b/rtgui/profilepanel.h
@@ -49,7 +49,7 @@ class PartialProfile;
 }
 class RTImage;
 
-class ProfilePanel :
+class ProfilePanel final :
     public Gtk::Grid,
     public PParamsChangeListener,
     public ProfileStoreListener,
diff --git a/rtgui/progressconnector.h b/rtgui/progressconnector.h
index f4d1d8f7e..90c9f67f0 100644
--- a/rtgui/progressconnector.h
+++ b/rtgui/progressconnector.h
@@ -28,7 +28,7 @@
 
 #undef THREAD_PRIORITY_NORMAL
 
-class PLDBridge :
+class PLDBridge final :
     public rtengine::ProgressListener
 {
 public:
diff --git a/rtgui/renamedlg.h b/rtgui/renamedlg.h
index f9447fbac..3c92965f8 100644
--- a/rtgui/renamedlg.h
+++ b/rtgui/renamedlg.h
@@ -26,7 +26,7 @@
 
 class CacheImageData;
 
-class RenameDialog :
+class RenameDialog final :
     public Gtk::Dialog
 {
 
diff --git a/rtgui/rtimage.h b/rtgui/rtimage.h
index b68becd82..eb1930d28 100644
--- a/rtgui/rtimage.h
+++ b/rtgui/rtimage.h
@@ -25,7 +25,7 @@
 /**
  * @brief A derived class of Gtk::Image in order to handle theme-related icon sets.
  */
-class RTImage : public Gtk::Image, public RTScalable
+class RTImage final : public Gtk::Image, public RTScalable
 {
     static double dpiBack; // used to keep track of master dpi change
     static int scaleBack;  // used to keep track of master scale change
diff --git a/rtgui/rtwindow.h b/rtgui/rtwindow.h
index 4791ac6c1..c493c2db4 100644
--- a/rtgui/rtwindow.h
+++ b/rtgui/rtwindow.h
@@ -35,7 +35,7 @@ class BatchQueuePanel;
 class EditorPanel;
 class FilePanel;
 class PLDBridge;
-class RTWindow :
+class RTWindow final :
     public Gtk::Window,
     public rtengine::ProgressListener,
     public rtengine::NonCopyable
diff --git a/rtgui/saveasdlg.h b/rtgui/saveasdlg.h
index 448b37fd7..445452a54 100644
--- a/rtgui/saveasdlg.h
+++ b/rtgui/saveasdlg.h
@@ -22,7 +22,7 @@
 
 #include "saveformatpanel.h"
 
-class SaveAsDialog :
+class SaveAsDialog final :
     public Gtk::Dialog,
     public FormatChangeListener
 {
diff --git a/rtgui/shcselector.h b/rtgui/shcselector.h
index 5c4421e0a..e03ca8e06 100644
--- a/rtgui/shcselector.h
+++ b/rtgui/shcselector.h
@@ -29,7 +29,7 @@ public:
     virtual void shcChanged() = 0;
 };
 
-class SHCSelector : public Gtk::DrawingArea, BackBuffer
+class SHCSelector final : public Gtk::DrawingArea, BackBuffer
 {
 
 protected:
diff --git a/rtgui/splash.h b/rtgui/splash.h
index 363c51489..bc63cef91 100644
--- a/rtgui/splash.h
+++ b/rtgui/splash.h
@@ -20,7 +20,7 @@
 
 #include <gtkmm.h>
 
-class SplashImage :
+class SplashImage final :
     public Gtk::DrawingArea
 {
 
@@ -39,7 +39,7 @@ public:
 };
 
 //class Splash : public Gtk::Window {
-class Splash : public Gtk::Dialog
+class Splash final : public Gtk::Dialog
 {
 
 private:
diff --git a/rtgui/thresholdselector.h b/rtgui/thresholdselector.h
index f948b56ad..886b82fbd 100644
--- a/rtgui/thresholdselector.h
+++ b/rtgui/thresholdselector.h
@@ -113,10 +113,10 @@ protected:
     void updateBackBuffer();
 
     Gtk::SizeRequestMode get_request_mode_vfunc () const override;
-    void get_preferred_height_vfunc (int& minimum_height, int& natural_height) const override;
-    void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const override;
-    void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const override;
-    void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override;
+    void get_preferred_height_vfunc (int& minimum_height, int& natural_height) const override final;
+    void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const override final;
+    void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const override final;
+    void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override final;
     void on_realize () override;
     bool on_draw(const ::Cairo::RefPtr< Cairo::Context> &cr) override;
     bool on_button_press_event (GdkEventButton* event) override;
diff --git a/rtgui/thumbbrowserbase.h b/rtgui/thumbbrowserbase.h
index b4caac0a9..e1f24594c 100644
--- a/rtgui/thumbbrowserbase.h
+++ b/rtgui/thumbbrowserbase.h
@@ -60,10 +60,10 @@ class ThumbBrowserBase :
         bool on_draw(const ::Cairo::RefPtr< Cairo::Context> &cr) override;
 
         Gtk::SizeRequestMode get_request_mode_vfunc () const override;
-        void get_preferred_height_vfunc (int &minimum_height, int &natural_height) const override;
-        void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const override;
-        void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const override;
-        void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override;
+        void get_preferred_height_vfunc (int &minimum_height, int &natural_height) const override final;
+        void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const override final;
+        void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const override final;
+        void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override final;
 
         bool on_button_press_event (GdkEventButton* event) override;
         bool on_button_release_event (GdkEventButton* event) override;
diff --git a/rtgui/toolpanel.h b/rtgui/toolpanel.h
index aecf1f39f..069654128 100644
--- a/rtgui/toolpanel.h
+++ b/rtgui/toolpanel.h
@@ -143,7 +143,7 @@ public:
         this->batchMode = batchMode;
     }
 
-    virtual Glib::ustring getToolName () {
+    Glib::ustring getToolName () {
         return toolName;
     }
 };
@@ -164,11 +164,11 @@ public:
 
     FoldableToolPanel(Gtk::Box* content, Glib::ustring toolName, Glib::ustring UILabel, bool need11 = false, bool useEnabled = false);
 
-    MyExpander* getExpander() override
+    MyExpander* getExpander() override final
     {
         return exp;
     }
-    void setExpanded (bool expanded) override
+    void setExpanded (bool expanded) override final
     {
         if (exp) {
             exp->set_expanded( expanded );
@@ -186,7 +186,7 @@ public:
             exp->show();
         }
     }
-    bool getExpanded () override
+    bool getExpanded () override final
     {
         if (exp) {
             return exp->get_expanded();
@@ -194,11 +194,11 @@ public:
 
         return false;
     }
-    void setParent (Gtk::Box* parent) override
+    void setParent (Gtk::Box* parent) override final
     {
         parentContainer = parent;
     }
-    Gtk::Box* getParent () override
+    Gtk::Box* getParent () override final
     {
         return parentContainer;
     }
diff --git a/rtgui/toolpanelcoord.h b/rtgui/toolpanelcoord.h
index 4313f6d12..17f14673d 100644
--- a/rtgui/toolpanelcoord.h
+++ b/rtgui/toolpanelcoord.h
@@ -318,7 +318,7 @@ public:
     int getSpotWBRectSize() const override;
     void cropSelectionReady() override;
     void rotateSelectionReady(double rotate_deg, Thumbnail* thm = nullptr) override;
-    ToolBar* getToolBar() const override;
+    ToolBar* getToolBar() const override final;
     CropGUIListener* startCropEditing(Thumbnail* thm = nullptr) override;
 
     void updateTPVScrollbar (bool hide);
@@ -326,7 +326,7 @@ public:
 
     // ToolBarListener interface
     void toolSelected (ToolMode tool) override;
-    void editModeSwitchedOff () override;
+    void editModeSwitchedOff () override final;
 
     void setEditProvider (EditDataProvider *provider);
 

From 0d2ffb5e611629872a7a985f466e9a0cda6a7f35 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Fri, 24 Jan 2020 14:53:41 +0100
Subject: [PATCH 6/8] more final fixes

---
 rtengine/camconst.h       |  4 ++--
 rtengine/cieimage.h       |  2 +-
 rtengine/dcrop.h          |  2 +-
 rtengine/dfmanager.h      |  4 ++--
 rtengine/ffmanager.h      |  4 ++--
 rtengine/iccstore.h       |  4 ++--
 rtengine/iimage.h         | 20 ++++++++++----------
 rtengine/imagesource.h    | 10 +++++-----
 rtengine/labimage.h       |  2 +-
 rtgui/thresholdselector.h |  8 ++++----
 rtgui/thumbbrowserbase.h  |  8 ++++----
 rtgui/toolpanel.h         | 10 +++++-----
 rtgui/toolpanelcoord.h    |  6 +++---
 13 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/rtengine/camconst.h b/rtengine/camconst.h
index 1096e1767..c014a3322 100644
--- a/rtengine/camconst.h
+++ b/rtengine/camconst.h
@@ -14,7 +14,7 @@ struct camera_const_levels {
     int levels[4];
 };
 
-class CameraConst
+class CameraConst final
 {
 private:
     std::string make_model;
@@ -56,7 +56,7 @@ public:
     void update_globalGreenEquilibration(bool other);
 };
 
-class CameraConstantsStore
+class CameraConstantsStore final
 {
 private:
     std::map<std::string, CameraConst *> mCameraConstants;
diff --git a/rtengine/cieimage.h b/rtengine/cieimage.h
index fea675cd4..329c0ba74 100644
--- a/rtengine/cieimage.h
+++ b/rtengine/cieimage.h
@@ -23,7 +23,7 @@
 namespace rtengine
 {
 
-class CieImage :
+class CieImage final :
     public NonCopyable
 {
 private:
diff --git a/rtengine/dcrop.h b/rtengine/dcrop.h
index c65c1e72f..8feeb4f90 100644
--- a/rtengine/dcrop.h
+++ b/rtengine/dcrop.h
@@ -35,7 +35,7 @@ using namespace procparams;
 
 class ImProcCoordinator;
 
-class Crop : public DetailedCrop, public PipetteBuffer
+class Crop final : public DetailedCrop, public PipetteBuffer
 {
 
 protected:
diff --git a/rtengine/dfmanager.h b/rtengine/dfmanager.h
index 216dcfc53..b23981ffb 100644
--- a/rtengine/dfmanager.h
+++ b/rtengine/dfmanager.h
@@ -31,7 +31,7 @@ namespace rtengine
 {
 
 class RawImage;
-class dfInfo
+class dfInfo final
 {
 public:
 
@@ -74,7 +74,7 @@ protected:
     void updateRawImage();
 };
 
-class DFManager
+class DFManager final
 {
 public:
     void init(const Glib::ustring &pathname);
diff --git a/rtengine/ffmanager.h b/rtengine/ffmanager.h
index 80ef5fa1c..f3303f12b 100644
--- a/rtengine/ffmanager.h
+++ b/rtengine/ffmanager.h
@@ -29,7 +29,7 @@ namespace rtengine
 {
 
 class RawImage;
-class ffInfo
+class ffInfo final
 {
 public:
 
@@ -71,7 +71,7 @@ protected:
     void updateRawImage();
 };
 
-class FFManager
+class FFManager final
 {
 public:
     void init(const Glib::ustring &pathname);
diff --git a/rtengine/iccstore.h b/rtengine/iccstore.h
index 731a155bb..117124122 100644
--- a/rtengine/iccstore.h
+++ b/rtengine/iccstore.h
@@ -39,7 +39,7 @@ namespace procparams
 
 typedef const double(*TMatrix)[3];
 
-class ProfileContent
+class ProfileContent final
 {
 public:
     ProfileContent();
@@ -54,7 +54,7 @@ private:
     std::string data;
 };
 
-class ICCStore
+class ICCStore final
 {
 public:
     enum class ProfileType {
diff --git a/rtengine/iimage.h b/rtengine/iimage.h
index f44c01320..c4e6b7d14 100644
--- a/rtengine/iimage.h
+++ b/rtengine/iimage.h
@@ -658,7 +658,7 @@ public:
 
     /* If any of the required allocation fails, "width" and "height" are set to -1, and all remaining buffer are freed
      * Can be safely used to reallocate an existing image */
-    void allocate (int W, int H) override final
+    void allocate (int W, int H) final
     {
 
         if (W == width && H == height) {
@@ -746,7 +746,7 @@ public:
         }
     }
 
-    void rotate (int deg) override final
+    void rotate (int deg) final
     {
 
         if (deg == 90) {
@@ -873,7 +873,7 @@ public:
         }
     }
 
-    void hflip () override final
+    void hflip () final
     {
         int width2 = width / 2;
 
@@ -905,7 +905,7 @@ public:
 #endif
     }
 
-    void vflip () override final
+    void vflip () final
     {
 
         int height2 = height / 2;
@@ -989,7 +989,7 @@ public:
         }
     }
 
-    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const override final
+    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const final
     {
         histogram.clear();
         avg_r = avg_g = avg_b = 0.;
@@ -1328,7 +1328,7 @@ public:
      * If any of the required allocation fails, "width" and "height" are set to -1, and all remaining buffer are freed
      * Can be safely used to reallocate an existing image or to free up it's memory with "allocate (0,0);"
      */
-    void allocate (int W, int H) override final
+    void allocate (int W, int H) final
     {
 
         if (W == width && H == height) {
@@ -1382,7 +1382,7 @@ public:
         memcpy (dest->data, data, 3 * width * height * sizeof(T));
     }
 
-    void rotate (int deg) override final
+    void rotate (int deg) final
     {
 
         if (deg == 90) {
@@ -1516,7 +1516,7 @@ public:
         }
     }
 
-    void hflip () override final
+    void hflip () final
     {
         int width2 = width / 2;
 
@@ -1552,7 +1552,7 @@ public:
         }
     }
 
-    void vflip () override final
+    void vflip () final
     {
 
         AlignedBuffer<T> lBuffer(3 * width);
@@ -1619,7 +1619,7 @@ public:
         }
     }
 
-    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const override final
+    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const final
     {
         histogram.clear();
         avg_r = avg_g = avg_b = 0.;
diff --git a/rtengine/imagesource.h b/rtengine/imagesource.h
index d1fa5896e..75b160e10 100644
--- a/rtengine/imagesource.h
+++ b/rtengine/imagesource.h
@@ -143,11 +143,11 @@ public:
 
     virtual void        setProgressListener (ProgressListener* pl) {}
 
-    void        increaseRef () override final
+    void        increaseRef () final
     {
         references++;
     }
-    void        decreaseRef () override final
+    void        decreaseRef () final
     {
         references--;
 
@@ -175,15 +175,15 @@ public:
         return dirpyrdenoiseExpComp;
     }
     // functions inherited from the InitialImage interface
-    Glib::ustring getFileName () override final
+    Glib::ustring getFileName () final
     {
         return fileName;
     }
-    cmsHPROFILE getEmbeddedProfile () override final
+    cmsHPROFILE getEmbeddedProfile () final
     {
         return embProfile;
     }
-    const FramesMetaData* getMetaData () override final
+    const FramesMetaData* getMetaData () final
     {
         return idata;
     }
diff --git a/rtengine/labimage.h b/rtengine/labimage.h
index 79f003b07..7140d9de0 100644
--- a/rtengine/labimage.h
+++ b/rtengine/labimage.h
@@ -23,7 +23,7 @@
 namespace rtengine
 {
 
-class LabImage
+class LabImage final
 {
 private:
     void allocLab(size_t w, size_t h);
diff --git a/rtgui/thresholdselector.h b/rtgui/thresholdselector.h
index 886b82fbd..4ae86560e 100644
--- a/rtgui/thresholdselector.h
+++ b/rtgui/thresholdselector.h
@@ -113,10 +113,10 @@ protected:
     void updateBackBuffer();
 
     Gtk::SizeRequestMode get_request_mode_vfunc () const override;
-    void get_preferred_height_vfunc (int& minimum_height, int& natural_height) const override final;
-    void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const override final;
-    void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const override final;
-    void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override final;
+    void get_preferred_height_vfunc (int& minimum_height, int& natural_height) const final;
+    void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const final;
+    void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const final;
+    void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const final;
     void on_realize () override;
     bool on_draw(const ::Cairo::RefPtr< Cairo::Context> &cr) override;
     bool on_button_press_event (GdkEventButton* event) override;
diff --git a/rtgui/thumbbrowserbase.h b/rtgui/thumbbrowserbase.h
index e1f24594c..69017b183 100644
--- a/rtgui/thumbbrowserbase.h
+++ b/rtgui/thumbbrowserbase.h
@@ -60,10 +60,10 @@ class ThumbBrowserBase :
         bool on_draw(const ::Cairo::RefPtr< Cairo::Context> &cr) override;
 
         Gtk::SizeRequestMode get_request_mode_vfunc () const override;
-        void get_preferred_height_vfunc (int &minimum_height, int &natural_height) const override final;
-        void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const override final;
-        void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const override final;
-        void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override final;
+        void get_preferred_height_vfunc (int &minimum_height, int &natural_height) const final;
+        void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const final;
+        void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const final;
+        void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const final;
 
         bool on_button_press_event (GdkEventButton* event) override;
         bool on_button_release_event (GdkEventButton* event) override;
diff --git a/rtgui/toolpanel.h b/rtgui/toolpanel.h
index 069654128..12b3eebcc 100644
--- a/rtgui/toolpanel.h
+++ b/rtgui/toolpanel.h
@@ -164,11 +164,11 @@ public:
 
     FoldableToolPanel(Gtk::Box* content, Glib::ustring toolName, Glib::ustring UILabel, bool need11 = false, bool useEnabled = false);
 
-    MyExpander* getExpander() override final
+    MyExpander* getExpander() final
     {
         return exp;
     }
-    void setExpanded (bool expanded) override final
+    void setExpanded (bool expanded) final
     {
         if (exp) {
             exp->set_expanded( expanded );
@@ -186,7 +186,7 @@ public:
             exp->show();
         }
     }
-    bool getExpanded () override final
+    bool getExpanded () final
     {
         if (exp) {
             return exp->get_expanded();
@@ -194,11 +194,11 @@ public:
 
         return false;
     }
-    void setParent (Gtk::Box* parent) override final
+    void setParent (Gtk::Box* parent) final
     {
         parentContainer = parent;
     }
-    Gtk::Box* getParent () override final
+    Gtk::Box* getParent () final
     {
         return parentContainer;
     }
diff --git a/rtgui/toolpanelcoord.h b/rtgui/toolpanelcoord.h
index 17f14673d..0fc1a9070 100644
--- a/rtgui/toolpanelcoord.h
+++ b/rtgui/toolpanelcoord.h
@@ -314,11 +314,11 @@ public:
 
     // imageareatoollistener interface
     void spotWBselected(int x, int y, Thumbnail* thm = nullptr) override;
-    void sharpMaskSelected(bool sharpMask) override;
+    void sharpMaskSelected(bool sharpMask) override final;
     int getSpotWBRectSize() const override;
     void cropSelectionReady() override;
     void rotateSelectionReady(double rotate_deg, Thumbnail* thm = nullptr) override;
-    ToolBar* getToolBar() const override final;
+    ToolBar* getToolBar() const final;
     CropGUIListener* startCropEditing(Thumbnail* thm = nullptr) override;
 
     void updateTPVScrollbar (bool hide);
@@ -326,7 +326,7 @@ public:
 
     // ToolBarListener interface
     void toolSelected (ToolMode tool) override;
-    void editModeSwitchedOff () override final;
+    void editModeSwitchedOff () final;
 
     void setEditProvider (EditDataProvider *provider);
 

From 1150a4352f318ff80b94bad06291489f1846d9d0 Mon Sep 17 00:00:00 2001
From: Benitoite <kd6kxr@gmail.com>
Date: Fri, 7 Feb 2020 00:47:24 -0800
Subject: [PATCH 7/8] Re-enable a private LENSFUNDBDIR on Win32 and UNIX

Restores a few lines in CMakeLists.txt 2cc8918 removed for non-APPLE systems.
---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c60fa37c..6d51503d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,6 +252,12 @@ if(NOT DEFINED APPDATADIR)
     endif()
 endif()
 
+if (NOT APPLE)
+    if(DEFINED LENSFUNDBDIR AND NOT IS_ABSOLUTE "${LENSFUNDBDIR}") 
+        set(LENSFUNDBDIR "${DATADIR}/${LENSFUNDBDIR}") 
+    endif()
+endif()
+
 if(APPLE)
     if("${CODESIGNID}")
         set(CODESIGNID "${CODESIGNID}" CACHE STRING "Codesigning Identity")

From 4221fdf31e43658cef98ad660ad398b8ae81b8d7 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Fri, 7 Feb 2020 13:52:25 +0100
Subject: [PATCH 8/8] fix regression in Canon sRAW/mRAW black level
 computation, taken from ART, fixes #5638

---
 rtengine/dcraw.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc
index 812f122b3..0284b3f1d 100644
--- a/rtengine/dcraw.cc
+++ b/rtengine/dcraw.cc
@@ -9868,6 +9868,7 @@ void CLASS identify()
     filters = 0;
     tiff_samples = colors = 3;
     load_raw = &CLASS canon_sraw_load_raw;
+    FORC4 cblack[c] = 0; // ALB
   } else if (!strcmp(model,"PowerShot 600")) {
     height = 613;
     width  = 854;