From d82ea3af02021f5292c5a49acfb92942b82a25e3 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Fri, 17 Jan 2020 23:18:48 +0100
Subject: [PATCH 1/4] reduce some include dependencies

---
 rtengine/camconst.cc       |  4 ++--
 rtengine/camconst.h        | 12 +++++++++---
 rtengine/color.h           |  8 +++++++-
 rtengine/curves.cc         |  5 ++---
 rtengine/curves.h          | 12 ++++++++----
 rtengine/diagonalcurves.cc |  2 --
 rtengine/iimage.h          |  8 +++++++-
 rtengine/imagedata.h       |  8 +++++++-
 rtengine/improcfun.h       |  9 ++++++++-
 rtengine/previewimage.h    |  8 +++++++-
 rtengine/rtthumbnail.h     |  9 +++++++--
 rtgui/previewloader.h      |  9 +++++++--
 12 files changed, 71 insertions(+), 23 deletions(-)
diff --git a/rtengine/camconst.cc b/rtengine/camconst.cc
index d136d6a21..c5cfc26fa 100644
--- a/rtengine/camconst.cc
+++ b/rtengine/camconst.cc
@@ -638,7 +638,7 @@ CameraConst::update_globalGreenEquilibration(bool other)
 }
 
 bool
-CameraConstantsStore::parse_camera_constants_file(Glib::ustring filename_)
+CameraConstantsStore::parse_camera_constants_file(const Glib::ustring& filename_)
 {
     // read the file into a single long string
     const char *filename = filename_.c_str();
@@ -809,7 +809,7 @@ CameraConstantsStore::~CameraConstantsStore()
     }
 }
 
-void CameraConstantsStore::init(Glib::ustring baseDir, Glib::ustring userSettingsDir)
+void CameraConstantsStore::init(const Glib::ustring& baseDir, const Glib::ustring& userSettingsDir)
 {
     parse_camera_constants_file(Glib::build_filename(baseDir, "camconst.json"));
 
diff --git a/rtengine/camconst.h b/rtengine/camconst.h
index 1096e1767..fbc16cb13 100644
--- a/rtengine/camconst.h
+++ b/rtengine/camconst.h
@@ -3,10 +3,16 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
 #include <map>
 #include <vector>
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
@@ -62,11 +68,11 @@ private:
     std::map<std::string, CameraConst *> mCameraConstants;
 
     CameraConstantsStore();
-    bool parse_camera_constants_file(Glib::ustring filename);
+    bool parse_camera_constants_file(const Glib::ustring& filename);
 
 public:
     ~CameraConstantsStore();
-    void init(Glib::ustring baseDir, Glib::ustring userSettingsDir);
+    void init(const Glib::ustring& baseDir, const Glib::ustring& userSettingsDir);
     static CameraConstantsStore *getInstance(void);
     CameraConst *get(const char make[], const char model[]);
 };
diff --git a/rtengine/color.h b/rtengine/color.h
index 211615de1..8e0015c42 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -20,7 +20,6 @@
 #pragma once
 
 #include <array>
-#include <glibmm/ustring.h>
 
 #include "rt_math.h"
 #include "LUT.h"
@@ -30,6 +29,13 @@
 
 #define SAT(a,b,c) ((float)max(a,b,c)-(float)min(a,b,c))/(float)max(a,b,c)
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/curves.cc b/rtengine/curves.cc
index b2105a091..35a38d3b8 100644
--- a/rtengine/curves.cc
+++ b/rtengine/curves.cc
@@ -21,8 +21,7 @@
 #include <memory>
 #include <cmath>
 #include <cstring>
-#include <glib.h>
-#include <glib/gstdio.h>
+#include <glibmm/ustring.h>
 
 #include "rt_math.h"
 
@@ -2186,7 +2185,7 @@ void PerceptualToneCurve::init()
     }
 }
 
-void PerceptualToneCurve::initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const
+void PerceptualToneCurve::initApplyState(PerceptualToneCurveState & state, const Glib::ustring &workingSpace) const
 {
 
     // Get the curve's contrast value, and convert to a chroma scaling
diff --git a/rtengine/curves.h b/rtengine/curves.h
index bc8193b76..27f5a8adc 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -22,12 +22,9 @@
 #include <string>
 #include <vector>
 
-#include <glibmm/ustring.h>
-
 #include "rt_math.h"
 #include "flatcurvetypes.h"
 #include "diagonalcurvetypes.h"
-#include "pipettebuffer.h"
 #include "noncopyable.h"
 #include "LUT.h"
 #include "sleef.h"
@@ -37,6 +34,13 @@
 
 #define CLIPI(a) ((a)>0?((a)<65534?(a):65534):0)
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 using namespace std;
 
 namespace rtengine
@@ -940,7 +944,7 @@ private:
     float calculateToneCurveContrastValue() const;
 public:
     static void init();
-    void initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const;
+    void initApplyState(PerceptualToneCurveState & state, const Glib::ustring& workingSpace) const;
     void BatchApply(const size_t start, const size_t end, float *r, float *g, float *b, const PerceptualToneCurveState &state) const;
 };
 
diff --git a/rtengine/diagonalcurves.cc b/rtengine/diagonalcurves.cc
index bb20b7cc1..e81f2fe92 100644
--- a/rtengine/diagonalcurves.cc
+++ b/rtengine/diagonalcurves.cc
@@ -16,8 +16,6 @@
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
  */
-#include <glib.h>
-#include <glib/gstdio.h>
 #include "curves.h"
 #include <cmath>
 #include <vector>
diff --git a/rtengine/iimage.h b/rtengine/iimage.h
index 7309dd91f..c2e9dfd3b 100644
--- a/rtengine/iimage.h
+++ b/rtengine/iimage.h
@@ -20,7 +20,6 @@
 
 #include <vector>
 
-#include <glibmm/ustring.h>
 #include <lcms2.h>
 
 #include "alignedbuffer.h"
@@ -41,6 +40,13 @@
 
 #define CHECK_BOUNDS 0
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/imagedata.h b/rtengine/imagedata.h
index ff8ed4b86..5765d9aec 100644
--- a/rtengine/imagedata.h
+++ b/rtengine/imagedata.h
@@ -23,12 +23,18 @@
 #include <string>
 #include <vector>
 
-#include <glibmm/ustring.h>
 
 #include <libiptcdata/iptc-data.h>
 
 #include "imageio.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtexif
 {
 
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 8f4ae7771..6538bd6b1 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -19,11 +19,17 @@
 #pragma once
 
 #include <memory>
+#include <vector>
 
 #include "coord2d.h"
 #include "gamutwarning.h"
-#include "pipettebuffer.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
 template<typename T>
 class LUT;
 
@@ -44,6 +50,7 @@ class FramesMetaData;
 class LensCorrection;
 class NoiseCurve;
 class OpacityCurve;
+class PipetteBuffer;
 class ToneCurve;
 class WavCurve;
 class WavOpacityCurveBY;
diff --git a/rtengine/previewimage.h b/rtengine/previewimage.h
index e6c3ea070..2143509a3 100644
--- a/rtengine/previewimage.h
+++ b/rtengine/previewimage.h
@@ -18,10 +18,16 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
 
 #include <cairomm/cairomm.h>
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/rtthumbnail.h b/rtengine/rtthumbnail.h
index dcc9596f6..c8d657a62 100644
--- a/rtengine/rtthumbnail.h
+++ b/rtengine/rtthumbnail.h
@@ -18,8 +18,6 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
-
 #include <lcms2.h>
 
 #include "image16.h"
@@ -30,6 +28,13 @@
 
 #include "../rtgui/threadutils.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtgui/previewloader.h b/rtgui/previewloader.h
index 9a74ee2eb..a8032fcaf 100644
--- a/rtgui/previewloader.h
+++ b/rtgui/previewloader.h
@@ -20,10 +20,15 @@
 
 #include <set>
 
-#include <glibmm/ustring.h>
-
 #include "../rtengine/noncopyable.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 class FileBrowserEntry;
 
 class PreviewLoaderListener

From 54d8efc5f62724a1d07b543090f98401c04b5499 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Fri, 17 Jan 2020 23:56:17 +0100
Subject: [PATCH 2/4] reduce include dependecies

---
 rtengine/guidedfilter.cc | 1 +
 rtengine/imagesource.h   | 1 -
 rtengine/ipdehaze.cc     | 1 +
 rtengine/rescale.h       | 5 ++++-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc
index ad3beec51..f83560cfc 100644
--- a/rtengine/guidedfilter.cc
+++ b/rtengine/guidedfilter.cc
@@ -29,6 +29,7 @@
  * available at https://arxiv.org/abs/1505.00996
 */
 
+#include "array2D.h"
 #include "boxblur.h"
 #include "guidedfilter.h"
 #include "imagefloat.h"
diff --git a/rtengine/imagesource.h b/rtengine/imagesource.h
index e0c26aa9f..2ef2554bc 100644
--- a/rtengine/imagesource.h
+++ b/rtengine/imagesource.h
@@ -25,7 +25,6 @@
 
 #include "coord2d.h"
 #include "imagedata.h"
-#include "LUT.h"
 #include "rtengine.h"
 
 template<typename T>
diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 28a0f2d57..38e35c612 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -32,6 +32,7 @@
 #include <iostream>
 #include <vector>
 
+#include "array2D.h"
 #include "color.h"
 #include "guidedfilter.h"
 #include "iccstore.h"
diff --git a/rtengine/rescale.h b/rtengine/rescale.h
index 70974aa48..2138cd8e8 100644
--- a/rtengine/rescale.h
+++ b/rtengine/rescale.h
@@ -20,9 +20,12 @@
 
 #pragma once
 
-#include "array2D.h"
 #include "rt_math.h"
 
+template<typename T>
+class array2D;
+
+
 namespace rtengine
 {
 

From c3a86befaaa8eab8aebbeddb46f3b92492f6f3d6 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sat, 18 Jan 2020 11:57:31 +0100
Subject: [PATCH 3/4] Fix broken build

---
 rtengine/camconst.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rtengine/camconst.h b/rtengine/camconst.h
index fbc16cb13..0c0618671 100644
--- a/rtengine/camconst.h
+++ b/rtengine/camconst.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <map>
+#include <string>
 #include <vector>
 
 namespace Glib

From bf301b7e4099337d1bbd37c8a02dc72876ecaa1f Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sat, 18 Jan 2020 23:46:48 +0100
Subject: [PATCH 4/4] reduce include depenencies

---
 rtengine/curves.h            |    2 -
 rtengine/dcrop.cc            |    1 +
 rtengine/dcrop.h             |    5 +-
 rtengine/improccoordinator.h |    1 -
 rtengine/improcfun.cc        |    3 +-
 rtengine/opthelper.h         |   39 +-
 rtengine/rtengine.h          |    1 -
 rtengine/sleefsseavx.c       | 1483 --------------------------------
 rtengine/sleefsseavx.h       | 1558 +++++++++++++++++++++++++++++++---
 rtengine/utils.cc            |    1 -
 rtgui/cropwindow.cc          |    1 +
 rtgui/curveeditor.h          |    3 +-
 rtgui/guiutils.cc            |    1 -
 13 files changed, 1473 insertions(+), 1626 deletions(-)
 delete mode 100644 rtengine/sleefsseavx.c

diff --git a/rtengine/curves.h b/rtengine/curves.h
index 27f5a8adc..d7c35b619 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -30,8 +30,6 @@
 #include "sleef.h"
 #define CURVES_MIN_POLY_POINTS  1000
 
-#include "rt_math.h"
-
 #define CLIPI(a) ((a)>0?((a)<65534?(a):65534):0)
 
 namespace Glib
diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 890003ee0..77392a552 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -24,6 +24,7 @@
 #include "dcrop.h"
 #include "image8.h"
 #include "imagefloat.h"
+#include "improccoordinator.h"
 #include "labimage.h"
 #include "mytime.h"
 #include "procparams.h"
diff --git a/rtengine/dcrop.h b/rtengine/dcrop.h
index c65c1e72f..6667800f9 100644
--- a/rtengine/dcrop.h
+++ b/rtengine/dcrop.h
@@ -18,11 +18,7 @@
  */
 #pragma once
 
-#include "improccoordinator.h"
 #include "rtengine.h"
-#include "improcfun.h"
-#include "imagesource.h"
-#include "procevents.h"
 #include "pipettebuffer.h"
 #include "../rtgui/threadutils.h"
 
@@ -30,6 +26,7 @@ namespace rtengine
 {
 
 class Image8;
+class CieImage;
 
 using namespace procparams;
 
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index 96d1f80ce..8b5b37625 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -27,7 +27,6 @@
 #include "imagesource.h"
 #include "improcfun.h"
 #include "LUT.h"
-#include "procevents.h"
 #include "rtengine.h"
 
 #include "../rtgui/threadutils.h"
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index d5c69773e..6749ff305 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -31,15 +31,16 @@
 #include "cieimage.h"
 #include "clutstore.h"
 #include "color.h"
+#include "colortemp.h"
 #include "curves.h"
 #include "dcp.h"
 #include "EdgePreservingDecomposition.h"
 #include "iccmatrices.h"
 #include "iccstore.h"
 #include "imagesource.h"
-#include "improccoordinator.h"
 #include "improcfun.h"
 #include "labimage.h"
+#include "pipettebuffer.h"
 #include "procparams.h"
 #include "rt_math.h"
 #include "rtengine.h"
diff --git a/rtengine/opthelper.h b/rtengine/opthelper.h
index b65ede227..f431c0ec9 100644
--- a/rtengine/opthelper.h
+++ b/rtengine/opthelper.h
@@ -18,27 +18,24 @@
 //  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 //
 ////////////////////////////////////////////////////////////////
+#pragma once
 
-#ifndef OPTHELPER_H
-    #define OPTHELPER_H
+#define pow_F(a,b) (xexpf(b*xlogf(a)))
 
-    #define pow_F(a,b) (xexpf(b*xlogf(a)))
-
-    #ifdef __SSE2__
-        #include "sleefsseavx.c"
-    #endif
-
-    #ifdef __GNUC__
-        #define RESTRICT    __restrict__
-        #define LIKELY(x)   __builtin_expect (!!(x), 1)
-        #define UNLIKELY(x) __builtin_expect (!!(x), 0)
-        #define ALIGNED64 __attribute__ ((aligned (64)))
-        #define ALIGNED16 __attribute__ ((aligned (16)))
-    #else
-        #define RESTRICT
-        #define LIKELY(x)    (x)
-        #define UNLIKELY(x)  (x)
-        #define ALIGNED64
-        #define ALIGNED16
-    #endif
+#ifdef __SSE2__
+    #include "sleefsseavx.h"
+#endif
+
+#ifdef __GNUC__
+    #define RESTRICT    __restrict__
+    #define LIKELY(x)   __builtin_expect (!!(x), 1)
+    #define UNLIKELY(x) __builtin_expect (!!(x), 0)
+    #define ALIGNED64 __attribute__ ((aligned (64)))
+    #define ALIGNED16 __attribute__ ((aligned (16)))
+#else
+    #define RESTRICT
+    #define LIKELY(x)    (x)
+    #define UNLIKELY(x)  (x)
+    #define ALIGNED64
+    #define ALIGNED16
 #endif
diff --git a/rtengine/rtengine.h b/rtengine/rtengine.h
index 0473622c4..ca663c5bc 100644
--- a/rtengine/rtengine.h
+++ b/rtengine/rtengine.h
@@ -31,7 +31,6 @@
 #include "imageformat.h"
 #include "procevents.h"
 #include "rawmetadatalocation.h"
-#include "rt_math.h"
 #include "settings.h"
 
 #include "../rtgui/threadutils.h"
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
deleted file mode 100644
index 0af516f9b..000000000
--- a/rtengine/sleefsseavx.c
+++ /dev/null
@@ -1,1483 +0,0 @@
-////////////////////////////////////////////////////////////////
-//
-//  this code was taken from http://shibatch.sourceforge.net/
-//  Many thanks to the author of original version: Naoki Shibata
-//
-//  This version contains modifications made by Ingo Weyrich
-//
-////////////////////////////////////////////////////////////////
-
-
-#ifndef SLEEFSSEAVX
-#define SLEEFSSEAVX
-
-#include <assert.h>
-#include "rt_math.h"
-#ifdef __SSE2__
-#include "helpersse2.h"
-
-#ifdef ENABLE_AVX
-#include "helperavx.h"
-#endif
-
-#ifdef __GNUC__
-#define INLINE __inline
-#else
-#define INLINE inline
-#endif
-
-#define PI4_A .7853981554508209228515625
-#define PI4_B .794662735614792836713604629039764404296875e-8
-#define PI4_C .306161699786838294306516483068750264552437361480769e-16
-#define M_4_PI 1.273239544735162542821171882678754627704620361328125
-
-#define L2U .69314718055966295651160180568695068359375
-#define L2L .28235290563031577122588448175013436025525412068e-12
-#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
-
-#define PI4_Af 0.78515625f
-#define PI4_Bf 0.00024127960205078125f
-#define PI4_Cf 6.3329935073852539062e-07f
-#define PI4_Df 4.9604681473525147339e-10f
-
-#define L2Uf 0.693145751953125f
-#define L2Lf 1.428606765330187045e-06f
-#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
-
-#define INFINITYf ((float)rtengine::RT_INFINITY)
-#define NANf ((float)rtengine::RT_NAN)
-
-static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
-    return vadd(vadd(v0, v1), v2);
-}
-
-static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
-    return vadd3(vadd(v0, v1), v2, v3);
-}
-
-static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
-    return vadd4(vadd(v0, v1), v2, v3, v4);
-}
-
-static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
-    return vadd5(vadd(v0, v1), v2, v3, v4, v5);
-}
-
-static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
-    return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
-}
-
-static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
-    return vsub(vsub(v0, v1), v2);
-}
-
-static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
-    return vsub3(vsub(v0, v1), v2, v3);
-}
-
-static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
-    return vsub4(vsub(v0, v1), v2, v3, v4);
-}
-
-//
-
-static INLINE vdouble2 normalize_d(vdouble2 t) {
-    vdouble2 s;
-
-    s.x = vadd(t.x, t.y);
-    s.y = vadd(vsub(t.x, s.x), t.y);
-
-    return s;
-}
-
-static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
-    vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
-    return r;
-}
-
-static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y);
-    r.y = vadd(vsub(x, r.x), y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y);
-    vdouble v = vsub(r.x, x);
-    r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
-
-    return r;
-}
-
-static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x.x, y);
-    r.y = vadd3(vsub(x.x, r.x), y, x.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x.x, y);
-    vdouble v = vsub(r.x, x.x);
-    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
-    r.y = vadd(r.y, x.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y.x);
-    r.y = vadd3(vsub(x, r.x), y.x, y.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
-    // |x| >= |y|
-
-    vdouble2 r;
-
-    r.x = vadd(x.x, y.x);
-    r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
-    vdouble2 r;
-
-    r.x  = vadd(x.x, y.x);
-    vdouble v = vsub(r.x, x.x);
-    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
-    r.y = vadd(r.y, vadd(x.y, y.y));
-
-    return r;
-}
-
-static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
-    vdouble t = vrec(d.x);
-    vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
-    vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
-    vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
-
-    vdouble2 q;
-
-    q.x = vmul(n.x, t);
-
-    vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
-            vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
-
-    q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
-
-    return q;
-}
-
-static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
-    vdouble xh = vupper(x), xl = vsub(x, xh);
-    vdouble yh = vupper(y), yl = vsub(y, yh);
-    vdouble2 r;
-
-    r.x = vmul(x, y);
-    r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
-
-    return r;
-}
-
-static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble yh = vupper(y  ), yl = vsub(y  , yh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, y);
-    r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
-
-    return r;
-}
-
-static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, y.x);
-    r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
-
-    return r;
-}
-
-static INLINE vdouble2 squ_d(vdouble2 x) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, x.x);
-    r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
-
-    return r;
-}
-
-static INLINE vdouble2 rec_s(vdouble d) {
-    vdouble t = vrec(d);
-    vdouble dh = vupper(d), dl = vsub(d, dh);
-    vdouble th = vupper(t), tl = vsub(t, th);
-    vdouble2 q;
-
-    q.x = t;
-    q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
-
-    return q;
-}
-
-static INLINE vdouble2 sqrt_d(vdouble2 d) {
-    vdouble t = vsqrt(vadd(d.x, d.y));
-    return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
-}
-
-//
-
-static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
-
-static INLINE vint xilogb(vdouble d) {
-    vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
-    e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
-    e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
-    return vrint_vi_vd(e);
-}
-
-static INLINE vdouble xsin(vdouble d) {
-    vint q;
-    vdouble u, s;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
-
-    u = vcast_vd_vi(q);
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
-
-    s = vmul(d, d);
-
-    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
-
-    u = vcast_vd_d(-7.97255955009037868891952e-18);
-    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
-    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
-    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
-    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
-    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
-
-    u = vmla(s, vmul(u, d), d);
-
-    return u;
-}
-
-static INLINE vdouble xcos(vdouble d) {
-    vint q;
-    vdouble u, s;
-
-    q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
-    q = vaddi(vaddi(q, q), vcast_vi_i(1));
-
-    u = vcast_vd_vi(q);
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
-
-    s = vmul(d, d);
-
-    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
-
-    u = vcast_vd_d(-7.97255955009037868891952e-18);
-    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
-    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
-    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
-    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
-    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
-
-    u = vmla(s, vmul(u, d), d);
-
-    return u;
-}
-
-static INLINE vdouble2 xsincos(vdouble d) {
-    vint q;
-    vmask m;
-    vdouble u, s, t, rx, ry;
-    vdouble2 r;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
-
-    s = d;
-
-    u = vcast_vd_vi(q);
-    s = vmla(u, vcast_vd_d(-PI4_A*2), s);
-    s = vmla(u, vcast_vd_d(-PI4_B*2), s);
-    s = vmla(u, vcast_vd_d(-PI4_C*2), s);
-
-    t = s;
-
-    s = vmul(s, s);
-
-    u = vcast_vd_d(1.58938307283228937328511e-10);
-    u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
-    u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
-    u = vmul(vmul(u, s), t);
-
-    rx = vadd(t, u);
-
-    u = vcast_vd_d(-1.13615350239097429531523e-11);
-    u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
-    u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
-    u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
-    u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
-    u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
-    u = vmla(u, s, vcast_vd_d(-0.5));
-
-    ry = vadd(vcast_vd_d(1), vmul(s, u));
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
-    r.x = vsel(m, rx, ry);
-    r.y = vsel(m, ry, rx);
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
-    r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
-
-    m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
-    r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
-
-    m = vmask_isinf(d);
-    r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
-    r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
-
-    return r;
-}
-
-static INLINE vdouble xtan(vdouble d) {
-    vint q;
-    vdouble u, s, x;
-    vmask m;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
-
-    u = vcast_vd_vi(q);
-    x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
-    x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
-    x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
-
-    s = vmul(x, x);
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
-    x = vsel(m, vneg(x), x);
-
-    u = vcast_vd_d(1.01419718511083373224408e-05);
-    u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
-    u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
-    u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
-    u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
-    u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
-    u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
-    u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
-    u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
-    u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
-    u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
-    u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
-    u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
-    u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
-    u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
-
-    u = vmla(s, vmul(u, x), x);
-
-    u = vsel(m, vrec(u), u);
-
-    u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
-
-    return u;
-}
-
-static INLINE vdouble atan2k(vdouble y, vdouble x) {
-    vdouble s, t, u;
-    vint q;
-    vmask p;
-
-    q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
-    x = vabs(x);
-
-    q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
-    p = vmask_lt(x, y);
-    s = vsel (p, vneg(x), y);
-    t = vmax (x, y);
-
-    s = vdiv(s, t);
-    t = vmul(s, s);
-
-    u = vcast_vd_d(-1.88796008463073496563746e-05);
-    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
-    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
-    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
-    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
-    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
-    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
-    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
-    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
-    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
-    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
-    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
-    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
-    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
-    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
-    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
-    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
-    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
-    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
-
-    t = vadd(s, vmul(s, vmul(t, u)));
-    t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
-
-    return t;
-}
-
-static INLINE vdouble xatan2(vdouble y, vdouble x) {
-    vdouble r = atan2k(vabs(y), x);
-
-    r = vmulsign(r, x);
-    r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
-    r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
-    r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
-
-    return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
-}
-
-static INLINE vdouble xasin(vdouble d) {
-    vdouble x, y;
-    x = vadd(vcast_vd_d(1), d);
-    y = vsub(vcast_vd_d(1), d);
-    x = vmul(x, y);
-    x = vsqrt(x);
-    x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
-    return vmulsign(x, d);
-}
-
-static INLINE vdouble xacos(vdouble d) {
-    vdouble x, y;
-    x = vadd(vcast_vd_d(1), d);
-    y = vsub(vcast_vd_d(1), d);
-    x = vmul(x, y);
-    x = vsqrt(x);
-    x = vmulsign(atan2k(x, vabs(d)), d);
-    y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
-    x = vadd(x, y);
-    return x;
-}
-
-static INLINE vdouble xatan(vdouble s) {
-    vdouble t, u;
-    vint q;
-
-    q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
-    s = vabs(s);
-
-    q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
-    s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
-
-    t = vmul(s, s);
-
-    u = vcast_vd_d(-1.88796008463073496563746e-05);
-    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
-    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
-    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
-    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
-    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
-    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
-    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
-    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
-    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
-    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
-    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
-    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
-    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
-    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
-    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
-    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
-    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
-    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
-
-    t = vadd(s, vmul(s, vmul(t, u)));
-
-    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
-    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
-
-    return t;
-}
-
-static INLINE vdouble xlog(vdouble d) {
-    vdouble x, x2;
-    vdouble t, m;
-    vint e;
-
-    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
-    m = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
-    x2 = vmul(x, x);
-
-    t = vcast_vd_d(0.148197055177935105296783);
-    t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
-    t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
-    t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
-    t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
-    t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
-    t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
-    t = vmla(t, x2, vcast_vd_d(2));
-
-    x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
-
-    x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-static INLINE vdouble xexp(vdouble d) {
-    vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
-    vdouble s, u;
-
-    s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    u = vcast_vd_d(2.08860621107283687536341e-09);
-    u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
-    u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
-    u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
-    u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
-    u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
-    u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
-    u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
-    u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
-    u = vmla(u, s, vcast_vd_d(0.5));
-
-    u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
-
-    u = vldexp(u, q);
-
-    u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
-
-    return u;
-}
-
-static INLINE vdouble2 logk(vdouble d) {
-    vdouble2 x, x2;
-    vdouble t, m;
-    vint e;
-
-    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
-    m = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
-    x2 = squ_d(x);
-    x2 = normalize_d(x2);
-
-    t = vcast_vd_d(0.134601987501262130076155);
-    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
-    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
-    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
-    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
-    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
-    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
-    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
-
-    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
-                vcast_vd_vi(e)),
-            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
-}
-
-static INLINE vdouble expk(vdouble2 d) {
-    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
-    vint q = vrint_vi_vd(u);
-    vdouble2 s, t;
-
-    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
-
-    s = normalize_d(s);
-
-    u = vcast_vd_d(2.51069683420950419527139e-08);
-    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
-    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
-    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
-    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
-    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
-    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
-    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
-    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
-    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
-
-    t = add_dd(s, mul_ds(squ_d(s), u));
-
-    t = add_sd(vcast_vd_d(1), t);
-    u = vadd(t.x, t.y);
-    u = vldexp(u, q);
-
-    return u;
-}
-
-static INLINE vdouble xpow(vdouble x, vdouble y) {
-#if 1
-    vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
-    vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
-
-    vdouble result = expk(mul_ds(logk(vabs(x)), y));
-
-    //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
-
-    result = vmul(result,
-            vsel(vmask_gt(x, vcast_vd_d(0)),
-                vcast_vd_d(1),
-                vsel(yisint,
-                    vsel(yisodd,
-                        vcast_vd_d(-1),
-                        vcast_vd_d(1)),
-                    vcast_vd_d(rtengine::RT_NAN))));
-
-    vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
-
-    result = vsel(vmask_isinf(y),
-            vsel(vmask_lt(efx, vcast_vd_d(0)),
-                vcast_vd_d(0),
-                vsel(vmask_eq(efx, vcast_vd_d(0)),
-                    vcast_vd_d(1.0),
-                    vcast_vd_d(rtengine::RT_INFINITY))),
-            result);
-
-    result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
-            vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
-                vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
-                    vcast_vd_d(0),
-                    vcast_vd_d(rtengine::RT_INFINITY))),
-            result);
-
-    result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
-
-    result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
-
-    return result;
-#else
-    return expk(mul_ds(logk(x), y));
-#endif
-}
-
-static INLINE vdouble2 expk2(vdouble2 d) {
-    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
-    vint q = vrint_vi_vd(u);
-    vdouble2 s, t;
-
-    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
-
-    s = normalize_d(s);
-
-    u = vcast_vd_d(2.51069683420950419527139e-08);
-    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
-    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
-    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
-    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
-    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
-    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
-    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
-    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
-    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
-
-    t = add_dd(s, mul_ds(squ_d(s), u));
-
-    t = add_sd(vcast_vd_d(1), t);
-
-    return dd(vldexp(t.x, q), vldexp(t.y, q));
-}
-
-static INLINE vdouble xsinh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
-    d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
-    y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xcosh(vdouble x) {
-    vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
-    d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
-    vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xtanh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
-    vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
-    d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
-    y = d.x + d.y;
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble2 logk2(vdouble2 d) {
-    vdouble2 x, x2, m;
-    vdouble t;
-    vint e;
-
-    d = normalize_d(d);
-    e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
-    m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
-
-    x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
-    x2 = squ_d(x);
-    x2 = normalize_d(x2);
-
-    t = vcast_vd_d(0.134601987501262130076155);
-    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
-    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
-    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
-    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
-    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
-    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
-    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
-
-    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
-                vcast_vd_vi(e)),
-            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
-}
-
-static INLINE vdouble xasinh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
-    y = vadd(d.x, d.y);
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xacosh(vdouble x) {
-    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
-    vdouble y = vadd(d.x, d.y);
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
-    y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xatanh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
-    y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xcbrt(vdouble d) {
-    vdouble x, y, q = vcast_vd_d(1.0);
-    vint e, qu, re;
-    vdouble t;
-
-    e = vilogbp1(vabs(d));
-    d = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
-    qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
-    re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
-
-    q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
-    q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
-    q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
-
-    q = vmulsign(q, d);
-
-    d = vabs(d);
-
-    x = vcast_vd_d(-0.640245898480692909870982);
-    x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
-    x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
-    x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
-    x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
-    x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
-
-    y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
-    y = vmul(vmul(d, x), x);
-    y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
-
-    return y;
-}
-
-static INLINE vdouble xexp2(vdouble a) {
-    vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
-    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
-    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
-    return u;
-}
-
-static INLINE vdouble xexp10(vdouble a) {
-    vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
-    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
-    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
-    return u;
-}
-
-static INLINE vdouble xexpm1(vdouble a) {
-    vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
-    vdouble x = d.x + d.y;
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
-    return x;
-}
-
-static INLINE vdouble xlog10(vdouble a) {
-    vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
-    vdouble x = d.x + d.y;
-
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-static INLINE vdouble xlog1p(vdouble a) {
-    vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
-    vdouble x = d.x + d.y;
-
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-//
-
-typedef struct {
-    vfloat x, y;
-} vfloat2;
-
-static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
-static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
-
-#ifdef __SSE4_1__
-// only one instruction when using SSE4.1
-static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
-    return _mm_blendv_ps(y,x,(vfloat)mask);
-}
-
-static INLINE vint vselc(vmask mask, vint x, vint y) {
-    return _mm_blendv_epi8(y,x,mask);
-}
-
-#else
-// three instructions when using SSE2
-static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
-    return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
-}
-
-static INLINE vint vselc(vmask mask, vint x, vint y) {
-    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
-}
-#endif
-
-static INLINE vfloat vselfzero(vmask mask, vfloat x) {
-    // returns value of x if corresponding mask bits are 1, else returns 0
-    // faster than vself(mask, x, ZEROV)
-    return _mm_and_ps((vfloat)mask, x);
-}
-static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
-    // returns value of x if corresponding mask bits are 0, else returns 0
-    // faster than vself(mask, ZEROV, x)
-    return _mm_andnot_ps((vfloat)mask, x);
-}
-
-static INLINE vint vselizero(vmask mask, vint x) {
-    // returns value of x if corresponding mask bits are 1, else returns 0
-    // faster than vselc(mask, x, ZEROV)
-    return _mm_and_si128(mask, x);
-}
-static INLINE vint vselinotzero(vmask mask, vint x) {
-    // returns value of x if corresponding mask bits are 0, else returns 0
-    // faster than vselc(mask, ZEROV, x)
-    return _mm_andnot_si128(mask, x);
-}
-
-static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
-    vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
-    return vori2(vandi2(m2, x), vandnoti2(m2, y));
-}
-
-static INLINE vmask vsignbitf(vfloat f) {
-    return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
-}
-
-static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
-    return (vfloat)vxorm((vmask)x, vsignbitf(y));
-}
-
-static INLINE vfloat vsignf(vfloat f) {
-    return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
-}
-
-static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
-static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
-static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
-static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
-// the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
-static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
-static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
-static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
-
-static INLINE vint2 vilogbp1f(vfloat d) {
-    vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
-    d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
-    vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
-    q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
-    return q;
-}
-
-static INLINE vfloat vldexpf(vfloat x, vint2 q) {
-    vfloat u;
-    vint2 m = vsrai2(q, 31);
-    m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
-    q = vsubi2(q, vslli2(m, 2));
-    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
-    x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
-    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
-    return vmulf(x, u);
-}
-
-static INLINE vfloat xsinf(vfloat d) {
-    vint2 q;
-    vfloat u, s;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
-
-    u = vcast_vf_vi2(q);
-    d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
-
-    s = vmulf(d, d);
-
-    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
-
-    u = vcast_vf_f(2.6083159809786593541503e-06f);
-    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
-    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
-
-    u = vmlaf(s, vmulf(u, d), d);
-
-    return u;
-}
-
-static INLINE vfloat xcosf(vfloat d) {
-    vint2 q;
-    vfloat u, s;
-
-    q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
-    q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
-
-    u = vcast_vf_vi2(q);
-    d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
-
-    s = vmulf(d, d);
-
-    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
-
-    u = vcast_vf_f(2.6083159809786593541503e-06f);
-    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
-    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
-
-    u = vmlaf(s, vmulf(u, d), d);
-
-    return u;
-}
-
-static INLINE vfloat2 xsincosf(vfloat d) {
-    vint2 q;
-    vmask m;
-    vfloat u, s, t, rx, ry;
-    vfloat2 r;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
-
-    s = d;
-
-    u = vcast_vf_vi2(q);
-    s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
-
-    t = s;
-
-    s = vmulf(s, s);
-
-    u = vcast_vf_f(-0.000195169282960705459117889f);
-    u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
-    u = vmulf(vmulf(u, s), t);
-
-    rx = vaddf(t, u);
-
-    u = vcast_vf_f(-2.71811842367242206819355e-07f);
-    u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
-    u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
-    u = vmlaf(u, s, vcast_vf_f(-0.5));
-
-    ry = vaddf(vcast_vf_f(1), vmulf(s, u));
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
-    r.x = vself(m, rx, ry);
-    r.y = vself(m, ry, rx);
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
-    r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
-
-    m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
-    r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
-
-    m = vmaskf_isinf(d);
-    r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
-    r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
-
-    return r;
-}
-
-static INLINE vfloat xtanf(vfloat d) {
-    vint2 q;
-    vmask m;
-    vfloat u, s, x;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
-
-    x = d;
-
-    u = vcast_vf_vi2(q);
-    x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
-
-    s = vmulf(x, x);
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
-    x = vself(m, vnegf(x), x);
-
-    u = vcast_vf_f(0.00927245803177356719970703f);
-    u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
-    u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
-    u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
-    u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
-    u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
-
-    u = vmlaf(s, vmulf(u, x), x);
-
-    u = vself(m, vrecf(u), u);
-
-    u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
-
-    return u;
-}
-
-static INLINE vfloat xatanf(vfloat s) {
-    vfloat t, u;
-    vint2 q;
-
-    q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
-    s = vabsf(s);
-
-    q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
-    s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
-
-    t = vmulf(s, s);
-
-    u = vcast_vf_f(0.00282363896258175373077393f);
-    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
-    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
-    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
-    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
-    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
-
-    t = vaddf(s, vmulf(s, vmulf(t, u)));
-
-    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
-    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
-
-    return t;
-}
-
-static INLINE vfloat atan2kf(vfloat y, vfloat x) {
-    vfloat s, t, u;
-    vint2 q;
-    vmask p;
-
-    q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
-    x = vabsf(x);
-
-    q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
-    p = vmaskf_lt(x, y);
-    s = vself(p, vnegf(x), y);
-    t = vmaxf(x, y);
-
-    s = vdivf(s, t);
-    t = vmulf(s, s);
-
-    u = vcast_vf_f(0.00282363896258175373077393f);
-    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
-    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
-    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
-    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
-    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
-
-    t = vaddf(s, vmulf(s, vmulf(t, u)));
-    t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
-
-    return t;
-}
-
-static INLINE vfloat xatan2f(vfloat y, vfloat x) {
-    vfloat r = atan2kf(vabsf(y), x);
-
-    r = vmulsignf(r, x);
-    r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
-    r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
-    r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
-
-    return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
-}
-
-static INLINE vfloat xasinf(vfloat d) {
-    vfloat x, y;
-    x = vaddf(vcast_vf_f(1.0f), d);
-    y = vsubf(vcast_vf_f(1.0f), d);
-    x = vmulf(x, y);
-    x = vsqrtf(x);
-    x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
-    return vmulsignf(x, d);
-}
-
-static INLINE vfloat xacosf(vfloat d) {
-    vfloat x, y;
-    x = vaddf(vcast_vf_f(1.0f), d);
-    y = vsubf(vcast_vf_f(1.0f), d);
-    x = vmulf(x, y);
-    x = vsqrtf(x);
-    x = vmulsignf(atan2kf(x, vabsf(d)), d);
-    y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
-    x = vaddf(x, y);
-    return x;
-}
-
-static INLINE vfloat xlogf(vfloat d) {
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
-    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
-    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
-    x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogf0(vfloat d) {
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
-    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
-    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-}
-
-static INLINE vfloat xexpf(vfloat d) {
-    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
-    vfloat s, u;
-
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
-
-    u = vcast_vf_f(0.00136324646882712841033936f);
-    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
-    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
-    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
-
-    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
-
-    u = vldexpf(u, q);
-
-    // -104.0
-    return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
-}
-
-static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
-    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
-    vfloat s, u;
-
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
-
-    u = vcast_vf_f(0.00136324646882712841033936f);
-    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
-    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
-    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
-
-    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
-
-    return vldexpf(u, q);
-}
-
-static INLINE vfloat xcbrtf(vfloat d) {
-    vfloat x, y, q = vcast_vf_f(1.0), t;
-    vint2 e, qu, re;
-
-    e = vilogbp1f(vabsf(d));
-    d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
-    qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
-    re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
-
-    q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
-    q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
-    q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
-
-    q = vmulsignf(q, d);
-    d = vabsf(d);
-
-    x = vcast_vf_f(-0.601564466953277587890625f);
-    x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
-    x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
-    x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
-    x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
-    x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
-
-    y = vmulf(vmulf(d, x), x);
-    y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
-
-    return y;
-}
-
-static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
-    // clamps value in [low;high], returns low if value is NaN
-    return vmaxf(vminf(high, value), low);
-}
-
-static INLINE vfloat SQRV(vfloat a){
-    return a * a;
-}
-
-static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
-    // conditional swap the elements of two vfloats
-    vfloat temp = vself(condition, a, b); // the values which fit to condition
-    a = vself(condition, b, a); // the values which fit to inverted condition
-    b = temp;
-}
-
-static inline float vhadd( vfloat a ) {
-    // returns a[0] + a[1] + a[2] + a[3]
-    a += _mm_movehl_ps(a, a);
-    return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static inline float vhmin(vfloat a) {
-    // returns min(a[0], a[1], a[2], a[3])
-    a = vminf(a, _mm_movehl_ps(a, a));
-    return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static inline float vhmax(vfloat a) {
-    // returns max(a[0], a[1], a[2], a[3])
-    a = vmaxf(a, _mm_movehl_ps(a, a));
-    return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static INLINE vfloat vmul2f(vfloat a){
-    // fastest way to multiply by 2
-    return a + a;
-}
-
-static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
-    // calculate a * b + (1 - a) * c (interpolate two values)
-    // following is valid:
-    // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
-    // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
-    return a * (b-c) + c;
-}
-
-static INLINE vfloat vdup(vfloat a){
-    // returns { a[0],a[0],a[1],a[1] }
-    return _mm_unpacklo_ps( a, a );
-}
-
-static INLINE vfloat vaddc2vfu(float &a)
-{
-    // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
-    vfloat a1 = _mm_loadu_ps( &a );
-    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
-    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
-}
-
-static INLINE vfloat vadivapb (vfloat a, vfloat b) {
-    return a / (a+b);
-}
-
-static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
-    // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
-    rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
-    gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
-    bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
-}
-
-#if defined( __SSE4_1__ ) && defined( __x86_64__ )
-static INLINE vfloat vceilf(vfloat x) {
-    return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
-}
-
-#else
-
-static INLINE vfloat vceilf(vfloat x) {
-    __m128i zerov = _mm_setzero_si128();
-    zerov = _mm_cmpeq_epi32(zerov, zerov);
-    const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
-    const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
-    return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
-}
-#endif
-
-#endif // __SSE2__
-#endif // SLEEFSSEAVX
diff --git a/rtengine/sleefsseavx.h b/rtengine/sleefsseavx.h
index 8fe20c54b..c10f4a0c2 100644
--- a/rtengine/sleefsseavx.h
+++ b/rtengine/sleefsseavx.h
@@ -1,124 +1,897 @@
-#include <immintrin.h>
-#include <stdint.h>
+////////////////////////////////////////////////////////////////
+//
+//  this code was taken from http://shibatch.sourceforge.net/
+//  Many thanks to the author of original version: Naoki Shibata
+//
+//  This version contains modifications made by Ingo Weyrich
+//
+////////////////////////////////////////////////////////////////
+#pragma once
 
+#include "rt_math.h"
 #ifdef __SSE2__
-#define VECTLENDP 2
-#define VECTLENSP 4
-
-typedef __m128d vdouble;
-typedef __m128i vint;
-
-typedef __m128 vfloat;
-typedef __m128i vint2;
-typedef __m128i vmask;
-
-static vdouble vloadu(double *p)
-{
-    return _mm_loadu_pd(p);
-}
-static void vstoreu(double *p, vdouble v)
-{
-    _mm_storeu_pd(p, v);
-}
-
-static vfloat vloaduf(float *p)
-{
-    return _mm_loadu_ps(p);
-}
-static void vstoreuf(float *p, vfloat v)
-{
-    _mm_storeu_ps(p, v);
-}
-
-static vint2 vloadui2(int32_t *p)
-{
-    return (vint2)_mm_loadu_si128((__m128i *)p);
-}
-static void vstoreui2(int32_t *p, vint2 v)
-{
-    _mm_storeu_si128((__m128i *)p, (__m128i)v);
-}
-#endif
+#include "helpersse2.h"
 
 #ifdef ENABLE_AVX
-#define VECTLENDP 4
-#define VECTLENSP 8
+#include "helperavx.h"
+#endif
 
-typedef __m256d vdouble;
-typedef __m128i vint;
+#ifdef __GNUC__
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
 
+#define PI4_A .7853981554508209228515625
+#define PI4_B .794662735614792836713604629039764404296875e-8
+#define PI4_C .306161699786838294306516483068750264552437361480769e-16
+#define M_4_PI 1.273239544735162542821171882678754627704620361328125
 
-typedef __m256 vfloat;
-typedef struct {
-    vint x, y;
-} vint2;
+#define L2U .69314718055966295651160180568695068359375
+#define L2L .28235290563031577122588448175013436025525412068e-12
+#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
 
-static vdouble vloadu(double *p)
-{
-    return _mm256_loadu_pd(p);
-}
-static void vstoreu(double *p, vdouble v)
-{
-    return _mm256_storeu_pd(p, v);
+#define PI4_Af 0.78515625f
+#define PI4_Bf 0.00024127960205078125f
+#define PI4_Cf 6.3329935073852539062e-07f
+#define PI4_Df 4.9604681473525147339e-10f
+
+#define L2Uf 0.693145751953125f
+#define L2Lf 1.428606765330187045e-06f
+#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
+
+#define INFINITYf ((float)rtengine::RT_INFINITY)
+#define NANf ((float)rtengine::RT_NAN)
+
+static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
+    return vadd(vadd(v0, v1), v2);
 }
 
-static vfloat vloaduf(float *p)
-{
-    return _mm256_loadu_ps(p);
-}
-static void vstoreuf(float *p, vfloat v)
-{
-    return _mm256_storeu_ps(p, v);
+static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+    return vadd3(vadd(v0, v1), v2, v3);
 }
 
-static vint2 vloadui2(int32_t *p)
-{
-    vint2 r;
-    r.x = _mm_loadu_si128((__m128i *) p     );
-    r.y = _mm_loadu_si128((__m128i *)(p + 4));
+static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+    return vadd4(vadd(v0, v1), v2, v3, v4);
+}
+
+static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
+    return vadd5(vadd(v0, v1), v2, v3, v4, v5);
+}
+
+static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
+    return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
+}
+
+static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
+    return vsub(vsub(v0, v1), v2);
+}
+
+static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+    return vsub3(vsub(v0, v1), v2, v3);
+}
+
+static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+    return vsub4(vsub(v0, v1), v2, v3, v4);
+}
+
+//
+
+static INLINE vdouble2 normalize_d(vdouble2 t) {
+    vdouble2 s;
+
+    s.x = vadd(t.x, t.y);
+    s.y = vadd(vsub(t.x, s.x), t.y);
+
+    return s;
+}
+
+static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
+    vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
     return r;
 }
 
-static void vstoreui2(int32_t *p, vint2 v)
-{
-    _mm_storeu_si128((__m128i *) p     , v.x);
-    _mm_storeu_si128((__m128i *)(p + 4), v.y);
+static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y);
+    r.y = vadd(vsub(x, r.x), y);
+
+    return r;
 }
-#endif
 
-typedef struct {
+static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y);
+    vdouble v = vsub(r.x, x);
+    r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
+
+    return r;
+}
+
+static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x.x, y);
+    r.y = vadd3(vsub(x.x, r.x), y, x.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x.x, y);
+    vdouble v = vsub(r.x, x.x);
+    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
+    r.y = vadd(r.y, x.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y.x);
+    r.y = vadd3(vsub(x, r.x), y.x, y.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
+    // |x| >= |y|
+
+    vdouble2 r;
+
+    r.x = vadd(x.x, y.x);
+    r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
+    vdouble2 r;
+
+    r.x  = vadd(x.x, y.x);
+    vdouble v = vsub(r.x, x.x);
+    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
+    r.y = vadd(r.y, vadd(x.y, y.y));
+
+    return r;
+}
+
+static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
+    vdouble t = vrec(d.x);
+    vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
+    vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
+    vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
+
+    vdouble2 q;
+
+    q.x = vmul(n.x, t);
+
+    vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
+            vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
+
+    q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
+
+    return q;
+}
+
+static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
+    vdouble xh = vupper(x), xl = vsub(x, xh);
+    vdouble yh = vupper(y), yl = vsub(y, yh);
+    vdouble2 r;
+
+    r.x = vmul(x, y);
+    r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
+
+    return r;
+}
+
+static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble yh = vupper(y  ), yl = vsub(y  , yh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, y);
+    r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
+
+    return r;
+}
+
+static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, y.x);
+    r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
+
+    return r;
+}
+
+static INLINE vdouble2 squ_d(vdouble2 x) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, x.x);
+    r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
+
+    return r;
+}
+
+static INLINE vdouble2 rec_s(vdouble d) {
+    vdouble t = vrec(d);
+    vdouble dh = vupper(d), dl = vsub(d, dh);
+    vdouble th = vupper(t), tl = vsub(t, th);
+    vdouble2 q;
+
+    q.x = t;
+    q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
+
+    return q;
+}
+
+static INLINE vdouble2 sqrt_d(vdouble2 d) {
+    vdouble t = vsqrt(vadd(d.x, d.y));
+    return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
+}
+
+//
+
+static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
+
+static INLINE vint xilogb(vdouble d) {
+    vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
+    e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
+    e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
+    return vrint_vi_vd(e);
+}
+
+static INLINE vdouble xsin(vdouble d) {
+    vint q;
+    vdouble u, s;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
+
+    u = vcast_vd_vi(q);
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
+
+    s = vmul(d, d);
+
+    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
+
+    u = vcast_vd_d(-7.97255955009037868891952e-18);
+    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
+    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+    u = vmla(s, vmul(u, d), d);
+
+    return u;
+}
+
+static INLINE vdouble xcos(vdouble d) {
+    vint q;
+    vdouble u, s;
+
+    q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
+    q = vaddi(vaddi(q, q), vcast_vi_i(1));
+
+    u = vcast_vd_vi(q);
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
+
+    s = vmul(d, d);
+
+    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
+
+    u = vcast_vd_d(-7.97255955009037868891952e-18);
+    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
+    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+    u = vmla(s, vmul(u, d), d);
+
+    return u;
+}
+
+static INLINE vdouble2 xsincos(vdouble d) {
+    vint q;
+    vmask m;
+    vdouble u, s, t, rx, ry;
+    vdouble2 r;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
+
+    s = d;
+
+    u = vcast_vd_vi(q);
+    s = vmla(u, vcast_vd_d(-PI4_A*2), s);
+    s = vmla(u, vcast_vd_d(-PI4_B*2), s);
+    s = vmla(u, vcast_vd_d(-PI4_C*2), s);
+
+    t = s;
+
+    s = vmul(s, s);
+
+    u = vcast_vd_d(1.58938307283228937328511e-10);
+    u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
+    u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
+    u = vmul(vmul(u, s), t);
+
+    rx = vadd(t, u);
+
+    u = vcast_vd_d(-1.13615350239097429531523e-11);
+    u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
+    u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
+    u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
+    u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
+    u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
+    u = vmla(u, s, vcast_vd_d(-0.5));
+
+    ry = vadd(vcast_vd_d(1), vmul(s, u));
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
+    r.x = vsel(m, rx, ry);
+    r.y = vsel(m, ry, rx);
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
+    r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
+
+    m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
+    r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
+
+    m = vmask_isinf(d);
+    r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
+    r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
+
+    return r;
+}
+
+static INLINE vdouble xtan(vdouble d) {
+    vint q;
+    vdouble u, s, x;
+    vmask m;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
+
+    u = vcast_vd_vi(q);
+    x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
+    x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
+    x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
+
+    s = vmul(x, x);
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
+    x = vsel(m, vneg(x), x);
+
+    u = vcast_vd_d(1.01419718511083373224408e-05);
+    u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
+    u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
+    u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
+    u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
+    u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
+    u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
+    u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
+    u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
+    u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
+    u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
+    u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
+    u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
+    u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
+    u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
+
+    u = vmla(s, vmul(u, x), x);
+
+    u = vsel(m, vrec(u), u);
+
+    u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
+
+    return u;
+}
+
+static INLINE vdouble atan2k(vdouble y, vdouble x) {
+    vdouble s, t, u;
+    vint q;
+    vmask p;
+
+    q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
+    x = vabs(x);
+
+    q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
+    p = vmask_lt(x, y);
+    s = vsel (p, vneg(x), y);
+    t = vmax (x, y);
+
+    s = vdiv(s, t);
+    t = vmul(s, s);
+
+    u = vcast_vd_d(-1.88796008463073496563746e-05);
+    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
+    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
+    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
+    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
+    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
+    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
+    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
+    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
+    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
+    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
+    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
+    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
+    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
+    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
+    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
+    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
+    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
+    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+    t = vadd(s, vmul(s, vmul(t, u)));
+    t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
+
+    return t;
+}
+
+static INLINE vdouble xatan2(vdouble y, vdouble x) {
+    vdouble r = atan2k(vabs(y), x);
+
+    r = vmulsign(r, x);
+    r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
+    r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
+    r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
+
+    return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
+}
+
+static INLINE vdouble xasin(vdouble d) {
     vdouble x, y;
-} vdouble2;
+    x = vadd(vcast_vd_d(1), d);
+    y = vsub(vcast_vd_d(1), d);
+    x = vmul(x, y);
+    x = vsqrt(x);
+    x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
+    return vmulsign(x, d);
+}
 
-vdouble xldexp(vdouble x, vint q);
-vint xilogb(vdouble d);
+static INLINE vdouble xacos(vdouble d) {
+    vdouble x, y;
+    x = vadd(vcast_vd_d(1), d);
+    y = vsub(vcast_vd_d(1), d);
+    x = vmul(x, y);
+    x = vsqrt(x);
+    x = vmulsign(atan2k(x, vabs(d)), d);
+    y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
+    x = vadd(x, y);
+    return x;
+}
 
-vdouble xsin(vdouble d);
-vdouble xcos(vdouble d);
-vdouble2 xsincos(vdouble d);
-vdouble xtan(vdouble d);
-vdouble xasin(vdouble s);
-vdouble xacos(vdouble s);
-vdouble xatan(vdouble s);
-vdouble xatan2(vdouble y, vdouble x);
-vdouble xlog(vdouble d);
-vdouble xexp(vdouble d);
-vdouble xpow(vdouble x, vdouble y);
+static INLINE vdouble xatan(vdouble s) {
+    vdouble t, u;
+    vint q;
 
-vdouble xsinh(vdouble d);
-vdouble xcosh(vdouble d);
-vdouble xtanh(vdouble d);
-vdouble xasinh(vdouble s);
-vdouble xacosh(vdouble s);
-vdouble xatanh(vdouble s);
+    q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
+    s = vabs(s);
 
-vdouble xcbrt(vdouble d);
+    q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
+    s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
 
-vdouble xexp2(vdouble a);
-vdouble xexp10(vdouble a);
-vdouble xexpm1(vdouble a);
-vdouble xlog10(vdouble a);
-vdouble xlog1p(vdouble a);
+    t = vmul(s, s);
+
+    u = vcast_vd_d(-1.88796008463073496563746e-05);
+    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
+    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
+    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
+    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
+    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
+    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
+    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
+    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
+    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
+    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
+    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
+    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
+    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
+    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
+    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
+    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
+    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
+    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+    t = vadd(s, vmul(s, vmul(t, u)));
+
+    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
+    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
+
+    return t;
+}
+
+static INLINE vdouble xlog(vdouble d) {
+    vdouble x, x2;
+    vdouble t, m;
+    vint e;
+
+    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
+    m = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
+    x2 = vmul(x, x);
+
+    t = vcast_vd_d(0.148197055177935105296783);
+    t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
+    t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
+    t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
+    t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
+    t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
+    t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
+    t = vmla(t, x2, vcast_vd_d(2));
+
+    x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
+
+    x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
+
+static INLINE vdouble xexp(vdouble d) {
+    vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
+    vdouble s, u;
+
+    s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    u = vcast_vd_d(2.08860621107283687536341e-09);
+    u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
+    u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
+    u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
+    u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
+    u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
+    u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
+    u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
+    u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
+    u = vmla(u, s, vcast_vd_d(0.5));
+
+    u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
+
+    u = vldexp(u, q);
+
+    u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
+
+    return u;
+}
+
+static INLINE vdouble2 logk(vdouble d) {
+    vdouble2 x, x2;
+    vdouble t, m;
+    vint e;
+
+    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
+    m = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
+    x2 = squ_d(x);
+    x2 = normalize_d(x2);
+
+    t = vcast_vd_d(0.134601987501262130076155);
+    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
+    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
+    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
+    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
+    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
+    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
+    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
+
+    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
+                vcast_vd_vi(e)),
+            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
+}
+
+static INLINE vdouble expk(vdouble2 d) {
+    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
+    vint q = vrint_vi_vd(u);
+    vdouble2 s, t;
+
+    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
+
+    s = normalize_d(s);
+
+    u = vcast_vd_d(2.51069683420950419527139e-08);
+    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
+    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+    t = add_dd(s, mul_ds(squ_d(s), u));
+
+    t = add_sd(vcast_vd_d(1), t);
+    u = vadd(t.x, t.y);
+    u = vldexp(u, q);
+
+    return u;
+}
+
+static INLINE vdouble xpow(vdouble x, vdouble y) {
+#if 1
+    vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
+    vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
+
+    vdouble result = expk(mul_ds(logk(vabs(x)), y));
+
+    //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
+
+    result = vmul(result,
+            vsel(vmask_gt(x, vcast_vd_d(0)),
+                vcast_vd_d(1),
+                vsel(yisint,
+                    vsel(yisodd,
+                        vcast_vd_d(-1),
+                        vcast_vd_d(1)),
+                    vcast_vd_d(rtengine::RT_NAN))));
+
+    vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
+
+    result = vsel(vmask_isinf(y),
+            vsel(vmask_lt(efx, vcast_vd_d(0)),
+                vcast_vd_d(0),
+                vsel(vmask_eq(efx, vcast_vd_d(0)),
+                    vcast_vd_d(1.0),
+                    vcast_vd_d(rtengine::RT_INFINITY))),
+            result);
+
+    result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
+            vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
+                vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
+                    vcast_vd_d(0),
+                    vcast_vd_d(rtengine::RT_INFINITY))),
+            result);
+
+    result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
+
+    result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
+
+    return result;
+#else
+    return expk(mul_ds(logk(x), y));
+#endif
+}
+
+static INLINE vdouble2 expk2(vdouble2 d) {
+    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
+    vint q = vrint_vi_vd(u);
+    vdouble2 s, t;
+
+    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
+
+    s = normalize_d(s);
+
+    u = vcast_vd_d(2.51069683420950419527139e-08);
+    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
+    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+    t = add_dd(s, mul_ds(squ_d(s), u));
+
+    t = add_sd(vcast_vd_d(1), t);
+
+    return dd(vldexp(t.x, q), vldexp(t.y, q));
+}
+
+static INLINE vdouble xsinh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
+    d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
+    y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xcosh(vdouble x) {
+    vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
+    d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
+    vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xtanh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
+    vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
+    d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
+    y = d.x + d.y;
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble2 logk2(vdouble2 d) {
+    vdouble2 x, x2, m;
+    vdouble t;
+    vint e;
+
+    d = normalize_d(d);
+    e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
+    m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
+
+    x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
+    x2 = squ_d(x);
+    x2 = normalize_d(x2);
+
+    t = vcast_vd_d(0.134601987501262130076155);
+    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
+    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
+    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
+    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
+    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
+    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
+    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
+
+    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
+                vcast_vd_vi(e)),
+            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
+}
+
+static INLINE vdouble xasinh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
+    y = vadd(d.x, d.y);
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xacosh(vdouble x) {
+    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
+    vdouble y = vadd(d.x, d.y);
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
+    y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xatanh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
+    y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xcbrt(vdouble d) {
+    vdouble x, y, q = vcast_vd_d(1.0);
+    vint e, qu, re;
+    vdouble t;
+
+    e = vilogbp1(vabs(d));
+    d = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
+    qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
+    re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
+
+    q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
+    q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
+    q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
+
+    q = vmulsign(q, d);
+
+    d = vabs(d);
+
+    x = vcast_vd_d(-0.640245898480692909870982);
+    x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
+    x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
+    x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
+    x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
+    x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
+
+    y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
+    y = vmul(vmul(d, x), x);
+    y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
+
+    return y;
+}
+
+static INLINE vdouble xexp2(vdouble a) {
+    vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
+    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
+    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
+    return u;
+}
+
+static INLINE vdouble xexp10(vdouble a) {
+    vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
+    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
+    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
+    return u;
+}
+
+static INLINE vdouble xexpm1(vdouble a) {
+    vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
+    vdouble x = d.x + d.y;
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
+    return x;
+}
+
+static INLINE vdouble xlog10(vdouble a) {
+    vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
+    vdouble x = d.x + d.y;
+
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
+
+static INLINE vdouble xlog1p(vdouble a) {
+    vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
+    vdouble x = d.x + d.y;
+
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
 
 //
 
@@ -126,15 +899,580 @@ typedef struct {
     vfloat x, y;
 } vfloat2;
 
-vfloat xsinf(vfloat d);
-vfloat xcosf(vfloat d);
-vfloat2 xsincosf(vfloat d);
-vfloat xtanf(vfloat d);
-vfloat xasinf(vfloat s);
-vfloat xacosf(vfloat s);
-vfloat xatanf(vfloat s);
-vfloat xatan2f(vfloat y, vfloat x);
-vfloat xlogf(vfloat d);
-vfloat xlogf0(vfloat d);
-vfloat xexpf(vfloat d);
-vfloat xcbrtf(vfloat s);
+static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
+static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
+
+#ifdef __SSE4_1__
+// only one instruction when using SSE4.1
+static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
+    return _mm_blendv_ps(y,x,(vfloat)mask);
+}
+
+static INLINE vint vselc(vmask mask, vint x, vint y) {
+    return _mm_blendv_epi8(y,x,mask);
+}
+
+#else
+// three instructions when using SSE2
+static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
+    return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+}
+
+static INLINE vint vselc(vmask mask, vint x, vint y) {
+    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+}
+#endif
+
+static INLINE vfloat vselfzero(vmask mask, vfloat x) {
+    // returns value of x if corresponding mask bits are 1, else returns 0
+    // faster than vself(mask, x, ZEROV)
+    return _mm_and_ps((vfloat)mask, x);
+}
+static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
+    // returns value of x if corresponding mask bits are 0, else returns 0
+    // faster than vself(mask, ZEROV, x)
+    return _mm_andnot_ps((vfloat)mask, x);
+}
+
+static INLINE vint vselizero(vmask mask, vint x) {
+    // returns value of x if corresponding mask bits are 1, else returns 0
+    // faster than vselc(mask, x, ZEROV)
+    return _mm_and_si128(mask, x);
+}
+static INLINE vint vselinotzero(vmask mask, vint x) {
+    // returns value of x if corresponding mask bits are 0, else returns 0
+    // faster than vselc(mask, ZEROV, x)
+    return _mm_andnot_si128(mask, x);
+}
+
+static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
+    vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
+    return vori2(vandi2(m2, x), vandnoti2(m2, y));
+}
+
+static INLINE vmask vsignbitf(vfloat f) {
+    return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
+}
+
+static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
+    return (vfloat)vxorm((vmask)x, vsignbitf(y));
+}
+
+static INLINE vfloat vsignf(vfloat f) {
+    return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
+}
+
+static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
+static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
+// the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
+static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
+static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
+static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
+
+static INLINE vint2 vilogbp1f(vfloat d) {
+    vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
+    d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
+    vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
+    q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
+    return q;
+}
+
+static INLINE vfloat vldexpf(vfloat x, vint2 q) {
+    vfloat u;
+    vint2 m = vsrai2(q, 31);
+    m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
+    q = vsubi2(q, vslli2(m, 2));
+    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
+    x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
+    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
+    return vmulf(x, u);
+}
+
+static INLINE vfloat xsinf(vfloat d) {
+    vint2 q;
+    vfloat u, s;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
+
+    u = vcast_vf_vi2(q);
+    d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
+
+    s = vmulf(d, d);
+
+    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
+
+    u = vcast_vf_f(2.6083159809786593541503e-06f);
+    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+    u = vmlaf(s, vmulf(u, d), d);
+
+    return u;
+}
+
+static INLINE vfloat xcosf(vfloat d) {
+    vint2 q;
+    vfloat u, s;
+
+    q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
+    q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
+
+    u = vcast_vf_vi2(q);
+    d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
+
+    s = vmulf(d, d);
+
+    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
+
+    u = vcast_vf_f(2.6083159809786593541503e-06f);
+    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+    u = vmlaf(s, vmulf(u, d), d);
+
+    return u;
+}
+
+static INLINE vfloat2 xsincosf(vfloat d) {
+    vint2 q;
+    vmask m;
+    vfloat u, s, t, rx, ry;
+    vfloat2 r;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
+
+    s = d;
+
+    u = vcast_vf_vi2(q);
+    s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
+
+    t = s;
+
+    s = vmulf(s, s);
+
+    u = vcast_vf_f(-0.000195169282960705459117889f);
+    u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
+    u = vmulf(vmulf(u, s), t);
+
+    rx = vaddf(t, u);
+
+    u = vcast_vf_f(-2.71811842367242206819355e-07f);
+    u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
+    u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
+    u = vmlaf(u, s, vcast_vf_f(-0.5));
+
+    ry = vaddf(vcast_vf_f(1), vmulf(s, u));
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
+    r.x = vself(m, rx, ry);
+    r.y = vself(m, ry, rx);
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
+    r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
+
+    m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
+    r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
+
+    m = vmaskf_isinf(d);
+    r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
+    r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
+
+    return r;
+}
+
+static INLINE vfloat xtanf(vfloat d) {
+    vint2 q;
+    vmask m;
+    vfloat u, s, x;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
+
+    x = d;
+
+    u = vcast_vf_vi2(q);
+    x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
+
+    s = vmulf(x, x);
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
+    x = vself(m, vnegf(x), x);
+
+    u = vcast_vf_f(0.00927245803177356719970703f);
+    u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
+    u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
+    u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
+    u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
+    u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
+
+    u = vmlaf(s, vmulf(u, x), x);
+
+    u = vself(m, vrecf(u), u);
+
+    u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
+
+    return u;
+}
+
+static INLINE vfloat xatanf(vfloat s) {
+    vfloat t, u;
+    vint2 q;
+
+    q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
+    s = vabsf(s);
+
+    q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
+    s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
+
+    t = vmulf(s, s);
+
+    u = vcast_vf_f(0.00282363896258175373077393f);
+    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
+    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+    t = vaddf(s, vmulf(s, vmulf(t, u)));
+
+    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
+    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
+
+    return t;
+}
+
+static INLINE vfloat atan2kf(vfloat y, vfloat x) {
+    vfloat s, t, u;
+    vint2 q;
+    vmask p;
+
+    q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
+    x = vabsf(x);
+
+    q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
+    p = vmaskf_lt(x, y);
+    s = vself(p, vnegf(x), y);
+    t = vmaxf(x, y);
+
+    s = vdivf(s, t);
+    t = vmulf(s, s);
+
+    u = vcast_vf_f(0.00282363896258175373077393f);
+    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
+    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+    t = vaddf(s, vmulf(s, vmulf(t, u)));
+    t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
+
+    return t;
+}
+
+static INLINE vfloat xatan2f(vfloat y, vfloat x) {
+    vfloat r = atan2kf(vabsf(y), x);
+
+    r = vmulsignf(r, x);
+    r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
+    r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
+    r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
+
+    return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
+}
+
+static INLINE vfloat xasinf(vfloat d) {
+    vfloat x, y;
+    x = vaddf(vcast_vf_f(1.0f), d);
+    y = vsubf(vcast_vf_f(1.0f), d);
+    x = vmulf(x, y);
+    x = vsqrtf(x);
+    x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
+    return vmulsignf(x, d);
+}
+
+static INLINE vfloat xacosf(vfloat d) {
+    vfloat x, y;
+    x = vaddf(vcast_vf_f(1.0f), d);
+    y = vsubf(vcast_vf_f(1.0f), d);
+    x = vmulf(x, y);
+    x = vsqrtf(x);
+    x = vmulsignf(atan2kf(x, vabsf(d)), d);
+    y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
+    x = vaddf(x, y);
+    return x;
+}
+
+static INLINE vfloat xlogf(vfloat d) {
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
+    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
+    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
+    x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogf0(vfloat d) {
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
+    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
+    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+}
+
+static INLINE vfloat xexpf(vfloat d) {
+    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
+    vfloat s, u;
+
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
+
+    u = vcast_vf_f(0.00136324646882712841033936f);
+    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
+    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
+    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
+
+    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
+
+    u = vldexpf(u, q);
+
+    // -104.0
+    return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
+}
+
+static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
+    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
+    vfloat s, u;
+
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
+
+    u = vcast_vf_f(0.00136324646882712841033936f);
+    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
+    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
+    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
+
+    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
+
+    return vldexpf(u, q);
+}
+
+static INLINE vfloat xcbrtf(vfloat d) {
+    vfloat x, y, q = vcast_vf_f(1.0), t;
+    vint2 e, qu, re;
+
+    e = vilogbp1f(vabsf(d));
+    d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
+    qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
+    re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
+
+    q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
+    q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
+    q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
+
+    q = vmulsignf(q, d);
+    d = vabsf(d);
+
+    x = vcast_vf_f(-0.601564466953277587890625f);
+    x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
+    x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
+    x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
+    x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
+    x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
+
+    y = vmulf(vmulf(d, x), x);
+    y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
+
+    return y;
+}
+
+static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
+    // clamps value in [low;high], returns low if value is NaN
+    return vmaxf(vminf(high, value), low);
+}
+
+static INLINE vfloat SQRV(vfloat a){
+    return a * a;
+}
+
+static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
+    // conditional swap the elements of two vfloats
+    vfloat temp = vself(condition, a, b); // the values which fit to condition
+    a = vself(condition, b, a); // the values which fit to inverted condition
+    b = temp;
+}
+
+static inline float vhadd( vfloat a ) {
+    // returns a[0] + a[1] + a[2] + a[3]
+    a += _mm_movehl_ps(a, a);
+    return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static inline float vhmin(vfloat a) {
+    // returns min(a[0], a[1], a[2], a[3])
+    a = vminf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static inline float vhmax(vfloat a) {
+    // returns max(a[0], a[1], a[2], a[3])
+    a = vmaxf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static INLINE vfloat vmul2f(vfloat a){
+    // fastest way to multiply by 2
+    return a + a;
+}
+
+static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
+    // calculate a * b + (1 - a) * c (interpolate two values)
+    // following is valid:
+    // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
+    // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
+    return a * (b-c) + c;
+}
+
+static INLINE vfloat vdup(vfloat a){
+    // returns { a[0],a[0],a[1],a[1] }
+    return _mm_unpacklo_ps( a, a );
+}
+
+static INLINE vfloat vaddc2vfu(float &a)
+{
+    // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
+    vfloat a1 = _mm_loadu_ps( &a );
+    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
+    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
+}
+
+static INLINE vfloat vadivapb (vfloat a, vfloat b) {
+    return a / (a+b);
+}
+
+static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
+    // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
+    rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
+    gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
+    bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
+}
+
+#if defined( __SSE4_1__ ) && defined( __x86_64__ )
+static INLINE vfloat vceilf(vfloat x) {
+    return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
+}
+
+#else
+
+static INLINE vfloat vceilf(vfloat x) {
+    __m128i zerov = _mm_setzero_si128();
+    zerov = _mm_cmpeq_epi32(zerov, zerov);
+    const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
+    const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
+    return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
+}
+#endif
+
+#endif // __SSE2__
diff --git a/rtengine/utils.cc b/rtengine/utils.cc
index a07a1235f..0674c9806 100644
--- a/rtengine/utils.cc
+++ b/rtengine/utils.cc
@@ -22,7 +22,6 @@
 #include "rt_math.h"
 
 #include "utils.h"
-#include "rt_math.h"
 
 using namespace std;
 
diff --git a/rtgui/cropwindow.cc b/rtgui/cropwindow.cc
index 0a9b81112..edc378700 100644
--- a/rtgui/cropwindow.cc
+++ b/rtgui/cropwindow.cc
@@ -37,6 +37,7 @@
 #include "rtsurface.h"
 
 #include "../rtengine/dcrop.h"
+#include "../rtengine/imagesource.h"
 #include "../rtengine/procparams.h"
 #include "../rtengine/rt_math.h"
 
diff --git a/rtgui/curveeditor.h b/rtgui/curveeditor.h
index baae8f492..abc0cc10a 100644
--- a/rtgui/curveeditor.h
+++ b/rtgui/curveeditor.h
@@ -18,8 +18,8 @@
  */
 #pragma once
 
-#include "coloredbar.h"
 #include "editcallbacks.h"
+#include "guiutils.h"
 
 #include "../rtengine/diagonalcurvetypes.h"
 #include "../rtengine/flatcurvetypes.h"
@@ -28,6 +28,7 @@
 
 class CurveEditorGroup;
 class CurveEditorSubGroup;
+class ColorProvider;
 class PopUpToggleButton;
 
 /*
diff --git a/rtgui/guiutils.cc b/rtgui/guiutils.cc
index 02a28607f..9a4e71ab4 100644
--- a/rtgui/guiutils.cc
+++ b/rtgui/guiutils.cc
@@ -20,7 +20,6 @@
 
 #include "guiutils.h"
 #include "options.h"
-#include "../rtengine/rt_math.h"
 #include "../rtengine/utils.h"
 #include "../rtengine/procparams.h"
 #include "rtimage.h"