From 2e43b2c2136b42b500ef4ad782172c3e2a3f26f8 Mon Sep 17 00:00:00 2001
From: Ingo <heckflosse@i-weyrich.de>
Date: Tue, 5 Mar 2013 23:16:41 +0100
Subject: [PATCH] Added modified Sleef-Library to repository

---
 rtengine/helperavx.h   |  271 +++++++++
 rtengine/helpersse2.h  |  239 ++++++++
 rtengine/sleef.c       | 1211 +++++++++++++++++++++++++++++++++++++
 rtengine/sleef.h       |   51 ++
 rtengine/sleefsseavx.c | 1295 ++++++++++++++++++++++++++++++++++++++++
 rtengine/sleefsseavx.h |  108 ++++
 6 files changed, 3175 insertions(+)
 create mode 100644 rtengine/helperavx.h
 create mode 100644 rtengine/helpersse2.h
 create mode 100644 rtengine/sleef.c
 create mode 100644 rtengine/sleef.h
 create mode 100644 rtengine/sleefsseavx.c
 create mode 100644 rtengine/sleefsseavx.h

diff --git a/rtengine/helperavx.h b/rtengine/helperavx.h
new file mode 100644
index 000000000..e1b3dadee
--- /dev/null
+++ b/rtengine/helperavx.h
@@ -0,0 +1,271 @@
+#ifndef __AVX__
+#error Please specify -mavx.
+#endif
+
+#ifdef __GNUC__
+#define INLINE __attribute__((always_inline))
+#else
+#define INLINE inline
+#endif
+
+#include <immintrin.h>
+#include <stdint.h>
+
+typedef __m256d vdouble;
+typedef __m128i vint;
+typedef __m256i vmask;
+
+typedef __m256 vfloat;
+typedef struct {
+  vint x, y;
+} vint2;
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
+static INLINE vdouble vcast_vd_d(double d) { return _mm256_set_pd(d, d, d, d); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(i, i, i, i); }
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m256i)vd; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (__m256d)vm;  }
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m256i)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m256)vm; }
+
+//
+
+static INLINE vfloat vcast_vf_f(float f) { return _mm256_set_ps(f, f, f, f, f, f, f, f); }
+
+static INLINE vfloat vaddf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
+static INLINE vfloat vsubf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
+static INLINE vfloat vmulf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
+static INLINE vfloat vdivf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
+static INLINE vfloat vrecf(vfloat x) { return vdivf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrtf(vfloat x) { return _mm256_sqrt_ps(x); }
+static INLINE vfloat vmaxf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
+static INLINE vfloat vminf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
+
+//
+
+static INLINE vdouble vadd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
+static INLINE vdouble vsub(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
+static INLINE vdouble vmul(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
+static INLINE vdouble vdiv(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
+static INLINE vdouble vrec(vdouble x) { return _mm256_div_pd(_mm256_set_pd(1, 1, 1, 1), x); }
+static INLINE vdouble vsqrt(vdouble x) { return _mm256_sqrt_pd(x); }
+static INLINE vdouble vmla(vdouble x, vdouble y, vdouble z) { return vadd(vmul(x, y), z); }
+
+static INLINE vdouble vmax(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
+static INLINE vdouble vmin(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
+
+static INLINE vdouble vabs(vdouble d) { return (__m256d)_mm256_andnot_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); }
+static INLINE vdouble vneg(vdouble d) { return (__m256d)_mm256_xor_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); }
+
+//
+
+static INLINE vint vaddi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsubi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+
+static INLINE vint vandi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnoti(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vori(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxori(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vslli(vint x, int c) { return _mm_slli_epi32 (x, c); }
+static INLINE vint vsrli(vint x, int c) { return _mm_srli_epi32 (x, c); }
+static INLINE vint vsrai(vint x, int c) { return _mm_srai_epi32 (x, c); }
+
+//
+
+static INLINE vmask vandm(vmask x, vmask y) { return (vmask)_mm256_and_pd((__m256d)x, (__m256d)y); }
+static INLINE vmask vandnotm(vmask x, vmask y) { return (vmask)_mm256_andnot_pd((__m256d)x, (__m256d)y); }
+static INLINE vmask vorm(vmask x, vmask y) { return (vmask)_mm256_or_pd((__m256d)x, (__m256d)y); }
+static INLINE vmask vxorm(vmask x, vmask y) { return (vmask)_mm256_xor_pd((__m256d)x, (__m256d)y); }
+
+static INLINE vmask vmask_eq(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_EQ_OQ); }
+static INLINE vmask vmask_neq(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_NEQ_OQ); }
+static INLINE vmask vmask_lt(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LT_OQ); }
+static INLINE vmask vmask_le(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LE_OQ); }
+static INLINE vmask vmask_gt(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GT_OQ); }
+static INLINE vmask vmask_ge(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GE_OQ); }
+
+static INLINE vmask vmaskf_eq(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_EQ_OQ); }
+static INLINE vmask vmaskf_neq(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_NEQ_OQ); }
+static INLINE vmask vmaskf_lt(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LT_OQ); }
+static INLINE vmask vmaskf_le(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LE_OQ); }
+static INLINE vmask vmaskf_gt(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GT_OQ); }
+static INLINE vmask vmaskf_ge(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GE_OQ); }
+
+static INLINE vmask vmaski_eq(vint x, vint y) {
+  __m256d r = _mm256_cvtepi32_pd(_mm_and_si128(_mm_cmpeq_epi32(x, y), _mm_set_epi32(1, 1, 1, 1)));
+  return vmask_eq(r, _mm256_set_pd(1, 1, 1, 1));
+}
+
+static INLINE vdouble vsel(vmask mask, vdouble x, vdouble y) {
+  return (__m256d)vorm(vandm(mask, (__m256i)x), vandnotm(mask, (__m256i)y));
+}
+
+static INLINE vint vseli_lt(vdouble d0, vdouble d1, vint x, vint y) {
+  __m128i mask = _mm256_cvtpd_epi32(_mm256_and_pd(_mm256_cmp_pd(d0, d1, _CMP_LT_OQ), _mm256_set_pd(1.0, 1.0, 1.0, 1.0)));
+  mask = _mm_cmpeq_epi32(mask, _mm_set_epi32(1, 1, 1, 1));
+  return vori(vandi(mask, x), vandnoti(mask, y));
+}
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) {
+  vint2 r;
+  r.x = _mm256_castsi256_si128(vm);
+  r.y = _mm256_extractf128_si256(vm, 1);
+  return r;
+}
+
+static INLINE vmask vcast_vm_vi2(vint2 vi) {
+  vmask m = _mm256_castsi128_si256(vi.x);
+  m = _mm256_insertf128_si256(m, vi.y, 1);
+  return m;
+}
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvttps_epi32(vf)); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); }
+static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = vcast_vi_i(i); return r; }
+
+static INLINE vint2 vaddi2(vint2 x, vint2 y) { vint2 r; r.x = vaddi(x.x, y.x); r.y = vaddi(x.y, y.y); return r; }
+static INLINE vint2 vsubi2(vint2 x, vint2 y) { vint2 r; r.x = vsubi(x.x, y.x); r.y = vsubi(x.y, y.y); return r; }
+
+static INLINE vint2 vandi2(vint2 x, vint2 y) { vint2 r; r.x = vandi(x.x, y.x); r.y = vandi(x.y, y.y); return r; }
+static INLINE vint2 vandnoti2(vint2 x, vint2 y) { vint2 r; r.x = vandnoti(x.x, y.x); r.y = vandnoti(x.y, y.y); return r; }
+static INLINE vint2 vori2(vint2 x, vint2 y) { vint2 r; r.x = vori(x.x, y.x); r.y = vori(x.y, y.y); return r; }
+static INLINE vint2 vxori2(vint2 x, vint2 y) { vint2 r; r.x = vxori(x.x, y.x); r.y = vxori(x.y, y.y); return r; }
+
+static INLINE vint2 vslli2(vint2 x, int c) { vint2 r; r.x = vslli(x.x, c); r.y = vslli(x.y, c); return r; }
+static INLINE vint2 vsrli2(vint2 x, int c) { vint2 r; r.x = vsrli(x.x, c); r.y = vsrli(x.y, c); return r; }
+static INLINE vint2 vsrai2(vint2 x, int c) { vint2 r; r.x = vsrai(x.x, c); r.y = vsrai(x.y, c); return r; }
+
+static INLINE vmask vmaski2_eq(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpeq_epi32(x.x, y.x);
+  r.y = _mm_cmpeq_epi32(x.y, y.y);
+  return vcast_vm_vi2(r);
+}
+
+static INLINE vint2 vseli2(vmask m, vint2 x, vint2 y) {
+  vint2 r, m2 = vcast_vi2_vm(m);
+  r.x = vori(vandi(m2.x, x.x), vandnoti(m2.x, y.x));
+  r.y = vori(vandi(m2.y, x.y), vandnoti(m2.y, y.y));
+  return r;
+}
+
+//
+
+static INLINE double vcast_d_vd(vdouble v) {
+  double s[4];
+  _mm256_storeu_pd(s, v);
+  return s[0];
+}
+
+static INLINE float vcast_f_vf(vfloat v) {
+  float s[8];
+  _mm256_storeu_ps(s, v);
+  return s[0];
+}
+
+static INLINE vmask vsignbit(vdouble d) {
+  return (vmask)_mm256_and_pd(d, _mm256_set_pd(-0.0,-0.0,-0.0,-0.0));
+}
+
+static INLINE vdouble vsign(vdouble d) {
+  return _mm256_or_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), (vdouble)vsignbit(d));
+}
+
+static INLINE vdouble vmulsign(vdouble x, vdouble y) {
+  return (__m256d)vxorm((__m256i)x, vsignbit(y));
+}
+
+static INLINE vmask vmask_isinf(vdouble d) {
+  return (vmask)_mm256_cmp_pd(vabs(d), _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vmask vmask_ispinf(vdouble d) {
+  return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vmask vmask_isminf(vdouble d) {
+  return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(-INFINITY, -INFINITY, -INFINITY, -INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vmask vmask_isnan(vdouble d) {
+  return (vmask)_mm256_cmp_pd(d, d, _CMP_NEQ_UQ);
+}
+
+static INLINE vdouble visinf(vdouble d) {
+  return _mm256_and_pd((vdouble)vmask_isinf(d), vsign(d));
+}
+
+static INLINE vdouble visinf2(vdouble d, vdouble m) {
+  return _mm256_and_pd((vdouble)vmask_isinf(d), _mm256_or_pd((vdouble)vsignbit(d), m));
+}
+
+static INLINE vdouble vpow2i(vint q) {
+  vint r;
+  vdouble y;
+  q = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), q);
+  q = _mm_slli_epi32(q, 20);
+  r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,0,0,0));
+  y = _mm256_castpd128_pd256((__m128d)r);
+  r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(3,2,2,2));
+  y = _mm256_insertf128_pd(y, (__m128d)r, 1);
+  y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0));
+  return y;
+}
+
+static INLINE vdouble vldexp(vdouble x, vint q) {
+  vint m = _mm_srai_epi32(q, 31);
+  m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7);
+  q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2));
+  vdouble y = vpow2i(m);
+  return vmul(vmul(vmul(vmul(vmul(x, y), y), y), y), vpow2i(q));
+}
+
+static INLINE vint vilogbp1(vdouble d) {
+  vint q, r, c;
+  vmask m = vmask_lt(d, vcast_vd_d(4.9090934652977266E-91));
+  d = vsel(m, vmul(vcast_vd_d(2.037035976334486E90), d), d);
+  c = _mm256_cvtpd_epi32(vsel(m, vcast_vd_d(300+0x3fe), vcast_vd_d(0x3fe)));
+  q = (__m128i)_mm256_castpd256_pd128(d);
+  q = (__m128i)_mm_shuffle_ps((__m128)q, _mm_set_ps(0, 0, 0, 0), _MM_SHUFFLE(0,0,3,1));
+  r = (__m128i)_mm256_extractf128_pd(d, 1);
+  r = (__m128i)_mm_shuffle_ps(_mm_set_ps(0, 0, 0, 0), (__m128)r, _MM_SHUFFLE(3,1,0,0));
+  q = _mm_or_si128(q, r);
+  q = _mm_srli_epi32(q, 20);
+  q = _mm_sub_epi32(q, c);
+  return q;
+}
+
+static INLINE vdouble vupper(vdouble d) {
+  return (__m256d)_mm256_and_pd(d, (vdouble)_mm256_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000));
+}
+
+//
+
+typedef struct {
+  vdouble x, y;
+} vdouble2;
+
+static INLINE vdouble2 dd(vdouble h, vdouble l) {
+  vdouble2 ret = {h, l};
+  return ret;
+}
+
+static INLINE vdouble2 vsel2(vmask mask, vdouble2 x, vdouble2 y) {
+  return dd((__m256d)vorm(vandm(mask, (__m256i)x.x), vandnotm(mask, (__m256i)y.x)),
+	    (__m256d)vorm(vandm(mask, (__m256i)x.y), vandnotm(mask, (__m256i)y.y)));
+}
+
+static INLINE vdouble2 abs_d(vdouble2 x) {
+  return dd((__m256d)_mm256_xor_pd(_mm256_and_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), x.x), x.x),
+	    (__m256d)_mm256_xor_pd(_mm256_and_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), x.x), x.y));
+}
diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h
new file mode 100644
index 000000000..d9a7d6d03
--- /dev/null
+++ b/rtengine/helpersse2.h
@@ -0,0 +1,239 @@
+#ifndef __SSE2__
+#error Please specify -msse2.
+#endif
+
+#ifdef __GNUC__
+#define INLINE __inline
+//#define INLINE __attribute__((always_inline))
+#else
+#define INLINE inline
+#endif
+
+#include <intrin.h>
+#include <stdint.h>
+
+typedef __m128d vdouble;
+typedef __m128i vint;
+typedef __m128i vmask;
+
+typedef __m128 vfloat;
+typedef __m128i vint2;
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
+static INLINE vdouble vcast_vd_d(double d) { return _mm_set_pd(d, d); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m128i)vd; }
+static INLINE vdouble vreinterpret_vd_vm(vint vm) { return (__m128d)vm; }
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m128i)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m128)vm; }
+
+//
+
+static INLINE vfloat vcast_vf_f(float f) { return _mm_set_ps(f, f, f, f); }
+
+static INLINE vfloat vaddf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
+static INLINE vfloat vsubf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
+static INLINE vfloat vmulf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
+static INLINE vfloat vdivf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
+static INLINE vfloat vrecf(vfloat x) { return vdivf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrtf(vfloat x) { return _mm_sqrt_ps(x); }
+static INLINE vfloat vmaxf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
+static INLINE vfloat vminf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
+
+//
+
+static INLINE vdouble vadd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
+static INLINE vdouble vsub(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
+static INLINE vdouble vmul(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
+static INLINE vdouble vdiv(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
+static INLINE vdouble vrec(vdouble x) { return _mm_div_pd(_mm_set_pd(1, 1), x); }
+static INLINE vdouble vsqrt(vdouble x) { return _mm_sqrt_pd(x); }
+static INLINE vdouble vmla(vdouble x, vdouble y, vdouble z) { return vadd(vmul(x, y), z); }
+
+static INLINE vdouble vmax(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
+static INLINE vdouble vmin(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
+
+static INLINE vdouble vabs(vdouble d) { return (__m128d)_mm_andnot_pd(_mm_set_pd(-0.0,-0.0), d); }
+static INLINE vdouble vneg(vdouble d) { return (__m128d)_mm_xor_pd(_mm_set_pd(-0.0,-0.0), d); }
+
+//
+
+static INLINE vint vaddi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsubi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+
+static INLINE vint vandi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnoti(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vori(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxori(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vslli(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrli(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsrai(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+//
+
+static INLINE vmask vandm(vmask x, vmask y) { return _mm_and_si128(x, y); }
+static INLINE vmask vandnotm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vmask vorm(vmask x, vmask y) { return _mm_or_si128(x, y); }
+static INLINE vmask vxorm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vmask vmask_eq(vdouble x, vdouble y) { return (__m128i)_mm_cmpeq_pd(x, y); }
+static INLINE vmask vmask_neq(vdouble x, vdouble y) { return (__m128i)_mm_cmpneq_pd(x, y); }
+static INLINE vmask vmask_lt(vdouble x, vdouble y) { return (__m128i)_mm_cmplt_pd(x, y); }
+static INLINE vmask vmask_le(vdouble x, vdouble y) { return (__m128i)_mm_cmple_pd(x, y); }
+static INLINE vmask vmask_gt(vdouble x, vdouble y) { return (__m128i)_mm_cmpgt_pd(x, y); }
+static INLINE vmask vmask_ge(vdouble x, vdouble y) { return (__m128i)_mm_cmpge_pd(x, y); }
+
+static INLINE vmask vmaskf_eq(vfloat x, vfloat y) { return (__m128i)_mm_cmpeq_ps(x, y); }
+static INLINE vmask vmaskf_neq(vfloat x, vfloat y) { return (__m128i)_mm_cmpneq_ps(x, y); }
+static INLINE vmask vmaskf_lt(vfloat x, vfloat y) { return (__m128i)_mm_cmplt_ps(x, y); }
+static INLINE vmask vmaskf_le(vfloat x, vfloat y) { return (__m128i)_mm_cmple_ps(x, y); }
+static INLINE vmask vmaskf_gt(vfloat x, vfloat y) { return (__m128i)_mm_cmpgt_ps(x, y); }
+static INLINE vmask vmaskf_ge(vfloat x, vfloat y) { return (__m128i)_mm_cmpge_ps(x, y); }
+
+static INLINE vmask vmaski_eq(vint x, vint y) {
+  __m128 s = (__m128)_mm_cmpeq_epi32(x, y);
+  return (__m128i)_mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 0, 0));
+}
+
+static INLINE vdouble vsel(vmask mask, vdouble x, vdouble y) {
+  return (__m128d)vorm(vandm(mask, (__m128i)x), vandnotm(mask, (__m128i)y));
+}
+
+static INLINE vint vseli_lt(vdouble d0, vdouble d1, vint x, vint y) {
+  vmask mask = (vmask)_mm_cmpeq_ps(_mm_cvtpd_ps((vdouble)vmask_lt(d0, d1)), _mm_set_ps(0, 0, 0, 0));
+  return vori(vandnoti(mask, x), vandi(mask, y));
+}
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm_set_epi32(i, i, i, i); }
+
+static INLINE vint2 vaddi2(vint2 x, vint2 y) { return vaddi(x, y); }
+static INLINE vint2 vsubi2(vint2 x, vint2 y) { return vsubi(x, y); }
+
+static INLINE vint2 vandi2(vint2 x, vint2 y) { return vandi(x, y); }
+static INLINE vint2 vandnoti2(vint2 x, vint2 y) { return vandnoti(x, y); }
+static INLINE vint2 vori2(vint2 x, vint2 y) { return vori(x, y); }
+static INLINE vint2 vxori2(vint2 x, vint2 y) { return vxori(x, y); }
+
+static INLINE vint2 vslli2(vint2 x, int c) { return vslli(x, c); }
+static INLINE vint2 vsrli2(vint2 x, int c) { return vsrli(x, c); }
+static INLINE vint2 vsrai2(vint2 x, int c) { return vsrai(x, c); }
+
+static INLINE vmask vmaski2_eq(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint2 vseli2(vmask m, vint2 x, vint2 y) { return vorm(vandm(m, x), vandnotm(m, y)); }
+
+//
+
+static INLINE double vcast_d_vd(vdouble v) {
+  double s[2];
+  _mm_storeu_pd(s, v);
+  return s[0];
+}
+
+static INLINE float vcast_f_vf(vfloat v) {
+  float s[4];
+  _mm_storeu_ps(s, v);
+  return s[0];
+}
+
+static INLINE vmask vsignbit(vdouble d) {
+  return _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0));
+}
+
+static INLINE vdouble vsign(vdouble d) {
+  return (__m128d)_mm_or_si128((__m128i)_mm_set_pd(1, 1), _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0)));
+}
+
+static INLINE vdouble vmulsign(vdouble x, vdouble y) {
+  return (__m128d)vxori((__m128i)x, vsignbit(y));
+}
+
+static INLINE vmask vmask_isinf(vdouble d) {
+  return (vmask)_mm_cmpeq_pd(vabs(d), _mm_set_pd(INFINITY, INFINITY));
+}
+
+static INLINE vmask vmask_ispinf(vdouble d) {
+  return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(INFINITY, INFINITY));
+}
+
+static INLINE vmask vmask_isminf(vdouble d) {
+  return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(-INFINITY, -INFINITY));
+}
+
+static INLINE vmask vmask_isnan(vdouble d) {
+  return (vmask)_mm_cmpneq_pd(d, d);
+}
+
+static INLINE vdouble visinf(vdouble d) {
+  return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)_mm_set_pd(1, 1)));
+}
+
+static INLINE vdouble visinf2(vdouble d, vdouble m) {
+  return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)m));
+}
+
+//
+
+static INLINE vdouble vpow2i(vint q) {
+  q = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), q);
+  q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,3,0,3));
+  return (__m128d)_mm_slli_epi32(q, 20);
+}
+
+static INLINE vdouble vldexp(vdouble x, vint q) {
+  vint m = _mm_srai_epi32(q, 31);
+  m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7);
+  q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2));
+  vdouble y = vpow2i(m);
+  return vmul(vmul(vmul(vmul(vmul(x, y), y), y), y), vpow2i(q));
+}
+
+static INLINE vint vilogbp1(vdouble d) {
+  vint m = vmask_lt(d, vcast_vd_d(4.9090934652977266E-91));
+  d = vsel(m, vmul(vcast_vd_d(2.037035976334486E90), d), d);
+  __m128i q = _mm_and_si128((__m128i)d, _mm_set_epi32(((1 << 12)-1) << 20, 0, ((1 << 12)-1) << 20, 0));
+  q = _mm_srli_epi32(q, 20);
+  q = vorm(vandm   (m, _mm_sub_epi32(q, _mm_set_epi32(300 + 0x3fe, 0, 300 + 0x3fe, 0))),
+	   vandnotm(m, _mm_sub_epi32(q, _mm_set_epi32(      0x3fe, 0,       0x3fe, 0))));
+  q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(0,0,3,1));
+  return q;
+}
+
+static INLINE vdouble vupper(vdouble d) {
+  return (__m128d)_mm_and_si128((__m128i)d, _mm_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000));
+}
+
+//
+
+typedef struct {
+  vdouble x, y;
+} vdouble2;
+
+static INLINE vdouble2 dd(vdouble h, vdouble l) {
+  vdouble2 ret = {h, l};
+  return ret;
+}
+
+static INLINE vdouble2 vsel2(vmask mask, vdouble2 x, vdouble2 y) {
+  return dd((__m128d)vorm(vandm(mask, (__m128i)x.x), vandnotm(mask, (__m128i)y.x)),
+	    (__m128d)vorm(vandm(mask, (__m128i)x.y), vandnotm(mask, (__m128i)y.y)));
+}
+
+static INLINE vdouble2 abs_d(vdouble2 x) {
+  return dd((__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0,-0.0), x.x), x.x),
+	    (__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0,-0.0), x.x), x.y));
+}
diff --git a/rtengine/sleef.c b/rtengine/sleef.c
new file mode 100644
index 000000000..7d486bd4a
--- /dev/null
+++ b/rtengine/sleef.c
@@ -0,0 +1,1211 @@
+#include <assert.h>
+#include <stdint.h>
+#include <math.h>
+//#include <bits/nan.h>
+//#include <bits/inf.h>
+
+#define PI4_A .7853981554508209228515625
+#define PI4_B .794662735614792836713604629039764404296875e-8
+#define PI4_C .306161699786838294306516483068750264552437361480769e-16
+#define M_4_PI 1.273239544735162542821171882678754627704620361328125
+
+#define L2U .69314718055966295651160180568695068359375
+#define L2L .28235290563031577122588448175013436025525412068e-12
+#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
+
+__inline int64_t doubleToRawLongBits(double d) {
+  union {
+    double f;
+    int64_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+__inline double longBitsToDouble(int64_t i) {
+  union {
+    double f;
+    int64_t i;
+  } tmp;
+  tmp.i = i;
+  return tmp.f;
+}
+
+__inline double xfabs(double x) {
+  return longBitsToDouble(0x7fffffffffffffffLL & doubleToRawLongBits(x));
+}
+
+__inline double mulsign(double x, double y) {
+  return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (1LL << 63)));
+}
+
+__inline double sign(double d) { return mulsign(1, d); }
+__inline double mla(double x, double y, double z) { return x * y + z; }
+__inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }
+
+__inline int xisnan(double x) { return x != x; }
+__inline int xisinf(double x) { return x == INFINITY || x == -INFINITY; }
+__inline int xisminf(double x) { return x == -INFINITY; }
+__inline int xispinf(double x) { return x == INFINITY; }
+
+__inline double ldexpk(double x, int q) {
+  double u;
+  int m;
+  m = q >> 31;
+  m = (((m + q) >> 9) - m) << 7;
+  q = q - (m << 2);
+  u = longBitsToDouble(((int64_t)(m + 0x3ff)) << 52);
+  x = x * u * u * u * u;
+  u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
+  return x * u;
+}
+
+__inline double xldexp(double x, int q) { return ldexpk(x, q); }
+
+__inline int ilogbp1(double d) {
+  int m = d < 4.9090934652977266E-91;
+  d = m ? 2.037035976334486E90 * d : d;
+  int q = (doubleToRawLongBits(d) >> 52) & 0x7ff;
+  q = m ? q - (300 + 0x03fe) : q - 0x03fe;
+  return q;
+}
+
+__inline int xilogb(double d) {
+  int e = ilogbp1(xfabs(d)) - 1;
+  e = d == 0 ? -2147483648 : e;
+  e = d == INFINITY || d == -INFINITY ? 2147483647 : e;
+  return e;
+}
+
+__inline double upper(double d) {
+  return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffff8000000LL);
+}
+
+typedef struct {
+  double x, y;
+} double2;
+
+typedef struct {
+  float x, y;
+} float2;
+
+__inline double2 dd(double h, double l) {
+  double2 ret;
+  ret.x = h; ret.y = l;
+  return ret;
+}
+
+__inline double2 normalize_d(double2 t) {
+  double2 s;
+
+  s.x = t.x + t.y;
+  s.y = t.x - s.x + t.y;
+
+  return s;
+}
+
+__inline double2 scale_d(double2 d, double s) {
+  double2 r;
+
+  r.x = d.x * s;
+  r.y = d.y * s;
+
+  return r;
+}
+
+__inline double2 add2_ss(double x, double y) {
+  double2 r;
+
+  r.x = x + y;
+  double v = r.x - x;
+  r.y = (x - (r.x - v)) + (y - v);
+
+  return r;
+}
+
+__inline double2 add_ds(double2 x, double y) {
+  // |x| >= |y|
+
+  double2 r;
+
+  assert(xisnan(x.x) || xisnan(y) || xfabs(x.x) >= xfabs(y));
+
+  r.x = x.x + y;
+  r.y = x.x - r.x + y + x.y;
+
+  return r;
+}
+
+__inline double2 add2_ds(double2 x, double y) {
+  // |x| >= |y|
+
+  double2 r;
+
+  r.x  = x.x + y;
+  double v = r.x - x.x;
+  r.y = (x.x - (r.x - v)) + (y - v);
+  r.y += x.y;
+
+  return r;
+}
+
+__inline double2 add_sd(double x, double2 y) {
+  // |x| >= |y|
+
+  double2 r;
+
+  assert(xisnan(x) || xisnan(y.x) || xfabs(x) >= xfabs(y.x));
+
+  r.x = x + y.x;
+  r.y = x - r.x + y.x + y.y;
+
+  return r;
+}
+
+__inline double2 add_dd(double2 x, double2 y) {
+  // |x| >= |y|
+
+  double2 r;
+
+  assert(xisnan(x.x) || xisnan(y.x) || xfabs(x.x) >= xfabs(y.x));
+
+  r.x = x.x + y.x;
+  r.y = x.x - r.x + y.x + x.y + y.y;
+
+  return r;
+}
+
+__inline double2 add2_dd(double2 x, double2 y) {
+  double2 r;
+
+  r.x  = x.x + y.x;
+  double v = r.x - x.x;
+  r.y = (x.x - (r.x - v)) + (y.x - v);
+  r.y += x.y + y.y;
+
+  return r;
+}
+
+__inline double2 div_dd(double2 n, double2 d) {
+  double t = 1.0 / d.x;
+  double dh  = upper(d.x), dl  = d.x - dh;
+  double th  = upper(t  ), tl  = t   - th;
+  double nhh = upper(n.x), nhl = n.x - nhh;
+
+  double2 q;
+
+  q.x = n.x * t;
+
+  double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
+    q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);
+
+  q.y = t * (n.y - q.x * d.y) + u;
+
+  return q;
+}
+
+__inline double2 mul_ss(double x, double y) {
+  double xh = upper(x), xl = x - xh;
+  double yh = upper(y), yl = y - yh;
+  double2 r;
+
+  r.x = x * y;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;
+
+  return r;
+}
+
+__inline double2 mul_ds(double2 x, double y) {
+  double xh = upper(x.x), xl = x.x - xh;
+  double yh = upper(y  ), yl = y   - yh;
+  double2 r;
+
+  r.x = x.x * y;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;
+
+  return r;
+}
+
+__inline double2 mul_dd(double2 x, double2 y) {
+  double xh = upper(x.x), xl = x.x - xh;
+  double yh = upper(y.x), yl = y.x - yh;
+  double2 r;
+
+  r.x = x.x * y.x;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;
+
+  return r;
+}
+
+__inline double2 squ_d(double2 x) {
+  double xh = upper(x.x), xl = x.x - xh;
+  double2 r;
+
+  r.x = x.x * x.x;
+  r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);
+
+  return r;
+}
+
+__inline double2 rec_s(double d) {
+  double t = 1.0 / d;
+  double dh = upper(d), dl = d - dh;
+  double th = upper(t), tl = t - th;
+  double2 q;
+
+  q.x = t;
+  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);
+
+  return q;
+}
+
+__inline double2 sqrt_d(double2 d) {
+  double t = sqrt(d.x + d.y);
+  return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), 0.5);
+}
+
+__inline double atan2k(double y, double x) {
+  double s, t, u;
+  int q = 0;
+
+  if (x < 0) { x = -x; q = -2; }
+  if (y > x) { t = x; x = y; y = -t; q += 1; }
+
+  s = y / x;
+  t = s * s;
+
+  u = -1.88796008463073496563746e-05;
+  u = u * t + (0.000209850076645816976906797);
+  u = u * t + (-0.00110611831486672482563471);
+  u = u * t + (0.00370026744188713119232403);
+  u = u * t + (-0.00889896195887655491740809);
+  u = u * t + (0.016599329773529201970117);
+  u = u * t + (-0.0254517624932312641616861);
+  u = u * t + (0.0337852580001353069993897);
+  u = u * t + (-0.0407629191276836500001934);
+  u = u * t + (0.0466667150077840625632675);
+  u = u * t + (-0.0523674852303482457616113);
+  u = u * t + (0.0587666392926673580854313);
+  u = u * t + (-0.0666573579361080525984562);
+  u = u * t + (0.0769219538311769618355029);
+  u = u * t + (-0.090908995008245008229153);
+  u = u * t + (0.111111105648261418443745);
+  u = u * t + (-0.14285714266771329383765);
+  u = u * t + (0.199999999996591265594148);
+  u = u * t + (-0.333333333333311110369124);
+
+  t = u * t * s + s;
+  t = q * (M_PI/2) + t;
+
+  return t;
+}
+
+__inline double xatan2(double y, double x) {
+  double r = atan2k(xfabs(y), x);
+
+  r = mulsign(r, x);
+  if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI  /2)) : 0);
+  if (xisinf(y)          ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0);
+  if (             y == 0) r = (sign(x) == -1 ? M_PI : 0);
+
+  return xisnan(x) || xisnan(y) ? NAN : mulsign(r, y);
+}
+
+__inline double xasin(double d) {
+  return mulsign(atan2k(xfabs(d), sqrt((1+d)*(1-d))), d);
+}
+
+__inline double xacos(double d) {
+  return mulsign(atan2k(sqrt((1+d)*(1-d)), xfabs(d)), d) + (d < 0 ? M_PI : 0);
+}
+
+__inline double xatan(double s) {
+  double t, u;
+  int q = 0;
+
+  if (s < 0) { s = -s; q = 2; }
+  if (s > 1) { s = 1.0 / s; q |= 1; }
+
+  t = s * s;
+
+  u = -1.88796008463073496563746e-05;
+  u = u * t + (0.000209850076645816976906797);
+  u = u * t + (-0.00110611831486672482563471);
+  u = u * t + (0.00370026744188713119232403);
+  u = u * t + (-0.00889896195887655491740809);
+  u = u * t + (0.016599329773529201970117);
+  u = u * t + (-0.0254517624932312641616861);
+  u = u * t + (0.0337852580001353069993897);
+  u = u * t + (-0.0407629191276836500001934);
+  u = u * t + (0.0466667150077840625632675);
+  u = u * t + (-0.0523674852303482457616113);
+  u = u * t + (0.0587666392926673580854313);
+  u = u * t + (-0.0666573579361080525984562);
+  u = u * t + (0.0769219538311769618355029);
+  u = u * t + (-0.090908995008245008229153);
+  u = u * t + (0.111111105648261418443745);
+  u = u * t + (-0.14285714266771329383765);
+  u = u * t + (0.199999999996591265594148);
+  u = u * t + (-0.333333333333311110369124);
+
+  t = s + s * (t * u);
+
+  if ((q & 1) != 0) t = 1.570796326794896557998982 - t;
+  if ((q & 2) != 0) t = -t;
+
+  return t;
+}
+
+__inline double xsin(double d) {
+  int q;
+  double u, s;
+
+  q = (int)xrint(d * M_1_PI);
+
+  d = mla(q, -PI4_A*4, d);
+  d = mla(q, -PI4_B*4, d);
+  d = mla(q, -PI4_C*4, d);
+
+  s = d * d;
+
+  if ((q & 1) != 0) d = -d;
+
+  u = -7.97255955009037868891952e-18;
+  u = mla(u, s, 2.81009972710863200091251e-15);
+  u = mla(u, s, -7.64712219118158833288484e-13);
+  u = mla(u, s, 1.60590430605664501629054e-10);
+  u = mla(u, s, -2.50521083763502045810755e-08);
+  u = mla(u, s, 2.75573192239198747630416e-06);
+  u = mla(u, s, -0.000198412698412696162806809);
+  u = mla(u, s, 0.00833333333333332974823815);
+  u = mla(u, s, -0.166666666666666657414808);
+
+  u = mla(s, u * d, d);
+
+  return u;
+}
+
+__inline double xcos(double d) {
+  int q;
+  double u, s;
+
+  q = 1 + 2*(int)xrint(d * M_1_PI - 0.5);
+
+  d = mla(q, -PI4_A*2, d);
+  d = mla(q, -PI4_B*2, d);
+  d = mla(q, -PI4_C*2, d);
+
+  s = d * d;
+
+  if ((q & 2) == 0) d = -d;
+
+  u = -7.97255955009037868891952e-18;
+  u = mla(u, s, 2.81009972710863200091251e-15);
+  u = mla(u, s, -7.64712219118158833288484e-13);
+  u = mla(u, s, 1.60590430605664501629054e-10);
+  u = mla(u, s, -2.50521083763502045810755e-08);
+  u = mla(u, s, 2.75573192239198747630416e-06);
+  u = mla(u, s, -0.000198412698412696162806809);
+  u = mla(u, s, 0.00833333333333332974823815);
+  u = mla(u, s, -0.166666666666666657414808);
+
+  u = mla(s, u * d, d);
+
+  return u;
+}
+
+__inline double2 xsincos(double d) {
+  int q;
+  double u, s, t;
+  double2 r;
+
+  q = (int)xrint(d * (2 * M_1_PI));
+
+  s = d;
+
+  s = mla(-q, PI4_A*2, s);
+  s = mla(-q, PI4_B*2, s);
+  s = mla(-q, PI4_C*2, s);
+
+  t = s;
+
+  s = s * s;
+
+  u = 1.58938307283228937328511e-10;
+  u = mla(u, s, -2.50506943502539773349318e-08);
+  u = mla(u, s, 2.75573131776846360512547e-06);
+  u = mla(u, s, -0.000198412698278911770864914);
+  u = mla(u, s, 0.0083333333333191845961746);
+  u = mla(u, s, -0.166666666666666130709393);
+  u = u * s * t;
+
+  r.x = t + u;
+
+  u = -1.13615350239097429531523e-11;
+  u = mla(u, s, 2.08757471207040055479366e-09);
+  u = mla(u, s, -2.75573144028847567498567e-07);
+  u = mla(u, s, 2.48015872890001867311915e-05);
+  u = mla(u, s, -0.00138888888888714019282329);
+  u = mla(u, s, 0.0416666666666665519592062);
+  u = mla(u, s, -0.5);
+
+  r.y = u * s + 1;
+
+  if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((q & 2) != 0) { r.x = -r.x; }
+  if (((q+1) & 2) != 0) { r.y = -r.y; }
+
+  if (xisinf(d)) { r.x = r.y = NAN; }
+
+  return r;
+}
+
+__inline double xtan(double d) {
+  int q;
+  double u, s, x;
+
+  q = (int)xrint(d * (2 * M_1_PI));
+
+  x = mla(q, -PI4_A*2, d);
+  x = mla(q, -PI4_B*2, x);
+  x = mla(q, -PI4_C*2, x);
+
+  s = x * x;
+
+  if ((q & 1) != 0) x = -x;
+
+  u = 1.01419718511083373224408e-05;
+  u = mla(u, s, -2.59519791585924697698614e-05);
+  u = mla(u, s, 5.23388081915899855325186e-05);
+  u = mla(u, s, -3.05033014433946488225616e-05);
+  u = mla(u, s, 7.14707504084242744267497e-05);
+  u = mla(u, s, 8.09674518280159187045078e-05);
+  u = mla(u, s, 0.000244884931879331847054404);
+  u = mla(u, s, 0.000588505168743587154904506);
+  u = mla(u, s, 0.00145612788922812427978848);
+  u = mla(u, s, 0.00359208743836906619142924);
+  u = mla(u, s, 0.00886323944362401618113356);
+  u = mla(u, s, 0.0218694882853846389592078);
+  u = mla(u, s, 0.0539682539781298417636002);
+  u = mla(u, s, 0.133333333333125941821962);
+  u = mla(u, s, 0.333333333333334980164153);
+
+  u = mla(s, u * x, x);
+
+  if ((q & 1) != 0) u = 1.0 / u;
+
+  if (xisinf(d)) u = NAN;
+
+  return u;
+}
+
+__inline double xlog(double d) {
+  double x, x2, t, m;
+  int e;
+
+  e = ilogbp1(d * 0.7071);
+  m = ldexpk(d, -e);
+
+  x = (m-1) / (m+1);
+  x2 = x * x;
+
+  t = 0.148197055177935105296783;
+  t = mla(t, x2, 0.153108178020442575739679);
+  t = mla(t, x2, 0.181837339521549679055568);
+  t = mla(t, x2, 0.22222194152736701733275);
+  t = mla(t, x2, 0.285714288030134544449368);
+  t = mla(t, x2, 0.399999999989941956712869);
+  t = mla(t, x2, 0.666666666666685503450651);
+  t = mla(t, x2, 2);
+
+  x = x * t + 0.693147180559945286226764 * e;
+
+  if (xisinf(d)) x = INFINITY;
+  if (d < 0) x = NAN;
+  if (d == 0) x = -INFINITY;
+
+  return x;
+}
+
+__inline double xexp(double d) {
+  int q = (int)xrint(d * R_LN2);
+  double s, u;
+
+  s = mla(q, -L2U, d);
+  s = mla(q, -L2L, s);
+
+  u = 2.08860621107283687536341e-09;
+  u = mla(u, s, 2.51112930892876518610661e-08);
+  u = mla(u, s, 2.75573911234900471893338e-07);
+  u = mla(u, s, 2.75572362911928827629423e-06);
+  u = mla(u, s, 2.4801587159235472998791e-05);
+  u = mla(u, s, 0.000198412698960509205564975);
+  u = mla(u, s, 0.00138888888889774492207962);
+  u = mla(u, s, 0.00833333333331652721664984);
+  u = mla(u, s, 0.0416666666666665047591422);
+  u = mla(u, s, 0.166666666666666851703837);
+  u = mla(u, s, 0.5);
+
+  u = s * s * u + s + 1;
+  u = ldexpk(u, q);
+
+  if (xisminf(d)) u = 0;
+
+  return u;
+}
+
+__inline double2 logk(double d) {
+  double2 x, x2;
+  double m, t;
+  int e;
+
+  e = ilogbp1(d * 0.7071);
+  m = ldexpk(d, -e);
+
+  x = div_dd(add2_ss(-1, m), add2_ss(1, m));
+  x2 = squ_d(x);
+
+  t = 0.134601987501262130076155;
+  t = mla(t, x2.x, 0.132248509032032670243288);
+  t = mla(t, x2.x, 0.153883458318096079652524);
+  t = mla(t, x2.x, 0.181817427573705403298686);
+  t = mla(t, x2.x, 0.222222231326187414840781);
+  t = mla(t, x2.x, 0.285714285651261412873718);
+  t = mla(t, x2.x, 0.400000000000222439910458);
+  t = mla(t, x2.x, 0.666666666666666371239645);
+
+  return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
+		 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
+}
+
+__inline double expk(double2 d) {
+  int q = (int)rint((d.x + d.y) * R_LN2);
+  double2 s, t;
+  double u;
+
+  s = add2_ds(d, q * -L2U);
+  s = add2_ds(s, q * -L2L);
+
+  s = normalize_d(s);
+
+  u = 2.51069683420950419527139e-08;
+  u = mla(u, s.x, 2.76286166770270649116855e-07);
+  u = mla(u, s.x, 2.75572496725023574143864e-06);
+  u = mla(u, s.x, 2.48014973989819794114153e-05);
+  u = mla(u, s.x, 0.000198412698809069797676111);
+  u = mla(u, s.x, 0.0013888888939977128960529);
+  u = mla(u, s.x, 0.00833333333332371417601081);
+  u = mla(u, s.x, 0.0416666666665409524128449);
+  u = mla(u, s.x, 0.166666666666666740681535);
+  u = mla(u, s.x, 0.500000000000000999200722);
+
+  t = add_dd(s, mul_ds(squ_d(s), u));
+
+  t = add_sd(1, t);
+  return ldexpk(t.x + t.y, q);
+}
+
+__inline double xpow(double x, double y) {
+  int yisint = (int)y == y;
+  int yisodd = (1 & (int)y) != 0 && yisint;
+
+  double result = expk(mul_ds(logk(xfabs(x)), y));
+
+  result = xisnan(result) ? INFINITY : result;
+  result *=  (x >= 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1)));
+
+  double efx = mulsign(xfabs(x) - 1, y);
+  if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : INFINITY);
+  if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITY);
+  if (xisnan(x) || xisnan(y)) result = NAN;
+  if (y == 0 || x == 1) result = 1;
+
+  return result;
+}
+
+__inline double2 expk2(double2 d) {
+  int q = (int)rint((d.x + d.y) * R_LN2);
+  double2 s, t;
+  double u;
+
+  s = add2_ds(d, q * -L2U);
+  s = add2_ds(s, q * -L2L);
+
+  s = normalize_d(s);
+
+  u = 2.51069683420950419527139e-08;
+  u = mla(u, s.x, 2.76286166770270649116855e-07);
+  u = mla(u, s.x, 2.75572496725023574143864e-06);
+  u = mla(u, s.x, 2.48014973989819794114153e-05);
+  u = mla(u, s.x, 0.000198412698809069797676111);
+  u = mla(u, s.x, 0.0013888888939977128960529);
+  u = mla(u, s.x, 0.00833333333332371417601081);
+  u = mla(u, s.x, 0.0416666666665409524128449);
+  u = mla(u, s.x, 0.166666666666666740681535);
+  u = mla(u, s.x, 0.500000000000000999200722);
+
+  t = add_dd(s, mul_ds(squ_d(s), u));
+
+  t = add_sd(1, t);
+  return dd(ldexpk(t.x, q), ldexpk(t.y, q));
+}
+
+__inline double xsinh(double x) {
+  double y = xfabs(x);
+  double2 d = expk2(dd(y, 0));
+  d = add2_dd(d, div_dd(dd(-1, 0), d));
+  y = (d.x + d.y) * 0.5;
+
+  y = xisinf(x) || xisnan(y) ? INFINITY : y;
+  y = mulsign(y, x);
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+__inline double xcosh(double x) {
+  double2 d = expk2(dd(x, 0));
+  d = add2_dd(d, div_dd(dd(1, 0), d));
+  double y = (d.x + d.y) * 0.5;
+
+  y = xisinf(x) || xisnan(y) ? INFINITY : y;
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+__inline double xtanh(double x) {
+  double y = xfabs(x);
+  double2 d = expk2(dd(y, 0));
+  double2 e = div_dd(dd(1, 0), d);
+  d = div_dd(add2_dd(d, scale_d(e, -1)), add2_dd(d, e));
+  y = d.x + d.y;
+
+  y = xisinf(x) || xisnan(y) ? 1.0 : y;
+  y = mulsign(y, x);
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+__inline double2 logk2(double2 d) {
+  double2 x, x2, m;
+  double t;
+  int e;
+
+  d = normalize_d(d);
+  e = ilogbp1(d.x * 0.7071);
+  m = scale_d(d, ldexpk(1, -e));
+
+  x = div_dd(add2_ds(m, -1), add2_ds(m, 1));
+  x2 = squ_d(x);
+
+  t = 0.134601987501262130076155;
+  t = mla(t, x2.x, 0.132248509032032670243288);
+  t = mla(t, x2.x, 0.153883458318096079652524);
+  t = mla(t, x2.x, 0.181817427573705403298686);
+  t = mla(t, x2.x, 0.222222231326187414840781);
+  t = mla(t, x2.x, 0.285714285651261412873718);
+  t = mla(t, x2.x, 0.400000000000222439910458);
+  t = mla(t, x2.x, 0.666666666666666371239645);
+
+  return add2_dd(mul_ds(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
+		 add2_dd(scale_d(x, 2), mul_ds(mul_dd(x2, x), t)));
+}
+
+__inline double xasinh(double x) {
+  double y = xfabs(x);
+  double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  1)), y));
+  y = d.x + d.y;
+
+  y = xisinf(x) || xisnan(y) ? INFINITY : y;
+  y = mulsign(y, x);
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+__inline double xacosh(double x) {
+  double2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), -1)), x));
+  double y = d.x + d.y;
+
+  y = xisinf(x) || xisnan(y) ? INFINITY : y;
+  y = x == 1.0 ? 0.0 : y;
+  y = x < 1.0 ? NAN : y;
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+__inline double xatanh(double x) {
+  double y = xfabs(x);
+  double2 d = logk2(div_dd(add2_ss(1, y), add2_ss(1, -y)));
+  y = y > 1.0 ? NAN : (y == 1.0 ? INFINITY : (d.x + d.y) * 0.5);
+
+  y = xisinf(x) || xisnan(y) ? NAN : y;
+  y = mulsign(y, x);
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+//
+
+__inline double xfma(double x, double y, double z) {
+  union {
+    double f;
+    long long int i;
+  } tmp;
+
+  tmp.f = x;
+  tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
+  double xh = tmp.f, xl = x - xh;
+
+  tmp.f = y;
+  tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL;
+  double yh = tmp.f, yl = y - yh;
+
+  double h = x * y;
+  double l = xh * yh - h + xl * yh + xh * yl + xl * yl;
+
+  double h2, l2, v;
+
+  h2 = h + z;
+  v = h2 - h;
+  l2 = (h - (h2 - v)) + (z - v) + l;
+
+  return h2 + l2;
+}
+
+__inline double xsqrt(double d) { // max error : 0.5 ulp
+  double q = 1;
+
+  if (d < 8.636168555094445E-78) {
+    d *= 1.157920892373162E77;
+    q = 2.9387358770557188E-39;
+  }
+
+  // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+  double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1));
+
+  x = x * (1.5 - 0.5 * d * x * x);
+  x = x * (1.5 - 0.5 * d * x * x);
+  x = x * (1.5 - 0.5 * d * x * x);
+
+  // You can change xfma to fma if fma is correctly implemented
+  x = xfma(d * x, d * x, -d) * (x * -0.5) + d * x;
+
+  return d == INFINITY ? INFINITY : x * q;
+}
+
+__inline double xcbrt(double d) { // max error : 2 ulps
+  double x, y, q = 1.0;
+  int e, r;
+
+  e = ilogbp1(d);
+  d = ldexpk(d, -e);
+  r = (e + 6144) % 3;
+  q = (r == 1) ? 1.2599210498948731647672106 : q;
+  q = (r == 2) ? 1.5874010519681994747517056 : q;
+  q = ldexpk(q, (e + 6144) / 3 - 2048);
+
+  q = mulsign(q, d);
+  d = xfabs(d);
+
+  x = -0.640245898480692909870982;
+  x = x * d + 2.96155103020039511818595;
+  x = x * d + -5.73353060922947843636166;
+  x = x * d + 6.03990368989458747961407;
+  x = x * d + -3.85841935510444988821632;
+  x = x * d + 2.2307275302496609725722;
+
+  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);
+  y = d * x * x;
+  y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q;
+
+  return y;
+}
+
+__inline double xexp2(double a) {
+  double u = expk(mul_ds(dd(0.69314718055994528623, 2.3190468138462995584e-17), a));
+  if (xispinf(a)) u = INFINITY;
+  if (xisminf(a)) u = 0;
+  return u;
+}
+
+__inline double xexp10(double a) {
+  double u = expk(mul_ds(dd(2.3025850929940459011, -2.1707562233822493508e-16), a));
+  if (xispinf(a)) u = INFINITY;
+  if (xisminf(a)) u = 0;
+  return u;
+}
+
+__inline double xexpm1(double a) {
+  double2 d = add2_ds(expk2(dd(a, 0)), -1.0);
+  double x = d.x + d.y;
+  if (xispinf(a)) x = INFINITY;
+  if (xisminf(a)) x = -1;
+  return x;
+}
+
+__inline double xlog10(double a) {
+  double2 d = mul_dd(logk(a), dd(0.43429448190325176116, 6.6494347733425473126e-17));
+  double x = d.x + d.y;
+
+  if (xisinf(a)) x = INFINITY;
+  if (a < 0) x = NAN;
+  if (a == 0) x = -INFINITY;
+
+  return x;
+}
+
+__inline double xlog1p(double a) {
+  double2 d = logk2(add2_ss(a, 1));
+  double x = d.x + d.y;
+
+  if (xisinf(a)) x = INFINITY;
+  if (a < -1) x = NAN;
+  if (a == -1) x = -INFINITY;
+
+  return x;
+}
+
+///////////////////////////////////////////
+
+#define PI4_Af 0.78515625f
+#define PI4_Bf 0.00024127960205078125f
+#define PI4_Cf 6.3329935073852539062e-07f
+#define PI4_Df 4.9604681473525147339e-10f
+
+#define L2Uf 0.693145751953125f
+#define L2Lf 1.428606765330187045e-06f
+
+#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
+#define M_PIf ((float)M_PI)
+
+#define INFINITYf ((float)INFINITY)
+#define NANf ((float)NAN)
+
+__inline int32_t floatToRawIntBits(float d) {
+  union {
+    float f;
+    int32_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+__inline float intBitsToFloat(int32_t i) {
+  union {
+    float f;
+    int32_t i;
+  } tmp;
+  tmp.i = i;
+  return tmp.f;
+}
+
+__inline float xfabsf(float x) {
+  return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x));
+}
+
+__inline float mulsignf(float x, float y) {
+  return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));
+}
+
+__inline float signf(float d) { return mulsignf(1, d); }
+__inline float mlaf(float x, float y, float z) { return x * y + z; }
+__inline float xrintf(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); }
+
+__inline int xisnanf(float x) { return x != x; }
+__inline int xisinff(float x) { return x == INFINITYf || x == -INFINITYf; }
+__inline int xisminff(float x) { return x == -INFINITYf; }
+__inline int xispinff(float x) { return x == INFINITYf; }
+
+__inline int ilogbp1f(float d) {
+  int m = d < 5.421010862427522E-20f;
+  d = m ? 1.8446744073709552E19f * d : d;
+  int q = (floatToRawIntBits(d) >> 23) & 0xff;
+  q = m ? q - (64 + 0x7e) : q - 0x7e;
+  return q;
+}
+
+__inline float ldexpkf(float x, int q) {
+  float u;
+  int m;
+  m = q >> 31;
+  m = (((m + q) >> 6) - m) << 4;
+  q = q - (m << 2);
+  u = intBitsToFloat(((int32_t)(m + 0x7f)) << 23);
+  x = x * u * u * u * u;
+  u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
+  return x * u;
+}
+
+__inline float xcbrtf(float d) { // max error : 2 ulps
+  float x, y, q = 1.0f;
+  int e, r;
+
+  e = ilogbp1f(d);
+  d = ldexpkf(d, -e);
+  r = (e + 6144) % 3;
+  q = (r == 1) ? 1.2599210498948731647672106f : q;
+  q = (r == 2) ? 1.5874010519681994747517056f : q;
+  q = ldexpkf(q, (e + 6144) / 3 - 2048);
+
+  q = mulsignf(q, d);
+  d = xfabsf(d);
+
+  x = -0.601564466953277587890625f;
+  x = mlaf(x, d, 2.8208892345428466796875f);
+  x = mlaf(x, d, -5.532182216644287109375f);
+  x = mlaf(x, d, 5.898262500762939453125f);
+  x = mlaf(x, d, -3.8095417022705078125f);
+  x = mlaf(x, d, 2.2241256237030029296875f);
+
+  y = d * x * x;
+  y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q;
+
+  return y;
+}
+
+__inline float xsinf(float d) {
+  int q;
+  float u, s;
+
+  q = (int)xrintf(d * (float)M_1_PI);
+
+  d = mlaf(q, -PI4_Af*4, d);
+  d = mlaf(q, -PI4_Bf*4, d);
+  d = mlaf(q, -PI4_Cf*4, d);
+  d = mlaf(q, -PI4_Df*4, d);
+
+  s = d * d;
+
+  if ((q & 1) != 0) d = -d;
+
+  u = 2.6083159809786593541503e-06f;
+  u = mlaf(u, s, -0.0001981069071916863322258f);
+  u = mlaf(u, s, 0.00833307858556509017944336f);
+  u = mlaf(u, s, -0.166666597127914428710938f);
+
+  u = mlaf(s, u * d, d);
+
+  return u;
+}
+
+__inline float xcosf(float d) {
+  int q;
+  float u, s;
+
+  q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f);
+
+  d = mlaf(q, -PI4_Af*2, d);
+  d = mlaf(q, -PI4_Bf*2, d);
+  d = mlaf(q, -PI4_Cf*2, d);
+  d = mlaf(q, -PI4_Df*2, d);
+
+  s = d * d;
+
+  if ((q & 2) == 0) d = -d;
+
+  u = 2.6083159809786593541503e-06f;
+  u = mlaf(u, s, -0.0001981069071916863322258f);
+  u = mlaf(u, s, 0.00833307858556509017944336f);
+  u = mlaf(u, s, -0.166666597127914428710938f);
+
+  u = mlaf(s, u * d, d);
+
+  return u;
+}
+
+__inline float2 xsincosf(float d) {
+  int q;
+  float u, s, t;
+  float2 r;
+
+  q = (int)rint(d * ((float)(2 * M_1_PI)));
+
+  s = d;
+
+  s = mlaf(q, -PI4_Af*2, s);
+  s = mlaf(q, -PI4_Bf*2, s);
+  s = mlaf(q, -PI4_Cf*2, s);
+  s = mlaf(q, -PI4_Df*2, s);
+
+  t = s;
+
+  s = s * s;
+
+  u = -0.000195169282960705459117889f;
+  u = mlaf(u, s, 0.00833215750753879547119141f);
+  u = mlaf(u, s, -0.166666537523269653320312f);
+  u = u * s * t;
+
+  r.x = t + u;
+
+  u = -2.71811842367242206819355e-07f;
+  u = mlaf(u, s, 2.47990446951007470488548e-05f);
+  u = mlaf(u, s, -0.00138888787478208541870117f);
+  u = mlaf(u, s, 0.0416666641831398010253906f);
+  u = mlaf(u, s, -0.5f);
+
+  r.y = u * s + 1;
+
+  if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((q & 2) != 0) { r.x = -r.x; }
+  if (((q+1) & 2) != 0) { r.y = -r.y; }
+
+  if (xisinff(d)) { r.x = r.y = NANf; }
+
+  return r;
+}
+
+__inline float xtanf(float d) {
+  int q;
+  float u, s, x;
+
+  q = (int)xrintf(d * (float)(2 * M_1_PI));
+
+  x = d;
+
+  x = mlaf(q, -PI4_Af*2, x);
+  x = mlaf(q, -PI4_Bf*2, x);
+  x = mlaf(q, -PI4_Cf*2, x);
+  x = mlaf(q, -PI4_Df*2, x);
+
+  s = x * x;
+
+  if ((q & 1) != 0) x = -x;
+
+  u = 0.00927245803177356719970703f;
+  u = mlaf(u, s, 0.00331984995864331722259521f);
+  u = mlaf(u, s, 0.0242998078465461730957031f);
+  u = mlaf(u, s, 0.0534495301544666290283203f);
+  u = mlaf(u, s, 0.133383005857467651367188f);
+  u = mlaf(u, s, 0.333331853151321411132812f);
+
+  u = mlaf(s, u * x, x);
+
+  if ((q & 1) != 0) u = 1.0f / u;
+
+  if (xisinff(d)) u = NANf;
+
+  return u;
+}
+
+__inline float xatanf(float s) {
+  float t, u;
+  int q = 0;
+
+  if (s < 0) { s = -s; q = 2; }
+  if (s > 1) { s = 1.0f / s; q |= 1; }
+
+  t = s * s;
+
+  u = 0.00282363896258175373077393f;
+  u = mlaf(u, t, -0.0159569028764963150024414f);
+  u = mlaf(u, t, 0.0425049886107444763183594f);
+  u = mlaf(u, t, -0.0748900920152664184570312f);
+  u = mlaf(u, t, 0.106347933411598205566406f);
+  u = mlaf(u, t, -0.142027363181114196777344f);
+  u = mlaf(u, t, 0.199926957488059997558594f);
+  u = mlaf(u, t, -0.333331018686294555664062f);
+
+  t = s + s * (t * u);
+
+  if ((q & 1) != 0) t = 1.570796326794896557998982f - t;
+  if ((q & 2) != 0) t = -t;
+
+  return t;
+}
+
+__inline float atan2kf(float y, float x) {
+  float s, t, u;
+  int q = 0;
+
+  if (x < 0) { x = -x; q = -2; }
+  if (y > x) { t = x; x = y; y = -t; q += 1; }
+
+  s = y / x;
+  t = s * s;
+
+  u = 0.00282363896258175373077393f;
+  u = mlaf(u, t, -0.0159569028764963150024414f);
+  u = mlaf(u, t, 0.0425049886107444763183594f);
+  u = mlaf(u, t, -0.0748900920152664184570312f);
+  u = mlaf(u, t, 0.106347933411598205566406f);
+  u = mlaf(u, t, -0.142027363181114196777344f);
+  u = mlaf(u, t, 0.199926957488059997558594f);
+  u = mlaf(u, t, -0.333331018686294555664062f);
+
+  t = u * t * s + s;
+  t = q * (float)(M_PI/2) + t;
+
+  return t;
+}
+
+__inline float xatan2f(float y, float x) {
+  float r = atan2kf(xfabsf(y), x);
+
+  r = mulsignf(r, x);
+  if (xisinff(x) || x == 0) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI  /2)) : 0);
+  if (xisinff(y)          ) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0);
+  if (              y == 0) r = (signf(x) == -1 ? M_PIf : 0);
+
+  return xisnanf(x) || xisnanf(y) ? NANf : mulsignf(r, y);
+}
+
+__inline float xasinf(float d) {
+  return mulsignf(atan2kf(fabsf(d), sqrtf((1.0f+d)*(1.0f-d))), d);
+}
+
+__inline float xacosf(float d) {
+  return mulsignf(atan2kf(sqrtf((1.0f+d)*(1.0f-d)), fabsf(d)), d) + (d < 0 ? (float)M_PI : 0.0f);
+}
+
+__inline float xlogf(float d) {
+  float x, x2, t, m;
+  int e;
+
+  e = ilogbp1f(d * 0.7071f);
+  m = ldexpkf(d, -e);
+
+  x = (m-1.0f) / (m+1.0f);
+  x2 = x * x;
+
+  t = 0.2371599674224853515625f;
+  t = mlaf(t, x2, 0.285279005765914916992188f);
+  t = mlaf(t, x2, 0.400005519390106201171875f);
+  t = mlaf(t, x2, 0.666666567325592041015625f);
+  t = mlaf(t, x2, 2.0f);
+
+  x = x * t + 0.693147180559945286226764f * e;
+
+  if (xisinff(d)) x = INFINITYf;
+  if (d < 0) x = NANf;
+  if (d == 0) x = -INFINITYf;
+
+  return x;
+}
+
+__inline float xexpf(float d) {
+  if(d<=-104.0f) return 0.0f;
+
+  int q = (int)xrintf(d * R_LN2f);
+  float s, u;
+
+  s = mlaf(q, -L2Uf, d);
+  s = mlaf(q, -L2Lf, s);
+
+  u = 0.00136324646882712841033936f;
+  u = mlaf(u, s, 0.00836596917361021041870117f);
+  u = mlaf(u, s, 0.0416710823774337768554688f);
+  u = mlaf(u, s, 0.166665524244308471679688f);
+  u = mlaf(u, s, 0.499999850988388061523438f);
+
+  u = s * s * u + s + 1.0f;
+  u = ldexpkf(u, q);
+
+//  if (xisminff(d)) u = 0;
+  return u;
+}
diff --git a/rtengine/sleef.h b/rtengine/sleef.h
new file mode 100644
index 000000000..ab42eda40
--- /dev/null
+++ b/rtengine/sleef.h
@@ -0,0 +1,51 @@
+typedef struct {
+  double x, y;
+} double2;
+
+typedef struct {
+  float x, y;
+} float2;
+
+double xsin(double d);
+double xcos(double d);
+double2 xsincos(double d);
+double xtan(double d);
+double xasin(double s);
+double xacos(double s);
+double xatan(double s);
+double xatan2(double y, double x);
+double xlog(double d);
+double xexp(double d);
+double xpow(double x, double y);
+
+double xsinh(double x);
+double xcosh(double x);
+double xtanh(double x);
+double xasinh(double x);
+double xacosh(double x);
+double xatanh(double x);
+double xldexp(double x, int q);
+int xilogb(double d);
+
+double xfma(double x, double y, double z);
+double xsqrt(double d);
+double xcbrt(double d);
+
+double xexp2(double a);
+double xexp10(double a);
+double xexpm1(double a);
+double xlog10(double a);
+double xlog1p(double a);
+
+float xsinf(float d);
+float xcosf(float d);
+float2 xsincosf(float d);
+float xtanf(float d);
+float xasinf(float s);
+float xacosf(float s);
+float xatanf(float s);
+float xatan2f(float y, float x);
+float xlogf(float d);
+float xexpf(float d);
+float xpowf(float x, float y);
+float xcbrtf(float d);
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
new file mode 100644
index 000000000..976a57c08
--- /dev/null
+++ b/rtengine/sleefsseavx.c
@@ -0,0 +1,1295 @@
+#ifndef SLEEFSSEAVX
+#define SLEEFSSEAVX
+
+#include <assert.h>
+#include <math.h>
+//#include <bits/nan.h>
+//#include <bits/inf.h>
+//#include "sleefsseavx.h"
+#ifdef __SSE2__
+#include "helpersse2.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "helperavx.h"
+#endif
+
+#ifdef __GNUC__
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+
+//
+
+#define PI4_A .7853981554508209228515625
+#define PI4_B .794662735614792836713604629039764404296875e-8
+#define PI4_C .306161699786838294306516483068750264552437361480769e-16
+#define M_4_PI 1.273239544735162542821171882678754627704620361328125
+
+#define L2U .69314718055966295651160180568695068359375
+#define L2L .28235290563031577122588448175013436025525412068e-12
+#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
+
+//
+
+#define PI4_Af 0.78515625f
+#define PI4_Bf 0.00024127960205078125f
+#define PI4_Cf 6.3329935073852539062e-07f
+#define PI4_Df 4.9604681473525147339e-10f
+
+#define L2Uf 0.693145751953125f
+#define L2Lf 1.428606765330187045e-06f
+#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
+
+#define INFINITYf ((float)INFINITY)
+#define NANf ((float)NAN)
+
+//
+
+static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
+  return vadd(vadd(v0, v1), v2);
+}
+
+static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+  return vadd3(vadd(v0, v1), v2, v3);
+}
+
+static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+  return vadd4(vadd(v0, v1), v2, v3, v4);
+}
+
+static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
+  return vadd5(vadd(v0, v1), v2, v3, v4, v5);
+}
+
+static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
+  return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
+}
+
+static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
+  return vsub(vsub(v0, v1), v2);
+}
+
+static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+  return vsub3(vsub(v0, v1), v2, v3);
+}
+
+static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+  return vsub4(vsub(v0, v1), v2, v3, v4);
+}
+
+//
+
+static INLINE vdouble2 normalize_d(vdouble2 t) {
+  vdouble2 s;
+
+  s.x = vadd(t.x, t.y);
+  s.y = vadd(vsub(t.x, s.x), t.y);
+
+  return s;
+}
+
+static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
+  vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
+  return r;
+}
+
+static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vadd(x, y);
+  r.y = vadd(vsub(x, r.x), y);
+
+  return r;
+}
+
+static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vadd(x, y);
+  vdouble v = vsub(r.x, x);
+  r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
+
+  return r;
+}
+
+static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vadd(x.x, y);
+  r.y = vadd3(vsub(x.x, r.x), y, x.y);
+
+  return r;
+}
+
+static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vadd(x.x, y);
+  vdouble v = vsub(r.x, x.x);
+  r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
+  r.y = vadd(r.y, x.y);
+
+  return r;
+}
+
+static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
+  vdouble2 r;
+
+  r.x = vadd(x, y.x);
+  r.y = vadd3(vsub(x, r.x), y.x, y.y);
+
+  return r;
+}
+
+static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
+  // |x| >= |y|
+
+  vdouble2 r;
+
+  r.x = vadd(x.x, y.x);
+  r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
+
+  return r;
+}
+
+static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
+  vdouble2 r;
+
+  r.x  = vadd(x.x, y.x);
+  vdouble v = vsub(r.x, x.x);
+  r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
+  r.y = vadd(r.y, vadd(x.y, y.y));
+
+  return r;
+}
+
+static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
+  vdouble t = vrec(d.x);
+  vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
+  vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
+  vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
+
+  vdouble2 q;
+
+  q.x = vmul(n.x, t);
+
+  vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
+		    vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
+
+  q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
+
+  return q;
+}
+
+static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
+  vdouble xh = vupper(x), xl = vsub(x, xh);
+  vdouble yh = vupper(y), yl = vsub(y, yh);
+  vdouble2 r;
+
+  r.x = vmul(x, y);
+  r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
+
+  return r;
+}
+
+static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
+  vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+  vdouble yh = vupper(y  ), yl = vsub(y  , yh);
+  vdouble2 r;
+
+  r.x = vmul(x.x, y);
+  r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
+
+  return r;
+}
+
+static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
+  vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+  vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
+  vdouble2 r;
+
+  r.x = vmul(x.x, y.x);
+  r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
+
+  return r;
+}
+
+static INLINE vdouble2 squ_d(vdouble2 x) {
+  vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+  vdouble2 r;
+
+  r.x = vmul(x.x, x.x);
+  r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
+
+  return r;
+}
+
+static INLINE vdouble2 rec_s(vdouble d) {
+  vdouble t = vrec(d);
+  vdouble dh = vupper(d), dl = vsub(d, dh);
+  vdouble th = vupper(t), tl = vsub(t, th);
+  vdouble2 q;
+
+  q.x = t;
+  q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
+
+  return q;
+}
+
+static INLINE vdouble2 sqrt_d(vdouble2 d) {
+  vdouble t = vsqrt(vadd(d.x, d.y));
+  return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
+}
+
+//
+
+static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
+
+static INLINE vint xilogb(vdouble d) {
+  vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
+  e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
+  e = vsel(vmask_eq(vabs(d), vcast_vd_d(INFINITY)), vcast_vd_d(2147483647), e);
+  return vrint_vi_vd(e);
+}
+
+static INLINE vdouble xsin(vdouble d) {
+  vint q;
+  vdouble u, s;
+
+  q = vrint_vi_vd(vmul(d, vcast_vd_d(M_1_PI)));
+
+  u = vcast_vd_vi(q);
+  d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
+  d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
+  d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
+
+  s = vmul(d, d);
+
+  d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
+
+  u = vcast_vd_d(-7.97255955009037868891952e-18);
+  u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+  u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+  u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+  u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+  u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+  u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
+  u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
+  u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+  u = vmla(s, vmul(u, d), d);
+
+  return u;
+}
+
+static INLINE vdouble xcos(vdouble d) {
+  vint q;
+  vdouble u, s;
+
+  q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(M_1_PI)), vcast_vd_d(0.5)));
+  q = vaddi(vaddi(q, q), vcast_vi_i(1));
+
+  u = vcast_vd_vi(q);
+  d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
+  d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
+  d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
+
+  s = vmul(d, d);
+
+  d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
+
+  u = vcast_vd_d(-7.97255955009037868891952e-18);
+  u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+  u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+  u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+  u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+  u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+  u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
+  u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
+  u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+  u = vmla(s, vmul(u, d), d);
+
+  return u;
+}
+
+static INLINE vdouble2 xsincos(vdouble d) {
+  vint q;
+  vmask m;
+  vdouble u, s, t, rx, ry;
+  vdouble2 r;
+
+  q = vrint_vi_vd(vmul(d, vcast_vd_d(M_2_PI)));
+
+  s = d;
+
+  u = vcast_vd_vi(q);
+  s = vmla(u, vcast_vd_d(-PI4_A*2), s);
+  s = vmla(u, vcast_vd_d(-PI4_B*2), s);
+  s = vmla(u, vcast_vd_d(-PI4_C*2), s);
+
+  t = s;
+
+  s = vmul(s, s);
+
+  u = vcast_vd_d(1.58938307283228937328511e-10);
+  u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
+  u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
+  u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
+  u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
+  u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
+  u = vmul(vmul(u, s), t);
+
+  rx = vadd(t, u);
+
+  u = vcast_vd_d(-1.13615350239097429531523e-11);
+  u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
+  u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
+  u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
+  u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
+  u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
+  u = vmla(u, s, vcast_vd_d(-0.5));
+
+  ry = vadd(vcast_vd_d(1), vmul(s, u));
+
+  m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
+  r.x = vsel(m, rx, ry);
+  r.y = vsel(m, ry, rx);
+
+  m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
+  r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
+
+  m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
+  r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
+
+  m = vmask_isinf(d);
+  r.x = vsel(m, vcast_vd_d(NAN), r.x);
+  r.y = vsel(m, vcast_vd_d(NAN), r.y);
+
+  return r;
+}
+
+static INLINE vdouble xtan(vdouble d) {
+  vint q;
+  vdouble u, s, x;
+  vmask m;
+
+  q = vrint_vi_vd(vmul(d, vcast_vd_d(M_2_PI)));
+
+  u = vcast_vd_vi(q);
+  x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
+  x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
+  x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
+
+  s = vmul(x, x);
+
+  m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
+  x = vsel(m, vneg(x), x);
+
+  u = vcast_vd_d(1.01419718511083373224408e-05);
+  u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
+  u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
+  u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
+  u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
+  u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
+  u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
+  u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
+  u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
+  u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
+  u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
+  u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
+  u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
+  u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
+  u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
+
+  u = vmla(s, vmul(u, x), x);
+
+  u = vsel(m, vrec(u), u);
+
+  u = vsel(vmask_isinf(d), vcast_vd_d(NAN), u);
+
+  return u;
+}
+
+static INLINE vdouble atan2k(vdouble y, vdouble x) {
+  vdouble s, t, u;
+  vint q;
+  vmask p;
+
+  q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
+  x = vabs(x);
+
+  q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
+  p = vmask_lt(x, y);
+  s = vsel (p, vneg(x), y);
+  t = vmax (x, y);
+
+  s = vdiv(s, t);
+  t = vmul(s, s);
+
+  u = vcast_vd_d(-1.88796008463073496563746e-05);
+  u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
+  u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
+  u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
+  u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
+  u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
+  u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
+  u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
+  u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
+  u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
+  u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
+  u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
+  u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
+  u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
+  u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
+  u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
+  u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
+  u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
+  u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+  t = vadd(s, vmul(s, vmul(t, u)));
+  t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(M_PI/2)));
+
+  return t;
+}
+
+static INLINE vdouble xatan2(vdouble y, vdouble x) {
+  vdouble r = atan2k(vabs(y), x);
+
+  r = vmulsign(r, x);
+  r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(M_PI/2), visinf2(x, vmulsign(vcast_vd_d(M_PI/2), x))), r);
+  r = vsel(vmask_isinf(y), vsub(vcast_vd_d(M_PI/2), visinf2(x, vmulsign(vcast_vd_d(M_PI/4), x))), r);
+  r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(M_PI), vcast_vd_d(0)), r);
+
+  return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(NAN), vmulsign(r, y));
+}
+
+static INLINE vdouble xasin(vdouble d) {
+  vdouble x, y;
+  x = vadd(vcast_vd_d(1), d);
+  y = vsub(vcast_vd_d(1), d);
+  x = vmul(x, y);
+  x = vsqrt(x);
+  x = vsel(vmask_isnan(x), vcast_vd_d(NAN), atan2k(vabs(d), x));
+  return vmulsign(x, d);
+}
+
+static INLINE vdouble xacos(vdouble d) {
+  vdouble x, y;
+  x = vadd(vcast_vd_d(1), d);
+  y = vsub(vcast_vd_d(1), d);
+  x = vmul(x, y);
+  x = vsqrt(x);
+  x = vmulsign(atan2k(x, vabs(d)), d);
+  y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(M_PI));
+  x = vadd(x, y);
+  return x;
+}
+
+static INLINE vdouble xatan(vdouble s) {
+  vdouble t, u;
+  vint q;
+
+  q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
+  s = vabs(s);
+
+  q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
+  s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
+
+  t = vmul(s, s);
+
+  u = vcast_vd_d(-1.88796008463073496563746e-05);
+  u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
+  u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
+  u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
+  u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
+  u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
+  u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
+  u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
+  u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
+  u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
+  u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
+  u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
+  u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
+  u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
+  u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
+  u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
+  u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
+  u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
+  u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+  t = vadd(s, vmul(s, vmul(t, u)));
+
+  t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(M_PI/2), t), t);
+  t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
+
+  return t;
+}
+
+static INLINE vdouble xlog(vdouble d) {
+  vdouble x, x2;
+  vdouble t, m;
+  vint e;
+
+  e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
+  m = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+  x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
+  x2 = vmul(x, x);
+
+  t = vcast_vd_d(0.148197055177935105296783);
+  t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
+  t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
+  t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
+  t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
+  t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
+  t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
+  t = vmla(t, x2, vcast_vd_d(2));
+
+  x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
+
+  x = vsel(vmask_ispinf(d), vcast_vd_d(INFINITY), x);
+  x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(NAN), x);
+  x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x);
+
+  return x;
+}
+
+static INLINE vdouble xexp(vdouble d) {
+  vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
+  vdouble s, u;
+
+  s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+  s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+  u = vcast_vd_d(2.08860621107283687536341e-09);
+  u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
+  u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
+  u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
+  u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
+  u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
+  u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
+  u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
+  u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
+  u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
+  u = vmla(u, s, vcast_vd_d(0.5));
+
+  u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
+
+  u = vldexp(u, q);
+
+  u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
+
+  return u;
+}
+
+static INLINE vdouble2 logk(vdouble d) {
+  vdouble2 x, x2;
+  vdouble t, m;
+  vint e;
+
+  e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
+  m = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+  x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
+  x2 = squ_d(x);
+  x2 = normalize_d(x2);
+
+  t = vcast_vd_d(0.134601987501262130076155);
+  t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
+  t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
+  t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
+  t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
+  t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
+  t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
+  t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
+
+  return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
+		       vcast_vd_vi(e)),
+		add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
+}
+
+static INLINE vdouble expk(vdouble2 d) {
+  vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
+  vint q = vrint_vi_vd(u);
+  vdouble2 s, t;
+
+  s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+  s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+  q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
+
+  s = normalize_d(s);
+
+  u = vcast_vd_d(2.51069683420950419527139e-08);
+  u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+  u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+  u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+  u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+  u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+  u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+  u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+  u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
+  u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+  t = add_dd(s, mul_ds(squ_d(s), u));
+
+  t = add_sd(vcast_vd_d(1), t);
+  u = vadd(t.x, t.y);
+  u = vldexp(u, q);
+
+  return u;
+}
+
+static INLINE vdouble xpow(vdouble x, vdouble y) {
+#if 1
+  vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
+  vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
+
+  vdouble result = expk(mul_ds(logk(vabs(x)), y));
+
+  //result = vsel(vmask_isnan(result), vcast_vd_d(INFINITY), result);
+
+  result = vmul(result,
+		vsel(vmask_gt(x, vcast_vd_d(0)),
+		     vcast_vd_d(1),
+		     vsel(yisint,
+			  vsel(yisodd,
+			       vcast_vd_d(-1),
+			       vcast_vd_d(1)),
+			  vcast_vd_d(NAN))));
+
+  vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
+
+  result = vsel(vmask_isinf(y),
+		vsel(vmask_lt(efx, vcast_vd_d(0)),
+		     vcast_vd_d(0),
+		     vsel(vmask_eq(efx, vcast_vd_d(0)),
+			  vcast_vd_d(1.0),
+			  vcast_vd_d(INFINITY))),
+		result);
+
+  result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
+		vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
+		     vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
+			  vcast_vd_d(0),
+			  vcast_vd_d(INFINITY))),
+		result);
+
+  result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(NAN), result);
+
+  result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
+
+  return result;
+#else
+  return expk(mul_ds(logk(x), y));
+#endif
+}
+
+static INLINE vdouble2 expk2(vdouble2 d) {
+  vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
+  vint q = vrint_vi_vd(u);
+  vdouble2 s, t;
+
+  s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+  s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+  q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
+
+  s = normalize_d(s);
+
+  u = vcast_vd_d(2.51069683420950419527139e-08);
+  u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+  u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+  u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+  u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+  u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+  u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+  u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+  u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
+  u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+  t = add_dd(s, mul_ds(squ_d(s), u));
+
+  t = add_sd(vcast_vd_d(1), t);
+
+  return dd(vldexp(t.x, q), vldexp(t.y, q));
+}
+
+static INLINE vdouble xsinh(vdouble x) {
+  vdouble y = vabs(x);
+  vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
+  d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
+  y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
+
+  y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(INFINITY), y);
+  y = vmulsign(y, x);
+  y = vsel(vmask_isnan(x), vcast_vd_d(NAN), y);
+
+  return y;
+}
+
+static INLINE vdouble xcosh(vdouble x) {
+  vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
+  d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
+  vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
+
+  y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(INFINITY), y);
+  y = vsel(vmask_isnan(x), vcast_vd_d(NAN), y);
+
+  return y;
+}
+
+static INLINE vdouble xtanh(vdouble x) {
+  vdouble y = vabs(x);
+  vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
+  vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
+  d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
+  y = d.x + d.y;
+
+  y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
+  y = vmulsign(y, x);
+  y = vsel(vmask_isnan(x), vcast_vd_d(NAN), y);
+
+  return y;
+}
+
+static INLINE vdouble2 logk2(vdouble2 d) {
+  vdouble2 x, x2, m;
+  vdouble t;
+  vint e;
+
+  d = normalize_d(d);
+  e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
+  m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
+
+  x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
+  x2 = squ_d(x);
+  x2 = normalize_d(x2);
+
+  t = vcast_vd_d(0.134601987501262130076155);
+  t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
+  t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
+  t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
+  t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
+  t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
+  t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
+  t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
+
+  return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
+		       vcast_vd_vi(e)),
+		add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
+}
+
+static INLINE vdouble xasinh(vdouble x) {
+  vdouble y = vabs(x);
+  vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
+  y = vadd(d.x, d.y);
+
+  y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(INFINITY), y);
+  y = vmulsign(y, x);
+  y = vsel(vmask_isnan(x), vcast_vd_d(NAN), y);
+
+  return y;
+}
+
+static INLINE vdouble xacosh(vdouble x) {
+  vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
+  vdouble y = vadd(d.x, d.y);
+
+  y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(INFINITY), y);
+  y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
+  y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(NAN), y);
+  y = vsel(vmask_isnan(x), vcast_vd_d(NAN), y);
+
+  return y;
+}
+
+static INLINE vdouble xatanh(vdouble x) {
+  vdouble y = vabs(x);
+  vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
+  y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
+
+  y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(NAN), y);
+  y = vmulsign(y, x);
+  y = vsel(vmask_isnan(x), vcast_vd_d(NAN), y);
+
+  return y;
+}
+
+static INLINE vdouble xcbrt(vdouble d) {
+  vdouble x, y, q = vcast_vd_d(1.0);
+  vint e, qu, re;
+  vdouble t;
+
+  e = vilogbp1(vabs(d));
+  d = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+  t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
+  qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
+  re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
+
+  q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
+  q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
+  q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
+
+  q = vmulsign(q, d);
+
+  d = vabs(d);
+
+  x = vcast_vd_d(-0.640245898480692909870982);
+  x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
+  x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
+  x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
+  x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
+  x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
+
+  y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
+  y = vmul(vmul(d, x), x);
+  y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
+
+  return y;
+}
+
+static INLINE vdouble xexp2(vdouble a) {
+  vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
+  u = vsel(vmask_ispinf(a), vcast_vd_d(INFINITY), u);
+  u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
+  return u;
+}
+
+static INLINE vdouble xexp10(vdouble a) {
+  vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
+  u = vsel(vmask_ispinf(a), vcast_vd_d(INFINITY), u);
+  u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
+  return u;
+}
+
+static INLINE vdouble xexpm1(vdouble a) {
+  vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
+  vdouble x = d.x + d.y;
+  x = vsel(vmask_ispinf(a), vcast_vd_d(INFINITY), x);
+  x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
+  return x;
+}
+
+static INLINE vdouble xlog10(vdouble a) {
+  vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
+  vdouble x = d.x + d.y;
+
+  x = vsel(vmask_ispinf(a), vcast_vd_d(INFINITY), x);
+  x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(NAN), x);
+  x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x);
+
+  return x;
+}
+
+static INLINE vdouble xlog1p(vdouble a) {
+  vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
+  vdouble x = d.x + d.y;
+
+  x = vsel(vmask_ispinf(a), vcast_vd_d(INFINITY), x);
+  x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(NAN), x);
+  x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-INFINITY), x);
+
+  return x;
+}
+
+//
+
+typedef struct {
+  vfloat x, y;
+} vfloat2;
+
+static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return vaddf(vmulf(x, y), z); }
+static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
+static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
+
+static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
+  return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+}
+
+static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
+  vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
+  return vori2(vandi2(m2, x), vandnoti2(m2, y));
+}
+
+static INLINE vmask vsignbitf(vfloat f) {
+  return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
+}
+
+static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
+  return (vfloat)vxorm((vmask)x, vsignbitf(y));
+}
+
+static INLINE vfloat vsignf(vfloat f) {
+  return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
+}
+
+static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
+static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
+static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
+static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
+
+static INLINE vint2 vilogbp1f(vfloat d) {
+  vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
+  d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
+  vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
+  q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
+  return q;
+}
+
+static INLINE vfloat vldexpf(vfloat x, vint2 q) {
+  vfloat u;
+  vint2 m = vsrai2(q, 31);
+  m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
+  q = vsubi2(q, vslli2(m, 2));
+  u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
+  x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
+  u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
+  return vmulf(x, u);
+}
+
+static INLINE vfloat xsinf(vfloat d) {
+  vint2 q;
+  vfloat u, s;
+
+  q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)M_1_PI)));
+
+  u = vcast_vf_vi2(q);
+  d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
+  d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
+  d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
+  d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
+
+  s = vmulf(d, d);
+
+  d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
+
+  u = vcast_vf_f(2.6083159809786593541503e-06f);
+  u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+  u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+  u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+  u = vmlaf(s, vmulf(u, d), d);
+
+  return u;
+}
+
+static INLINE vfloat xcosf(vfloat d) {
+  vint2 q;
+  vfloat u, s;
+
+  q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));
+  q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
+
+  u = vcast_vf_vi2(q);
+  d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
+  d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
+  d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
+  d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
+
+  s = vmulf(d, d);
+
+  d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
+
+  u = vcast_vf_f(2.6083159809786593541503e-06f);
+  u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+  u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+  u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+  u = vmlaf(s, vmulf(u, d), d);
+
+  return u;
+}
+
+static INLINE vfloat2 xsincosf(vfloat d) {
+  vint2 q;
+  vmask m;
+  vfloat u, s, t, rx, ry;
+  vfloat2 r;
+
+  q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)M_2_PI)));
+
+  s = d;
+
+  u = vcast_vf_vi2(q);
+  s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
+  s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
+  s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
+  s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
+
+  t = s;
+
+  s = vmulf(s, s);
+
+  u = vcast_vf_f(-0.000195169282960705459117889f);
+  u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
+  u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
+  u = vmulf(vmulf(u, s), t);
+
+  rx = vaddf(t, u);
+
+  u = vcast_vf_f(-2.71811842367242206819355e-07f);
+  u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
+  u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
+  u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
+  u = vmlaf(u, s, vcast_vf_f(-0.5));
+
+  ry = vaddf(vcast_vf_f(1), vmulf(s, u));
+
+  m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
+  r.x = vself(m, rx, ry);
+  r.y = vself(m, ry, rx);
+
+  m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
+  r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
+
+  m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
+  r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
+
+  m = vmaskf_isinf(d);
+  r.x = vself(m, vcast_vf_f(NAN), r.x);
+  r.y = vself(m, vcast_vf_f(NAN), r.y);
+
+  return r;
+}
+
+static INLINE vfloat xtanf(vfloat d) {
+  vint2 q;
+  vmask m;
+  vfloat u, s, x;
+
+  q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * M_1_PI))));
+
+  x = d;
+
+  u = vcast_vf_vi2(q);
+  x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
+  x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
+  x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
+  x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
+
+  s = vmulf(x, x);
+
+  m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
+  x = vself(m, vnegf(x), x);
+
+  u = vcast_vf_f(0.00927245803177356719970703f);
+  u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
+  u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
+  u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
+  u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
+  u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
+
+  u = vmlaf(s, vmulf(u, x), x);
+
+  u = vself(m, vrecf(u), u);
+
+  u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
+
+  return u;
+}
+
+static INLINE vfloat xatanf(vfloat s) {
+  vfloat t, u;
+  vint2 q;
+
+  q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
+  s = vabsf(s);
+
+  q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
+  s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
+
+  t = vmulf(s, s);
+
+  u = vcast_vf_f(0.00282363896258175373077393f);
+  u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+  u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+  u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+  u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
+  u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+  u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
+  u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+  t = vaddf(s, vmulf(s, vmulf(t, u)));
+
+  t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(M_PI/2)), t), t);
+  t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
+
+  return t;
+}
+
+static INLINE vfloat atan2kf(vfloat y, vfloat x) {
+  vfloat s, t, u;
+  vint2 q;
+  vmask p;
+
+  q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
+  x = vabsf(x);
+
+  q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
+  p = vmaskf_lt(x, y);
+  s = vself(p, vnegf(x), y);
+  t = vmaxf(x, y);
+
+  s = vdivf(s, t);
+  t = vmulf(s, s);
+
+  u = vcast_vf_f(0.00282363896258175373077393f);
+  u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+  u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+  u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+  u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
+  u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+  u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
+  u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+  t = vaddf(s, vmulf(s, vmulf(t, u)));
+  t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(M_PI/2))));
+
+  return t;
+}
+
+static INLINE vfloat xatan2f(vfloat y, vfloat x) {
+  vfloat r = atan2kf(vabsf(y), x);
+
+  r = vmulsignf(r, x);
+  r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(M_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(M_PI/2)), x))), r);
+  r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(M_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(M_PI/4)), x))), r);
+  r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vself(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)M_PI), vcast_vf_f(0.0f)), r);
+
+  return vself(vorm(vmaskf_isnan(x), vmaskf_isnan(y)), vcast_vf_f(NANf), vmulsignf(r, y));
+}
+
+static INLINE vfloat xasinf(vfloat d) {
+  vfloat x, y;
+  x = vaddf(vcast_vf_f(1.0f), d);
+  y = vsubf(vcast_vf_f(1.0f), d);
+  x = vmulf(x, y);
+  x = vsqrtf(x);
+  x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
+  return vmulsignf(x, d);
+}
+
+static INLINE vfloat xacosf(vfloat d) {
+  vfloat x, y;
+  x = vaddf(vcast_vf_f(1.0f), d);
+  y = vsubf(vcast_vf_f(1.0f), d);
+  x = vmulf(x, y);
+  x = vsqrtf(x);
+  x = vmulsignf(atan2kf(x, vabsf(d)), d);
+  y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)M_PI));
+  x = vaddf(x, y);
+  return x;
+}
+
+static INLINE vfloat xlogf(vfloat d) {
+  vfloat x, x2, t, m;
+  vint2 e;
+
+  e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+  m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+  x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+  x2 = vmulf(x, x);
+
+  t = vcast_vf_f(0.2371599674224853515625f);
+  t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+  t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+  t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+  t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+  x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+  x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
+  x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
+  x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
+
+  return x;
+}
+
+static INLINE vfloat xlogf0(vfloat d) {
+  vfloat x, x2, t, m;
+  vint2 e;
+
+  e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+  m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+  x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+  x2 = vmulf(x, x);
+
+  t = vcast_vf_f(0.2371599674224853515625f);
+  t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+  t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+  t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+  t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+  x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+  x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
+  x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
+  x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
+
+  return x;
+}
+
+
+static INLINE vfloat xexpf(vfloat d) {
+  vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
+  vfloat s, u;
+
+  s = vaddf(d, vmulf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf)));
+  s = vaddf(s, vmulf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf)));
+
+  u = vcast_vf_f(0.00136324646882712841033936f);
+  u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
+  u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
+  u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
+  u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
+
+  u = vaddf(vcast_vf_f(1.0f), vaddf(s, vmulf(vmulf(s, s), u)));
+
+  u = vldexpf(u, q);
+
+  u = vself(vmaskf_isminf(d), vcast_vf_f(0.0f), u);
+// -104.0
+  u = vself(vmaskf_gt(vcast_vf_f(-104), d), vcast_vf_f(0), u);
+  return u;
+}
+
+static INLINE vfloat xcbrtf(vfloat d) {
+  vfloat x, y, q = vcast_vf_f(1.0), t;
+  vint2 e, qu, re;
+
+  e = vilogbp1f(vabsf(d));
+  d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+  t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
+  qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
+  re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
+
+  q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
+  q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
+  q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
+
+  q = vmulsignf(q, d);
+  d = vabsf(d);
+
+  x = vcast_vf_f(-0.601564466953277587890625f);
+  x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
+  x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
+  x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
+  x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
+  x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
+
+  y = vmulf(vmulf(d, x), x);
+  y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
+
+  return y;
+}
+#endif // SLEEFSSEAVX
diff --git a/rtengine/sleefsseavx.h b/rtengine/sleefsseavx.h
new file mode 100644
index 000000000..b2b179dd3
--- /dev/null
+++ b/rtengine/sleefsseavx.h
@@ -0,0 +1,108 @@
+#include <immintrin.h>
+#include <stdint.h>
+
+#ifdef __SSE2__
+#define VECTLENDP 2
+#define VECTLENSP 4
+
+typedef __m128d vdouble;
+typedef __m128i vint;
+
+typedef __m128 vfloat;
+typedef __m128i vint2;
+typedef __m128i vmask;
+
+static vdouble vloadu(double *p) { return _mm_loadu_pd(p); }
+static void vstoreu(double *p, vdouble v) { _mm_storeu_pd(p, v); }
+
+static vfloat vloaduf(float *p) { return _mm_loadu_ps(p); }
+static void vstoreuf(float *p, vfloat v) { _mm_storeu_ps(p, v); }
+
+static vint2 vloadui2(int32_t *p) { return (vint2)_mm_loadu_si128((__m128i *)p); }
+static void vstoreui2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, (__m128i)v); }
+#endif
+
+#ifdef ENABLE_AVX
+#define VECTLENDP 4
+#define VECTLENSP 8
+
+typedef __m256d vdouble;
+typedef __m128i vint;
+
+
+typedef __m256 vfloat;
+typedef struct {
+  vint x, y;
+} vint2;
+
+static vdouble vloadu(double *p) { return _mm256_loadu_pd(p); }
+static void vstoreu(double *p, vdouble v) { return _mm256_storeu_pd(p, v); }
+
+static vfloat vloaduf(float *p) { return _mm256_loadu_ps(p); }
+static void vstoreuf(float *p, vfloat v) { return _mm256_storeu_ps(p, v); }
+
+static vint2 vloadui2(int32_t *p) {
+  vint2 r;
+  r.x = _mm_loadu_si128((__m128i *) p     );
+  r.y = _mm_loadu_si128((__m128i *)(p + 4));
+  return r;
+}
+
+static void vstoreui2(int32_t *p, vint2 v) {
+  _mm_storeu_si128((__m128i *) p     , v.x);
+  _mm_storeu_si128((__m128i *)(p + 4), v.y);
+}
+#endif
+
+typedef struct {
+  vdouble x, y;
+} vdouble2;
+
+vdouble xldexp(vdouble x, vint q);
+vint xilogb(vdouble d);
+
+vdouble xsin(vdouble d);
+vdouble xcos(vdouble d);
+vdouble2 xsincos(vdouble d);
+vdouble xtan(vdouble d);
+vdouble xasin(vdouble s);
+vdouble xacos(vdouble s);
+vdouble xatan(vdouble s);
+vdouble xatan2(vdouble y, vdouble x);
+vdouble xlog(vdouble d);
+vdouble xexp(vdouble d);
+vdouble xpow(vdouble x, vdouble y);
+
+vdouble xsinh(vdouble d);
+vdouble xcosh(vdouble d);
+vdouble xtanh(vdouble d);
+vdouble xasinh(vdouble s);
+vdouble xacosh(vdouble s);
+vdouble xatanh(vdouble s);
+
+vdouble xcbrt(vdouble d);
+
+vdouble xexp2(vdouble a);
+vdouble xexp10(vdouble a);
+vdouble xexpm1(vdouble a);
+vdouble xlog10(vdouble a);
+vdouble xlog1p(vdouble a);
+
+//
+
+typedef struct {
+  vfloat x, y;
+} vfloat2;
+
+vfloat xsinf(vfloat d);
+vfloat xcosf(vfloat d);
+vfloat2 xsincosf(vfloat d);
+vfloat xtanf(vfloat d);
+vfloat xasinf(vfloat s);
+vfloat xacosf(vfloat s);
+vfloat xatanf(vfloat s);
+vfloat xatan2f(vfloat y, vfloat x);
+vfloat xlogf(vfloat d);
+vfloat xlogf0(vfloat d);
+vfloat xexpf(vfloat d);
+vfloat xcbrtf(vfloat s);