Added modified Sleef-Library to repository

2013-03-05 23:16:41 +01:00
parent 5cab1357f4
commit 2e43b2c213
6 changed files with 3175 additions and 0 deletions
--- a/rtengine/helperavx.h
+++ b/rtengine/helperavx.h
@@ -0,0 +1,271 @@
+#ifndef __AVX__
+#error Please specify -mavx.
+#endif
+
+#ifdef __GNUC__
+#define INLINE __attribute__((always_inline))
+#else
+#define INLINE inline
+#endif
+
+#include <immintrin.h>
+#include <stdint.h>
+
+typedef __m256d vdouble;
+typedef __m128i vint;
+typedef __m256i vmask;
+
+typedef __m256 vfloat;
+typedef struct {
+  vint x, y;
+} vint2;
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
+static INLINE vdouble vcast_vd_d(double d) { return _mm256_set_pd(d, d, d, d); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(i, i, i, i); }
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m256i)vd; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (__m256d)vm;  }
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m256i)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m256)vm; }
+
+//
+
+static INLINE vfloat vcast_vf_f(float f) { return _mm256_set_ps(f, f, f, f, f, f, f, f); }
+
+static INLINE vfloat vaddf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
+static INLINE vfloat vsubf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
+static INLINE vfloat vmulf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
+static INLINE vfloat vdivf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
+static INLINE vfloat vrecf(vfloat x) { return vdivf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrtf(vfloat x) { return _mm256_sqrt_ps(x); }
+static INLINE vfloat vmaxf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
+static INLINE vfloat vminf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
+
+//
+
+static INLINE vdouble vadd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
+static INLINE vdouble vsub(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
+static INLINE vdouble vmul(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
+static INLINE vdouble vdiv(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
+static INLINE vdouble vrec(vdouble x) { return _mm256_div_pd(_mm256_set_pd(1, 1, 1, 1), x); }
+static INLINE vdouble vsqrt(vdouble x) { return _mm256_sqrt_pd(x); }
+static INLINE vdouble vmla(vdouble x, vdouble y, vdouble z) { return vadd(vmul(x, y), z); }
+
+static INLINE vdouble vmax(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
+static INLINE vdouble vmin(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
+
+static INLINE vdouble vabs(vdouble d) { return (__m256d)_mm256_andnot_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); }
+static INLINE vdouble vneg(vdouble d) { return (__m256d)_mm256_xor_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); }
+
+//
+
+static INLINE vint vaddi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsubi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+
+static INLINE vint vandi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnoti(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vori(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxori(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vslli(vint x, int c) { return _mm_slli_epi32 (x, c); }
+static INLINE vint vsrli(vint x, int c) { return _mm_srli_epi32 (x, c); }
+static INLINE vint vsrai(vint x, int c) { return _mm_srai_epi32 (x, c); }
+
+//
+
+static INLINE vmask vandm(vmask x, vmask y) { return (vmask)_mm256_and_pd((__m256d)x, (__m256d)y); }
+static INLINE vmask vandnotm(vmask x, vmask y) { return (vmask)_mm256_andnot_pd((__m256d)x, (__m256d)y); }
+static INLINE vmask vorm(vmask x, vmask y) { return (vmask)_mm256_or_pd((__m256d)x, (__m256d)y); }
+static INLINE vmask vxorm(vmask x, vmask y) { return (vmask)_mm256_xor_pd((__m256d)x, (__m256d)y); }
+
+static INLINE vmask vmask_eq(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_EQ_OQ); }
+static INLINE vmask vmask_neq(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_NEQ_OQ); }
+static INLINE vmask vmask_lt(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LT_OQ); }
+static INLINE vmask vmask_le(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LE_OQ); }
+static INLINE vmask vmask_gt(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GT_OQ); }
+static INLINE vmask vmask_ge(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GE_OQ); }
+
+static INLINE vmask vmaskf_eq(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_EQ_OQ); }
+static INLINE vmask vmaskf_neq(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_NEQ_OQ); }
+static INLINE vmask vmaskf_lt(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LT_OQ); }
+static INLINE vmask vmaskf_le(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LE_OQ); }
+static INLINE vmask vmaskf_gt(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GT_OQ); }
+static INLINE vmask vmaskf_ge(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GE_OQ); }
+
+static INLINE vmask vmaski_eq(vint x, vint y) {
+  __m256d r = _mm256_cvtepi32_pd(_mm_and_si128(_mm_cmpeq_epi32(x, y), _mm_set_epi32(1, 1, 1, 1)));
+  return vmask_eq(r, _mm256_set_pd(1, 1, 1, 1));
+}
+
+static INLINE vdouble vsel(vmask mask, vdouble x, vdouble y) {
+  return (__m256d)vorm(vandm(mask, (__m256i)x), vandnotm(mask, (__m256i)y));
+}
+
+static INLINE vint vseli_lt(vdouble d0, vdouble d1, vint x, vint y) {
+  __m128i mask = _mm256_cvtpd_epi32(_mm256_and_pd(_mm256_cmp_pd(d0, d1, _CMP_LT_OQ), _mm256_set_pd(1.0, 1.0, 1.0, 1.0)));
+  mask = _mm_cmpeq_epi32(mask, _mm_set_epi32(1, 1, 1, 1));
+  return vori(vandi(mask, x), vandnoti(mask, y));
+}
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) {
+  vint2 r;
+  r.x = _mm256_castsi256_si128(vm);
+  r.y = _mm256_extractf128_si256(vm, 1);
+  return r;
+}
+
+static INLINE vmask vcast_vm_vi2(vint2 vi) {
+  vmask m = _mm256_castsi128_si256(vi.x);
+  m = _mm256_insertf128_si256(m, vi.y, 1);
+  return m;
+}
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvttps_epi32(vf)); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); }
+static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = vcast_vi_i(i); return r; }
+
+static INLINE vint2 vaddi2(vint2 x, vint2 y) { vint2 r; r.x = vaddi(x.x, y.x); r.y = vaddi(x.y, y.y); return r; }
+static INLINE vint2 vsubi2(vint2 x, vint2 y) { vint2 r; r.x = vsubi(x.x, y.x); r.y = vsubi(x.y, y.y); return r; }
+
+static INLINE vint2 vandi2(vint2 x, vint2 y) { vint2 r; r.x = vandi(x.x, y.x); r.y = vandi(x.y, y.y); return r; }
+static INLINE vint2 vandnoti2(vint2 x, vint2 y) { vint2 r; r.x = vandnoti(x.x, y.x); r.y = vandnoti(x.y, y.y); return r; }
+static INLINE vint2 vori2(vint2 x, vint2 y) { vint2 r; r.x = vori(x.x, y.x); r.y = vori(x.y, y.y); return r; }
+static INLINE vint2 vxori2(vint2 x, vint2 y) { vint2 r; r.x = vxori(x.x, y.x); r.y = vxori(x.y, y.y); return r; }
+
+static INLINE vint2 vslli2(vint2 x, int c) { vint2 r; r.x = vslli(x.x, c); r.y = vslli(x.y, c); return r; }
+static INLINE vint2 vsrli2(vint2 x, int c) { vint2 r; r.x = vsrli(x.x, c); r.y = vsrli(x.y, c); return r; }
+static INLINE vint2 vsrai2(vint2 x, int c) { vint2 r; r.x = vsrai(x.x, c); r.y = vsrai(x.y, c); return r; }
+
+static INLINE vmask vmaski2_eq(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpeq_epi32(x.x, y.x);
+  r.y = _mm_cmpeq_epi32(x.y, y.y);
+  return vcast_vm_vi2(r);
+}
+
+static INLINE vint2 vseli2(vmask m, vint2 x, vint2 y) {
+  vint2 r, m2 = vcast_vi2_vm(m);
+  r.x = vori(vandi(m2.x, x.x), vandnoti(m2.x, y.x));
+  r.y = vori(vandi(m2.y, x.y), vandnoti(m2.y, y.y));
+  return r;
+}
+
+//
+
+static INLINE double vcast_d_vd(vdouble v) {
+  double s[4];
+  _mm256_storeu_pd(s, v);
+  return s[0];
+}
+
+static INLINE float vcast_f_vf(vfloat v) {
+  float s[8];
+  _mm256_storeu_ps(s, v);
+  return s[0];
+}
+
+static INLINE vmask vsignbit(vdouble d) {
+  return (vmask)_mm256_and_pd(d, _mm256_set_pd(-0.0,-0.0,-0.0,-0.0));
+}
+
+static INLINE vdouble vsign(vdouble d) {
+  return _mm256_or_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), (vdouble)vsignbit(d));
+}
+
+static INLINE vdouble vmulsign(vdouble x, vdouble y) {
+  return (__m256d)vxorm((__m256i)x, vsignbit(y));
+}
+
+static INLINE vmask vmask_isinf(vdouble d) {
+  return (vmask)_mm256_cmp_pd(vabs(d), _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vmask vmask_ispinf(vdouble d) {
+  return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vmask vmask_isminf(vdouble d) {
+  return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(-INFINITY, -INFINITY, -INFINITY, -INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vmask vmask_isnan(vdouble d) {
+  return (vmask)_mm256_cmp_pd(d, d, _CMP_NEQ_UQ);
+}
+
+static INLINE vdouble visinf(vdouble d) {
+  return _mm256_and_pd((vdouble)vmask_isinf(d), vsign(d));
+}
+
+static INLINE vdouble visinf2(vdouble d, vdouble m) {
+  return _mm256_and_pd((vdouble)vmask_isinf(d), _mm256_or_pd((vdouble)vsignbit(d), m));
+}
+
+static INLINE vdouble vpow2i(vint q) {
+  vint r;
+  vdouble y;
+  q = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), q);
+  q = _mm_slli_epi32(q, 20);
+  r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,0,0,0));
+  y = _mm256_castpd128_pd256((__m128d)r);
+  r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(3,2,2,2));
+  y = _mm256_insertf128_pd(y, (__m128d)r, 1);
+  y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0));
+  return y;
+}
+
+static INLINE vdouble vldexp(vdouble x, vint q) {
+  vint m = _mm_srai_epi32(q, 31);
+  m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7);
+  q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2));
+  vdouble y = vpow2i(m);
+  return vmul(vmul(vmul(vmul(vmul(x, y), y), y), y), vpow2i(q));
+}
+
+static INLINE vint vilogbp1(vdouble d) {
+  vint q, r, c;
+  vmask m = vmask_lt(d, vcast_vd_d(4.9090934652977266E-91));
+  d = vsel(m, vmul(vcast_vd_d(2.037035976334486E90), d), d);
+  c = _mm256_cvtpd_epi32(vsel(m, vcast_vd_d(300+0x3fe), vcast_vd_d(0x3fe)));
+  q = (__m128i)_mm256_castpd256_pd128(d);
+  q = (__m128i)_mm_shuffle_ps((__m128)q, _mm_set_ps(0, 0, 0, 0), _MM_SHUFFLE(0,0,3,1));
+  r = (__m128i)_mm256_extractf128_pd(d, 1);
+  r = (__m128i)_mm_shuffle_ps(_mm_set_ps(0, 0, 0, 0), (__m128)r, _MM_SHUFFLE(3,1,0,0));
+  q = _mm_or_si128(q, r);
+  q = _mm_srli_epi32(q, 20);
+  q = _mm_sub_epi32(q, c);
+  return q;
+}
+
+static INLINE vdouble vupper(vdouble d) {
+  return (__m256d)_mm256_and_pd(d, (vdouble)_mm256_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000));
+}
+
+//
+
+typedef struct {
+  vdouble x, y;
+} vdouble2;
+
+static INLINE vdouble2 dd(vdouble h, vdouble l) {
+  vdouble2 ret = {h, l};
+  return ret;
+}
+
+static INLINE vdouble2 vsel2(vmask mask, vdouble2 x, vdouble2 y) {
+  return dd((__m256d)vorm(vandm(mask, (__m256i)x.x), vandnotm(mask, (__m256i)y.x)),
+	    (__m256d)vorm(vandm(mask, (__m256i)x.y), vandnotm(mask, (__m256i)y.y)));
+}
+
+static INLINE vdouble2 abs_d(vdouble2 x) {
+  return dd((__m256d)_mm256_xor_pd(_mm256_and_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), x.x), x.x),
+	    (__m256d)_mm256_xor_pd(_mm256_and_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), x.x), x.y));
+}
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -0,0 +1,239 @@
+#ifndef __SSE2__
+#error Please specify -msse2.
+#endif
+
+#ifdef __GNUC__
+#define INLINE __inline
+//#define INLINE __attribute__((always_inline))
+#else
+#define INLINE inline
+#endif
+
+#include <intrin.h>
+#include <stdint.h>
+
+typedef __m128d vdouble;
+typedef __m128i vint;
+typedef __m128i vmask;
+
+typedef __m128 vfloat;
+typedef __m128i vint2;
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
+static INLINE vdouble vcast_vd_d(double d) { return _mm_set_pd(d, d); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m128i)vd; }
+static INLINE vdouble vreinterpret_vd_vm(vint vm) { return (__m128d)vm; }
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m128i)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m128)vm; }
+
+//
+
+static INLINE vfloat vcast_vf_f(float f) { return _mm_set_ps(f, f, f, f); }
+
+static INLINE vfloat vaddf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
+static INLINE vfloat vsubf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
+static INLINE vfloat vmulf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
+static INLINE vfloat vdivf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
+static INLINE vfloat vrecf(vfloat x) { return vdivf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrtf(vfloat x) { return _mm_sqrt_ps(x); }
+static INLINE vfloat vmaxf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
+static INLINE vfloat vminf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
+
+//
+
+static INLINE vdouble vadd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
+static INLINE vdouble vsub(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
+static INLINE vdouble vmul(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
+static INLINE vdouble vdiv(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
+static INLINE vdouble vrec(vdouble x) { return _mm_div_pd(_mm_set_pd(1, 1), x); }
+static INLINE vdouble vsqrt(vdouble x) { return _mm_sqrt_pd(x); }
+static INLINE vdouble vmla(vdouble x, vdouble y, vdouble z) { return vadd(vmul(x, y), z); }
+
+static INLINE vdouble vmax(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
+static INLINE vdouble vmin(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
+
+static INLINE vdouble vabs(vdouble d) { return (__m128d)_mm_andnot_pd(_mm_set_pd(-0.0,-0.0), d); }
+static INLINE vdouble vneg(vdouble d) { return (__m128d)_mm_xor_pd(_mm_set_pd(-0.0,-0.0), d); }
+
+//
+
+static INLINE vint vaddi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsubi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+
+static INLINE vint vandi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnoti(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vori(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxori(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vslli(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrli(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsrai(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+//
+
+static INLINE vmask vandm(vmask x, vmask y) { return _mm_and_si128(x, y); }
+static INLINE vmask vandnotm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vmask vorm(vmask x, vmask y) { return _mm_or_si128(x, y); }
+static INLINE vmask vxorm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vmask vmask_eq(vdouble x, vdouble y) { return (__m128i)_mm_cmpeq_pd(x, y); }
+static INLINE vmask vmask_neq(vdouble x, vdouble y) { return (__m128i)_mm_cmpneq_pd(x, y); }
+static INLINE vmask vmask_lt(vdouble x, vdouble y) { return (__m128i)_mm_cmplt_pd(x, y); }
+static INLINE vmask vmask_le(vdouble x, vdouble y) { return (__m128i)_mm_cmple_pd(x, y); }
+static INLINE vmask vmask_gt(vdouble x, vdouble y) { return (__m128i)_mm_cmpgt_pd(x, y); }
+static INLINE vmask vmask_ge(vdouble x, vdouble y) { return (__m128i)_mm_cmpge_pd(x, y); }
+
+static INLINE vmask vmaskf_eq(vfloat x, vfloat y) { return (__m128i)_mm_cmpeq_ps(x, y); }
+static INLINE vmask vmaskf_neq(vfloat x, vfloat y) { return (__m128i)_mm_cmpneq_ps(x, y); }
+static INLINE vmask vmaskf_lt(vfloat x, vfloat y) { return (__m128i)_mm_cmplt_ps(x, y); }
+static INLINE vmask vmaskf_le(vfloat x, vfloat y) { return (__m128i)_mm_cmple_ps(x, y); }
+static INLINE vmask vmaskf_gt(vfloat x, vfloat y) { return (__m128i)_mm_cmpgt_ps(x, y); }
+static INLINE vmask vmaskf_ge(vfloat x, vfloat y) { return (__m128i)_mm_cmpge_ps(x, y); }
+
+static INLINE vmask vmaski_eq(vint x, vint y) {
+  __m128 s = (__m128)_mm_cmpeq_epi32(x, y);
+  return (__m128i)_mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 0, 0));
+}
+
+static INLINE vdouble vsel(vmask mask, vdouble x, vdouble y) {
+  return (__m128d)vorm(vandm(mask, (__m128i)x), vandnotm(mask, (__m128i)y));
+}
+
+static INLINE vint vseli_lt(vdouble d0, vdouble d1, vint x, vint y) {
+  vmask mask = (vmask)_mm_cmpeq_ps(_mm_cvtpd_ps((vdouble)vmask_lt(d0, d1)), _mm_set_ps(0, 0, 0, 0));
+  return vori(vandnoti(mask, x), vandi(mask, y));
+}
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm_set_epi32(i, i, i, i); }
+
+static INLINE vint2 vaddi2(vint2 x, vint2 y) { return vaddi(x, y); }
+static INLINE vint2 vsubi2(vint2 x, vint2 y) { return vsubi(x, y); }
+
+static INLINE vint2 vandi2(vint2 x, vint2 y) { return vandi(x, y); }
+static INLINE vint2 vandnoti2(vint2 x, vint2 y) { return vandnoti(x, y); }
+static INLINE vint2 vori2(vint2 x, vint2 y) { return vori(x, y); }
+static INLINE vint2 vxori2(vint2 x, vint2 y) { return vxori(x, y); }
+
+static INLINE vint2 vslli2(vint2 x, int c) { return vslli(x, c); }
+static INLINE vint2 vsrli2(vint2 x, int c) { return vsrli(x, c); }
+static INLINE vint2 vsrai2(vint2 x, int c) { return vsrai(x, c); }
+
+static INLINE vmask vmaski2_eq(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint2 vseli2(vmask m, vint2 x, vint2 y) { return vorm(vandm(m, x), vandnotm(m, y)); }
+
+//
+
+static INLINE double vcast_d_vd(vdouble v) {
+  double s[2];
+  _mm_storeu_pd(s, v);
+  return s[0];
+}
+
+static INLINE float vcast_f_vf(vfloat v) {
+  float s[4];
+  _mm_storeu_ps(s, v);
+  return s[0];
+}
+
+static INLINE vmask vsignbit(vdouble d) {
+  return _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0));
+}
+
+static INLINE vdouble vsign(vdouble d) {
+  return (__m128d)_mm_or_si128((__m128i)_mm_set_pd(1, 1), _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0)));
+}
+
+static INLINE vdouble vmulsign(vdouble x, vdouble y) {
+  return (__m128d)vxori((__m128i)x, vsignbit(y));
+}
+
+static INLINE vmask vmask_isinf(vdouble d) {
+  return (vmask)_mm_cmpeq_pd(vabs(d), _mm_set_pd(INFINITY, INFINITY));
+}
+
+static INLINE vmask vmask_ispinf(vdouble d) {
+  return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(INFINITY, INFINITY));
+}
+
+static INLINE vmask vmask_isminf(vdouble d) {
+  return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(-INFINITY, -INFINITY));
+}
+
+static INLINE vmask vmask_isnan(vdouble d) {
+  return (vmask)_mm_cmpneq_pd(d, d);
+}
+
+static INLINE vdouble visinf(vdouble d) {
+  return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)_mm_set_pd(1, 1)));
+}
+
+static INLINE vdouble visinf2(vdouble d, vdouble m) {
+  return (__m128d)_mm_and_si128(vmask_isinf(d), _mm_or_si128(vsignbit(d), (__m128i)m));
+}
+
+//
+
+static INLINE vdouble vpow2i(vint q) {
+  q = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), q);
+  q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,3,0,3));
+  return (__m128d)_mm_slli_epi32(q, 20);
+}
+
+static INLINE vdouble vldexp(vdouble x, vint q) {
+  vint m = _mm_srai_epi32(q, 31);
+  m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7);
+  q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2));
+  vdouble y = vpow2i(m);
+  return vmul(vmul(vmul(vmul(vmul(x, y), y), y), y), vpow2i(q));
+}
+
+static INLINE vint vilogbp1(vdouble d) {
+  vint m = vmask_lt(d, vcast_vd_d(4.9090934652977266E-91));
+  d = vsel(m, vmul(vcast_vd_d(2.037035976334486E90), d), d);
+  __m128i q = _mm_and_si128((__m128i)d, _mm_set_epi32(((1 << 12)-1) << 20, 0, ((1 << 12)-1) << 20, 0));
+  q = _mm_srli_epi32(q, 20);
+  q = vorm(vandm   (m, _mm_sub_epi32(q, _mm_set_epi32(300 + 0x3fe, 0, 300 + 0x3fe, 0))),
+	   vandnotm(m, _mm_sub_epi32(q, _mm_set_epi32(      0x3fe, 0,       0x3fe, 0))));
+  q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(0,0,3,1));
+  return q;
+}
+
+static INLINE vdouble vupper(vdouble d) {
+  return (__m128d)_mm_and_si128((__m128i)d, _mm_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000));
+}
+
+//
+
+typedef struct {
+  vdouble x, y;
+} vdouble2;
+
+static INLINE vdouble2 dd(vdouble h, vdouble l) {
+  vdouble2 ret = {h, l};
+  return ret;
+}
+
+static INLINE vdouble2 vsel2(vmask mask, vdouble2 x, vdouble2 y) {
+  return dd((__m128d)vorm(vandm(mask, (__m128i)x.x), vandnotm(mask, (__m128i)y.x)),
+	    (__m128d)vorm(vandm(mask, (__m128i)x.y), vandnotm(mask, (__m128i)y.y)));
+}
+
+static INLINE vdouble2 abs_d(vdouble2 x) {
+  return dd((__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0,-0.0), x.x), x.x),
+	    (__m128d)_mm_xor_pd(_mm_and_pd(_mm_set_pd(-0.0,-0.0), x.x), x.y));
+}
--- a/rtengine/sleef.c
+++ b/rtengine/sleef.c
--- a/rtengine/sleef.h
+++ b/rtengine/sleef.h
@@ -0,0 +1,51 @@
+typedef struct {
+  double x, y;
+} double2;
+
+typedef struct {
+  float x, y;
+} float2;
+
+double xsin(double d);
+double xcos(double d);
+double2 xsincos(double d);
+double xtan(double d);
+double xasin(double s);
+double xacos(double s);
+double xatan(double s);
+double xatan2(double y, double x);
+double xlog(double d);
+double xexp(double d);
+double xpow(double x, double y);
+
+double xsinh(double x);
+double xcosh(double x);
+double xtanh(double x);
+double xasinh(double x);
+double xacosh(double x);
+double xatanh(double x);
+double xldexp(double x, int q);
+int xilogb(double d);
+
+double xfma(double x, double y, double z);
+double xsqrt(double d);
+double xcbrt(double d);
+
+double xexp2(double a);
+double xexp10(double a);
+double xexpm1(double a);
+double xlog10(double a);
+double xlog1p(double a);
+
+float xsinf(float d);
+float xcosf(float d);
+float2 xsincosf(float d);
+float xtanf(float d);
+float xasinf(float s);
+float xacosf(float s);
+float xatanf(float s);
+float xatan2f(float y, float x);
+float xlogf(float d);
+float xexpf(float d);
+float xpowf(float x, float y);
+float xcbrtf(float d);
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
--- a/rtengine/sleefsseavx.h
+++ b/rtengine/sleefsseavx.h
@@ -0,0 +1,108 @@
+#include <immintrin.h>
+#include <stdint.h>
+
+#ifdef __SSE2__
+#define VECTLENDP 2
+#define VECTLENSP 4
+
+typedef __m128d vdouble;
+typedef __m128i vint;
+
+typedef __m128 vfloat;
+typedef __m128i vint2;
+typedef __m128i vmask;
+
+static vdouble vloadu(double *p) { return _mm_loadu_pd(p); }
+static void vstoreu(double *p, vdouble v) { _mm_storeu_pd(p, v); }
+
+static vfloat vloaduf(float *p) { return _mm_loadu_ps(p); }
+static void vstoreuf(float *p, vfloat v) { _mm_storeu_ps(p, v); }
+
+static vint2 vloadui2(int32_t *p) { return (vint2)_mm_loadu_si128((__m128i *)p); }
+static void vstoreui2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, (__m128i)v); }
+#endif
+
+#ifdef ENABLE_AVX
+#define VECTLENDP 4
+#define VECTLENSP 8
+
+typedef __m256d vdouble;
+typedef __m128i vint;
+
+
+typedef __m256 vfloat;
+typedef struct {
+  vint x, y;
+} vint2;
+
+static vdouble vloadu(double *p) { return _mm256_loadu_pd(p); }
+static void vstoreu(double *p, vdouble v) { return _mm256_storeu_pd(p, v); }
+
+static vfloat vloaduf(float *p) { return _mm256_loadu_ps(p); }
+static void vstoreuf(float *p, vfloat v) { return _mm256_storeu_ps(p, v); }
+
+static vint2 vloadui2(int32_t *p) {
+  vint2 r;
+  r.x = _mm_loadu_si128((__m128i *) p     );
+  r.y = _mm_loadu_si128((__m128i *)(p + 4));
+  return r;
+}
+
+static void vstoreui2(int32_t *p, vint2 v) {
+  _mm_storeu_si128((__m128i *) p     , v.x);
+  _mm_storeu_si128((__m128i *)(p + 4), v.y);
+}
+#endif
+
+typedef struct {
+  vdouble x, y;
+} vdouble2;
+
+vdouble xldexp(vdouble x, vint q);
+vint xilogb(vdouble d);
+
+vdouble xsin(vdouble d);
+vdouble xcos(vdouble d);
+vdouble2 xsincos(vdouble d);
+vdouble xtan(vdouble d);
+vdouble xasin(vdouble s);
+vdouble xacos(vdouble s);
+vdouble xatan(vdouble s);
+vdouble xatan2(vdouble y, vdouble x);
+vdouble xlog(vdouble d);
+vdouble xexp(vdouble d);
+vdouble xpow(vdouble x, vdouble y);
+
+vdouble xsinh(vdouble d);
+vdouble xcosh(vdouble d);
+vdouble xtanh(vdouble d);
+vdouble xasinh(vdouble s);
+vdouble xacosh(vdouble s);
+vdouble xatanh(vdouble s);
+
+vdouble xcbrt(vdouble d);
+
+vdouble xexp2(vdouble a);
+vdouble xexp10(vdouble a);
+vdouble xexpm1(vdouble a);
+vdouble xlog10(vdouble a);
+vdouble xlog1p(vdouble a);
+
+//
+
+typedef struct {
+  vfloat x, y;
+} vfloat2;
+
+vfloat xsinf(vfloat d);
+vfloat xcosf(vfloat d);
+vfloat2 xsincosf(vfloat d);
+vfloat xtanf(vfloat d);
+vfloat xasinf(vfloat s);
+vfloat xacosf(vfloat s);
+vfloat xatanf(vfloat s);
+vfloat xatan2f(vfloat y, vfloat x);
+vfloat xlogf(vfloat d);
+vfloat xlogf0(vfloat d);
+vfloat xexpf(vfloat d);
+vfloat xcbrtf(vfloat s);