diff --git a/rtengine/sleef.c b/rtengine/sleef.c index f03c9f1b3..901d04f7d 100644 --- a/rtengine/sleef.c +++ b/rtengine/sleef.c @@ -12,10 +12,8 @@ #include #include -//#include #include "rt_math.h" -//#include -//#include +#include "opthelper.h" #define PI4_A .7853981554508209228515625 #define PI4_B .794662735614792836713604629039764404296875e-8 @@ -1005,6 +1003,10 @@ __inline float xsinf(float d) { } __inline float xcosf(float d) { +#ifdef __SSE2__ + // faster than scalar version + return xcosf(_mm_set_ss(d))[0]; +#else int q; float u, s; @@ -1027,9 +1029,15 @@ __inline float xcosf(float d) { u = mlaf(s, u * d, d); return u; +#endif } __inline float2 xsincosf(float d) { +#ifdef __SSE2__ + // faster than scalar version + vfloat2 res = xsincosf(_mm_set_ss(d)); + return {res.x[0], res.y[0]}; +#else int q; float u, s, t; float2 r; @@ -1069,6 +1077,7 @@ __inline float2 xsincosf(float d) { if (xisinff(d)) { r.x = r.y = rtengine::RT_NAN_F; } return r; +#endif } __inline float xtanf(float d) { diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index e4f587464..dcea09e2b 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -12,10 +12,6 @@ #define SLEEFSSEAVX #include -//#include -//#include -//#include -//#include "sleefsseavx.h" #include "rt_math.h" #ifdef __SSE2__ #include "helpersse2.h" @@ -30,8 +26,6 @@ #define INLINE inline #endif -// - #define PI4_A .7853981554508209228515625 #define PI4_B .794662735614792836713604629039764404296875e-8 #define PI4_C .306161699786838294306516483068750264552437361480769e-16 @@ -41,8 +35,6 @@ #define L2L .28235290563031577122588448175013436025525412068e-12 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931 -// - #define PI4_Af 0.78515625f #define PI4_Bf 0.00024127960205078125f #define PI4_Cf 6.3329935073852539062e-07f @@ -55,8 +47,6 @@ #define INFINITYf ((float)rtengine::RT_INFINITY) #define NANf ((float)rtengine::RT_NAN) -// - static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) { return vadd(vadd(v0, v1), v2); } @@ -1323,10 +1313,8 @@ static INLINE vfloat xexpf(vfloat d) { u = vldexpf(u, q); - u = vself(vmaskf_isminf(d), vcast_vf_f(0.0f), u); -// -104.0 - u = vself(vmaskf_gt(vcast_vf_f(-104), d), vcast_vf_f(0), u); - return u; + // -104.0 + return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u); } static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table