Cleaned gauss code and included some speedups

2016-01-18 23:56:02 +01:00
parent 33ea7156b8
commit a3c20daa46
12 changed files with 2117 additions and 2177 deletions
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -115,21 +115,30 @@ static INLINE vfloat vcast_vf_f(float f)
    return _mm_set_ps(f, f, f, f);
 }

+// Don't use intrinsics here. Newer gcc versions (>= 4.9, maybe also before 4.9) generate better code when not using intrinsics
+// example: vaddf(vmulf(a,b),c) will generate an FMA instruction when build for chips with that feature only when vaddf and vmulf don't use intrinsics
 static INLINE vfloat vaddf(vfloat x, vfloat y)
 {
-    return _mm_add_ps(x, y);
+    return x + y;
 }
 static INLINE vfloat vsubf(vfloat x, vfloat y)
 {
-    return _mm_sub_ps(x, y);
+    return x - y;
 }
 static INLINE vfloat vmulf(vfloat x, vfloat y)
 {
-    return _mm_mul_ps(x, y);
+    return x * y;
 }
 static INLINE vfloat vdivf(vfloat x, vfloat y)
 {
-    return _mm_div_ps(x, y);
+    return x / y;
+}
+// Also don't use intrinsic here: Some chips support FMA instructions with 3 and 4 operands
+// 3 operands: a = a*b+c, b = a*b+c, c = a*b+c // destination has to be one of a,b,c
+// 4 operands: d = a*b+c // destination does not have to be one of a,b,c
+// gcc will use the one which fits best when not using intrinsics. With using intrinsics that's not possible
+static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) {
+    return x * y + z;
 }
 static INLINE vfloat vrecf(vfloat x)
 {