Cleaned gauss code and included some speedups

This commit is contained in:
heckflosse
2016-01-18 23:56:02 +01:00
parent 33ea7156b8
commit a3c20daa46
12 changed files with 2117 additions and 2177 deletions

View File

@@ -115,21 +115,30 @@ static INLINE vfloat vcast_vf_f(float f)
return _mm_set_ps(f, f, f, f);
}
// Don't use intrinsics here. Newer gcc versions (>= 4.9, maybe also before 4.9) generate better code when not using intrinsics
// example: vaddf(vmulf(a,b),c) will generate an FMA instruction when build for chips with that feature only when vaddf and vmulf don't use intrinsics
static INLINE vfloat vaddf(vfloat x, vfloat y)
{
return _mm_add_ps(x, y);
return x + y;
}
static INLINE vfloat vsubf(vfloat x, vfloat y)
{
return _mm_sub_ps(x, y);
return x - y;
}
static INLINE vfloat vmulf(vfloat x, vfloat y)
{
return _mm_mul_ps(x, y);
return x * y;
}
static INLINE vfloat vdivf(vfloat x, vfloat y)
{
return _mm_div_ps(x, y);
return x / y;
}
// Also don't use intrinsic here: Some chips support FMA instructions with 3 and 4 operands
// 3 operands: a = a*b+c, b = a*b+c, c = a*b+c // destination has to be one of a,b,c
// 4 operands: d = a*b+c // destination does not have to be one of a,b,c
// gcc will use the one which fits best when not using intrinsics. With using intrinsics that's not possible
static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) {
return x * y + z;
}
static INLINE vfloat vrecf(vfloat x)
{