Amaze Demosaic: Speedup, cleaned code, changed nyquist code
This commit is contained in:
parent
2017a0e592
commit
ee665d6790
File diff suppressed because it is too large
Load Diff
@ -39,8 +39,15 @@ typedef __m128i vint2;
|
|||||||
#define STVFU(x,y) _mm_storeu_ps(&x,y)
|
#define STVFU(x,y) _mm_storeu_ps(&x,y)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static INLINE vfloat LC2VFU(float &a)
|
||||||
|
{
|
||||||
// Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
|
// Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
|
||||||
#define LC2VFU(a) _mm_shuffle_ps( LVFU(a), _mm_loadu_ps( (&a) + 4 ), _MM_SHUFFLE( 2,0,2,0 ) )
|
vfloat a1 = _mm_loadu_ps( &a );
|
||||||
|
vfloat a2 = _mm_loadu_ps( (&a) + 4 );
|
||||||
|
return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 ));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Store a vector of 4 floats in a[0],a[2],a[4] and a[6]
|
// Store a vector of 4 floats in a[0],a[2],a[4] and a[6]
|
||||||
#if defined(__x86_64__) && defined(__SSE4_1__)
|
#if defined(__x86_64__) && defined(__SSE4_1__)
|
||||||
|
@ -83,8 +83,8 @@ template<typename _Tp>
|
|||||||
inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c) {
|
inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c) {
|
||||||
// calculate a * b + (1 - a) * c
|
// calculate a * b + (1 - a) * c
|
||||||
// following is valid:
|
// following is valid:
|
||||||
// intp(a, b+x, c+x) = vintpf(a, b, c) + x
|
// intp(a, b+x, c+x) = intp(a, b, c) + x
|
||||||
// intp(a, b*x, c*x) = vintpf(a, b, c) * x
|
// intp(a, b*x, c*x) = intp(a, b, c) * x
|
||||||
return a * (b-c) + c;
|
return a * (b-c) + c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -910,11 +910,20 @@ static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vca
|
|||||||
static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
|
static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
|
||||||
return _mm_blendv_ps(y,x,(vfloat)mask);
|
return _mm_blendv_ps(y,x,(vfloat)mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static INLINE vint vselc(vmask mask, vint x, vint y) {
|
||||||
|
return _mm_blendv_epi8(y,x,mask);
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// three instructions when using SSE2
|
// three instructions when using SSE2
|
||||||
static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
|
static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
|
||||||
return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
|
return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static INLINE vint vselc(vmask mask, vint x, vint y) {
|
||||||
|
return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static INLINE vfloat vselfzero(vmask mask, vfloat x) {
|
static INLINE vfloat vselfzero(vmask mask, vfloat x) {
|
||||||
@ -928,6 +937,16 @@ static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
|
|||||||
return _mm_andnot_ps((vfloat)mask, x);
|
return _mm_andnot_ps((vfloat)mask, x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static INLINE vint vselizero(vmask mask, vint x) {
|
||||||
|
// returns value of x if corresponding mask bits are 1, else returns 0
|
||||||
|
// faster than vselc(mask, x, ZEROV)
|
||||||
|
return _mm_and_si128(mask, x);
|
||||||
|
}
|
||||||
|
static INLINE vint vselinotzero(vmask mask, vint x) {
|
||||||
|
// returns value of x if corresponding mask bits are 0, else returns 0
|
||||||
|
// faster than vselc(mask, ZEROV, x)
|
||||||
|
return _mm_andnot_si128(mask, x);
|
||||||
|
}
|
||||||
|
|
||||||
static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
|
static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
|
||||||
vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
|
vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
|
||||||
@ -1365,6 +1384,9 @@ static INLINE vfloat vaddc2vfu(float &a)
|
|||||||
return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
|
return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static INLINE vfloat vadivapb (vfloat a, vfloat b) {
|
||||||
|
return a / (a+b);
|
||||||
|
}
|
||||||
|
|
||||||
#endif // __SSE2__
|
#endif // __SSE2__
|
||||||
#endif // SLEEFSSEAVX
|
#endif // SLEEFSSEAVX
|
||||||
|
Loading…
x
Reference in New Issue
Block a user