Code review and speedup for Amaze Demosaic
This commit is contained in:
parent
e5b1abdc3b
commit
2017a0e592
File diff suppressed because it is too large
Load Diff
@ -4,7 +4,6 @@
|
|||||||
|
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
#define INLINE __inline
|
#define INLINE __inline
|
||||||
//#define INLINE __attribute__((always_inline))
|
|
||||||
#else
|
#else
|
||||||
#define INLINE inline
|
#define INLINE inline
|
||||||
#endif
|
#endif
|
||||||
@ -48,20 +47,20 @@ typedef __m128i vint2;
|
|||||||
// SSE4.1 => use _mm_blend_ps instead of _mm_set_epi32 and vself
|
// SSE4.1 => use _mm_blend_ps instead of _mm_set_epi32 and vself
|
||||||
#define STC2VFU(a,v) {\
|
#define STC2VFU(a,v) {\
|
||||||
__m128 TST1V = _mm_loadu_ps(&a);\
|
__m128 TST1V = _mm_loadu_ps(&a);\
|
||||||
__m128 TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 1,1,0,0 ));\
|
__m128 TST2V = _mm_unpacklo_ps(v,v);\
|
||||||
_mm_storeu_ps(&a, _mm_blend_ps(TST1V,TST2V,5));\
|
_mm_storeu_ps(&a, _mm_blend_ps(TST1V,TST2V,5));\
|
||||||
TST1V = _mm_loadu_ps((&a)+4);\
|
TST1V = _mm_loadu_ps((&a)+4);\
|
||||||
TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 3,3,2,2 ));\
|
TST2V = _mm_unpackhi_ps(v,v);\
|
||||||
_mm_storeu_ps((&a)+4, _mm_blend_ps(TST1V,TST2V,5));\
|
_mm_storeu_ps((&a)+4, _mm_blend_ps(TST1V,TST2V,5));\
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#define STC2VFU(a,v) {\
|
#define STC2VFU(a,v) {\
|
||||||
__m128 TST1V = _mm_loadu_ps(&a);\
|
__m128 TST1V = _mm_loadu_ps(&a);\
|
||||||
__m128 TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 1,1,0,0 ));\
|
__m128 TST2V = _mm_unpacklo_ps(v,v);\
|
||||||
vmask cmask = _mm_set_epi32(0xffffffff,0,0xffffffff,0);\
|
vmask cmask = _mm_set_epi32(0xffffffff,0,0xffffffff,0);\
|
||||||
_mm_storeu_ps(&a, vself(cmask,TST1V,TST2V));\
|
_mm_storeu_ps(&a, vself(cmask,TST1V,TST2V));\
|
||||||
TST1V = _mm_loadu_ps((&a)+4);\
|
TST1V = _mm_loadu_ps((&a)+4);\
|
||||||
TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 3,3,2,2 ));\
|
TST2V = _mm_unpackhi_ps(v,v);\
|
||||||
_mm_storeu_ps((&a)+4, vself(cmask,TST1V,TST2V));\
|
_mm_storeu_ps((&a)+4, vself(cmask,TST1V,TST2V));\
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -78,5 +78,15 @@ inline const _Tp& max(const _Tp& a, const _Tp& b, const _Tp& c, const _Tp& d)
|
|||||||
{
|
{
|
||||||
return std::max(d, std::max(c, std::max(a, b)));
|
return std::max(d, std::max(c, std::max(a, b)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename _Tp>
|
||||||
|
inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c) {
|
||||||
|
// calculate a * b + (1 - a) * c
|
||||||
|
// following is valid:
|
||||||
|
// intp(a, b+x, c+x) = vintpf(a, b, c) + x
|
||||||
|
// intp(a, b*x, c*x) = vintpf(a, b, c) * x
|
||||||
|
return a * (b-c) + c;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1316,7 +1316,9 @@ return vmaxf( b, vminf(a,c));
|
|||||||
}
|
}
|
||||||
|
|
||||||
static INLINE vfloat ULIMV( vfloat a, vfloat b, vfloat c ){
|
static INLINE vfloat ULIMV( vfloat a, vfloat b, vfloat c ){
|
||||||
return vself( vmaskf_lt(b,c), LIMV(a,b,c), LIMV(a,c,b));
|
// made to clamp a in range [b,c] but in fact it's also the median of a,b,c, which means that the result is independent on order of arguments
|
||||||
|
// ULIMV(a,b,c) = ULIMV(a,c,b) = ULIMV(b,a,c) = ULIMV(b,c,a) = ULIMV(c,a,b) = ULIMV(c,b,a)
|
||||||
|
return vmaxf(vminf(a,b), vminf(vmaxf(a,b),c));
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE vfloat SQRV(vfloat a){
|
static INLINE vfloat SQRV(vfloat a){
|
||||||
@ -1324,17 +1326,45 @@ static INLINE vfloat SQRV(vfloat a){
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
|
static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
|
||||||
|
// conditional swap the elements of two vfloats
|
||||||
vfloat temp = vself(condition, a, b); // the values which fit to condition
|
vfloat temp = vself(condition, a, b); // the values which fit to condition
|
||||||
condition = vnotm(condition); // invert the condition
|
condition = vnotm(condition); // invert the condition
|
||||||
a = vself(condition, a, b); // the values which fit to inverted condition
|
a = vself(condition, a, b); // the values which fit to inverted condition
|
||||||
b = temp;
|
b = temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline float vhadd( vfloat a )
|
static inline float vhadd( vfloat a ) {
|
||||||
{
|
// returns a[0] + a[1] + a[2] + a[3]
|
||||||
a += _mm_movehl_ps(a, a);
|
a += _mm_movehl_ps(a, a);
|
||||||
return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
|
return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static INLINE vfloat vmul2f(vfloat a){
|
||||||
|
// fastest way to multiply by 2
|
||||||
|
return a + a;
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
|
||||||
|
// calculate a * b + (1 - a) * c (interpolate two values)
|
||||||
|
// following is valid:
|
||||||
|
// vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
|
||||||
|
// vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
|
||||||
|
return a * (b-c) + c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE vfloat vdup(vfloat a){
|
||||||
|
// returns { a[0],a[0],a[1],a[1] }
|
||||||
|
return _mm_unpacklo_ps( a, a );
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE vfloat vaddc2vfu(float &a)
|
||||||
|
{
|
||||||
|
// loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
|
||||||
|
vfloat a1 = _mm_loadu_ps( &a );
|
||||||
|
vfloat a2 = _mm_loadu_ps( (&a) + 4 );
|
||||||
|
return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif // __SSE2__
|
#endif // __SSE2__
|
||||||
#endif // SLEEFSSEAVX
|
#endif // SLEEFSSEAVX
|
||||||
|
Loading…
x
Reference in New Issue
Block a user