Amaze Demosaic: Speedup, cleaned code, changed nyquist code

2016-01-26 13:10:38 +01:00 · 2016-01-26 13:10:38 +01:00 · ee665d6790
commit ee665d6790
parent 2017a0e592
4 changed files with 418 additions and 420 deletions
--- a/rtengine/amaze_demosaic_RT.cc
+++ b/rtengine/amaze_demosaic_RT.cc
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@ -39,8 +39,15 @@ typedef __m128i vint2;
 #define STVFU(x,y) _mm_storeu_ps(&x,y)
 #endif

-// Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
-#define LC2VFU(a) _mm_shuffle_ps( LVFU(a),  _mm_loadu_ps(  (&a) + 4 ), _MM_SHUFFLE( 2,0,2,0 ) )
+
+static INLINE vfloat LC2VFU(float &a)
+{
+    // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
+    vfloat a1 = _mm_loadu_ps( &a );
+    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
+    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 ));
+}
+

 // Store a vector of 4 floats in a[0],a[2],a[4] and a[6]
 #if defined(__x86_64__) && defined(__SSE4_1__)
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@ -83,8 +83,8 @@ template<typename _Tp>
 inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c) {
    // calculate a * b + (1 - a) * c
    // following is valid:
-    // intp(a, b+x, c+x) = vintpf(a, b, c) + x
-    // intp(a, b*x, c*x) = vintpf(a, b, c) * x
+    // intp(a, b+x, c+x) = intp(a, b, c) + x
+    // intp(a, b*x, c*x) = intp(a, b, c) * x
    return a * (b-c) + c;
 }

--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@ -910,11 +910,20 @@ static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vca
 	static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
 		return _mm_blendv_ps(y,x,(vfloat)mask);
 	}
+
+	static INLINE vint vselc(vmask mask, vint x, vint y) {
+		return _mm_blendv_epi8(y,x,mask);
+	}
+
 #else
 	// three instructions when using SSE2
 	static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
 		return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
 	}
+
+	static INLINE vint vselc(vmask mask, vint x, vint y) {
+	    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+	}
 #endif

 static INLINE vfloat vselfzero(vmask mask, vfloat x) {
@ -928,6 +937,16 @@ static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
    return _mm_andnot_ps((vfloat)mask, x);
 }

+static INLINE vint vselizero(vmask mask, vint x) {
+     // returns value of x if corresponding mask bits are 1, else returns 0
+     // faster than vselc(mask, x, ZEROV)
+    return _mm_and_si128(mask, x);
+}
+static INLINE vint vselinotzero(vmask mask, vint x) {
+    // returns value of x if corresponding mask bits are 0, else returns 0
+    // faster than vselc(mask, ZEROV, x)
+    return _mm_andnot_si128(mask, x);
+}

 static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
  vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
@ -1362,9 +1381,12 @@ static INLINE vfloat vaddc2vfu(float &a)
    // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
    vfloat a1 = _mm_loadu_ps( &a );
    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
-    return  _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
+    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
 }

+static INLINE vfloat vadivapb (vfloat a, vfloat b) {
+    return a / (a+b);
+}

 #endif // __SSE2__
 #endif // SLEEFSSEAVX