From 7655fce8f998dafbbf5149ddf56441ba98b8b538 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Thu, 11 Feb 2016 00:44:26 +0100
Subject: [PATCH] Shadows/Highlights Sparmpask, speedup using SSE4 for native
 builds

---
 rtengine/LUT.h        | 107 ++++++++++++++++++++++++++++--------------
 rtengine/helpersse2.h |   5 ++
 2 files changed, 76 insertions(+), 36 deletions(-)
diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index b6eb1a05c..965180c92 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -71,9 +71,7 @@
 #include <glibmm.h>
 #include <fstream>
 #endif
-#ifdef __SSE2__
-#include "sleefsseavx.c"
-#endif
+#include "opthelper.h"
 #include <assert.h>
 #include "rt_math.h"
 
@@ -91,10 +89,9 @@ protected:
 private:
     unsigned int owner;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-    __m128 maxsv __attribute__ ((aligned (16)));
-    __m128 sizev __attribute__ ((aligned (16)));
-    __m128i maxsiv __attribute__ ((aligned (16)));
-    __m128i sizeiv __attribute__ ((aligned (16)));
+    vfloat maxsv __attribute__ ((aligned (16)));
+    vfloat sizev __attribute__ ((aligned (16)));
+    vint sizeiv __attribute__ ((aligned (16)));
 #endif
 public:
     /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
@@ -120,10 +117,9 @@ public:
         maxs = size - 2;
         maxsf = (float)maxs;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-        maxsv =  _mm_set1_ps( maxs );
-        maxsiv = _mm_cvttps_epi32( maxsv );
+        maxsv =  F2V( maxs );
         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
-        sizev = _mm_set1_ps( size - 1 );
+        sizev = F2V( size - 1 );
 #endif
     }
     void operator ()(int s, int flags = 0xfffffff)
@@ -150,10 +146,9 @@ public:
         maxs = size - 2;
         maxsf = (float)maxs;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-        maxsv =  _mm_set1_ps( maxs );
-        maxsiv = _mm_cvttps_epi32( maxsv );
+        maxsv =  F2V( maxs );
         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
-        sizev = _mm_set1_ps( size - 1 );
+        sizev = F2V( size - 1 );
 #endif
     }
 
@@ -167,11 +162,11 @@ public:
 
         assert (s > 0);
 
-        if (source == NULL) {
+        if (!source) {
             printf("source is NULL!\n");
         }
 
-        assert (source != NULL);
+        assert (source != nullptr);
 #endif
         dirty = false;  // Assumption
         clip = flags;
@@ -182,10 +177,9 @@ public:
         maxs = size - 2;
         maxsf = (float)maxs;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-        maxsv =  _mm_set1_ps( size - 2);
-        maxsiv = _mm_cvttps_epi32( maxsv );
+        maxsv =  F2V( size - 2);
         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
-        sizev = _mm_set1_ps( size - 1 );
+        sizev = F2V( size - 1 );
 #endif
 
         for (int i = 0; i < s; i++) {
@@ -195,7 +189,7 @@ public:
 
     LUT()
     {
-        data = NULL;
+        data = nullptr;
         reset();
     }
 
@@ -237,10 +231,10 @@ public:
         if (this != &rhs) {
             if (rhs.size > this->size) {
                 delete [] this->data;
-                this->data = NULL;
+                this->data = nullptr;
             }
 
-            if (this->data == NULL) {
+            if (this->data == nullptr) {
                 this->data = new T[rhs.size];
             }
 
@@ -252,10 +246,9 @@ public:
             this->maxs = this->size - 2;
             this->maxsf = (float)this->maxs;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-            this->maxsv =  _mm_set1_ps( this->size - 2);
-            this->maxsiv = _mm_cvttps_epi32( this->maxsv );
+            this->maxsv =  F2V( this->size - 2);
             this->sizeiv =  _mm_set1_epi32( (int)(this->size - 1) );
-            this->sizev = _mm_set1_ps( this->size - 1 );
+            this->sizev = F2V( this->size - 1 );
 #endif
         }
 
@@ -268,14 +261,15 @@ public:
     }
 
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-    __m128 operator[](__m128 indexv ) const
+/*
+    vfloat operator[](vfloat indexv ) const
     {
 //      printf("don't use this operator. It's not ready for production");
         return _mm_setzero_ps();
 
         // convert floats to ints
-        __m128i idxv =  _mm_cvttps_epi32( indexv );
-        __m128 tempv, resultv, p1v, p2v;
+        vint idxv =  _mm_cvttps_epi32( indexv );
+        vfloat tempv, resultv, p1v, p2v;
         vmask maxmask = vmaskf_gt(indexv, maxsv);
         idxv = _mm_castps_si128(vself(maxmask, maxsv, _mm_castsi128_ps(idxv)));
         vmask minmask = vmaskf_lt(indexv, _mm_setzero_ps());
@@ -327,15 +321,55 @@ public:
         p2v = _mm_move_ss( p2v, tempv);
         // now p2v is 3 2 1 0
 
-        __m128 diffv = indexv - _mm_cvtepi32_ps ( idxv );
+        vfloat diffv = indexv - _mm_cvtepi32_ps ( idxv );
         diffv = vself(vorm(maxmask, minmask), _mm_setzero_ps(), diffv);
         resultv = p1v + p2v * diffv;
         return resultv  ;
     }
-
-    __m128 operator[](__m128i idxv ) const
+*/
+#ifdef __SSE4_1__
+    vfloat operator[](vint idxv ) const
     {
-        __m128 tempv, p1v;
+        vfloat tempv, p1v;
+        idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
+        // access the LUT 4 times and shuffle the values into p1v
+
+        int idx;
+
+        // get 4th value
+        idx = _mm_extract_epi32(idxv, 3);
+        tempv = _mm_load_ss(&data[idx]);
+        p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
+        // now p1v is 3 3 3 3
+
+        // get 3rd value
+        idx = _mm_extract_epi32(idxv, 2);
+        tempv = _mm_load_ss(&data[idx]);
+        p1v = _mm_move_ss( p1v, tempv);
+        // now p1v is 3 3 3 2
+
+        // get 2nd value
+        idx = _mm_extract_epi32(idxv, 1);
+        tempv = _mm_load_ss(&data[idx]);
+        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
+        // now p1v is 3 2 3 2
+        p1v = _mm_move_ss( p1v, tempv );
+        // now p1v is 3 2 3 1
+
+        // get 1st value
+        idx = _mm_cvtsi128_si32(idxv);
+        tempv = _mm_load_ss(&data[idx]);
+        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
+        // now p1v is 3 2 1 1
+        p1v = _mm_move_ss( p1v, tempv );
+        // now p1v is 3 2 1 0
+
+        return p1v;
+    }
+#else
+    vfloat operator[](vint idxv ) const
+    {
+        vfloat tempv, p1v;
         tempv = _mm_cvtepi32_ps(idxv);
         tempv = _mm_min_ps( tempv, sizev );
         idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( )  ));
@@ -346,7 +380,7 @@ public:
         // get 4th value
         idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
         tempv = _mm_load_ss(&data[idx]);
-        p1v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(0, 0, 0, 0));
+        p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
         // now p1v is 3 3 3 3
 
         // get 3rd value
@@ -358,7 +392,7 @@ public:
         // get 2nd value
         idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
         tempv = _mm_load_ss(&data[idx]);
-        p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(1, 0, 1, 0));
+        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
         // now p1v is 3 2 3 2
         p1v = _mm_move_ss( p1v, tempv );
         // now p1v is 3 2 3 1
@@ -366,13 +400,14 @@ public:
         // get 1st value
         idx = _mm_cvtsi128_si32 (idxv);
         tempv = _mm_load_ss(&data[idx]);
-        p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(3, 2, 0, 0));
+        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
         // now p1v is 3 2 1 1
         p1v = _mm_move_ss( p1v, tempv );
         // now p1v is 3 2 1 0
 
         return p1v;
     }
+#endif
 #endif
 
     // use with float indices
@@ -465,7 +500,7 @@ public:
         }
 
         dirty = true;
-        data = NULL;
+        data = nullptr;
         owner = 1;
         size = 0;
         upperBound = 0;
@@ -484,7 +519,7 @@ class HueLUT : public LUTf
 {
 public:
     HueLUT() : LUTf() {}
-    HueLUT(bool createArray) : LUTf()
+    explicit HueLUT(bool createArray) : LUTf()
     {
         if (createArray) {
             this->operator () (501, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h
index 3f2bf6299..3e4365e99 100644
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -39,6 +39,11 @@ typedef __m128i vint2;
 #define STVFU(x,y) _mm_storeu_ps(&x,y)
 #endif
 
+#if defined(__x86_64__) && defined(__AVX__)
+#define PERMUTEPS(a,mask) _mm_permute_ps(a,mask)
+#else
+#define PERMUTEPS(a,mask) _mm_shuffle_ps(a,a,mask)
+#endif
 
 static INLINE vfloat LC2VFU(float &a)
 {