New SSE interpolating routine for LUT<float>.

2017-12-25 14:55:14 -05:00
parent 3ccfb9b203
commit ebc92e1c35
2 changed files with 38 additions and 63 deletions
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -95,6 +95,8 @@ protected:
    // list of variables ordered to improve cache speed
    unsigned int maxs;
    float maxsf;
+    // possibly-more-correct value for sse routine (see unit test for details)
+    float maxIndexFloat;
    T * data;
    unsigned int clip;
    unsigned int size;
@@ -129,6 +131,7 @@ public:
        upperBound = size - 1;
        maxs = size - 2;
        maxsf = (float)maxs;
+        maxIndexFloat = ((float)upperBound) - 1e-5;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
        maxsv =  F2V( maxs );
        sizeiv =  _mm_set1_epi32( (int)(size - 1) );
@@ -158,6 +161,7 @@ public:
        upperBound = size - 1;
        maxs = size - 2;
        maxsf = (float)maxs;
+        maxIndexFloat = ((float)upperBound) - 1e-5;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
        maxsv =  F2V( maxs );
        sizeiv =  _mm_set1_epi32( (int)(size - 1) );
@@ -228,6 +232,7 @@ public:
            this->upperBound = rhs.upperBound;
            this->maxs = this->size - 2;
            this->maxsf = (float)this->maxs;
+            this->maxIndexFloat = ((float)this->upperBound) - 1e-5;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
            this->maxsv =  F2V( this->size - 2);
            this->sizeiv =  _mm_set1_epi32( (int)(this->size - 1) );
@@ -293,72 +298,37 @@ public:
    }

 #if defined( __SSE2__ ) && defined( __x86_64__ )
-/*
-    vfloat operator[](vfloat indexv ) const
+    vfloat operator[](vfloat indexv) const
    {
-//      printf("don't use this operator. It's not ready for production");
-        return _mm_setzero_ps();
+        static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");

-        // convert floats to ints
-        vint idxv =  _mm_cvttps_epi32( indexv );
-        vfloat tempv, resultv, p1v, p2v;
-        vmask maxmask = vmaskf_gt(indexv, maxsv);
-        idxv = _mm_castps_si128(vself(maxmask, maxsv, _mm_castsi128_ps(idxv)));
-        vmask minmask = vmaskf_lt(indexv, _mm_setzero_ps());
-        idxv = _mm_castps_si128(vself(minmask, _mm_setzero_ps(), _mm_castsi128_ps(idxv)));
-        // access the LUT 4 times and shuffle the values into p1v and p2v
+        // Clamp and convert to integer values. Extract out of SSE register because all
+        // lookup operations use regular addresses.
+        vfloat clampedIndexes = _mm_max_ps(
+                _mm_setzero_ps(),
+                _mm_min_ps(_mm_set1_ps(maxIndexFloat), indexv));
+        vint indexes = _mm_cvttps_epi32(clampedIndexes);
+        int indexArray[4];
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);

-        int idx;
+        // Load data from the table. This reads more than necessary, but there don't seem
+        // to exist more granular operations (though we could try non-SSE).
+        // Cast to int for convenience in the next operation (partial transpose).
+        vint values[4];
+        for (int i = 0; i < 4; ++i) {
+            values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
+        }

-        // get 4th value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
-        tempv = LVFU(data[idx]);
-        p1v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(0, 0, 0, 0));
-        p2v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
-        // now p1v is 3 3 3 3
-        //     p2v is 3 3 3 3
+        // Partial 4x4 transpose operation. We want two new vectors, the first consisting
+        // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
+        __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
+        __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
+        vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
+        vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));

-        // get 3rd value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
-        tempv = LVFU(data[idx]);
-        p1v = _mm_move_ss( p1v, tempv);
-        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
-        p2v = _mm_move_ss( p2v, tempv);
-        // now p1v is 3 3 3 2
-        //     p2v is 3 3 3 2
-
-        // get 2nd value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
-        tempv = LVFU(data[idx]);
-        p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(1, 0, 1, 0));
-        p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(1, 0, 1, 0));
-        // now p1v is 3 2 3 2
-        // now p2v is 3 2 3 2
-        p1v = _mm_move_ss( p1v, tempv );
-        // now p1v is 3 2 3 1
-        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
-        p2v = _mm_move_ss( p2v, tempv);
-        // now p1v is 3 2 3 1
-
-        // get 1st value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(0, 0, 0, 0)));
-        tempv = LVFU(data[idx]);
-        p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(3, 2, 0, 0));
-        // now p1v is 3 2 1 1
-        p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(3, 2, 0, 0));
-        // now p2v is 3 2 1 1
-        p1v = _mm_move_ss( p1v, tempv );
-        // now p1v is 3 2 1 0
-        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
-        p2v = _mm_move_ss( p2v, tempv);
-        // now p2v is 3 2 1 0
-
-        vfloat diffv = indexv - _mm_cvtepi32_ps ( idxv );
-        diffv = vself(vorm(maxmask, minmask), _mm_setzero_ps(), diffv);
-        resultv = p1v + p2v * diffv;
-        return resultv  ;
+        vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes);
+        return (_mm_set1_ps(1.0f) - diff) * lower + (diff * upper);
    }
-*/
 #ifdef __SSE4_1__
    template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
    vfloat operator[](vint idxv ) const
@@ -456,6 +426,8 @@ public:
            }

            idx = 0;
+            // Note: Maybe this should be 'idx > maxsf'? See unit test where a LUT with
+            // values [10, 11, 12, 13] gets looked up at 2.5 and returns 12.5.
        } else if (index > maxsf) {
            if (clip & LUT_CLIP_ABOVE) {
                return data[upperBound];
@@ -543,6 +515,7 @@ public:
        maxs = 0;
        maxsf = 0.f;
        clip = 0;
+        maxIndexFloat = ((float)upperBound) - 1e-5;
    }

    // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor)
@@ -652,6 +625,7 @@ public:
        upperBound = size - 1;
        maxs = size - 2;
        maxsf = (float)maxs;
+        maxIndexFloat = ((float)upperBound) - 1e-5;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
        maxsv =  F2V( size - 2);
        sizeiv =  _mm_set1_epi32( (int)(size - 1) );
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -881,6 +881,7 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const
    g = lutToneCurve[g];
    b = lutToneCurve[b];
 }
+
 inline void StandardToneCurve::BatchApply(
        const size_t start, const size_t end,
        float *r, float *g, float *b) const {
@@ -911,9 +912,9 @@ inline void StandardToneCurve::BatchApply(

 #if defined( __SSE2__ ) && defined( __x86_64__ )
    for (; i + 3 < end; i += 4) {
-        __m128i r_val = _mm_cvtps_epi32(LVF(r[i]));
-        __m128i g_val = _mm_cvtps_epi32(LVF(g[i]));
-        __m128i b_val = _mm_cvtps_epi32(LVF(b[i]));
+        __m128 r_val = LVF(r[i]);
+        __m128 g_val = LVF(g[i]);
+        __m128 b_val = LVF(b[i]);
        STVF(r[i], lutToneCurve[r_val]);
        STVF(g[i], lutToneCurve[g_val]);
        STVF(b[i], lutToneCurve[b_val]);