From 3ccfb9b2031111ee72666b81c3579069c17e0949 Mon Sep 17 00:00:00 2001
From: gatoatigrado <gatoatigrado@gmail.com>
Date: Mon, 25 Dec 2017 00:41:43 -0500
Subject: [PATCH 1/5] Use AlignedBuffer helper class in rgbProc, use SSE in
 standard tone curve application.

---
 rtengine/alignedbuffer.h |  4 +++
 rtengine/curves.h        | 53 ++++++++++++++++++++++++++++++++++++++++
 rtengine/improcfun.cc    | 30 ++++++++++-------------
 3 files changed, 70 insertions(+), 17 deletions(-)
diff --git a/rtengine/alignedbuffer.h b/rtengine/alignedbuffer.h
index dd9d7b278..560f0884f 100644
--- a/rtengine/alignedbuffer.h
+++ b/rtengine/alignedbuffer.h
@@ -21,6 +21,10 @@
 #include <cstdlib>
 #include <utility>
 
+inline size_t padToAlignment(size_t size, size_t align = 16) {
+    return align * ((size + align - 1) / align);
+}
+
 // Aligned buffer that should be faster
 template <class T> class AlignedBuffer
 {
diff --git a/rtengine/curves.h b/rtengine/curves.h
index c616c94da..e443d430c 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -800,6 +800,13 @@ class StandardToneCurve : public ToneCurve
 {
 public:
     void Apply(float& r, float& g, float& b) const;
+
+    // Applies the tone curve to `r`, `g`, `b` arrays, starting at `r[start]`
+    // and ending at `r[end]` (and respectively for `b` and `g`). Uses SSE
+    // and requires that `r`, `g`, and `b` pointers have the same alignment.
+    void BatchApply(
+            const size_t start, const size_t end,
+            float *r, float *g, float *b) const;
 };
 
 class AdobeToneCurve : public ToneCurve
@@ -874,6 +881,52 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const
     g = lutToneCurve[g];
     b = lutToneCurve[b];
 }
+inline void StandardToneCurve::BatchApply(
+        const size_t start, const size_t end,
+        float *r, float *g, float *b) const {
+    assert (lutToneCurve);
+
+    // All pointers must have the same alignment for SSE usage. In the loop body below,
+    // we will only check `r`, assuming that the same result would hold for `g` and `b`.
+    assert (reinterpret_cast<uintptr_t>(r) % 16 == reinterpret_cast<uintptr_t>(g) % 16);
+    assert (reinterpret_cast<uintptr_t>(g) % 16 == reinterpret_cast<uintptr_t>(b) % 16);
+
+    size_t i = start;
+    while (true) {
+        if (i >= end) {
+            // If we get to the end before getting to an aligned address, just return.
+            // (Or, for non-SSE mode, if we get to the end.)
+            return;
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+        } else if (reinterpret_cast<uintptr_t>(&r[i]) % 16 == 0) {
+            // Otherwise, we get to the first aligned address; go to the SSE part.
+            break;
+#endif
+        }
+        r[i] = lutToneCurve[r[i]];
+        g[i] = lutToneCurve[g[i]];
+        b[i] = lutToneCurve[b[i]];
+        i++;
+    }
+
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+    for (; i + 3 < end; i += 4) {
+        __m128i r_val = _mm_cvtps_epi32(LVF(r[i]));
+        __m128i g_val = _mm_cvtps_epi32(LVF(g[i]));
+        __m128i b_val = _mm_cvtps_epi32(LVF(b[i]));
+        STVF(r[i], lutToneCurve[r_val]);
+        STVF(g[i], lutToneCurve[g_val]);
+        STVF(b[i], lutToneCurve[b_val]);
+    }
+
+    // Remainder in non-SSE.
+    for (; i < end; ++i) {
+        r[i] = lutToneCurve[r[i]];
+        g[i] = lutToneCurve[g[i]];
+        b[i] = lutToneCurve[b[i]];
+    }
+#endif
+}
 
 // Tone curve according to Adobe's reference implementation
 // values in 0xffff space
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index 1d38f6be1..082799e62 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -23,6 +23,7 @@
 #include <omp.h>
 #endif
 
+#include "alignedbuffer.h"
 #include "rtengine.h"
 #include "improcfun.h"
 #include "curves.h"
@@ -3409,31 +3410,28 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
     #pragma omp parallel if (multiThread)
 #endif
     {
-        char *buffer;
+        size_t perChannelSizeBytes = padToAlignment(sizeof (float) * TS * TS + 4 * 64);
+        AlignedBuffer<float> buffer(3 * perChannelSizeBytes);
         char *editIFloatBuffer = nullptr;
         char *editWhateverBuffer = nullptr;
 
-        buffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63);
-        char *data;
-        data = (char*) ( ( uintptr_t (buffer) + uintptr_t (63)) / 64 * 64);
-
-        float *rtemp = (float (*))data;
-        float *gtemp = (float (*))         ((char*)rtemp + sizeof (float) * TS * TS + 4 * 64);
-        float *btemp = (float (*))         ((char*)gtemp + sizeof (float) * TS * TS + 8 * 64);
+        float *rtemp = buffer.data;
+        float *gtemp = &rtemp[perChannelSizeBytes / sizeof(float)];
+        float *btemp = &gtemp[perChannelSizeBytes / sizeof(float)];
         int istart;
         int jstart;
         int tW;
         int tH;
 
         // zero out the buffers
-        memset(buffer, 0, 3 * sizeof (float) * TS * TS + 20 * 64 + 63);
+        memset(rtemp, 0, 3 * perChannelSizeBytes);
 
         // Allocating buffer for the PipetteBuffer
         float *editIFloatTmpR = nullptr, *editIFloatTmpG = nullptr, *editIFloatTmpB = nullptr, *editWhateverTmp = nullptr;
 
         if (editImgFloat) {
             editIFloatBuffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63);
-            data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64);
+            char *data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64);
 
             editIFloatTmpR = (float (*))data;
             editIFloatTmpG = (float (*))         ((char*)editIFloatTmpR + sizeof (float) * TS * TS + 4 * 64);
@@ -3442,7 +3440,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
 
         if (editWhatever) {
             editWhateverBuffer = (char *) malloc (sizeof (float) * TS * TS + 20 * 64 + 63);
-            data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64);
+            char *data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64);
 
             editWhateverTmp = (float (*))data;
         }
@@ -3618,10 +3616,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                 if (hasToneCurve1) {
                     if (curveMode == ToneCurveParams::TcMode::STD) { // Standard
                         for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve1);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
+                            const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve1);
+                            userToneCurve.BatchApply (
+                                    0, tW - jstart,
+                                    &rtemp[ti * TS], &gtemp[ti * TS], &btemp[ti * TS]);
                         }
                     } else if (curveMode == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like
                         for (int i = istart, ti = 0; i < tH; i++, ti++) {
@@ -4529,8 +4527,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                 }
             }
 
-        free (buffer);
-
         if (editIFloatBuffer) {
             free (editIFloatBuffer);
         }

From ebc92e1c350b028ae16a113051b8c3e46d4da0f1 Mon Sep 17 00:00:00 2001
From: gatoatigrado <gatoatigrado@gmail.com>
Date: Mon, 25 Dec 2017 14:55:14 -0500
Subject: [PATCH 2/5] New SSE interpolating routine for LUT<float>.

---
 rtengine/LUT.h    | 94 +++++++++++++++++------------------------------
 rtengine/curves.h |  7 ++--
 2 files changed, 38 insertions(+), 63 deletions(-)

diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index 2701c4ffc..4f245634e 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -95,6 +95,8 @@ protected:
     // list of variables ordered to improve cache speed
     unsigned int maxs;
     float maxsf;
+    // possibly-more-correct value for sse routine (see unit test for details)
+    float maxIndexFloat;
     T * data;
     unsigned int clip;
     unsigned int size;
@@ -129,6 +131,7 @@ public:
         upperBound = size - 1;
         maxs = size - 2;
         maxsf = (float)maxs;
+        maxIndexFloat = ((float)upperBound) - 1e-5;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
         maxsv =  F2V( maxs );
         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
@@ -158,6 +161,7 @@ public:
         upperBound = size - 1;
         maxs = size - 2;
         maxsf = (float)maxs;
+        maxIndexFloat = ((float)upperBound) - 1e-5;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
         maxsv =  F2V( maxs );
         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
@@ -228,6 +232,7 @@ public:
             this->upperBound = rhs.upperBound;
             this->maxs = this->size - 2;
             this->maxsf = (float)this->maxs;
+            this->maxIndexFloat = ((float)this->upperBound) - 1e-5;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
             this->maxsv =  F2V( this->size - 2);
             this->sizeiv =  _mm_set1_epi32( (int)(this->size - 1) );
@@ -293,72 +298,37 @@ public:
     }
 
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-/*
-    vfloat operator[](vfloat indexv ) const
+    vfloat operator[](vfloat indexv) const
     {
-//      printf("don't use this operator. It's not ready for production");
-        return _mm_setzero_ps();
+        static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
 
-        // convert floats to ints
-        vint idxv =  _mm_cvttps_epi32( indexv );
-        vfloat tempv, resultv, p1v, p2v;
-        vmask maxmask = vmaskf_gt(indexv, maxsv);
-        idxv = _mm_castps_si128(vself(maxmask, maxsv, _mm_castsi128_ps(idxv)));
-        vmask minmask = vmaskf_lt(indexv, _mm_setzero_ps());
-        idxv = _mm_castps_si128(vself(minmask, _mm_setzero_ps(), _mm_castsi128_ps(idxv)));
-        // access the LUT 4 times and shuffle the values into p1v and p2v
+        // Clamp and convert to integer values. Extract out of SSE register because all
+        // lookup operations use regular addresses.
+        vfloat clampedIndexes = _mm_max_ps(
+                _mm_setzero_ps(),
+                _mm_min_ps(_mm_set1_ps(maxIndexFloat), indexv));
+        vint indexes = _mm_cvttps_epi32(clampedIndexes);
+        int indexArray[4];
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
 
-        int idx;
+        // Load data from the table. This reads more than necessary, but there don't seem
+        // to exist more granular operations (though we could try non-SSE).
+        // Cast to int for convenience in the next operation (partial transpose).
+        vint values[4];
+        for (int i = 0; i < 4; ++i) {
+            values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
+        }
 
-        // get 4th value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
-        tempv = LVFU(data[idx]);
-        p1v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(0, 0, 0, 0));
-        p2v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
-        // now p1v is 3 3 3 3
-        //     p2v is 3 3 3 3
+        // Partial 4x4 transpose operation. We want two new vectors, the first consisting
+        // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
+        __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
+        __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
+        vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
+        vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
 
-        // get 3rd value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
-        tempv = LVFU(data[idx]);
-        p1v = _mm_move_ss( p1v, tempv);
-        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
-        p2v = _mm_move_ss( p2v, tempv);
-        // now p1v is 3 3 3 2
-        //     p2v is 3 3 3 2
-
-        // get 2nd value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
-        tempv = LVFU(data[idx]);
-        p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(1, 0, 1, 0));
-        p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(1, 0, 1, 0));
-        // now p1v is 3 2 3 2
-        // now p2v is 3 2 3 2
-        p1v = _mm_move_ss( p1v, tempv );
-        // now p1v is 3 2 3 1
-        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
-        p2v = _mm_move_ss( p2v, tempv);
-        // now p1v is 3 2 3 1
-
-        // get 1st value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(0, 0, 0, 0)));
-        tempv = LVFU(data[idx]);
-        p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(3, 2, 0, 0));
-        // now p1v is 3 2 1 1
-        p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(3, 2, 0, 0));
-        // now p2v is 3 2 1 1
-        p1v = _mm_move_ss( p1v, tempv );
-        // now p1v is 3 2 1 0
-        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
-        p2v = _mm_move_ss( p2v, tempv);
-        // now p2v is 3 2 1 0
-
-        vfloat diffv = indexv - _mm_cvtepi32_ps ( idxv );
-        diffv = vself(vorm(maxmask, minmask), _mm_setzero_ps(), diffv);
-        resultv = p1v + p2v * diffv;
-        return resultv  ;
+        vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes);
+        return (_mm_set1_ps(1.0f) - diff) * lower + (diff * upper);
     }
-*/
 #ifdef __SSE4_1__
     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
     vfloat operator[](vint idxv ) const
@@ -456,6 +426,8 @@ public:
             }
 
             idx = 0;
+            // Note: Maybe this should be 'idx > maxsf'? See unit test where a LUT with
+            // values [10, 11, 12, 13] gets looked up at 2.5 and returns 12.5.
         } else if (index > maxsf) {
             if (clip & LUT_CLIP_ABOVE) {
                 return data[upperBound];
@@ -543,6 +515,7 @@ public:
         maxs = 0;
         maxsf = 0.f;
         clip = 0;
+        maxIndexFloat = ((float)upperBound) - 1e-5;
     }
 
     // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor)
@@ -652,6 +625,7 @@ public:
         upperBound = size - 1;
         maxs = size - 2;
         maxsf = (float)maxs;
+        maxIndexFloat = ((float)upperBound) - 1e-5;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
         maxsv =  F2V( size - 2);
         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
diff --git a/rtengine/curves.h b/rtengine/curves.h
index e443d430c..15ab96624 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -881,6 +881,7 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const
     g = lutToneCurve[g];
     b = lutToneCurve[b];
 }
+
 inline void StandardToneCurve::BatchApply(
         const size_t start, const size_t end,
         float *r, float *g, float *b) const {
@@ -911,9 +912,9 @@ inline void StandardToneCurve::BatchApply(
 
 #if defined( __SSE2__ ) && defined( __x86_64__ )
     for (; i + 3 < end; i += 4) {
-        __m128i r_val = _mm_cvtps_epi32(LVF(r[i]));
-        __m128i g_val = _mm_cvtps_epi32(LVF(g[i]));
-        __m128i b_val = _mm_cvtps_epi32(LVF(b[i]));
+        __m128 r_val = LVF(r[i]);
+        __m128 g_val = LVF(g[i]);
+        __m128 b_val = LVF(b[i]);
         STVF(r[i], lutToneCurve[r_val]);
         STVF(g[i], lutToneCurve[g_val]);
         STVF(b[i], lutToneCurve[b_val]);

From 88ebaf618a223f3bb5ccddf15bbb252a26851046 Mon Sep 17 00:00:00 2001
From: gatoatigrado <gatoatigrado@gmail.com>
Date: Mon, 25 Dec 2017 18:13:05 -0500
Subject: [PATCH 3/5] Code review changes

---
 rtengine/LUT.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index 4f245634e..9a700bc17 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -95,7 +95,8 @@ protected:
     // list of variables ordered to improve cache speed
     unsigned int maxs;
     float maxsf;
-    // possibly-more-correct value for sse routine (see unit test for details)
+    // For the SSE routine operator[](vfloat), we just clip float lookup values
+    // to just below the max value.
     float maxIndexFloat;
     T * data;
     unsigned int clip;
@@ -125,7 +126,10 @@ public:
 #endif
         dirty = true;
         clip = flags;
-        data = new T[s];
+        // Add a few extra elements so [](vfloat) won't access out-of-bounds memory.
+        // The routine would still produce the right answer, but might cause issues
+        // with address/heap checking programs.
+        data = new T[s + 3];
         owner = 1;
         size = s;
         upperBound = size - 1;
@@ -155,7 +159,8 @@ public:
 
         dirty = true; // Assumption!
         clip = flags;
-        data = new T[s];
+        // See comment in constructor.
+        data = new T[s + 3];
         owner = 1;
         size = s;
         upperBound = size - 1;
@@ -222,7 +227,8 @@ public:
             }
 
             if (this->data == nullptr) {
-                this->data = new T[rhs.size];
+                // See comment in constructor.
+                this->data = new T[rhs.size + 3];
             }
 
             this->clip = rhs.clip;
@@ -327,7 +333,7 @@ public:
         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
 
         vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes);
-        return (_mm_set1_ps(1.0f) - diff) * lower + (diff * upper);
+        return vintpf(diff, upper, lower);
     }
 #ifdef __SSE4_1__
     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
@@ -426,9 +432,7 @@ public:
             }
 
             idx = 0;
-            // Note: Maybe this should be 'idx > maxsf'? See unit test where a LUT with
-            // values [10, 11, 12, 13] gets looked up at 2.5 and returns 12.5.
-        } else if (index > maxsf) {
+        } else if (idx > maxs) {
             if (clip & LUT_CLIP_ABOVE) {
                 return data[upperBound];
             }

From 59e31ed36f179dbb3faff72181e920d892c8f513 Mon Sep 17 00:00:00 2001
From: gatoatigrado <gatoatigrado@gmail.com>
Date: Mon, 25 Dec 2017 22:46:05 -0500
Subject: [PATCH 4/5] Code review changes

---
 rtengine/LUT.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index 9a700bc17..b58144a6f 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -310,9 +310,7 @@ public:
 
         // Clamp and convert to integer values. Extract out of SSE register because all
         // lookup operations use regular addresses.
-        vfloat clampedIndexes = _mm_max_ps(
-                _mm_setzero_ps(),
-                _mm_min_ps(_mm_set1_ps(maxIndexFloat), indexv));
+        vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxIndexFloat), indexv));
         vint indexes = _mm_cvttps_epi32(clampedIndexes);
         int indexArray[4];
         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);

From 6dab5742dd6541b5def11589e826c598791e2d14 Mon Sep 17 00:00:00 2001
From: gatoatigrado <gatoatigrado@gmail.com>
Date: Mon, 25 Dec 2017 22:50:40 -0500
Subject: [PATCH 5/5] Add comment and assertions

---
 rtengine/LUT.h    | 7 +++++++
 rtengine/curves.h | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index b58144a6f..29147df8b 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -200,6 +200,10 @@ public:
         clip = flags;
     }
 
+    int getClip() const {
+        return clip;
+    }
+
     /** @brief Get the number of element in the LUT (i.e. dimension of the array)
      *  For a LUT(500), it will return 500
      *  @return number of element in the array
@@ -304,6 +308,9 @@ public:
     }
 
 #if defined( __SSE2__ ) && defined( __x86_64__ )
+
+    // NOTE: This version requires LUTs which clip at upper and lower bounds
+    // (which is the default).
     vfloat operator[](vfloat indexv) const
     {
         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
diff --git a/rtengine/curves.h b/rtengine/curves.h
index 15ab96624..e8b65c33d 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -886,6 +886,8 @@ inline void StandardToneCurve::BatchApply(
         const size_t start, const size_t end,
         float *r, float *g, float *b) const {
     assert (lutToneCurve);
+    assert (lutToneCurve.getClip() & LUT_CLIP_BELOW);
+    assert (lutToneCurve.getClip() & LUT_CLIP_ABOVE);
 
     // All pointers must have the same alignment for SSE usage. In the loop body below,
     // we will only check `r`, assuming that the same result would hold for `g` and `b`.