From 1edfb0c6f79d72acb6f759e2df30a11503404b59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Tue, 10 May 2016 20:39:20 +0200
Subject: [PATCH 1/4] Switch `getClutValue` to vfloat2 and load/store
 source[RGB] unaligned

Ingo had some cleanup suggestions in #3154 which I tried to realize with
this commit. Although switching to `vfloat2` is a clever idea, I can see
no further speedup.
---
 rtengine/clutstore.cc | 27 +++++++++++++++++++--------
 rtengine/improcfun.cc | 34 ++++++++++++++++------------------
 2 files changed, 35 insertions(+), 26 deletions(-)
diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc
index 5d859f90c..fc167734c 100644
--- a/rtengine/clutstore.cc
+++ b/rtengine/clutstore.cc
@@ -56,7 +56,7 @@ bool loadFile(
             img_src.convertColorSpace(img_float.get(), icm, curr_wb);
         }
 
-        AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 8); // + 8 because of SSE4_1 version of getClutValue
+        AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 8); // + 8 because of SSE4_1 version of getClutValues
 
         std::size_t index = 0;
 
@@ -78,12 +78,19 @@ bool loadFile(
 }
 
 #ifdef __SSE2__
-vfloat getClutValue(const AlignedBuffer<std::uint16_t>& clut_image, size_t index)
+vfloat2 getClutValues(const AlignedBuffer<std::uint16_t>& clut_image, size_t index)
 {
+    const __m128i v_values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(clut_image.data + index));
 #ifdef __SSE4_1__
-    return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(clut_image.data + index))));
+    return {
+        _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_values)),
+        _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_srli_si128(v_values, 8)))
+    };
 #else
-    return _mm_set_ps(clut_image.data[index + 3], clut_image.data[index + 2], clut_image.data[index + 1], clut_image.data[index]);
+    return {
+        _mm_cvtpu16_ps(_mm_movepi64_pi64(v_values)),
+        _mm_cvtpu16_ps(_mm_movepi64_pi64(_mm_srli_si128(v_values, 8)))
+    };
 #endif
 }
 #endif
@@ -212,11 +219,13 @@ void rtengine::HaldCLUT::getRGB(
 
         const vfloat v_r = PERMUTEPS(v_rgb, _MM_SHUFFLE(0, 0, 0, 0));
 
-        vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
+        vfloat2 v_clut_values = getClutValues(clut_image, index);
+        vfloat v_tmp1 = vintpf(v_r, v_clut_values.y, v_clut_values.x);
 
         index = (color + level) * 4;
 
-        vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
+        v_clut_values = getClutValues(clut_image, index);
+        vfloat v_tmp2 = vintpf(v_r, v_clut_values.y, v_clut_values.x);
 
         const vfloat v_g = PERMUTEPS(v_rgb, _MM_SHUFFLE(1, 1, 1, 1));
 
@@ -224,11 +233,13 @@ void rtengine::HaldCLUT::getRGB(
 
         index = (color + level_square) * 4;
 
-        v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
+        v_clut_values = getClutValues(clut_image, index);
+        v_tmp1 = vintpf(v_r, v_clut_values.y, v_clut_values.x);
 
         index = (color + level + level_square) * 4;
 
-        v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
+        v_clut_values = getClutValues(clut_image, index);
+        v_tmp2 = vintpf(v_r, v_clut_values.y, v_clut_values.x);
 
         v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1);
 
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index e9ae98e2a..bc70bf5e5 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -3226,8 +3226,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                 xyz2clut = iccStore->workingSpaceInverseMatrix( hald_clut->getProfile() );
                 xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working );
                 clut2xyz = iccStore->workingSpaceMatrix( hald_clut->getProfile() );
-#ifdef __SSE2__
 
+#ifdef __SSE2__
                 for (int i = 0; i < 3; ++i) {
                     for (int j = 0; j < 3; ++j) {
                         v_work2xyz[i][j] = F2V(work2xyz[i][j]);
@@ -3236,8 +3236,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                         v_clut2xyz[i][j] = F2V(clut2xyz[i][j]);
                     }
                 }
-
 #endif
+
             }
         }
     }
@@ -4361,12 +4361,12 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             // Convert from working to clut profile
                             int j = jstart;
                             int tj = 0;
-#ifdef __SSE2__
 
+#ifdef __SSE2__
                             for (; j < tW - 3; j += 4, tj += 4) {
-                                vfloat sourceR = LVFU(rtemp[ti * TS + tj]);
-                                vfloat sourceG = LVFU(gtemp[ti * TS + tj]);
-                                vfloat sourceB = LVFU(btemp[ti * TS + tj]);
+                                vfloat sourceR = LVF(rtemp[ti * TS + tj]);
+                                vfloat sourceG = LVF(gtemp[ti * TS + tj]);
+                                vfloat sourceB = LVF(btemp[ti * TS + tj]);
 
                                 vfloat x;
                                 vfloat y;
@@ -4374,11 +4374,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                 Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_work2xyz);
                                 Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2clut);
 
-                                STVFU(rtemp[ti * TS + tj], sourceR);
-                                STVFU(gtemp[ti * TS + tj], sourceG);
-                                STVFU(btemp[ti * TS + tj], sourceB);
+                                STVF(rtemp[ti * TS + tj], sourceR);
+                                STVF(gtemp[ti * TS + tj], sourceG);
+                                STVF(btemp[ti * TS + tj], sourceB);
                             }
-
 #endif
 
                             for (; j < tW; j++, tj++) {
@@ -4428,12 +4427,12 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             // Convert from clut to working profile
                             int j = jstart;
                             int tj = 0;
-#ifdef __SSE2__
 
+#ifdef __SSE2__
                             for (; j < tW - 3; j += 4, tj += 4) {
-                                vfloat sourceR = LVFU(rtemp[ti * TS + tj]);
-                                vfloat sourceG = LVFU(gtemp[ti * TS + tj]);
-                                vfloat sourceB = LVFU(btemp[ti * TS + tj]);
+                                vfloat sourceR = LVF(rtemp[ti * TS + tj]);
+                                vfloat sourceG = LVF(gtemp[ti * TS + tj]);
+                                vfloat sourceB = LVF(btemp[ti * TS + tj]);
 
                                 vfloat x;
                                 vfloat y;
@@ -4441,11 +4440,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                 Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_clut2xyz);
                                 Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2work);
 
-                                STVFU(rtemp[ti * TS + tj], sourceR);
-                                STVFU(gtemp[ti * TS + tj], sourceG);
-                                STVFU(btemp[ti * TS + tj], sourceB);
+                                STVF(rtemp[ti * TS + tj], sourceR);
+                                STVF(gtemp[ti * TS + tj], sourceG);
+                                STVF(btemp[ti * TS + tj], sourceB);
                             }
-
 #endif
 
                             for (; j < tW; j++, tj++) {

From f4d5c645de112bf9e5423707513115eaf82653e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Wed, 11 May 2016 20:01:17 +0200
Subject: [PATCH 2/4] Add Ingo's `clutstore_no_mmx.patch`

Ingo has provided a solution for the strange Windows crash with
`_mm_cvtpu16_ps()`: It was not an alignment problem, but the use of
MMX instructions which led to the SEGV.

Now Ingo's solutions omits MMX instructions altogether and is
nevertheless faster than the `_mm_set_ps()` workaround.

Many thanks to @heckflosse!
---
 rtengine/clutstore.cc | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc
index fc167734c..2724ed34e 100644
--- a/rtengine/clutstore.cc
+++ b/rtengine/clutstore.cc
@@ -87,9 +87,18 @@ vfloat2 getClutValues(const AlignedBuffer<std::uint16_t>& clut_image, size_t ind
         _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_srli_si128(v_values, 8)))
     };
 #else
+    vint lowval = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(1, 0, 1, 0));
+    vint highval = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(3, 2, 3, 2));
+    lowval = _mm_shufflelo_epi16(lowval, _MM_SHUFFLE(1, 1, 0, 0));
+    highval = _mm_shufflelo_epi16(highval, _MM_SHUFFLE(1, 1, 0, 0));
+    lowval = _mm_shufflehi_epi16(lowval, _MM_SHUFFLE(3, 3, 2, 2));
+    highval = _mm_shufflehi_epi16(highval, _MM_SHUFFLE(3, 3, 2, 2));
+    lowval = vandm(lowval, _mm_set1_epi32(0x0000ffff));
+    highval = vandm(highval, _mm_set1_epi32(0x0000ffff));
+
     return {
-        _mm_cvtpu16_ps(_mm_movepi64_pi64(v_values)),
-        _mm_cvtpu16_ps(_mm_movepi64_pi64(_mm_srli_si128(v_values, 8)))
+        _mm_cvtepi32_ps(lowval),
+        _mm_cvtepi32_ps(highval)
     };
 #endif
 }
@@ -261,12 +270,6 @@ void rtengine::HaldCLUT::splitClutFilename(
 {
     Glib::ustring basename = Glib::path_get_basename(filename);
 
-    Glib::ustring::size_type last_slash_pos = basename.rfind('/');
-
-    if (last_slash_pos == Glib::ustring::npos) {
-        last_slash_pos = basename.rfind('\\');
-    }
-
     const Glib::ustring::size_type last_dot_pos = basename.rfind('.');
 
     if (last_dot_pos != Glib::ustring::npos) {

From e8595890c5bf2dffa1d2cbd6e95c35b502e8fc68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Thu, 12 May 2016 20:21:17 +0200
Subject: [PATCH 3/4] Final cleanups

---
 rtengine/clutstore.cc | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc
index 2724ed34e..f738c3da9 100644
--- a/rtengine/clutstore.cc
+++ b/rtengine/clutstore.cc
@@ -80,21 +80,23 @@ bool loadFile(
 #ifdef __SSE2__
 vfloat2 getClutValues(const AlignedBuffer<std::uint16_t>& clut_image, size_t index)
 {
-    const __m128i v_values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(clut_image.data + index));
+    const vint v_values = _mm_loadu_si128(reinterpret_cast<const vint*>(clut_image.data + index));
 #ifdef __SSE4_1__
     return {
         _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_values)),
         _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_srli_si128(v_values, 8)))
     };
 #else
-    vint lowval = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(1, 0, 1, 0));
-    vint highval = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(3, 2, 3, 2));
-    lowval = _mm_shufflelo_epi16(lowval, _MM_SHUFFLE(1, 1, 0, 0));
-    highval = _mm_shufflelo_epi16(highval, _MM_SHUFFLE(1, 1, 0, 0));
-    lowval = _mm_shufflehi_epi16(lowval, _MM_SHUFFLE(3, 3, 2, 2));
-    highval = _mm_shufflehi_epi16(highval, _MM_SHUFFLE(3, 3, 2, 2));
-    lowval = vandm(lowval, _mm_set1_epi32(0x0000ffff));
-    highval = vandm(highval, _mm_set1_epi32(0x0000ffff));
+    const vint v_mask = _mm_set1_epi32(0x0000FFFF);
+
+    vint v_low = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(1, 0, 1, 0));
+    vint v_high = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(3, 2, 3, 2));
+    v_low = _mm_shufflelo_epi16(v_low, _MM_SHUFFLE(1, 1, 0, 0));
+    v_high = _mm_shufflelo_epi16(v_high, _MM_SHUFFLE(1, 1, 0, 0));
+    v_low = _mm_shufflehi_epi16(v_low, _MM_SHUFFLE(3, 3, 2, 2));
+    v_high = _mm_shufflehi_epi16(v_high, _MM_SHUFFLE(3, 3, 2, 2));
+    v_low = vandm(v_low, m_mask);
+    v_high = vandm(v_high, v_mask);
 
     return {
         _mm_cvtepi32_ps(lowval),

From aa2912531644cf66c2565c73776f13de67ef3e4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Thu, 12 May 2016 21:44:20 +0200
Subject: [PATCH 4/4] Read-ahead of `getClutValues()` is only one pixel now

---
 rtengine/clutstore.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc
index f738c3da9..5fd2cfae3 100644
--- a/rtengine/clutstore.cc
+++ b/rtengine/clutstore.cc
@@ -56,7 +56,7 @@ bool loadFile(
             img_src.convertColorSpace(img_float.get(), icm, curr_wb);
         }
 
-        AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 8); // + 8 because of SSE4_1 version of getClutValues
+        AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 4); // getClutValues() loads one pixel in advance
 
         std::size_t index = 0;