diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc
index 9d0cea35a..5d859f90c 100644
--- a/rtengine/clutstore.cc
+++ b/rtengine/clutstore.cc
@@ -56,7 +56,7 @@ bool loadFile(
             img_src.convertColorSpace(img_float.get(), icm, curr_wb);
         }
 
-        AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 4);
+        AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 8); // + 8 because of SSE4_1 version of getClutValue
 
         std::size_t index = 0;
 
@@ -81,9 +81,9 @@ bool loadFile(
 vfloat getClutValue(const AlignedBuffer<std::uint16_t>& clut_image, size_t index)
 {
 #ifdef __SSE4_1__
-    return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(*reinterpret_cast<const __m128i*>(clut_image.data + index)));
+    return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(clut_image.data + index))));
 #else
-    return _mm_cvtpu16_ps(*reinterpret_cast<const __m64*>(clut_image.data + index));
+    return _mm_set_ps(clut_image.data[index + 3], clut_image.data[index + 2], clut_image.data[index + 1], clut_image.data[index]);
 #endif
 }
 #endif
@@ -205,8 +205,8 @@ void rtengine::HaldCLUT::getRGB(
         out_rgbx[2] = intp<float>(strength, out_rgbx[2], *b);
 #else
         const vfloat v_in = _mm_set_ps(0.0f, *b, *g, *r);
-        const vfloat v_tmp = v_in * _mm_load_ps1(&flevel_minus_one);
-        const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp)));
+        const vfloat v_tmp = v_in * F2V(flevel_minus_one);
+        const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(F2V(flevel_minus_two), v_tmp)));
 
         size_t index = color * 4;
 
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index 4a0fe684e..e9ae98e2a 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -3209,10 +3209,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
     bool clutAndWorkingProfilesAreSame = false;
     TMatrix work2xyz, xyz2clut, clut2xyz, xyz2work;
 #ifdef __SSE2__
-    vfloat v_work2xyz[3][3];
-    vfloat v_xyz2clut[3][3];
-    vfloat v_clut2xyz[3][3];
-    vfloat v_xyz2work[3][3];
+    vfloat v_work2xyz[3][3] ALIGNED16;
+    vfloat v_xyz2clut[3][3] ALIGNED16;
+    vfloat v_clut2xyz[3][3] ALIGNED16;
+    vfloat v_xyz2work[3][3] ALIGNED16;
 #endif
 
     if ( params->filmSimulation.enabled && !params->filmSimulation.clutFilename.empty() ) {
@@ -3227,6 +3227,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                 xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working );
                 clut2xyz = iccStore->workingSpaceMatrix( hald_clut->getProfile() );
 #ifdef __SSE2__
+
                 for (int i = 0; i < 3; ++i) {
                     for (int j = 0; j < 3; ++j) {
                         v_work2xyz[i][j] = F2V(work2xyz[i][j]);
@@ -3235,6 +3236,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                         v_clut2xyz[i][j] = F2V(clut2xyz[i][j]);
                     }
                 }
+
 #endif
             }
         }
@@ -3452,6 +3454,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
             editWhateverTmp = (float(*))data;
         }
 
+        float out_rgbx[4 * TS] ALIGNED16; // Line buffer for CLUT
 
 #ifdef _OPENMP
         #pragma omp for schedule(dynamic) collapse(2)
@@ -4352,8 +4355,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
 
                 // Film Simulations
                 if (hald_clut) {
-                    float out_rgbx[4 * TS] ALIGNED16;
-
 
                     for (int i = istart, ti = 0; i < tH; i++, ti++) {
                         if (!clutAndWorkingProfilesAreSame) {
@@ -4361,10 +4362,11 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             int j = jstart;
                             int tj = 0;
 #ifdef __SSE2__
+
                             for (; j < tW - 3; j += 4, tj += 4) {
-                                vfloat sourceR = LVF(rtemp[ti * TS + tj]);
-                                vfloat sourceG = LVF(gtemp[ti * TS + tj]);
-                                vfloat sourceB = LVF(btemp[ti * TS + tj]);
+                                vfloat sourceR = LVFU(rtemp[ti * TS + tj]);
+                                vfloat sourceG = LVFU(gtemp[ti * TS + tj]);
+                                vfloat sourceB = LVFU(btemp[ti * TS + tj]);
 
                                 vfloat x;
                                 vfloat y;
@@ -4372,11 +4374,13 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                 Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_work2xyz);
                                 Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2clut);
 
-                                STVF(rtemp[ti * TS + tj], sourceR);
-                                STVF(gtemp[ti * TS + tj], sourceG);
-                                STVF(btemp[ti * TS + tj], sourceB);
+                                STVFU(rtemp[ti * TS + tj], sourceR);
+                                STVFU(gtemp[ti * TS + tj], sourceG);
+                                STVFU(btemp[ti * TS + tj], sourceB);
                             }
+
 #endif
+
                             for (; j < tW; j++, tj++) {
                                 float &sourceR = rtemp[ti * TS + tj];
                                 float &sourceG = gtemp[ti * TS + tj];
@@ -4425,10 +4429,11 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             int j = jstart;
                             int tj = 0;
 #ifdef __SSE2__
+
                             for (; j < tW - 3; j += 4, tj += 4) {
-                                vfloat sourceR = LVF(rtemp[ti * TS + tj]);
-                                vfloat sourceG = LVF(gtemp[ti * TS + tj]);
-                                vfloat sourceB = LVF(btemp[ti * TS + tj]);
+                                vfloat sourceR = LVFU(rtemp[ti * TS + tj]);
+                                vfloat sourceG = LVFU(gtemp[ti * TS + tj]);
+                                vfloat sourceB = LVFU(btemp[ti * TS + tj]);
 
                                 vfloat x;
                                 vfloat y;
@@ -4436,11 +4441,13 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                 Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_clut2xyz);
                                 Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2work);
 
-                                STVF(rtemp[ti * TS + tj], sourceR);
-                                STVF(gtemp[ti * TS + tj], sourceG);
-                                STVF(btemp[ti * TS + tj], sourceB);
+                                STVFU(rtemp[ti * TS + tj], sourceR);
+                                STVFU(gtemp[ti * TS + tj], sourceG);
+                                STVFU(btemp[ti * TS + tj], sourceB);
                             }
+
 #endif
+
                             for (; j < tW; j++, tj++) {
                                 float &sourceR = rtemp[ti * TS + tj];
                                 float &sourceG = gtemp[ti * TS + tj];
@@ -7185,6 +7192,7 @@ SSEFUNCTION void ImProcFunctions::lab2rgb(const LabImage &src, Imagefloat &dst,
             wipv[i][j] = F2V(wiprof[i][j]);
         }
     }
+
 #endif
 
 #ifdef _OPENMP
@@ -7194,9 +7202,10 @@ SSEFUNCTION void ImProcFunctions::lab2rgb(const LabImage &src, Imagefloat &dst,
     for(int i = 0; i < H; i++) {
         int j = 0;
 #ifdef __SSE2__
+
         for(; j < W - 3; j += 4) {
             vfloat X, Y, Z;
-            vfloat R,G,B;
+            vfloat R, G, B;
             Color::Lab2XYZ(LVFU(src.L[i][j]), LVFU(src.a[i][j]), LVFU(src.b[i][j]), X, Y, Z);
             Color::xyz2rgb(X, Y, Z, R, G, B, wipv);
             STVFU(dst.r(i, j), R);
@@ -7205,6 +7214,7 @@ SSEFUNCTION void ImProcFunctions::lab2rgb(const LabImage &src, Imagefloat &dst,
         }
 
 #endif
+
         for(; j < W; j++) {
             float X, Y, Z;
             Color::Lab2XYZ(src.L[i][j], src.a[i][j], src.b[i][j], X, Y, Z);