diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc
index 692dcbcb9..c9d4645a2 100644
--- a/rtengine/clutstore.cc
+++ b/rtengine/clutstore.cc
@@ -164,92 +164,94 @@ Glib::ustring rtengine::HaldCLUT::getProfile() const
     return clut_profile;
 }
 
-void rtengine::HaldCLUT::getRGB(float r, float g, float b, float out_rgbx[4]) const
+void rtengine::HaldCLUT::getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const
 {
     const unsigned int level = clut_level; // This is important
 
-    const unsigned int red = std::min(flevel_minus_two, r * flevel_minus_one);
-    const unsigned int green = std::min(flevel_minus_two, g * flevel_minus_one);
-    const unsigned int blue = std::min(flevel_minus_two, b * flevel_minus_one);
-
     const unsigned int level_square = level * level;
 
-    const unsigned int color = red + green * level + blue * level_square;
+    for (std::size_t column = 0; column < line_size; ++column, ++r, ++g, ++b, out_rgbx += 4) {
+        const unsigned int red = std::min(flevel_minus_two, *r * flevel_minus_one);
+        const unsigned int green = std::min(flevel_minus_two, *g * flevel_minus_one);
+        const unsigned int blue = std::min(flevel_minus_two, *b * flevel_minus_one);
+
+        const unsigned int color = red + green * level + blue * level_square;
 
 #ifndef __SSE2__
-    r = r * flevel_minus_one - red;
-    g = g * flevel_minus_one - green;
-    b = b * flevel_minus_one - blue;
+        const float re = *r * flevel_minus_one - red;
+        const float gr = *g * flevel_minus_one - green;
+        const float bl = *b * flevel_minus_one - blue;
 
-    size_t index = color * 4;
+        size_t index = color * 4;
 
-    float tmp1[4] ALIGNED16;
-    tmp1[0] = intp<float>(r, clut_image.data[index + 4], clut_image.data[index]);
-    tmp1[1] = intp<float>(r, clut_image.data[index + 5], clut_image.data[index + 1]);
-    tmp1[2] = intp<float>(r, clut_image.data[index + 6], clut_image.data[index + 2]);
+        float tmp1[4] ALIGNED16;
+        tmp1[0] = intp<float>(re, clut_image.data[index + 4], clut_image.data[index]);
+        tmp1[1] = intp<float>(re, clut_image.data[index + 5], clut_image.data[index + 1]);
+        tmp1[2] = intp<float>(re, clut_image.data[index + 6], clut_image.data[index + 2]);
 
-    index = (color + level) * 4;
+        index = (color + level) * 4;
 
-    float tmp2[4] ALIGNED16;
-    tmp2[0] = intp<float>(r, clut_image.data[index + 4], clut_image.data[index]);
-    tmp2[1] = intp<float>(r, clut_image.data[index + 5], clut_image.data[index + 1]);
-    tmp2[2] = intp<float>(r, clut_image.data[index + 6], clut_image.data[index + 2]);
+        float tmp2[4] ALIGNED16;
+        tmp2[0] = intp<float>(re, clut_image.data[index + 4], clut_image.data[index]);
+        tmp2[1] = intp<float>(re, clut_image.data[index + 5], clut_image.data[index + 1]);
+        tmp2[2] = intp<float>(re, clut_image.data[index + 6], clut_image.data[index + 2]);
 
-    out_rgbx[0] = intp<float>(g, tmp2[0], tmp1[0]);
-    out_rgbx[1] = intp<float>(g, tmp2[1], tmp1[1]);
-    out_rgbx[2] = intp<float>(g, tmp2[2], tmp1[2]);
+        out_rgbx[0] = intp<float>(gr, tmp2[0], tmp1[0]);
+        out_rgbx[1] = intp<float>(gr, tmp2[1], tmp1[1]);
+        out_rgbx[2] = intp<float>(gr, tmp2[2], tmp1[2]);
 
-    index = (color + level_square) * 4;
+        index = (color + level_square) * 4;
 
-    tmp1[0] = intp<float>(r, clut_image.data[index + 4], clut_image.data[index]);
-    tmp1[1] = intp<float>(r, clut_image.data[index + 5], clut_image.data[index + 1]);
-    tmp1[2] = intp<float>(r, clut_image.data[index + 6], clut_image.data[index + 2]);
+        tmp1[0] = intp<float>(re, clut_image.data[index + 4], clut_image.data[index]);
+        tmp1[1] = intp<float>(re, clut_image.data[index + 5], clut_image.data[index + 1]);
+        tmp1[2] = intp<float>(re, clut_image.data[index + 6], clut_image.data[index + 2]);
 
-    index = (color + level + level_square) * 4;
+        index = (color + level + level_square) * 4;
 
-    tmp2[0] = intp<float>(r, clut_image.data[index + 4], clut_image.data[index]);
-    tmp2[1] = intp<float>(r, clut_image.data[index + 5], clut_image.data[index + 1]);
-    tmp2[2] = intp<float>(r, clut_image.data[index + 6], clut_image.data[index + 2]);
+        tmp2[0] = intp<float>(re, clut_image.data[index + 4], clut_image.data[index]);
+        tmp2[1] = intp<float>(re, clut_image.data[index + 5], clut_image.data[index + 1]);
+        tmp2[2] = intp<float>(re, clut_image.data[index + 6], clut_image.data[index + 2]);
 
-    tmp1[0] = intp<float>(g, tmp2[0], tmp1[0]);
-    tmp1[1] = intp<float>(g, tmp2[1], tmp1[1]);
-    tmp1[2] = intp<float>(g, tmp2[2], tmp1[2]);
+        tmp1[0] = intp<float>(gr, tmp2[0], tmp1[0]);
+        tmp1[1] = intp<float>(gr, tmp2[1], tmp1[1]);
+        tmp1[2] = intp<float>(gr, tmp2[2], tmp1[2]);
 
-    out_rgbx[0] = intp<float>(b, tmp1[0], out_rgbx[0]);
-    out_rgbx[1] = intp<float>(b, tmp1[1], out_rgbx[1]);
-    out_rgbx[2] = intp<float>(b, tmp1[2], out_rgbx[2]);
+        out_rgbx[0] = intp<float>(bl, tmp1[0], out_rgbx[0]);
+        out_rgbx[1] = intp<float>(bl, tmp1[1], out_rgbx[1]);
+        out_rgbx[2] = intp<float>(bl, tmp1[2], out_rgbx[2]);
 #else
-    const vfloat v_tmp = _mm_set_ps(0.0f, b, g, r) * _mm_load_ps1(&flevel_minus_one);
-    const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp)));
+        const vfloat v_tmp = _mm_set_ps(0.0f, *b, *g, *r) * _mm_load_ps1(&flevel_minus_one);
+        const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp)));
 
-    size_t index = color * 4;
+        size_t index = color * 4;
 
-    const vfloat v_r = PERMUTEPS(v_rgb, 0x00);
+        const vfloat v_r = PERMUTEPS(v_rgb, 0x00);
 
-    vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
+        vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
 
-    index = (color + level) * 4;
+        index = (color + level) * 4;
 
-    vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
+        vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
 
-    const vfloat v_g = PERMUTEPS(v_rgb, 0x55);
+        const vfloat v_g = PERMUTEPS(v_rgb, 0x55);
 
-    vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1);
+        vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1);
 
-    index = (color + level_square) * 4;
+        index = (color + level_square) * 4;
 
-    v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
+        v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
 
-    index = (color + level + level_square) * 4;
+        index = (color + level + level_square) * 4;
 
-    v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
+        v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
 
-    v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1);
+        v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1);
 
-    const vfloat v_b = PERMUTEPS(v_rgb, 0xAA);
+        const vfloat v_b = PERMUTEPS(v_rgb, 0xAA);
 
-    _mm_store_ps(out_rgbx, vintpf(v_b, v_tmp1, v_out));
+        _mm_store_ps(out_rgbx, vintpf(v_b, v_tmp1, v_out));
 #endif
+    }
 }
 
 rtengine::CLUTStore& rtengine::CLUTStore::getInstance()
diff --git a/rtengine/clutstore.h b/rtengine/clutstore.h
index ed3491fbe..6203e4e61 100644
--- a/rtengine/clutstore.h
+++ b/rtengine/clutstore.h
@@ -24,7 +24,7 @@ public:
     virtual Glib::ustring getFilename() const = 0;
     virtual Glib::ustring getProfile() const = 0;
 
-    virtual void getRGB(float r, float g, float b, float out_rgbx[4]) const = 0;
+    virtual void getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const = 0;
 
     static void splitClutFilename(
         const Glib::ustring& filename,
@@ -48,7 +48,7 @@ public:
     Glib::ustring getFilename() const override;
     Glib::ustring getProfile() const override;
 
-    void getRGB(float r, float g, float b, float out_rgbx[4]) const override;
+    void getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const override;
 
 private:
     AlignedBuffer<std::uint16_t> clut_image;
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index 2e15da916..d274806bb 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -16,6 +16,7 @@
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
  */
+#include <memory>
 #include <cmath>
 #include <glib.h>
 #include <glibmm.h>
@@ -3224,8 +3225,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
         }
     }
 
-    double filmSimCorrectedStrength = double(params->filmSimulation.strength) / 100.;
-    double filmSimSourceStrength = double(100 - params->filmSimulation.strength) / 100.;
+    float filmSimCorrectedStrength = static_cast<float>(params->filmSimulation.strength) / 100.0f;
+    float filmSimSourceStrength = 1.0f - filmSimCorrectedStrength;
 
     const float exp_scale = pow (2.0, expcomp);
     const float comp = (max(0.0, expcomp) + 1.0) * hlcompr / 100.0;
@@ -4354,13 +4355,24 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             sourceR = CLIP<float>( Color::gamma_srgb( sourceR ) );
                             sourceG = CLIP<float>( Color::gamma_srgb( sourceG ) );
                             sourceB = CLIP<float>( Color::gamma_srgb( sourceB ) );
+                        }
+
+                        const std::size_t line_size = std::min(TS, tW - jstart);
+                        std::size_t out_rgbx_size = 4 * (line_size + 16);
+                        std::unique_ptr<float> out_rgbx_buf(new float[out_rgbx_size]);
+                        void* out_rgbx_ptr = out_rgbx_buf.get();
+                        float* const out_rgbx = reinterpret_cast<float*>(std::align(16, 4 * line_size, out_rgbx_ptr, out_rgbx_size));
+                        colorLUT->getRGB(line_size, rtemp + ti * TS, gtemp + ti * TS, btemp + ti * TS, out_rgbx);
+
+                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+                            float &sourceR = rtemp[ti * TS + tj];
+                            float &sourceG = gtemp[ti * TS + tj];
+                            float &sourceB = btemp[ti * TS + tj];
 
-                            float out_rgbx[4] ALIGNED16;
-                            colorLUT->getRGB( sourceR, sourceG, sourceB, out_rgbx );
                             // apply strength
-                            sourceR = out_rgbx[0] * filmSimCorrectedStrength + sourceR * filmSimSourceStrength;
-                            sourceG = out_rgbx[1] * filmSimCorrectedStrength + sourceG * filmSimSourceStrength;
-                            sourceB = out_rgbx[2] * filmSimCorrectedStrength + sourceB * filmSimSourceStrength;
+                            sourceR = out_rgbx[tj * 4 + 0] * filmSimCorrectedStrength + sourceR * filmSimSourceStrength;
+                            sourceG = out_rgbx[tj * 4 + 1] * filmSimCorrectedStrength + sourceG * filmSimSourceStrength;
+                            sourceB = out_rgbx[tj * 4 + 2] * filmSimCorrectedStrength + sourceB * filmSimSourceStrength;
                             // apply inverse gamma sRGB
                             sourceR = Color::igamma_srgb( sourceR );
                             sourceG = Color::igamma_srgb( sourceG );