diff --git a/rtengine/alignedbuffer.h b/rtengine/alignedbuffer.h
index dd9d7b278..560f0884f 100644
--- a/rtengine/alignedbuffer.h
+++ b/rtengine/alignedbuffer.h
@@ -21,6 +21,10 @@
 #include <cstdlib>
 #include <utility>
 
+inline size_t padToAlignment(size_t size, size_t align = 16) {
+    return align * ((size + align - 1) / align);
+}
+
 // Aligned buffer that should be faster
 template <class T> class AlignedBuffer
 {
diff --git a/rtengine/curves.h b/rtengine/curves.h
index c616c94da..e443d430c 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -800,6 +800,13 @@ class StandardToneCurve : public ToneCurve
 {
 public:
     void Apply(float& r, float& g, float& b) const;
+
+    // Applies the tone curve to `r`, `g`, `b` arrays, starting at `r[start]`
+    // and ending at `r[end]` (and respectively for `b` and `g`). Uses SSE
+    // and requires that `r`, `g`, and `b` pointers have the same alignment.
+    void BatchApply(
+            const size_t start, const size_t end,
+            float *r, float *g, float *b) const;
 };
 
 class AdobeToneCurve : public ToneCurve
@@ -874,6 +881,52 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const
     g = lutToneCurve[g];
     b = lutToneCurve[b];
 }
+inline void StandardToneCurve::BatchApply(
+        const size_t start, const size_t end,
+        float *r, float *g, float *b) const {
+    assert (lutToneCurve);
+
+    // All pointers must have the same alignment for SSE usage. In the loop body below,
+    // we will only check `r`, assuming that the same result would hold for `g` and `b`.
+    assert (reinterpret_cast<uintptr_t>(r) % 16 == reinterpret_cast<uintptr_t>(g) % 16);
+    assert (reinterpret_cast<uintptr_t>(g) % 16 == reinterpret_cast<uintptr_t>(b) % 16);
+
+    size_t i = start;
+    while (true) {
+        if (i >= end) {
+            // If we get to the end before getting to an aligned address, just return.
+            // (Or, for non-SSE mode, if we get to the end.)
+            return;
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+        } else if (reinterpret_cast<uintptr_t>(&r[i]) % 16 == 0) {
+            // Otherwise, we get to the first aligned address; go to the SSE part.
+            break;
+#endif
+        }
+        r[i] = lutToneCurve[r[i]];
+        g[i] = lutToneCurve[g[i]];
+        b[i] = lutToneCurve[b[i]];
+        i++;
+    }
+
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+    for (; i + 3 < end; i += 4) {
+        __m128i r_val = _mm_cvtps_epi32(LVF(r[i]));
+        __m128i g_val = _mm_cvtps_epi32(LVF(g[i]));
+        __m128i b_val = _mm_cvtps_epi32(LVF(b[i]));
+        STVF(r[i], lutToneCurve[r_val]);
+        STVF(g[i], lutToneCurve[g_val]);
+        STVF(b[i], lutToneCurve[b_val]);
+    }
+
+    // Remainder in non-SSE.
+    for (; i < end; ++i) {
+        r[i] = lutToneCurve[r[i]];
+        g[i] = lutToneCurve[g[i]];
+        b[i] = lutToneCurve[b[i]];
+    }
+#endif
+}
 
 // Tone curve according to Adobe's reference implementation
 // values in 0xffff space
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index 1d38f6be1..082799e62 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -23,6 +23,7 @@
 #include <omp.h>
 #endif
 
+#include "alignedbuffer.h"
 #include "rtengine.h"
 #include "improcfun.h"
 #include "curves.h"
@@ -3409,31 +3410,28 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
     #pragma omp parallel if (multiThread)
 #endif
     {
-        char *buffer;
+        size_t perChannelSizeBytes = padToAlignment(sizeof (float) * TS * TS + 4 * 64);
+        AlignedBuffer<float> buffer(3 * perChannelSizeBytes);
         char *editIFloatBuffer = nullptr;
         char *editWhateverBuffer = nullptr;
 
-        buffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63);
-        char *data;
-        data = (char*) ( ( uintptr_t (buffer) + uintptr_t (63)) / 64 * 64);
-
-        float *rtemp = (float (*))data;
-        float *gtemp = (float (*))         ((char*)rtemp + sizeof (float) * TS * TS + 4 * 64);
-        float *btemp = (float (*))         ((char*)gtemp + sizeof (float) * TS * TS + 8 * 64);
+        float *rtemp = buffer.data;
+        float *gtemp = &rtemp[perChannelSizeBytes / sizeof(float)];
+        float *btemp = &gtemp[perChannelSizeBytes / sizeof(float)];
         int istart;
         int jstart;
         int tW;
         int tH;
 
         // zero out the buffers
-        memset(buffer, 0, 3 * sizeof (float) * TS * TS + 20 * 64 + 63);
+        memset(rtemp, 0, 3 * perChannelSizeBytes);
 
         // Allocating buffer for the PipetteBuffer
         float *editIFloatTmpR = nullptr, *editIFloatTmpG = nullptr, *editIFloatTmpB = nullptr, *editWhateverTmp = nullptr;
 
         if (editImgFloat) {
             editIFloatBuffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63);
-            data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64);
+            char *data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64);
 
             editIFloatTmpR = (float (*))data;
             editIFloatTmpG = (float (*))         ((char*)editIFloatTmpR + sizeof (float) * TS * TS + 4 * 64);
@@ -3442,7 +3440,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
 
         if (editWhatever) {
             editWhateverBuffer = (char *) malloc (sizeof (float) * TS * TS + 20 * 64 + 63);
-            data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64);
+            char *data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64);
 
             editWhateverTmp = (float (*))data;
         }
@@ -3618,10 +3616,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                 if (hasToneCurve1) {
                     if (curveMode == ToneCurveParams::TcMode::STD) { // Standard
                         for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve1);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
+                            const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve1);
+                            userToneCurve.BatchApply (
+                                    0, tW - jstart,
+                                    &rtemp[ti * TS], &gtemp[ti * TS], &btemp[ti * TS]);
                         }
                     } else if (curveMode == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like
                         for (int i = istart, ti = 0; i < tH; i++, ti++) {
@@ -4529,8 +4527,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                 }
             }
 
-        free (buffer);
-
         if (editIFloatBuffer) {
             free (editIFloatBuffer);
         }