diff --git a/rtengine/curves.cc b/rtengine/curves.cc
index 508bd1640..dd25076fe 100644
--- a/rtengine/curves.cc
+++ b/rtengine/curves.cc
@@ -1827,6 +1827,14 @@ void PerceptualToneCurve::BatchApply(const size_t start, const size_t end, float
     const AdobeToneCurve& adobeTC = static_cast<const AdobeToneCurve&>((const ToneCurve&) * this);
 
     for (size_t i = start; i < end; ++i) {
+        const bool oog_r = OOG(rc[i]);
+        const bool oog_g = OOG(gc[i]);
+        const bool oog_b = OOG(bc[i]);
+
+        if (oog_r && oog_g && oog_b) {
+            continue;
+        }
+        
         float r = CLIP(rc[i]);
         float g = CLIP(gc[i]);
         float b = CLIP(bc[i]);
@@ -1848,12 +1856,18 @@ void PerceptualToneCurve::BatchApply(const size_t start, const size_t end, float
 
         if (ar >= 65535.f && ag >= 65535.f && ab >= 65535.f) {
             // clip fast path, will also avoid strange colours of clipped highlights
-            rc[i] = gc[i] = bc[i] = 65535.f;
+            //rc[i] = gc[i] = bc[i] = 65535.f;
+            if (!oog_r) rc[i] = 65535.f;
+            if (!oog_g) gc[i] = 65535.f;
+            if (!oog_b) bc[i] = 65535.f;
             continue;
         }
 
         if (ar <= 0.f && ag <= 0.f && ab <= 0.f) {
-            rc[i] = gc[i] = bc[i] = 0;
+            //rc[i] = gc[i] = bc[i] = 0;
+            if (!oog_r) rc[i] = 0.f;
+            if (!oog_g) gc[i] = 0.f;
+            if (!oog_b) bc[i] = 0.f;
             continue;
         }
 
@@ -1893,9 +1907,9 @@ void PerceptualToneCurve::BatchApply(const size_t start, const size_t end, float
                 g = newg;
                 b = newb;
             }
-            rc[i] = r;
-            gc[i] = g;
-            bc[i] = b;
+            if (!oog_r) rc[i] = r;
+            if (!oog_g) gc[i] = g;
+            if (!oog_b) bc[i] = b;
 
             continue;
         }
@@ -2003,9 +2017,9 @@ void PerceptualToneCurve::BatchApply(const size_t start, const size_t end, float
                 b = newb;
             }
 
-            rc[i] = r;
-            gc[i] = g;
-            bc[i] = b;
+            if (!oog_r) rc[i] = r;
+            if (!oog_g) gc[i] = g;
+            if (!oog_b) bc[i] = b;
 
             continue;
         }
@@ -2066,9 +2080,9 @@ void PerceptualToneCurve::BatchApply(const size_t start, const size_t end, float
             g = newg;
             b = newb;
         }
-        rc[i] = r;
-        gc[i] = g;
-        bc[i] = b;
+        if (!oog_r) rc[i] = r;
+        if (!oog_g) gc[i] = g;
+        if (!oog_b) bc[i] = b;
     }
 }
 float PerceptualToneCurve::cf_range[2];
diff --git a/rtengine/curves.h b/rtengine/curves.h
index d8e443fc9..e489ec38c 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -45,6 +45,29 @@ namespace rtengine
 class ToneCurve;
 class ColorAppearance;
 
+namespace curves {
+
+inline void setLutVal(const LUTf &lut, float &val)
+{
+    if (!OOG(val)) {
+        val = lut[std::max(val, 0.f)];
+    } else {
+        float m = lut[MAXVALF];
+        val += (m - val);
+    }
+}
+
+inline void setLutVal(float &val, float lutval, float maxval)
+{
+    if (!OOG(val)) {
+        val = lutval;
+    } else {
+        val += (maxval - val);
+    }
+}
+
+} // namespace curves
+
 class CurveFactory
 {
 
@@ -733,7 +756,7 @@ inline void Lightcurve::Apply (float& Li) const
 
     assert (lutColCurve);
 
-    Li = lutColCurve[Li];
+    curves::setLutVal(lutColCurve, Li);
 }
 
 class Brightcurve : public ColorAppearance
@@ -748,7 +771,7 @@ inline void Brightcurve::Apply (float& Br) const
 
     assert (lutColCurve);
 
-    Br = lutColCurve[Br];
+    curves::setLutVal(lutColCurve, Br);
 }
 
 class Chromacurve : public ColorAppearance
@@ -763,7 +786,7 @@ inline void Chromacurve::Apply (float& Cr) const
 
     assert (lutColCurve);
 
-    Cr = lutColCurve[Cr];
+    curves::setLutVal(lutColCurve, Cr);
 }
 class Saturcurve : public ColorAppearance
 {
@@ -777,7 +800,7 @@ inline void Saturcurve::Apply (float& Sa) const
 
     assert (lutColCurve);
 
-    Sa = lutColCurve[Sa];
+    curves::setLutVal(lutColCurve, Sa);
 }
 
 class Colorfcurve : public ColorAppearance
@@ -792,7 +815,7 @@ inline void Colorfcurve::Apply (float& Cf) const
 
     assert (lutColCurve);
 
-    Cf = lutColCurve[Cf];
+    curves::setLutVal(lutColCurve, Cf);
 }
 
 
@@ -881,9 +904,9 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const
 
     assert (lutToneCurve);
 
-    r = lutToneCurve[r];
-    g = lutToneCurve[g];
-    b = lutToneCurve[b];
+    curves::setLutVal(lutToneCurve, r);
+    curves::setLutVal(lutToneCurve, g);
+    curves::setLutVal(lutToneCurve, b);
 }
 
 inline void StandardToneCurve::BatchApply(
@@ -910,27 +933,36 @@ inline void StandardToneCurve::BatchApply(
             break;
 #endif
         }
-        r[i] = lutToneCurve[r[i]];
-        g[i] = lutToneCurve[g[i]];
-        b[i] = lutToneCurve[b[i]];
+        curves::setLutVal(lutToneCurve, r[i]);
+        curves::setLutVal(lutToneCurve, g[i]);
+        curves::setLutVal(lutToneCurve, b[i]);
         i++;
     }
 
 #ifdef __SSE2__
+    float tmpr[4];
+    float tmpg[4];
+    float tmpb[4];
+    float mv = lutToneCurve[MAXVALF];
     for (; i + 3 < end; i += 4) {
         __m128 r_val = LVF(r[i]);
         __m128 g_val = LVF(g[i]);
         __m128 b_val = LVF(b[i]);
-        STVF(r[i], lutToneCurve[r_val]);
-        STVF(g[i], lutToneCurve[g_val]);
-        STVF(b[i], lutToneCurve[b_val]);
+        STVF(tmpr[0], lutToneCurve[r_val]);
+        STVF(tmpg[0], lutToneCurve[g_val]);
+        STVF(tmpb[0], lutToneCurve[b_val]);
+        for (int j = 0; j < 4; ++j) {
+            curves::setLutVal(r[i+j], tmpr[j], mv);
+            curves::setLutVal(g[i+j], tmpg[j], mv);
+            curves::setLutVal(b[i+j], tmpb[j], mv);
+        }
     }
 
     // Remainder in non-SSE.
     for (; i < end; ++i) {
-        r[i] = lutToneCurve[r[i]];
-        g[i] = lutToneCurve[g[i]];
-        b[i] = lutToneCurve[b[i]];
+        curves::setLutVal(lutToneCurve, r[i]);
+        curves::setLutVal(lutToneCurve, g[i]);
+        curves::setLutVal(lutToneCurve, b[i]);
     }
 #endif
 }
@@ -938,10 +970,13 @@ inline void StandardToneCurve::BatchApply(
 // Tone curve according to Adobe's reference implementation
 // values in 0xffff space
 // inlined to make sure there will be no cache flush when used
-inline void AdobeToneCurve::Apply (float& r, float& g, float& b) const
+inline void AdobeToneCurve::Apply (float& ir, float& ig, float& ib) const
 {
 
     assert (lutToneCurve);
+    float r = CLIP(ir);
+    float g = CLIP(ig);
+    float b = CLIP(ib);
 
     if (r >= g) {
         if      (g > b) {
@@ -964,6 +999,10 @@ inline void AdobeToneCurve::Apply (float& r, float& g, float& b) const
             RGBTone (g, b, r);    // Case 7: g >= b >  r
         }
     }
+
+    setUnlessOOG(ir, r);
+    setUnlessOOG(ig, g);
+    setUnlessOOG(ib, b);
 }
 
 inline void AdobeToneCurve::RGBTone (float& r, float& g, float& b) const
@@ -976,10 +1015,14 @@ inline void AdobeToneCurve::RGBTone (float& r, float& g, float& b) const
 }
 
 // Modifying the Luminance channel only
-inline void LuminanceToneCurve::Apply(float &r, float &g, float &b) const
+inline void LuminanceToneCurve::Apply(float &ir, float &ig, float &ib) const
 {
     assert (lutToneCurve);
 
+    float r = CLIP(ir);
+    float g = CLIP(ig);
+    float b = CLIP(ib);
+
     float currLuminance = r * 0.2126729f + g * 0.7151521f + b * 0.0721750f;
     const float newLuminance = lutToneCurve[currLuminance];
     currLuminance = currLuminance == 0.f ? 0.00001f : currLuminance;
@@ -987,6 +1030,10 @@ inline void LuminanceToneCurve::Apply(float &r, float &g, float &b) const
     r = LIM<float>(r * coef, 0.f, 65535.f);
     g = LIM<float>(g * coef, 0.f, 65535.f);
     b = LIM<float>(b * coef, 0.f, 65535.f);
+
+    setUnlessOOG(ir, r);
+    setUnlessOOG(ig, g);
+    setUnlessOOG(ib, b);
 }
 
 inline float WeightedStdToneCurve::Triangle(float a, float a1, float b) const
@@ -1020,14 +1067,14 @@ inline vfloat WeightedStdToneCurve::Triangle(vfloat a, vfloat a1, vfloat b) cons
 
 // Tone curve modifying the value channel only, preserving hue and saturation
 // values in 0xffff space
-inline void WeightedStdToneCurve::Apply (float& r, float& g, float& b) const
+inline void WeightedStdToneCurve::Apply (float& ir, float& ig, float& ib) const
 {
 
     assert (lutToneCurve);
 
-    r = CLIP(r);
-    g = CLIP(g);
-    b = CLIP(b);
+    float r = CLIP(ir);
+    float g = CLIP(ig);
+    float b = CLIP(ib);
     float r1 = lutToneCurve[r];
     float g1 = Triangle(r, r1, g);
     float b1 = Triangle(r, r1, b);
@@ -1043,6 +1090,10 @@ inline void WeightedStdToneCurve::Apply (float& r, float& g, float& b) const
     r = CLIP<float>(r1 * 0.50f + r2 * 0.25f + r3 * 0.25f);
     g = CLIP<float>(g1 * 0.25f + g2 * 0.50f + g3 * 0.25f);
     b = CLIP<float>(b1 * 0.25f + b2 * 0.25f + b3 * 0.50f);
+
+    setUnlessOOG(ir, r);
+    setUnlessOOG(ig, g);
+    setUnlessOOG(ib, b);
 }
 
 inline void WeightedStdToneCurve::BatchApply(const size_t start, const size_t end, float *r, float *g, float *b) const {
@@ -1076,6 +1127,10 @@ inline void WeightedStdToneCurve::BatchApply(const size_t start, const size_t en
     const vfloat zd5v = F2V(0.5f);
     const vfloat zd25v = F2V(0.25f);
 
+    float tmpr[4];
+    float tmpg[4];
+    float tmpb[4];
+
     for (; i + 3 < end; i += 4) {
         vfloat r_val = LIMV(LVF(r[i]), ZEROV, c65535v);
         vfloat g_val = LIMV(LVF(g[i]), ZEROV, c65535v);
@@ -1092,9 +1147,14 @@ inline void WeightedStdToneCurve::BatchApply(const size_t start, const size_t en
         vfloat r3 = Triangle(b_val, b3, r_val);
         vfloat g3 = Triangle(b_val, b3, g_val);
 
-        STVF(r[i], LIMV(r1 * zd5v + r2 * zd25v + r3 * zd25v, ZEROV, c65535v));
-        STVF(g[i], LIMV(g1 * zd25v + g2 * zd5v + g3 * zd25v, ZEROV, c65535v));
-        STVF(b[i], LIMV(b1 * zd25v + b2 * zd25v + b3 * zd5v, ZEROV, c65535v));
+        STVF(tmpr[0], LIMV(r1 * zd5v + r2 * zd25v + r3 * zd25v, ZEROV, c65535v));
+        STVF(tmpg[0], LIMV(g1 * zd25v + g2 * zd5v + g3 * zd25v, ZEROV, c65535v));
+        STVF(tmpb[0], LIMV(b1 * zd25v + b2 * zd25v + b3 * zd5v, ZEROV, c65535v));
+        for (int j = 0; j < 4; ++j) {
+            setUnlessOOG(r[i+j], tmpr[j]);
+            setUnlessOOG(g[i+j], tmpg[j]);
+            setUnlessOOG(b[i+j], tmpb[j]);
+        }
     }
 
     // Remainder in non-SSE.
@@ -1106,14 +1166,14 @@ inline void WeightedStdToneCurve::BatchApply(const size_t start, const size_t en
 
 // Tone curve modifying the value channel only, preserving hue and saturation
 // values in 0xffff space
-inline void SatAndValueBlendingToneCurve::Apply (float& r, float& g, float& b) const
+inline void SatAndValueBlendingToneCurve::Apply (float& ir, float& ig, float& ib) const
 {
 
     assert (lutToneCurve);
 
-    r = CLIP(r);
-    g = CLIP(g);
-    b = CLIP(b);
+    float r = CLIP(ir);
+    float g = CLIP(ig);
+    float b = CLIP(ib);
 
     const float lum = (r + g + b) / 3.f;
     const float newLum = lutToneCurve[lum];
@@ -1137,6 +1197,10 @@ inline void SatAndValueBlendingToneCurve::Apply (float& r, float& g, float& b) c
         dV = v * coef;
     }
     Color::hsv2rgbdcp(h, s, v + dV, r, g, b);
+
+    setUnlessOOG(ir, r);
+    setUnlessOOG(ig, g);
+    setUnlessOOG(ib, b);
 }
 
 }
diff --git a/rtengine/dcp.cc b/rtengine/dcp.cc
index 82ea35f0f..1af26cf74 100644
--- a/rtengine/dcp.cc
+++ b/rtengine/dcp.cc
@@ -1184,13 +1184,17 @@ void DCPProfile::step2ApplyTile(float* rc, float* gc, float* bc, int width, int
                 }
 
                 // with looktable and tonecurve we need to clip
-                newr = FCLIP(newr);
-                newg = FCLIP(newg);
-                newb = FCLIP(newb);
+                // newr = FCLIP(newr);
+                // newg = FCLIP(newg);
+                // newb = FCLIP(newb);
 
                 if (as_in.data->apply_look_table) {
+                    float cnewr = FCLIP(newr);
+                    float cnewg = FCLIP(newg);
+                    float cnewb = FCLIP(newb);
+                    
                     float h, s, v;
-                    Color::rgb2hsvdcp(newr, newg, newb, h, s, v);
+                    Color::rgb2hsvdcp(cnewr, cnewg, cnewb, h, s, v);
 
                     hsdApply(look_info, look_table, h, s, v);
                     s = CLIP01(s);
@@ -1203,7 +1207,11 @@ void DCPProfile::step2ApplyTile(float* rc, float* gc, float* bc, int width, int
                         h -= 6.0f;
                     }
 
-                    Color::hsv2rgbdcp( h, s, v, newr, newg, newb);
+                    Color::hsv2rgbdcp( h, s, v, cnewr, cnewg, cnewb);
+
+                    setUnlessOOG(newr, cnewr);
+                    setUnlessOOG(newg, cnewg);
+                    setUnlessOOG(newb, cnewb);
                 }
 
                 if (as_in.data->use_tone_curve) {
diff --git a/rtengine/dirpyr_equalizer.cc b/rtengine/dirpyr_equalizer.cc
index 7d3ace3ff..69c01be8c 100644
--- a/rtengine/dirpyr_equalizer.cc
+++ b/rtengine/dirpyr_equalizer.cc
@@ -240,7 +240,7 @@ void ImProcFunctions :: dirpyr_equalizer(float ** src, float ** dst, int srcwidt
 
     for (int i = 0; i < srcheight; i++)
         for (int j = 0; j < srcwidth; j++) {
-            dst[i][j] = CLIP(buffer[i][j]);  // TODO: Really a clip necessary?
+            dst[i][j] = /*CLIP*/(buffer[i][j]);  // TODO: Really a clip necessary?
         }
 
 }
@@ -367,7 +367,7 @@ void ImProcFunctions :: dirpyr_equalizercam (CieImage *ncie, float ** src, float
         for (int i = 0; i < srcheight; i++)
             for (int j = 0; j < srcwidth; j++) {
                 if(ncie->J_p[i][j] > 8.f && ncie->J_p[i][j] < 92.f) {
-                    dst[i][j] = CLIP( buffer[i][j] );    // TODO: Really a clip necessary?
+                    dst[i][j] = /*CLIP*/( buffer[i][j] );    // TODO: Really a clip necessary?
                 } else {
                     dst[i][j] = src[i][j];
                 }
@@ -375,7 +375,7 @@ void ImProcFunctions :: dirpyr_equalizercam (CieImage *ncie, float ** src, float
     } else {
         for (int i = 0; i < srcheight; i++)
             for (int j = 0; j < srcwidth; j++) {
-                dst[i][j] = CLIP( buffer[i][j] );  // TODO: Really a clip necessary?
+                dst[i][j] = /*CLIP*/( buffer[i][j] );  // TODO: Really a clip necessary?
             }
     }
 }
diff --git a/rtengine/iimage.h b/rtengine/iimage.h
index d0fe626c2..d09f46a3a 100644
--- a/rtengine/iimage.h
+++ b/rtengine/iimage.h
@@ -119,7 +119,7 @@ inline void ImageDatas::convertTo(unsigned char src, unsigned short& dst) const
 template<>
 inline void ImageDatas::convertTo(float src, unsigned char& dst) const
 {
-    dst = uint16ToUint8Rounded(src);
+    dst = uint16ToUint8Rounded(CLIP(src));
 }
 template<>
 inline void ImageDatas::convertTo(unsigned char src, float& dst) const
diff --git a/rtengine/imagefloat.cc b/rtengine/imagefloat.cc
index 0352c0b10..d1b5c994b 100644
--- a/rtengine/imagefloat.cc
+++ b/rtengine/imagefloat.cc
@@ -146,6 +146,9 @@ void Imagefloat::setScanline (int row, unsigned char* buffer, int bps, float *mi
     }
 }
 
+
+namespace rtengine { extern void filmlike_clip(float *r, float *g, float *b); }
+
 void Imagefloat::getScanline (int row, unsigned char* buffer, int bps)
 {
 
@@ -163,18 +166,24 @@ void Imagefloat::getScanline (int row, unsigned char* buffer, int bps)
             sbuffer[ix++] = g(row, i) / 65535.f;
             sbuffer[ix++] = b(row, i) / 65535.f;
         }
-    } else if (bps == 16) {
+    } else {
         unsigned short *sbuffer = (unsigned short *)buffer;
         for (int i = 0, ix = 0; i < width; i++) {
-            sbuffer[ix++] = CLIP(r(row, i));
-            sbuffer[ix++] = CLIP(g(row, i));
-            sbuffer[ix++] = CLIP(b(row, i));
-        }
-    } else if (bps == 8) {
-        for (int i = 0, ix = 0; i < width; i++) {
-            buffer[ix++] = rtengine::uint16ToUint8Rounded(CLIP(r(row, i)));
-            buffer[ix++] = rtengine::uint16ToUint8Rounded(CLIP(g(row, i)));
-            buffer[ix++] = rtengine::uint16ToUint8Rounded(CLIP(b(row, i)));
+            float ri = r(row, i);
+            float gi = g(row, i);
+            float bi = b(row, i);
+            if (ri > 65535.f || gi > 65535.f || bi > 65535.f) {
+                filmlike_clip(&ri, &gi, &bi);
+            }
+            if (bps == 16) {
+                sbuffer[ix++] = CLIP(ri);
+                sbuffer[ix++] = CLIP(gi);
+                sbuffer[ix++] = CLIP(bi);
+            } else if (bps == 8) {
+                buffer[ix++] = rtengine::uint16ToUint8Rounded(CLIP(ri));
+                buffer[ix++] = rtengine::uint16ToUint8Rounded(CLIP(gi));
+                buffer[ix++] = rtengine::uint16ToUint8Rounded(CLIP(bi));
+            }
         }
     }
 }
@@ -238,6 +247,8 @@ void Imagefloat::getStdImage (ColorTemp ctemp, int tran, Imagefloat* image, Prev
     gm /= area;
     bm /= area;
 
+    const auto CLIP0 = [](float v) -> float { return std::max(v, 0.f); };
+
 #ifdef _OPENMP
     #pragma omp parallel
     {
@@ -270,9 +281,9 @@ void Imagefloat::getStdImage (ColorTemp ctemp, int tran, Imagefloat* image, Prev
                         continue;
                     }
 
-                    lineR[dst_x] = CLIP(rm2 * r(src_y, src_x));
-                    lineG[dst_x] = CLIP(gm2 * g(src_y, src_x));
-                    lineB[dst_x] = CLIP(bm2 * b(src_y, src_x));
+                    lineR[dst_x] = CLIP0(rm2 * r(src_y, src_x));
+                    lineG[dst_x] = CLIP0(gm2 * g(src_y, src_x));
+                    lineB[dst_x] = CLIP0(bm2 * b(src_y, src_x));
                 }
             } else {
                 // source image, first line of the current destination row
@@ -303,15 +314,15 @@ void Imagefloat::getStdImage (ColorTemp ctemp, int tran, Imagefloat* image, Prev
                     // convert back to gamma and clip
                     if (src_sub_width == skip && src_sub_height == skip) {
                         // Common case where the sub-region is complete
-                        lineR[dst_x] = CLIP(rm * rtot);
-                        lineG[dst_x] = CLIP(gm * gtot);
-                        lineB[dst_x] = CLIP(bm * btot);
+                        lineR[dst_x] = CLIP0(rm * rtot);
+                        lineG[dst_x] = CLIP0(gm * gtot);
+                        lineB[dst_x] = CLIP0(bm * btot);
                     } else {
                         // computing a special factor for this incomplete sub-region
                         float area = src_sub_width * src_sub_height;
-                        lineR[dst_x] = CLIP(rm2 * rtot / area);
-                        lineG[dst_x] = CLIP(gm2 * gtot / area);
-                        lineB[dst_x] = CLIP(bm2 * btot / area);
+                        lineR[dst_x] = CLIP0(rm2 * rtot / area);
+                        lineG[dst_x] = CLIP0(gm2 * gtot / area);
+                        lineB[dst_x] = CLIP0(bm2 * btot / area);
                     }
                 }
             }
@@ -357,9 +368,9 @@ Imagefloat::to8()
 
     for (int h = 0; h < height; ++h) {
         for (int w = 0; w < width; ++w) {
-            img8->r(h, w) = uint16ToUint8Rounded(r(h, w));
-            img8->g(h, w) = uint16ToUint8Rounded(g(h, w));
-            img8->b(h, w) = uint16ToUint8Rounded(b(h, w));
+            img8->r(h, w) = uint16ToUint8Rounded(CLIP(r(h, w)));
+            img8->g(h, w) = uint16ToUint8Rounded(CLIP(g(h, w)));
+            img8->b(h, w) = uint16ToUint8Rounded(CLIP(b(h, w)));
         }
     }
 
@@ -376,9 +387,9 @@ Imagefloat::to16()
 
     for (int h = 0; h < height; ++h) {
         for (int w = 0; w < width; ++w) {
-            img16->r(h, w) = r(h, w);
-            img16->g(h, w) = g(h, w);
-            img16->b(h, w) = b(h, w);
+            img16->r(h, w) = CLIP(r(h, w));
+            img16->g(h, w) = CLIP(g(h, w));
+            img16->b(h, w) = CLIP(b(h, w));
         }
     }
 
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index e7ff01f46..ced6727c6 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -50,10 +50,13 @@
 namespace {
 
 using namespace rtengine;
-// begin of helper function for rgbProc()
-void shadowToneCurve(const LUTf &shtonecurve, float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize) {
 
-#ifdef __SSE2__
+
+// begin of helper function for rgbProc()
+void shadowToneCurve(const LUTf &shtonecurve, float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize)
+{
+
+#if defined( __SSE2__ ) && defined( __x86_64__ )
     vfloat cr = F2V(0.299f);
     vfloat cg = F2V(0.587f);
     vfloat cb = F2V(0.114f);
@@ -61,8 +64,9 @@ void shadowToneCurve(const LUTf &shtonecurve, float *rtemp, float *gtemp, float
 
     for (int i = istart, ti = 0; i < tH; i++, ti++) {
         int j = jstart, tj = 0;
-#ifdef __SSE2__
-        for (; j < tW - 3; j+=4, tj+=4) {
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+
+        for (; j < tW - 3; j += 4, tj += 4) {
 
             vfloat rv = LVF(rtemp[ti * tileSize + tj]);
             vfloat gv = LVF(gtemp[ti * tileSize + tj]);
@@ -75,7 +79,9 @@ void shadowToneCurve(const LUTf &shtonecurve, float *rtemp, float *gtemp, float
             STVF(gtemp[ti * tileSize + tj], gv * tonefactorv);
             STVF(btemp[ti * tileSize + tj], bv * tonefactorv);
         }
+
 #endif
+
         for (; j < tW; j++, tj++) {
 
             float r = rtemp[ti * tileSize + tj];
@@ -92,17 +98,19 @@ void shadowToneCurve(const LUTf &shtonecurve, float *rtemp, float *gtemp, float
     }
 }
 
-void highlightToneCurve(const LUTf &hltonecurve, float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize, float exp_scale, float comp, float hlrange) {
+void highlightToneCurve(const LUTf &hltonecurve, float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize, float exp_scale, float comp, float hlrange)
+{
 
-#ifdef __SSE2__
+#if defined( __SSE2__ ) && defined( __x86_64__ )
     vfloat threev = F2V(3.f);
     vfloat maxvalfv = F2V(MAXVALF);
 #endif
 
     for (int i = istart, ti = 0; i < tH; i++, ti++) {
         int j = jstart, tj = 0;
-#ifdef __SSE2__
-        for (; j < tW - 3; j+=4, tj+=4) {
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+
+        for (; j < tW - 3; j += 4, tj += 4) {
 
             vfloat rv = LVF(rtemp[ti * tileSize + tj]);
             vfloat gv = LVF(gtemp[ti * tileSize + tj]);
@@ -111,14 +119,15 @@ void highlightToneCurve(const LUTf &hltonecurve, float *rtemp, float *gtemp, flo
             //TODO: proper treatment of out-of-gamut colors
             //float tonefactor = hltonecurve[(0.299f*r+0.587f*g+0.114f*b)];
             vmask maxMask = vmaskf_ge(vmaxf(rv, vmaxf(gv, bv)), maxvalfv);
-            if(_mm_movemask_ps((vfloat)maxMask)) {
+
+            if (_mm_movemask_ps((vfloat)maxMask)) {
                 for (int k = 0; k < 4; ++k) {
                     float r = rtemp[ti * tileSize + tj + k];
                     float g = gtemp[ti * tileSize + tj + k];
                     float b = btemp[ti * tileSize + tj + k];
-                    float tonefactor = ((r < MAXVALF ? hltonecurve[r] : CurveFactory::hlcurve (exp_scale, comp, hlrange, r) ) +
-                                        (g < MAXVALF ? hltonecurve[g] : CurveFactory::hlcurve (exp_scale, comp, hlrange, g) ) +
-                                        (b < MAXVALF ? hltonecurve[b] : CurveFactory::hlcurve (exp_scale, comp, hlrange, b) ) ) / 3.0;
+                    float tonefactor = ((r < MAXVALF ? hltonecurve[r] : CurveFactory::hlcurve(exp_scale, comp, hlrange, r)) +
+                                        (g < MAXVALF ? hltonecurve[g] : CurveFactory::hlcurve(exp_scale, comp, hlrange, g)) +
+                                        (b < MAXVALF ? hltonecurve[b] : CurveFactory::hlcurve(exp_scale, comp, hlrange, b))) / 3.0;
 
                     // note: tonefactor includes exposure scaling, that is here exposure slider and highlight compression takes place
                     rtemp[ti * tileSize + tj + k] = r * tonefactor;
@@ -133,7 +142,9 @@ void highlightToneCurve(const LUTf &hltonecurve, float *rtemp, float *gtemp, flo
                 STVF(btemp[ti * tileSize + tj], bv * tonefactorv);
             }
         }
+
 #endif
+
         for (; j < tW; j++, tj++) {
 
             float r = rtemp[ti * tileSize + tj];
@@ -142,9 +153,9 @@ void highlightToneCurve(const LUTf &hltonecurve, float *rtemp, float *gtemp, flo
 
             //TODO: proper treatment of out-of-gamut colors
             //float tonefactor = hltonecurve[(0.299f*r+0.587f*g+0.114f*b)];
-            float tonefactor = ((r < MAXVALF ? hltonecurve[r] : CurveFactory::hlcurve (exp_scale, comp, hlrange, r) ) +
-                                (g < MAXVALF ? hltonecurve[g] : CurveFactory::hlcurve (exp_scale, comp, hlrange, g) ) +
-                                (b < MAXVALF ? hltonecurve[b] : CurveFactory::hlcurve (exp_scale, comp, hlrange, b) ) ) / 3.0;
+            float tonefactor = ((r < MAXVALF ? hltonecurve[r] : CurveFactory::hlcurve(exp_scale, comp, hlrange, r)) +
+                                (g < MAXVALF ? hltonecurve[g] : CurveFactory::hlcurve(exp_scale, comp, hlrange, g)) +
+                                (b < MAXVALF ? hltonecurve[b] : CurveFactory::hlcurve(exp_scale, comp, hlrange, b))) / 3.0;
 
             // note: tonefactor includes exposure scaling, that is here exposure slider and highlight compression takes place
             rtemp[ti * tileSize + tj] = r * tonefactor;
@@ -224,9 +235,6 @@ void customToneCurve(const ToneCurve &customToneCurve, ToneCurveParams::TcMode c
 
         for (int i = istart, ti = 0; i < tH; i++, ti++) {
             for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                rtemp[ti * tileSize + tj] = CLIP<float> (rtemp[ti * tileSize + tj]);
-                gtemp[ti * tileSize + tj] = CLIP<float> (gtemp[ti * tileSize + tj]);
-                btemp[ti * tileSize + tj] = CLIP<float> (btemp[ti * tileSize + tj]);
                 userToneCurve.Apply(rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj]);
             }
         }
@@ -3260,7 +3268,7 @@ filmlike_clip_rgb_tone (float *r, float *g, float *b, const float L)
     *b = b_;
 }
 
-static void
+/*static*/ void
 filmlike_clip (float *r, float *g, float *b)
 {
     // This is Adobe's hue-stable film-like curve with a diagonal, ie only used for clipping. Can probably be further optimized.
@@ -3670,6 +3678,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
         }
 
         float out_rgbx[4 * TS] ALIGNED16; // Line buffer for CLUT
+        float clutr[TS] ALIGNED16;
+        float clutg[TS] ALIGNED16;
+        float clutb[TS] ALIGNED16;
 
         LUTu histToneCurveThr;
 
@@ -3760,9 +3771,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             filmlike_clip (&r, &g, &b);
                         }
 
-                        rtemp[ti * TS + tj] = r;
-                        gtemp[ti * TS + tj] = g;
-                        btemp[ti * TS + tj] = b;
+                        setUnlessOOG(rtemp[ti * TS + tj], r);
+                        setUnlessOOG(gtemp[ti * TS + tj], g);
+                        setUnlessOOG(btemp[ti * TS + tj], b);
                     }
                 }
 
@@ -3771,30 +3782,43 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                         for (int j = jstart, tj = 0; j < tW; j++, tj++) {
 
                             //brightness/contrast
-                            rtemp[ti * TS + tj] = tonecurve[ rtemp[ti * TS + tj] ];
-                            gtemp[ti * TS + tj] = tonecurve[ gtemp[ti * TS + tj] ];
-                            btemp[ti * TS + tj] = tonecurve[ btemp[ti * TS + tj] ];
+                            float r = tonecurve[ CLIP(rtemp[ti * TS + tj]) ];
+                            float g = tonecurve[ CLIP(gtemp[ti * TS + tj]) ];
+                            float b = tonecurve[ CLIP(btemp[ti * TS + tj]) ];
 
                             int y = CLIP<int> (lumimulf[0] * Color::gamma2curve[rtemp[ti * TS + tj]] + lumimulf[1] * Color::gamma2curve[gtemp[ti * TS + tj]] + lumimulf[2] * Color::gamma2curve[btemp[ti * TS + tj]]);
                             histToneCurveThr[y >> histToneCurveCompression]++;
+
+                            setUnlessOOG(rtemp[ti * TS + tj], r);
+                            setUnlessOOG(gtemp[ti * TS + tj], g);
+                            setUnlessOOG(btemp[ti * TS + tj], b);
                         }
                     }
                 } else {
+                    float tmpr[4];
+                    float tmpg[4];
+                    float tmpb[4];
+                    
                     for (int i = istart, ti = 0; i < tH; i++, ti++) {
                         int j = jstart, tj = 0;
 #ifdef __SSE2__
                         for (; j < tW - 3; j+=4, tj+=4) {
                             //brightness/contrast
-                            STVF(rtemp[ti * TS + tj], tonecurve(LVF(rtemp[ti * TS + tj])));
-                            STVF(gtemp[ti * TS + tj], tonecurve(LVF(gtemp[ti * TS + tj])));
-                            STVF(btemp[ti * TS + tj], tonecurve(LVF(btemp[ti * TS + tj])));
+                            STVF(tmpr[0], tonecurve(LVF(rtemp[ti * TS + tj])));
+                            STVF(tmpg[0], tonecurve(LVF(gtemp[ti * TS + tj])));
+                            STVF(tmpb[0], tonecurve(LVF(btemp[ti * TS + tj])));
+                            for (int k = 0; k < 4; ++k) {
+                                setUnlessOOG(rtemp[ti * TS + tj + k], tmpr[k]);
+                                setUnlessOOG(gtemp[ti * TS + tj + k], tmpg[k]);
+                                setUnlessOOG(btemp[ti * TS + tj + k], tmpb[k]);
+                            }
                         }
 #endif
                         for (; j < tW; j++, tj++) {
                             //brightness/contrast
-                            rtemp[ti * TS + tj] = tonecurve[rtemp[ti * TS + tj]];
-                            gtemp[ti * TS + tj] = tonecurve[gtemp[ti * TS + tj]];
-                            btemp[ti * TS + tj] = tonecurve[btemp[ti * TS + tj]];
+                            setUnlessOOG(rtemp[ti * TS + tj], tonecurve[rtemp[ti * TS + tj]]);
+                            setUnlessOOG(gtemp[ti * TS + tj], tonecurve[gtemp[ti * TS + tj]]);
+                            setUnlessOOG(btemp[ti * TS + tj], tonecurve[btemp[ti * TS + tj]]);
                         }
                     }
                 }
@@ -3842,17 +3866,17 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             for (int j = jstart, tj = 0; j < tW; j++, tj++) {
                                 // individual R tone curve
                                 if (rCurve) {
-                                    rtemp[ti * TS + tj] = rCurve[ rtemp[ti * TS + tj] ];
+                                    setUnlessOOG(rtemp[ti * TS + tj], rCurve[ rtemp[ti * TS + tj] ]);
                                 }
 
                                 // individual G tone curve
                                 if (gCurve) {
-                                    gtemp[ti * TS + tj] = gCurve[ gtemp[ti * TS + tj] ];
+                                    setUnlessOOG(gtemp[ti * TS + tj], gCurve[ gtemp[ti * TS + tj] ]);
                                 }
 
                                 // individual B tone curve
                                 if (bCurve) {
-                                    btemp[ti * TS + tj] = bCurve[ btemp[ti * TS + tj] ];
+                                    setUnlessOOG(btemp[ti * TS + tj], bCurve[ btemp[ti * TS + tj] ]);
                                 }
                             }
                         }
@@ -3919,18 +3943,22 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                     bool neg = false;
                                     bool more_rgb = false;
                                     //gamut control : Lab values are in gamut
-                                    Color::gamutLchonly (HH, sincosval, Lpro, Chpro, rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj], wip, highlight, 0.15f, 0.96f, neg, more_rgb);
+                                    Color::gamutLchonly (HH, sincosval, Lpro, Chpro, r, g, b, wip, highlight, 0.15f, 0.96f, neg, more_rgb);
 #else
                                     //gamut control : Lab values are in gamut
-                                    Color::gamutLchonly (HH, sincosval, Lpro, Chpro, rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj], wip, highlight, 0.15f, 0.96f);
+                                    Color::gamutLchonly (HH, sincosval, Lpro, Chpro, r, g, b, wip, highlight, 0.15f, 0.96f);
 #endif
                                     //end of gamut control
                                 } else {
                                     float x_, y_, z_;
                                     //calculate RGB with L_2 and old value of a and b
                                     Color::Lab2XYZ (L_2, a_1, b_1, x_, y_, z_) ;
-                                    Color::xyz2rgb (x_, y_, z_, rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj], wip);
+                                    Color::xyz2rgb (x_, y_, z_, r, g, b, wip);
                                 }
+
+                                setUnlessOOG(rtemp[ti * TS + tj], r);
+                                setUnlessOOG(gtemp[ti * TS + tj], g);
+                                setUnlessOOG(btemp[ti * TS + tj], b);
                             }
                         }
                     }
@@ -4081,9 +4109,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                     bo *= preserv;
                                 }
 
-                                rtemp[ti * TS + tj] = CLIP(ro);
-                                gtemp[ti * TS + tj] = CLIP(go);
-                                btemp[ti * TS + tj] = CLIP(bo);
+                                rtemp[ti * TS + tj] = /*CLIP*/(ro);
+                                gtemp[ti * TS + tj] = /*CLIP*/(go);
+                                btemp[ti * TS + tj] = /*CLIP*/(bo);
                             }
                         }
                     }
@@ -4137,9 +4165,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                     float b = btemp[ti * TS + tj];
                                     float ro, go, bo;
                                     labtoning (r, g, b, ro, go, bo, algm, metchrom, twoc, satLimit, satLimitOpacity, ctColorCurve, ctOpacityCurve, clToningcurve, cl2Toningcurve, iplow, iphigh, wp, wip);
-                                    rtemp[ti * TS + tj] = CLIP (ro); //I used CLIP because there is a little bug in gamutLchonly that return 65536.ii intead of 65535 ==> crash
-                                    gtemp[ti * TS + tj] = CLIP (go);
-                                    btemp[ti * TS + tj] = CLIP (bo);
+                                    rtemp[ti * TS + tj] = /*CLIP*/ (ro); //I used CLIP because there is a little bug in gamutLchonly that return 65536.ii intead of 65535 ==> crash
+                                    gtemp[ti * TS + tj] = /*CLIP*/ (go);
+                                    btemp[ti * TS + tj] = /*CLIP*/ (bo);
                                 }
                             }
                         }
@@ -4386,28 +4414,32 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                 Color::rgbxyz (sourceR, sourceG, sourceB, x, y, z, v_work2xyz);
                                 Color::xyz2rgb (x, y, z, sourceR, sourceG, sourceB, v_xyz2clut);
 
-                                STVF (rtemp[ti * TS + tj], sourceR);
-                                STVF (gtemp[ti * TS + tj], sourceG);
-                                STVF (btemp[ti * TS + tj], sourceB);
+                                STVF (clutr[tj], sourceR);
+                                STVF (clutg[tj], sourceG);
+                                STVF (clutb[tj], sourceB);
                             }
 
 #endif
 
                             for (; j < tW; j++, tj++) {
-                                float &sourceR = rtemp[ti * TS + tj];
-                                float &sourceG = gtemp[ti * TS + tj];
-                                float &sourceB = btemp[ti * TS + tj];
+                                float sourceR = rtemp[ti * TS + tj];
+                                float sourceG = gtemp[ti * TS + tj];
+                                float sourceB = btemp[ti * TS + tj];
 
                                 float x, y, z;
                                 Color::rgbxyz ( sourceR, sourceG, sourceB, x, y, z, wprof );
-                                Color::xyz2rgb (x, y, z, sourceR, sourceG, sourceB, xyz2clut);
+                                Color::xyz2rgb (x, y, z, clutr[tj], clutg[tj], clutb[tj], xyz2clut);
                             }
+                        } else {
+                            memcpy(clutr, &rtemp[ti * TS], sizeof(float) * TS);
+                            memcpy(clutg, &gtemp[ti * TS], sizeof(float) * TS);
+                            memcpy(clutb, &btemp[ti * TS], sizeof(float) * TS);
                         }
 
                         for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                            float &sourceR = rtemp[ti * TS + tj];
-                            float &sourceG = gtemp[ti * TS + tj];
-                            float &sourceB = btemp[ti * TS + tj];
+                            float &sourceR = clutr[tj];
+                            float &sourceG = clutg[tj];
+                            float &sourceB = clutb[tj];
 
                             // Apply gamma sRGB (default RT)
                             sourceR = Color::gamma_srgbclipped (sourceR);
@@ -4415,20 +4447,19 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             sourceB = Color::gamma_srgbclipped (sourceB);
                         }
 
-                        const std::size_t line_offset = ti * TS;
                         hald_clut->getRGB (
                             film_simulation_strength,
                             std::min (TS, tW - jstart),
-                            rtemp + line_offset,
-                            gtemp + line_offset,
-                            btemp + line_offset,
+                            clutr,
+                            clutg,
+                            clutb,
                             out_rgbx
                         );
 
                         for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                            float &sourceR = rtemp[ti * TS + tj];
-                            float &sourceG = gtemp[ti * TS + tj];
-                            float &sourceB = btemp[ti * TS + tj];
+                            float &sourceR = clutr[tj];
+                            float &sourceG = clutg[tj];
+                            float &sourceB = clutb[tj];
 
                             // Apply inverse gamma sRGB
                             sourceR = Color::igamma_srgb (out_rgbx[tj * 4 + 0]);
@@ -4444,9 +4475,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
 #ifdef __SSE2__
 
                             for (; j < tW - 3; j += 4, tj += 4) {
-                                vfloat sourceR = LVF (rtemp[ti * TS + tj]);
-                                vfloat sourceG = LVF (gtemp[ti * TS + tj]);
-                                vfloat sourceB = LVF (btemp[ti * TS + tj]);
+                                vfloat sourceR = LVF (clutr[tj]);
+                                vfloat sourceG = LVF (clutg[tj]);
+                                vfloat sourceB = LVF (clutb[tj]);
 
                                 vfloat x;
                                 vfloat y;
@@ -4454,23 +4485,31 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                 Color::rgbxyz (sourceR, sourceG, sourceB, x, y, z, v_clut2xyz);
                                 Color::xyz2rgb (x, y, z, sourceR, sourceG, sourceB, v_xyz2work);
 
-                                STVF (rtemp[ti * TS + tj], sourceR);
-                                STVF (gtemp[ti * TS + tj], sourceG);
-                                STVF (btemp[ti * TS + tj], sourceB);
+                                STVF (clutr[tj], sourceR);
+                                STVF (clutg[tj], sourceG);
+                                STVF (clutb[tj], sourceB);
                             }
 
 #endif
 
                             for (; j < tW; j++, tj++) {
-                                float &sourceR = rtemp[ti * TS + tj];
-                                float &sourceG = gtemp[ti * TS + tj];
-                                float &sourceB = btemp[ti * TS + tj];
+                                float &sourceR = clutr[tj];
+                                float &sourceG = clutg[tj];
+                                float &sourceB = clutb[tj];
 
                                 float x, y, z;
                                 Color::rgbxyz (sourceR, sourceG, sourceB, x, y, z, clut2xyz);
                                 Color::xyz2rgb ( x, y, z, sourceR, sourceG, sourceB, wiprof );
                             }
                         }
+
+                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+                            if (!OOG(rtemp[ti * TS + tj]) || !OOG(gtemp[ti * TS + tj]) || !OOG(btemp[ti * TS + tj])) {
+                                rtemp[ti * TS + tj] = clutr[tj];
+                                gtemp[ti * TS + tj] = clutg[tj];
+                                btemp[ti * TS + tj] = clutb[tj];
+                            }
+                        }
                     }
                 }
 
@@ -4598,7 +4637,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                 for (int j = 0; j < tW; j++) {
 
                     //mix channel
-                    tmpImage->r (i, j) = tmpImage->g (i, j) = tmpImage->b (i, j) = CLIP ((bwr * tmpImage->r (i, j) + bwg * tmpImage->g (i, j) + bwb * tmpImage->b (i, j)) * kcorec);
+                    tmpImage->r (i, j) = tmpImage->g (i, j) = tmpImage->b (i, j) = /*CLIP*/ ((bwr * tmpImage->r (i, j) + bwg * tmpImage->g (i, j) + bwb * tmpImage->b (i, j)) * kcorec);
 
 #ifndef __SSE2__
 
@@ -4694,9 +4733,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                 bo *= preserv;
                             }
 
-                            tmpImage->r(i, j) = CLIP(ro);
-                            tmpImage->g(i, j) = CLIP(go);
-                            tmpImage->b(i, j) = CLIP(bo);
+                            tmpImage->r(i, j) = /*CLIP*/(ro);
+                            tmpImage->g(i, j) = /*CLIP*/(go);
+                            tmpImage->b(i, j) = /*CLIP*/(bo);
                         }
                     }
                 }
@@ -4803,9 +4842,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                             float b = tmpImage->b (i, j);
                             float ro, bo, go;
                             labtoning (r, g, b, ro, go, bo, algm, metchrom,  twoc, satLimit, satLimitOpacity, ctColorCurve,  ctOpacityCurve, clToningcurve, cl2Toningcurve,  iplow, iphigh,  wp,  wip);
-                            tmpImage->r (i, j) = CLIP (ro);
-                            tmpImage->g (i, j) = CLIP (go);
-                            tmpImage->b (i, j) = CLIP (bo);
+                            tmpImage->r (i, j) = /*CLIP*/ (ro);
+                            tmpImage->g (i, j) = /*CLIP*/ (go);
+                            tmpImage->b (i, j) = /*CLIP*/ (bo);
 
                         }
                     }
@@ -5049,9 +5088,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             r += corr;
         }
 
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     {
@@ -5063,9 +5102,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             g += corr;
         }
 
-        r = CLIP(r);
-        b = CLIP(b);
-        g = CLIP(g);
+        // r = CLIP(r);
+        // b = CLIP(b);
+        // g = CLIP(g);
     }
 
 
@@ -5079,9 +5118,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             b += corr;
         }
 
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     // mid tones
@@ -5112,9 +5151,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             g -= 20000.f * RedM;
             b -= 20000.f * RedM;
         }
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     {
@@ -5129,9 +5168,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             g += 10000.f * GreenM;
             b -= 20000.f * GreenM;
         }
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     {
@@ -5146,9 +5185,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             g -= 20000.f * BlueM;
             b += 10000.f * BlueM;
         }
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     //high tones
@@ -5173,9 +5212,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             b -= corr;
         }
 
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     {
@@ -5188,9 +5227,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             b -= corr;
         }
 
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     {
@@ -5203,9 +5242,9 @@ void ImProcFunctions::toningsmh(float r, float g, float b, float &ro, float &go,
             g -= corr;
         }
 
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     ro = r;
@@ -5262,24 +5301,24 @@ void ImProcFunctions::toning2col (float r, float g, float b, float &ro, float &g
             b -= factor * krl;
         }
 
-        g = CLIP(g);
-        b = CLIP(b);
+        // g = CLIP(g);
+        // b = CLIP(b);
 
         if (kgl > 0.f) {
             r -= factor * kgl;
             b -= factor * kgl;
         }
 
-        r = CLIP(r);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // b = CLIP(b);
 
         if (kbl > 0.f) {
             r -= factor * kbl;
             g -= factor * kbl;
         }
 
-        r = CLIP(r);
-        g = CLIP(g);
+        // r = CLIP(r);
+        // g = CLIP(g);
     }
 
     //high tones
@@ -5306,9 +5345,9 @@ void ImProcFunctions::toning2col (float r, float g, float b, float &ro, float &g
         g += factor * (kgh > 0.f ? kgh : 0.f);
         b += factor * (kbh > 0.f ? kbh : 0.f);
 
-        r = CLIP(r);
-        g = CLIP(g);
-        b = CLIP(b);
+        // r = CLIP(r);
+        // g = CLIP(g);
+        // b = CLIP(b);
     }
 
     float preserv = 1.f;
@@ -5317,9 +5356,9 @@ void ImProcFunctions::toning2col (float r, float g, float b, float &ro, float &g
         preserv = lumbefore / lumafter;
     }
 
-    ro = CLIP(r * preserv);
-    go = CLIP(g * preserv);
-    bo = CLIP(b * preserv);
+    ro = /*CLIP*/(r * preserv);
+    go = /*CLIP*/(g * preserv);
+    bo = /*CLIP*/(b * preserv);
 }
 
 /**
diff --git a/rtengine/iplab2rgb.cc b/rtengine/iplab2rgb.cc
index 6979224a0..e56e63dea 100644
--- a/rtengine/iplab2rgb.cc
+++ b/rtengine/iplab2rgb.cc
@@ -30,8 +30,66 @@
 namespace rtengine
 {
 
+extern void filmlike_clip(float *r, float *g, float *b);
+
+namespace {
+
+inline void clipLAB(float iL, float ia, float ib, float &oL, float &oa, float &ob, const float scale, const float wp[3][3], const float wip[3][3])
+{
+    if (iL < 0.f) {
+        oL = oa = ob = 0.f;
+    } else if (iL > 32768.f) {
+        
+        float X, Y, Z;
+        float r, g, b;
+        Color::Lab2XYZ(iL, ia, ib, X, Y, Z);
+        Color::xyz2rgb(X, Y, Z, r, g, b, wip);
+        filmlike_clip(&r, &g, &b);
+        Color::rgbxyz(r, g, b, X, Y, Z, wp);
+        Color::XYZ2Lab(X, Y, Z, oL, oa, ob);
+        oL /= scale;
+        oa /= scale;
+        ob /= scale;
+        
+        // oL = 32768.f / scale;
+        // oa = ob = 0.f;
+    } else {
+        oL = iL / scale;
+        oa = ia / scale;
+        ob = ib / scale;
+    }
+}
+
+
+inline void clipLAB(float iL, float ia, float ib, double &oL, double &oa, double &ob, const float scale, const float wp[3][3], const float wip[3][3])
+{
+    float tL, ta, tb;
+    clipLAB(iL, ia, ib, tL, ta, tb, scale, wp, wip);
+    oL = tL;
+    oa = ta;
+    ob = tb;
+}
+
+} // namespace
+
 extern const Settings* settings;
 
+#define DECLARE_WORKING_MATRICES_(space) \
+    TMatrix wprof = ICCStore::getInstance()->workingSpaceMatrix ( space ); \
+    const float wp[3][3] = {                                            \
+        {static_cast<float> (wprof[0][0]), static_cast<float> (wprof[0][1]), static_cast<float> (wprof[0][2])}, \
+        {static_cast<float> (wprof[1][0]), static_cast<float> (wprof[1][1]), static_cast<float> (wprof[1][2])}, \
+        {static_cast<float> (wprof[2][0]), static_cast<float> (wprof[2][1]), static_cast<float> (wprof[2][2])} \
+    };                                                                  \
+                                                                        \
+    TMatrix wiprof = ICCStore::getInstance()->workingSpaceInverseMatrix ( space ); \
+    const float wip[3][3] = {                                           \
+        {static_cast<float> (wiprof[0][0]), static_cast<float> (wiprof[0][1]), static_cast<float> (wiprof[0][2])}, \
+        {static_cast<float> (wiprof[1][0]), static_cast<float> (wiprof[1][1]), static_cast<float> (wiprof[1][2])}, \
+        {static_cast<float> (wiprof[2][0]), static_cast<float> (wiprof[2][1]), static_cast<float> (wiprof[2][2])} \
+    }
+    
+
 // Used in ImProcCoordinator::updatePreviewImage  (rtengine/improccoordinator.cc)
 //         Crop::update                           (rtengine/dcrop.cc)
 //         Thumbnail::processImage                (rtengine/rtthumbnail.cc)
@@ -40,6 +98,8 @@ extern const Settings* settings;
 // otherwise divide by 327.68, convert to xyz and apply the sRGB transform, before converting with gamma2curve
 void ImProcFunctions::lab2monitorRgb (LabImage* lab, Image8* image)
 {
+    DECLARE_WORKING_MATRICES_(params->icm.working);
+    
     if (monitorTransform) {
 
         int W = lab->W;
@@ -68,9 +128,8 @@ void ImProcFunctions::lab2monitorRgb (LabImage* lab, Image8* image)
                 float* rb = lab->b[i];
 
                 for (int j = 0; j < W; j++) {
-                    buffer[iy++] = rL[j] / 327.68f;
-                    buffer[iy++] = ra[j] / 327.68f;
-                    buffer[iy++] = rb[j] / 327.68f;
+                    clipLAB(rL[j], ra[j], rb[j], buffer[iy], buffer[iy+1], buffer[iy+2], 327.68f, wp, wip);
+                    iy += 3;
                 }
 
                 cmsDoTransform (monitorTransform, buffer, data + ix, W);
@@ -94,12 +153,14 @@ void ImProcFunctions::lab2monitorRgb (LabImage* lab, Image8* image)
 
             float R, G, B;
             float x_, y_, z_;
+            float L, a, b;
 
             for (int j = 0; j < W; ++j) {
 
                 //float L1=rL[j],a1=ra[j],b1=rb[j];//for testing
+                clipLAB(rL[j], ra[j], rb[j], L, a, b, 1.f, wp, wip);
 
-                Color::Lab2XYZ(rL[j], ra[j], rb[j], x_, y_, z_ );
+                Color::Lab2XYZ(L, a, b, x_, y_, z_ );
 
                 Color::xyz2srgb(x_, y_, z_, R, G, B);
 
@@ -124,6 +185,8 @@ void ImProcFunctions::lab2monitorRgb (LabImage* lab, Image8* image)
 // otherwise divide by 327.68, convert to xyz and apply the RGB transform, before converting with gamma2curve
 Image8* ImProcFunctions::lab2rgb (LabImage* lab, int cx, int cy, int cw, int ch, const procparams::ColorManagementParams &icm, bool consider_histogram_settings)
 {
+    DECLARE_WORKING_MATRICES_(icm.working);
+    
     //gamutmap(lab);
 
     if (cx < 0) {
@@ -200,9 +263,8 @@ Image8* ImProcFunctions::lab2rgb (LabImage* lab, int cx, int cy, int cw, int ch,
                 float* rb = lab->b[i];
 
                 for (int j = cx; j < cx + cw; j++) {
-                    buffer[iy++] = rL[j] / 327.68f;
-                    buffer[iy++] = ra[j] / 327.68f;
-                    buffer[iy++] = rb[j] / 327.68f;
+                    clipLAB(rL[j], ra[j], rb[j], buffer[iy], buffer[iy+1], buffer[iy+2], 327.68f, wp, wip);
+                    iy += 3;
                 }
 
                 cmsDoTransform (hTransform, buffer, data + ix, cw);
@@ -230,8 +292,10 @@ Image8* ImProcFunctions::lab2rgb (LabImage* lab, int cx, int cy, int cw, int ch,
 
             float R, G, B;
             float x_, y_, z_;
+            float L, a, b;
 
             for (int j = cx; j < cx + cw; ++j) {
+                clipLAB(rL[j], ra[j], rb[j], L, a, b, 1.f, wp, wip);
                 Color::Lab2XYZ(rL[j], ra[j], rb[j], x_, y_, z_);
 
                 Color::xyz2rgb(x_, y_, z_, R, G, B, xyz_rgb);
@@ -330,9 +394,9 @@ Imagefloat* ImProcFunctions::lab2rgbOut (LabImage* lab, int cx, int cy, int cw,
 
                 Color::xyz2srgb(x_, y_, z_, R, G, B);
 
-                image->r(i - cy, j - cx) = Color::gamma2curve[CLIP(R)];
-                image->g(i - cy, j - cx) = Color::gamma2curve[CLIP(G)];
-                image->b(i - cy, j - cx) = Color::gamma2curve[CLIP(B)];
+                setUnlessOOG(image->r(i - cy, j - cx), Color::gamma2curve[CLIP(R)]);
+                setUnlessOOG(image->g(i - cy, j - cx), Color::gamma2curve[CLIP(G)]);
+                setUnlessOOG(image->b(i - cy, j - cx), Color::gamma2curve[CLIP(B)]);
             }
         }
     }
diff --git a/rtengine/ipresize.cc b/rtengine/ipresize.cc
index 03502e99b..f9ff94ef9 100644
--- a/rtengine/ipresize.cc
+++ b/rtengine/ipresize.cc
@@ -162,9 +162,9 @@ void ImProcFunctions::Lanczos (const Imagefloat* src, Imagefloat* dst, float sca
                     b += wh[k] * lb[jj];
                 }
 
-                dst->r (i, j) = CLIP (r);//static_cast<int> (r));
-                dst->g (i, j) = CLIP (g);//static_cast<int> (g));
-                dst->b (i, j) = CLIP (b);//static_cast<int> (b));
+                dst->r (i, j) = /*CLIP*/ (r);//static_cast<int> (r));
+                dst->g (i, j) = /*CLIP*/ (g);//static_cast<int> (g));
+                dst->b (i, j) = /*CLIP*/ (b);//static_cast<int> (b));
             }
         }
 
diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h
index ca93619ee..1918439ef 100644
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@@ -137,4 +137,20 @@ constexpr std::uint8_t uint16ToUint8Rounded(std::uint16_t i)
     return ((i + 128) - ((i + 128) >> 8)) >> 8;
 }
 
+
+template <typename T>
+constexpr bool OOG(const T &val, const T &high=T(MAXVAL))
+{
+    return (val > high);
+}
+
+template <typename T>
+void setUnlessOOG(T &out, const T &val)
+{
+    if (!OOG(out)) {
+        out = val;
+    }
+}
+
+
 }
diff --git a/rtengine/rtthumbnail.cc b/rtengine/rtthumbnail.cc
index 05d4bead8..cd4d414ea 100644
--- a/rtengine/rtthumbnail.cc
+++ b/rtengine/rtthumbnail.cc
@@ -1038,11 +1038,11 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, eSensorT
 
         for (int j = 0; j < rwidth; j++) {
             float red = baseImg->r (i, j) * rmi;
-            baseImg->r (i, j) = CLIP (red);
+            baseImg->r (i, j) = /*CLIP*/ (red);
             float green = baseImg->g (i, j) * gmi;
-            baseImg->g (i, j) = CLIP (green);
+            baseImg->g (i, j) = /*CLIP*/ (green);
             float blue = baseImg->b (i, j) * bmi;
-            baseImg->b (i, j) = CLIP (blue);
+            baseImg->b (i, j) = /*CLIP*/ (blue);
 
         }
     }
@@ -1234,6 +1234,7 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, eSensorT
             }
     }
 
+    
     // luminance processing
 //  ipf.EPDToneMap(labView,0,6);
 
@@ -1303,7 +1304,7 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, eSensorT
         ipf.ciecam_02float (cieView, adap, 1, 2, labView, &params, customColCurve1, customColCurve2, customColCurve3, dummy, dummy, CAMBrightCurveJ, CAMBrightCurveQ, CAMMean, 5, sk, execsharp, d, dj, yb, rtt);
         delete cieView;
     }
-
+    
     // color processing
     //ipf.colorCurve (labView, labView);
 
diff --git a/rtgui/editorpanel.cc b/rtgui/editorpanel.cc
index 278b14fe0..2f88e0c11 100644
--- a/rtgui/editorpanel.cc
+++ b/rtgui/editorpanel.cc
@@ -1978,6 +1978,11 @@ bool EditorPanel::saveImmediately (const Glib::ustring &filename, const SaveForm
 {
     rtengine::procparams::ProcParams pparams;
     ipc->getParams (&pparams);
+
+    if (gimpPlugin) {
+        pparams.icm.gamma = "linear_g1.0";
+    }
+
     rtengine::ProcessingJob *job = rtengine::ProcessingJob::create (ipc->getInitialImage(), pparams);
 
     // save immediately
@@ -1985,7 +1990,9 @@ bool EditorPanel::saveImmediately (const Glib::ustring &filename, const SaveForm
 
     int err = 0;
 
-    if (sf.format == "tif") {
+    if (gimpPlugin) {
+        err = img->saveAsTIFF (filename, 32, true);
+    } else if (sf.format == "tif") {
         err = img->saveAsTIFF (filename, sf.tiffBits, sf.tiffUncompressed);
     } else if (sf.format == "png") {
         err = img->saveAsPNG (filename, sf.pngBits);
diff --git a/rtgui/main.cc b/rtgui/main.cc
index 5c2296b58..9b2cf64ae 100644
--- a/rtgui/main.cc
+++ b/rtgui/main.cc
@@ -674,15 +674,8 @@ int main (int argc, char **argv)
             m.run (*rtWindow);
             gdk_threads_leave();
 
-            if (gimpPlugin &&
-                    rtWindow->epanel && rtWindow->epanel->isRealized()) {
-                SaveFormat sf;
-                sf.format = "tif";
-                sf.tiffBits = 16;
-                sf.tiffUncompressed = true;
-                sf.saveParams = true;
-
-                if (!rtWindow->epanel->saveImmediately (argv2, sf)) {
+            if (gimpPlugin && rtWindow->epanel && rtWindow->epanel->isRealized()) {
+                if (!rtWindow->epanel->saveImmediately(argv2, SaveFormat())) {
                     ret = -2;
                 }
             }