diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 22ad77e63..2fa589110 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1252,7 +1252,7 @@ float* RawImageSource::CA_correct_RT(
                         vfloat factors = oldvals / newvals;
                         factors = vself(vmaskf_le(newvals, onev), onev, factors);
                         factors = vself(vmaskf_le(oldvals, onev), onev, factors);
-                        STVFU((*nonGreen)[i/2][j/2], LIMV(factors, zd5v, twov));
+                        STVFU((*nonGreen)[i/2][j/2], vclampf(factors, zd5v, twov));
                     }
 #endif
                     for (; j < W - 2 * cb; j += 2) {
diff --git a/rtengine/EdgePreservingDecomposition.cc b/rtengine/EdgePreservingDecomposition.cc
index 4c56cbb5d..f71e0043e 100644
--- a/rtengine/EdgePreservingDecomposition.cc
+++ b/rtengine/EdgePreservingDecomposition.cc
@@ -731,7 +731,7 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
                 gxv = (LVFU(rg[x + 1]) -  LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + w]));
                 gyv = (LVFU(rg[x + w]) -  LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + 1]));
                 //Apply power to the magnitude of the gradient to get the edge stopping function.
-                _mm_storeu_ps( &a[x + w * y], Scalev * pow_F((zd5v * _mm_sqrt_ps(gxv * gxv + gyv * gyv + sqrepsv)), EdgeStoppingv) );
+                _mm_storeu_ps( &a[x + w * y], Scalev * pow_F((zd5v * vsqrtf(gxv * gxv + gyv * gyv + sqrepsv)), EdgeStoppingv) );
             }
 
             for(; x < w1; x++) {
diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 7c6b8159d..d9408f17a 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -3512,7 +3512,7 @@ void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc,
                         aNv = LVFU(acalc[i >> 1][j >> 1]);
                         bNv = LVFU(bcalc[i >> 1][j >> 1]);
                         _mm_storeu_ps(&noisevarhue[i1 >> 1][j1 >> 1], xatan2f(bNv, aNv));
-                        _mm_storeu_ps(&noisevarchrom[i1 >> 1][j1 >> 1], _mm_max_ps(c100v, _mm_sqrt_ps(SQRV(aNv) + SQRV(bNv))));
+                        _mm_storeu_ps(&noisevarchrom[i1 >> 1][j1 >> 1], vmaxf(vsqrtf(SQRV(aNv) + SQRV(bNv)),c100v));
                     }
 
                     for (; j < tileright; j += 2) {
diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index d2f758689..de668cca8 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -320,7 +320,7 @@ public:
 
         // Clamp and convert to integer values. Extract out of SSE register because all
         // lookup operations use regular addresses.
-        vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
+        vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
         vint indexes = _mm_cvttps_epi32(clampedIndexes);
         int indexArray[4];
         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
@@ -352,7 +352,7 @@ public:
 
         // Clamp and convert to integer values. Extract out of SSE register because all
         // lookup operations use regular addresses.
-        vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
+        vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
         vint indexes = _mm_cvttps_epi32(clampedIndexes);
         int indexArray[4];
         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
@@ -372,7 +372,7 @@ public:
         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
 
-        vfloat diff = vmaxf(ZEROV, vminf(sizev, indexv)) - _mm_cvtepi32_ps(indexes);
+        vfloat diff = vclampf(indexv, ZEROV, sizev) - _mm_cvtepi32_ps(indexes); // this automagically uses ZEROV in case indexv is NaN
         return vintpf(diff, upper, lower);
     }
 
@@ -383,7 +383,7 @@ public:
 
         // Clamp and convert to integer values. Extract out of SSE register because all
         // lookup operations use regular addresses.
-        vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
+        vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
         vint indexes = _mm_cvttps_epi32(clampedIndexes);
         int indexArray[4];
         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
@@ -420,7 +420,8 @@ public:
     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
     vfloat operator[](vint idxv) const
     {
-        vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers
+        // convert to float because SSE2 has no min/max for 32bit integers
+        vfloat tempv = vclampf(_mm_cvtepi32_ps(idxv), ZEROV, sizev); // this automagically uses ZEROV in case idxv is NaN (which will never happen because it is a vector of int)
         idxv = _mm_cvttps_epi32(tempv);
         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
         return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)],
diff --git a/rtengine/ciecam02.cc b/rtengine/ciecam02.cc
index f9475eb4e..c5e172478 100644
--- a/rtengine/ciecam02.cc
+++ b/rtengine/ciecam02.cc
@@ -542,9 +542,9 @@ void Ciecam02::xyz2jchqms_ciecam02float ( vfloat &J, vfloat &C, vfloat &h, vfloa
 
     cat02_to_hpefloat ( rp, gp, bp, rc, gc, bc);
     //gamut correction M.H.Brill S.Susstrunk
-    rp = _mm_max_ps (rp, ZEROV);
-    gp = _mm_max_ps (gp, ZEROV);
-    bp = _mm_max_ps (bp, ZEROV);
+    rp = vmaxf (rp, ZEROV);
+    gp = vmaxf (gp, ZEROV);
+    bp = vmaxf (bp, ZEROV);
     rpa = nonlinear_adaptationfloat ( rp, fl );
     gpa = nonlinear_adaptationfloat ( gp, fl );
     bpa = nonlinear_adaptationfloat ( bp, fl );
@@ -559,20 +559,20 @@ void Ciecam02::xyz2jchqms_ciecam02float ( vfloat &J, vfloat &C, vfloat &h, vfloa
     myh = vself (vmaskf_lt (myh, ZEROV), temp, myh);
 
     a = ((rpa + rpa) + gpa + (F2V (0.05f) * bpa) - F2V (0.305f)) * nbb;
-    a = _mm_max_ps (a, ZEROV);  //gamut correction M.H.Brill S.Susstrunk
+    a = vmaxf (a, ZEROV);  //gamut correction M.H.Brill S.Susstrunk
 
     J = pow_F ( a / aw, c * cz * F2V (0.5f));
 
     e = ((F2V (961.53846f)) * nc * ncb) * (xcosf ( myh + F2V (2.0f) ) + F2V (3.8f));
-    t = (e * _mm_sqrt_ps ( (ca * ca) + (cb * cb) )) / (rpa + gpa + (F2V (1.05f) * bpa));
+    t = (e * vsqrtf ( (ca * ca) + (cb * cb) )) / (rpa + gpa + (F2V (1.05f) * bpa));
 
     C = pow_F ( t, F2V (0.9f) ) * J * pow1;
 
     Q = wh * J;
     J *= J * F2V (100.0f);
     M = C * pfl;
-    Q = _mm_max_ps (Q, F2V (0.0001f)); // avoid division by zero
-    s = F2V (100.0f) * _mm_sqrt_ps ( M / Q );
+    Q = vmaxf (Q, F2V (0.0001f)); // avoid division by zero
+    s = F2V (100.0f) * vsqrtf ( M / Q );
     h = (myh * F2V (180.f)) / F2V (rtengine::RT_PI);
 }
 #endif
@@ -710,7 +710,7 @@ void Ciecam02::jch2xyz_ciecam02float ( vfloat &x, vfloat &y, vfloat &z, vfloat J
     xyz_to_cat02float ( rw, gw, bw, xw, yw, zw);
     e = ((F2V (961.53846f)) * nc * ncb) * (xcosf ( ((h * F2V (rtengine::RT_PI)) / F2V (180.0f)) + F2V (2.0f) ) + F2V (3.8f));
     a = pow_F ( J / F2V (100.0f), reccmcz ) * aw;
-    t = pow_F ( F2V (10.f) * C / (_mm_sqrt_ps ( J ) * pow1), F2V (1.1111111f) );
+    t = pow_F ( F2V (10.f) * C / (vsqrtf ( J ) * pow1), F2V (1.1111111f) );
 
     calculate_abfloat ( ca, cb, h, e, t, nbb, a );
     Aab_to_rgbfloat ( rpa, gpa, bpa, a, ca, cb, nbb );
@@ -780,7 +780,7 @@ vfloat Ciecam02::inverse_nonlinear_adaptationfloat ( vfloat c, vfloat fl )
     c -= F2V (0.1f);
     fl = vmulsignf (fl, c);
     c = vabsf (c);
-    c = _mm_min_ps ( c, F2V (399.99f));
+    c = vminf ( c, F2V (399.99f));
     return (F2V (100.0f) / fl) * pow_F ( (F2V (27.13f) * c) / (F2V (400.0f) - c), F2V (2.38095238f) );
 }
 #endif
diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc
index 87ce25d97..1a425d21b 100644
--- a/rtengine/clutstore.cc
+++ b/rtengine/clutstore.cc
@@ -226,7 +226,7 @@ void rtengine::HaldCLUT::getRGB(
 #else
         const vfloat v_in = _mm_set_ps(0.0f, *b, *g, *r);
         const vfloat v_tmp = v_in * F2V(flevel_minus_one);
-        const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(F2V(flevel_minus_two), v_tmp)));
+        const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(vminf(v_tmp, F2V(flevel_minus_two))));
 
         size_t index = color * 4;
 
diff --git a/rtengine/color.cc b/rtengine/color.cc
index b217b33c2..b0122c2f0 100644
--- a/rtengine/color.cc
+++ b/rtengine/color.cc
@@ -564,8 +564,8 @@ void Color::rgb2hsl(float r, float g, float b, float &h, float &s, float &l)
 #ifdef __SSE2__
 void Color::rgb2hsl(vfloat r, vfloat g, vfloat b, vfloat &h, vfloat &s, vfloat &l)
 {
-    vfloat maxv = _mm_max_ps(r, _mm_max_ps(g, b));
-    vfloat minv = _mm_min_ps(r, _mm_min_ps(g, b));
+    vfloat maxv = vmaxf(r, vmaxf(g, b));
+    vfloat minv = vminf(r, vminf(g, b));
     vfloat C = maxv - minv;
     vfloat tempv = maxv + minv;
     l = (tempv) * F2V(7.6295109e-6f);
@@ -2901,7 +2901,7 @@ void Color::LabGamutMunsell(float *labL, float *laba, float *labb, const int N,
         av = LVFU(laba[k]);
         bv = LVFU(labb[k]);
         _mm_storeu_ps(&HHBuffer[k], xatan2f(bv, av));
-        _mm_storeu_ps(&CCBuffer[k], _mm_sqrt_ps(SQRV(av) + SQRV(bv)) / c327d68v);
+        _mm_storeu_ps(&CCBuffer[k], vsqrtf(SQRV(av) + SQRV(bv)) / c327d68v);
     }
 
     for (; k < N; k++) {
diff --git a/rtengine/curves.h b/rtengine/curves.h
index 0a0c1d2c3..248bfe487 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -1295,9 +1295,9 @@ inline void WeightedStdToneCurve::BatchApply(const size_t start, const size_t en
     float tmpb[4] ALIGNED16;
 
     for (; i + 3 < end; i += 4) {
-        vfloat r_val = LIMV(LVF(r[i]), ZEROV, c65535v);
-        vfloat g_val = LIMV(LVF(g[i]), ZEROV, c65535v);
-        vfloat b_val = LIMV(LVF(b[i]), ZEROV, c65535v);
+        vfloat r_val = vclampf(LVF(r[i]), ZEROV, c65535v);
+        vfloat g_val = vclampf(LVF(g[i]), ZEROV, c65535v);
+        vfloat b_val = vclampf(LVF(b[i]), ZEROV, c65535v);
         vfloat r1 = lutToneCurve[r_val];
         vfloat g1 = Triangle(r_val, r1, g_val);
         vfloat b1 = Triangle(r_val, r1, b_val);
@@ -1310,9 +1310,9 @@ inline void WeightedStdToneCurve::BatchApply(const size_t start, const size_t en
         vfloat r3 = Triangle(b_val, b3, r_val);
         vfloat g3 = Triangle(b_val, b3, g_val);
 
-        STVF(tmpr[0], LIMV(r1 * zd5v + r2 * zd25v + r3 * zd25v, ZEROV, c65535v));
-        STVF(tmpg[0], LIMV(g1 * zd25v + g2 * zd5v + g3 * zd25v, ZEROV, c65535v));
-        STVF(tmpb[0], LIMV(b1 * zd25v + b2 * zd25v + b3 * zd5v, ZEROV, c65535v));
+        STVF(tmpr[0], vclampf(r1 * zd5v + r2 * zd25v + r3 * zd25v, ZEROV, c65535v));
+        STVF(tmpg[0], vclampf(g1 * zd25v + g2 * zd5v + g3 * zd25v, ZEROV, c65535v));
+        STVF(tmpb[0], vclampf(b1 * zd25v + b2 * zd25v + b3 * zd5v, ZEROV, c65535v));
         for (int j = 0; j < 4; ++j) {
             setUnlessOOG(r[i+j], g[i+j], b[i+j], tmpr[j], tmpg[j], tmpb[j]);
         }
diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc
index 10ef0dd0e..a324a7ca6 100644
--- a/rtengine/demosaic_algos.cc
+++ b/rtengine/demosaic_algos.cc
@@ -1399,7 +1399,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, array2D<float> &r
 // Adapted to RawTherapee by Jacques Desmis 3/2013
 // SSE version by Ingo Weyrich 5/2013
 #ifdef __SSE2__
-#define CLIPV(a) LIMV(a,zerov,c65535v)
+#define CLIPV(a) vclampf(a,zerov,c65535v)
 void RawImageSource::igv_interpolate(int winw, int winh)
 {
     static const float eps = 1e-5f, epssq = 1e-5f; //mod epssq -10f =>-5f Jacques 3/2013 to prevent artifact (divide by zero)
@@ -1513,10 +1513,10 @@ void RawImageSource::igv_interpolate(int winw, int winh)
                 //N,E,W,S Hamilton Adams Interpolation
                 // (48.f * 65535.f) = 3145680.f
                 tempv = c40v * LVFU(rgb[0][indx1]);
-                nvv = LIMV(((c23v * LVFU(rgb[1][(indx - v1) >> 1]) + c23v * LVFU(rgb[1][(indx - v3) >> 1]) + LVFU(rgb[1][(indx - v5) >> 1]) + LVFU(rgb[1][(indx + v1) >> 1]) + tempv - c32v * LVFU(rgb[0][(indx1 - v1)]) - c8v * LVFU(rgb[0][(indx1 - v2)]))) / c3145680v, zerov, onev);
-                evv = LIMV(((c23v * LVFU(rgb[1][(indx + h1) >> 1]) + c23v * LVFU(rgb[1][(indx + h3) >> 1]) + LVFU(rgb[1][(indx + h5) >> 1]) + LVFU(rgb[1][(indx - h1) >> 1]) + tempv - c32v * LVFU(rgb[0][(indx1 + h1)]) - c8v * LVFU(rgb[0][(indx1 + h2)]))) / c3145680v, zerov, onev);
-                wvv = LIMV(((c23v * LVFU(rgb[1][(indx - h1) >> 1]) + c23v * LVFU(rgb[1][(indx - h3) >> 1]) + LVFU(rgb[1][(indx - h5) >> 1]) + LVFU(rgb[1][(indx + h1) >> 1]) + tempv - c32v * LVFU(rgb[0][(indx1 - h1)]) - c8v * LVFU(rgb[0][(indx1 - h2)]))) / c3145680v, zerov, onev);
-                svv = LIMV(((c23v * LVFU(rgb[1][(indx + v1) >> 1]) + c23v * LVFU(rgb[1][(indx + v3) >> 1]) + LVFU(rgb[1][(indx + v5) >> 1]) + LVFU(rgb[1][(indx - v1) >> 1]) + tempv - c32v * LVFU(rgb[0][(indx1 + v1)]) - c8v * LVFU(rgb[0][(indx1 + v2)]))) / c3145680v, zerov, onev);
+                nvv = vclampf(((c23v * LVFU(rgb[1][(indx - v1) >> 1]) + c23v * LVFU(rgb[1][(indx - v3) >> 1]) + LVFU(rgb[1][(indx - v5) >> 1]) + LVFU(rgb[1][(indx + v1) >> 1]) + tempv - c32v * LVFU(rgb[0][(indx1 - v1)]) - c8v * LVFU(rgb[0][(indx1 - v2)]))) / c3145680v, zerov, onev);
+                evv = vclampf(((c23v * LVFU(rgb[1][(indx + h1) >> 1]) + c23v * LVFU(rgb[1][(indx + h3) >> 1]) + LVFU(rgb[1][(indx + h5) >> 1]) + LVFU(rgb[1][(indx - h1) >> 1]) + tempv - c32v * LVFU(rgb[0][(indx1 + h1)]) - c8v * LVFU(rgb[0][(indx1 + h2)]))) / c3145680v, zerov, onev);
+                wvv = vclampf(((c23v * LVFU(rgb[1][(indx - h1) >> 1]) + c23v * LVFU(rgb[1][(indx - h3) >> 1]) + LVFU(rgb[1][(indx - h5) >> 1]) + LVFU(rgb[1][(indx + h1) >> 1]) + tempv - c32v * LVFU(rgb[0][(indx1 - h1)]) - c8v * LVFU(rgb[0][(indx1 - h2)]))) / c3145680v, zerov, onev);
+                svv = vclampf(((c23v * LVFU(rgb[1][(indx + v1) >> 1]) + c23v * LVFU(rgb[1][(indx + v3) >> 1]) + LVFU(rgb[1][(indx + v5) >> 1]) + LVFU(rgb[1][(indx - v1) >> 1]) + tempv - c32v * LVFU(rgb[0][(indx1 + v1)]) - c8v * LVFU(rgb[0][(indx1 + v2)]))) / c3145680v, zerov, onev);
                 //Horizontal and vertical color differences
                 tempv = LVFU( rgb[0][indx1] ) / c65535v;
                 _mm_storeu_ps( &vdif[indx1], (sgv * nvv + ngv * svv) / (ngv + sgv) - tempv );
@@ -1561,9 +1561,9 @@ void RawImageSource::igv_interpolate(int winw, int winh)
             for (col = 7 + (FC(row, 1) & 1), indx1 = (row * width + col) >> 1, d = FC(row, col) / 2; col < width - 14; col += 8, indx1 += 4) {
                 //H&V integrated gaussian vector over variance on color differences
                 //Mod Jacques 3/2013
-                ngv = LIMV(epssqv + c78v * SQRV(LVFU(vdif[indx1])) + c69v * (SQRV(LVFU(vdif[indx1 - v1])) + SQRV(LVFU(vdif[indx1 + v1]))) + c51v * (SQRV(LVFU(vdif[indx1 - v2])) + SQRV(LVFU(vdif[indx1 + v2]))) + c21v * (SQRV(LVFU(vdif[indx1 - v3])) + SQRV(LVFU(vdif[indx1 + v3]))) - c6v * SQRV(LVFU(vdif[indx1 - v1]) + LVFU(vdif[indx1]) + LVFU(vdif[indx1 + v1]))
+                ngv = vclampf(epssqv + c78v * SQRV(LVFU(vdif[indx1])) + c69v * (SQRV(LVFU(vdif[indx1 - v1])) + SQRV(LVFU(vdif[indx1 + v1]))) + c51v * (SQRV(LVFU(vdif[indx1 - v2])) + SQRV(LVFU(vdif[indx1 + v2]))) + c21v * (SQRV(LVFU(vdif[indx1 - v3])) + SQRV(LVFU(vdif[indx1 + v3]))) - c6v * SQRV(LVFU(vdif[indx1 - v1]) + LVFU(vdif[indx1]) + LVFU(vdif[indx1 + v1]))
                            - c10v * (SQRV(LVFU(vdif[indx1 - v2]) + LVFU(vdif[indx1 - v1]) + LVFU(vdif[indx1])) + SQRV(LVFU(vdif[indx1]) + LVFU(vdif[indx1 + v1]) + LVFU(vdif[indx1 + v2]))) - c7v * (SQRV(LVFU(vdif[indx1 - v3]) + LVFU(vdif[indx1 - v2]) + LVFU(vdif[indx1 - v1])) + SQRV(LVFU(vdif[indx1 + v1]) + LVFU(vdif[indx1 + v2]) + LVFU(vdif[indx1 + v3]))), zerov, onev);
-                egv = LIMV(epssqv + c78v * SQRV(LVFU(hdif[indx1])) + c69v * (SQRV(LVFU(hdif[indx1 - h1])) + SQRV(LVFU(hdif[indx1 + h1]))) + c51v * (SQRV(LVFU(hdif[indx1 - h2])) + SQRV(LVFU(hdif[indx1 + h2]))) + c21v * (SQRV(LVFU(hdif[indx1 - h3])) + SQRV(LVFU(hdif[indx1 + h3]))) - c6v * SQRV(LVFU(hdif[indx1 - h1]) + LVFU(hdif[indx1]) + LVFU(hdif[indx1 + h1]))
+                egv = vclampf(epssqv + c78v * SQRV(LVFU(hdif[indx1])) + c69v * (SQRV(LVFU(hdif[indx1 - h1])) + SQRV(LVFU(hdif[indx1 + h1]))) + c51v * (SQRV(LVFU(hdif[indx1 - h2])) + SQRV(LVFU(hdif[indx1 + h2]))) + c21v * (SQRV(LVFU(hdif[indx1 - h3])) + SQRV(LVFU(hdif[indx1 + h3]))) - c6v * SQRV(LVFU(hdif[indx1 - h1]) + LVFU(hdif[indx1]) + LVFU(hdif[indx1 + h1]))
                            - c10v * (SQRV(LVFU(hdif[indx1 - h2]) + LVFU(hdif[indx1 - h1]) + LVFU(hdif[indx1])) + SQRV(LVFU(hdif[indx1]) + LVFU(hdif[indx1 + h1]) + LVFU(hdif[indx1 + h2]))) - c7v * (SQRV(LVFU(hdif[indx1 - h3]) + LVFU(hdif[indx1 - h2]) + LVFU(hdif[indx1 - h1])) + SQRV(LVFU(hdif[indx1 + h1]) + LVFU(hdif[indx1 + h2]) + LVFU(hdif[indx1 + h3]))), zerov, onev);
                 //Limit chrominance using H/V neighbourhood
                 nvv = median(d725v * LVFU(vdif[indx1]) + d1375v * LVFU(vdif[indx1 - v1]) + d1375v * LVFU(vdif[indx1 + v1]), LVFU(vdif[indx1 - v1]), LVFU(vdif[indx1 + v1]));
@@ -2114,7 +2114,7 @@ void RawImageSource::nodemosaic(bool bw)
 */
 
 #ifdef __SSE2__
-#define CLIPV(a) LIMV(a,ZEROV,c65535v)
+#define CLIPV(a) vclampf(a,ZEROV,c65535v)
 #endif
 void RawImageSource::refinement(int PassCount)
 {
diff --git a/rtengine/dirpyr_equalizer.cc b/rtengine/dirpyr_equalizer.cc
index 87ec53e13..2bad2fbb2 100644
--- a/rtengine/dirpyr_equalizer.cc
+++ b/rtengine/dirpyr_equalizer.cc
@@ -193,7 +193,7 @@ void ImProcFunctions :: dirpyr_equalizer(float ** src, float ** dst, int srcwidt
                 int j;
 
                 for (j = 0; j < srcwidth - 3; j += 4) {
-                    _mm_storeu_ps(&tmpChr[i][j], _mm_sqrt_ps(SQRV(LVFU(l_b[i][j])) + SQRV(LVFU(l_a[i][j]))) / div);
+                    _mm_storeu_ps(&tmpChr[i][j], vsqrtf(SQRV(LVFU(l_b[i][j])) + SQRV(LVFU(l_a[i][j]))) / div);
                 }
 
                 for (; j < srcwidth; j++) {
diff --git a/rtengine/fast_demo.cc b/rtengine/fast_demo.cc
index cab38b1f2..e88661485 100644
--- a/rtengine/fast_demo.cc
+++ b/rtengine/fast_demo.cc
@@ -364,7 +364,7 @@ void RawImageSource::fast_demosaic()
                         for (int j = left + 1, cc = 1; j < right - 1; j += 4, cc += 4) {
                             //interpolate B/R colors at R/B sites
                             _mm_storeu_ps(&bluetile[rr * TS + cc], LVFU(greentile[rr * TS + cc]) - zd25v * ((LVFU(greentile[(rr - 1)*TS + (cc - 1)]) + LVFU(greentile[(rr - 1)*TS + (cc + 1)]) + LVFU(greentile[(rr + 1)*TS + cc + 1]) + LVFU(greentile[(rr + 1)*TS + cc - 1])) -
-                                          _mm_min_ps(clip_ptv, LVFU(rawData[i - 1][j - 1]) + LVFU(rawData[i - 1][j + 1]) + LVFU(rawData[i + 1][j + 1]) + LVFU(rawData[i + 1][j - 1]))));
+                                          vminf(LVFU(rawData[i - 1][j - 1]) + LVFU(rawData[i - 1][j + 1]) + LVFU(rawData[i + 1][j + 1]) + LVFU(rawData[i + 1][j - 1]), clip_ptv)));
                         }
 
 #else
@@ -381,7 +381,7 @@ void RawImageSource::fast_demosaic()
                         for (int j = left + 1, cc = 1; j < right - 1; j += 4, cc += 4) {
                             //interpolate B/R colors at R/B sites
                             _mm_storeu_ps(&redtile[rr * TS + cc], LVFU(greentile[rr * TS + cc]) - zd25v * ((LVFU(greentile[(rr - 1)*TS + cc - 1]) + LVFU(greentile[(rr - 1)*TS + cc + 1]) + LVFU(greentile[(rr + 1)*TS + cc + 1]) + LVFU(greentile[(rr + 1)*TS + cc - 1])) -
-                                          _mm_min_ps(clip_ptv, LVFU(rawData[i - 1][j - 1]) + LVFU(rawData[i - 1][j + 1]) + LVFU(rawData[i + 1][j + 1]) + LVFU(rawData[i + 1][j - 1]))));
+                                          vminf(LVFU(rawData[i - 1][j - 1]) + LVFU(rawData[i - 1][j + 1]) + LVFU(rawData[i + 1][j + 1]) + LVFU(rawData[i + 1][j - 1]), clip_ptv)));
                         }
 
 #else
diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h
index 46af3aa89..74780cf48 100644
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -157,10 +157,14 @@ static INLINE vfloat vsqrtf(vfloat x)
 }
 static INLINE vfloat vmaxf(vfloat x, vfloat y)
 {
+    // _mm_max_ps(x, y) returns y if x is NaN
+    // don't change the order of the parameters
     return _mm_max_ps(x, y);
 }
 static INLINE vfloat vminf(vfloat x, vfloat y)
 {
+    // _mm_min_ps(x, y) returns y if x is NaN
+    // don't change the order of the parameters
     return _mm_min_ps(x, y);
 }
 
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index c46685103..396d7176c 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -4466,7 +4466,7 @@ void ImProcFunctions::chromiLuminanceCurve(PipetteBuffer *pipetteBuffer, int pW,
                     av = LVFU(lold->a[i][k]);
                     bv = LVFU(lold->b[i][k]);
                     STVF(HHBuffer[k], xatan2f(bv, av));
-                    STVF(CCBuffer[k], _mm_sqrt_ps(SQRV(av) + SQRV(bv)) / c327d68v);
+                    STVF (CCBuffer[k], vsqrtf (SQRV (av) + SQRV (bv)) / c327d68v);
                 }
 
                 for (; k < W; k++) {
diff --git a/rtengine/iplabregions.cc b/rtengine/iplabregions.cc
index d5bbd6302..d2380494a 100644
--- a/rtengine/iplabregions.cc
+++ b/rtengine/iplabregions.cc
@@ -204,9 +204,9 @@ BENCHFUN
                 for (int i = 0; i < n; ++i) {
                     vfloat blendv = LVFU(abmask[i][y][x]);
                     vfloat sv = F2V(rs[i]);
-                    vfloat a_newv = LIMV(sv * (av + F2V(abca[i])), cm42000v, c42000v);
-                    vfloat b_newv = LIMV(sv * (bv + F2V(abcb[i])), cm42000v, c42000v);
-                    vfloat l_newv = LIMV(lv * F2V(rl[i]), ZEROV, c32768v);
+                    vfloat a_newv = vclampf(sv * (av + F2V(abca[i])), cm42000v, c42000v);
+                    vfloat b_newv = vclampf(sv * (bv + F2V(abcb[i])), cm42000v, c42000v);
+                    vfloat l_newv = vclampf(lv * F2V(rl[i]), ZEROV, c32768v);
                     lv = vintpf(LVFU(Lmask[i][y][x]), l_newv, lv);
                     av = vintpf(blendv, a_newv, av);
                     bv = vintpf(blendv, b_newv, bv);
diff --git a/rtengine/ipretinex.cc b/rtengine/ipretinex.cc
index bf36ab885..54d80774b 100644
--- a/rtengine/ipretinex.cc
+++ b/rtengine/ipretinex.cc
@@ -505,11 +505,11 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
 
                     if (useHslLin) {
                         for (; j < W_L - 3; j += 4) {
-                            _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv * (LIMV(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv)));
+                            _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv *  (vclampf(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv) ));
                         }
                     } else {
                         for (; j < W_L - 3; j += 4) {
-                            _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv *  xlogf(LIMV(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv)));
+                            _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv *  xlogf(vclampf(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv) ));
                         }
                     }
 
@@ -991,11 +991,11 @@ void ImProcFunctions::MSRLocal(int sp, float** luminance, float** templ, const f
 
                 if (useHslLin) { //keep in case of ??
                     for (; j < W_L - 3; j += 4) {
-                        _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv * (LIMV(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv)));
+                        _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv *  (vclampf(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv) ));
                     }
                 } else {//always Lab mode due to Wavelet
                     for (; j < W_L - 3; j += 4) {
-                        _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv *  xlogf(LIMV(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv)));
+                        _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv *  xlogf(vclampf(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv) ));
                     }
                 }
 
diff --git a/rtengine/ipwavelet.cc b/rtengine/ipwavelet.cc
index 568ff3683..414ae2d59 100644
--- a/rtengine/ipwavelet.cc
+++ b/rtengine/ipwavelet.cc
@@ -735,7 +735,7 @@ void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int kall, const
                         av = LVFU (lab->a[i][j]);
                         bv = LVFU (lab->b[i][j]);
                         huev = xatan2f (bv, av);
-                        chrov = _mm_sqrt_ps (SQRV (av) + SQRV (bv)) / c327d68v;
+                        chrov = vsqrtf(SQRV(av) + SQRV(bv)) / c327d68v;
                         _mm_storeu_ps (&varhue[i1][j1], huev);
                         _mm_storeu_ps (&varchro[i1][j1], chrov);
 
@@ -1103,7 +1103,7 @@ void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int kall, const
                                 bv = LVFU (labco->b[i1][col]);
                                 STVF (atan2Buffer[col], xatan2f (bv, av));
 
-                                cv = _mm_sqrt_ps (SQRV (av) + SQRV (bv));
+                                cv = vsqrtf(SQRV(av) + SQRV(bv));
                                 yv = av / cv;
                                 xv = bv / cv;
                                 xyMask = vmaskf_eq (zerov, cv);
@@ -1991,7 +1991,7 @@ void ImProcFunctions::WaveletAandBAllAB(wavelet_decomposition &WaveletCoeffs_a,
                     __m128 av = LVFU (WavCoeffs_a0[i * W_L + k]);
                     __m128 bv = LVFU (WavCoeffs_b0[i * W_L + k]);
                     __m128 huev = xatan2f (bv, av);
-                    __m128 chrv = _mm_sqrt_ps (SQRV (av) + SQRV (bv));
+                    __m128 chrv = vsqrtf(SQRV(av) + SQRV(bv));
                     STVF (huebuffer[k], huev);
                     STVF (chrbuffer[k], chrv);
                 }
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 5ad66b612..f0e5f1880 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -3112,8 +3112,8 @@ void RawImageSource::processFlatField(const RAWParams &raw, RawImage *riFlatFile
                 vfloat rowBlackv = blackv[row & 1];
 
                 for (; col < W - 3; col += 4) {
-                    vfloat linecorrv = SQRV(vmaxf(epsv, LVFU(cfablur[row * W + col]) - rowBlackv)) /
-                                       (vmaxf(epsv, LVFU(cfablur1[row * W + col]) - rowBlackv) * vmaxf(epsv, LVFU(cfablur2[row * W + col]) - rowBlackv));
+                    vfloat linecorrv = SQRV(vmaxf(LVFU(cfablur[row * W + col]) - rowBlackv, epsv)) /
+                                       (vmaxf(LVFU(cfablur1[row * W + col]) - rowBlackv, epsv) * vmaxf(LVFU(cfablur2[row * W + col]) - rowBlackv, epsv));
                     vfloat valv = LVFU(rawData[row][col]);
                     valv -= rowBlackv;
                     STVFU(rawData[row][col], valv * linecorrv + rowBlackv);
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
index c68f11ae0..83d937bd1 100644
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@@ -1368,8 +1368,9 @@ static INLINE vfloat xcbrtf(vfloat d) {
   return y;
 }
 
-static INLINE vfloat LIMV( vfloat a, vfloat b, vfloat c ) {
-return vmaxf( b, vminf(a,c));
+static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
+    // clamps value in [low;high], returns low if value is NaN
+    return vmaxf(vminf(high, value), low);
 }
 
 static INLINE vfloat SQRV(vfloat a){
diff --git a/rtgui/main-cli.cc b/rtgui/main-cli.cc
index 26ee4fa81..66e0b9cfc 100644
--- a/rtgui/main-cli.cc
+++ b/rtgui/main-cli.cc
@@ -410,8 +410,6 @@ int processLineParams ( int argc, char **argv )
                         return -3;
                     }
 
-                    std::cout << "Output is " << bits << "-bit " << (isFloat ? "floating-point" : "integer") << "." << std::endl;
-
                     break;
 
                 case 't':
@@ -622,6 +620,18 @@ int processLineParams ( int argc, char **argv )
         }
     }
 
+    if (bits == -1) {
+        if (outputType == "jpg") {
+            bits = 8;
+        } else if (outputType == "png") {
+            bits = 8;
+        } else if (outputType == "tif") {
+            bits = 16;
+        } else {
+            bits = 8;
+        }
+    }
+
     if ( !argv1.empty() ) {
         return 1;
     }
@@ -662,6 +672,7 @@ int processLineParams ( int argc, char **argv )
         rtengine::procparams::ProcParams currentParams;
 
         Glib::ustring inputFile = inputFiles[iFile];
+        std::cout << "Output is " << bits << "-bit " << (isFloat ? "floating-point" : "integer") << "." << std::endl;
         std::cout << "Processing: " << inputFile << std::endl;
 
         rtengine::InitialImage* ii = nullptr;