diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index a55c5f5bd..c0a63037a 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -397,94 +397,34 @@ public:
         vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
         return vintpf(diff, upper, lower);
     }
+
+    // vectorized LUT access with integer indices. Clips at lower and upper bounds
 #ifdef __SSE4_1__
     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
-    vfloat operator[](vint idxv ) const
+    vfloat operator[](vint idxv) const
     {
-        vfloat tempv, p1v;
         idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
-        // access the LUT 4 times and shuffle the values into p1v
-
-        int idx;
-
-        // get 4th value
-        idx = _mm_extract_epi32(idxv, 3);
-        tempv = _mm_load_ss(&data[idx]);
-        p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
-        // now p1v is 3 3 3 3
-
-        // get 3rd value
-        idx = _mm_extract_epi32(idxv, 2);
-        tempv = _mm_load_ss(&data[idx]);
-        p1v = _mm_move_ss( p1v, tempv);
-        // now p1v is 3 3 3 2
-
-        // get 2nd value
-        idx = _mm_extract_epi32(idxv, 1);
-        tempv = _mm_load_ss(&data[idx]);
-        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
-        // now p1v is 3 2 3 2
-        p1v = _mm_move_ss( p1v, tempv );
-        // now p1v is 3 2 3 1
-
-        // get 1st value
-        idx = _mm_cvtsi128_si32(idxv);
-        tempv = _mm_load_ss(&data[idx]);
-        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
-        // now p1v is 3 2 1 1
-        p1v = _mm_move_ss( p1v, tempv );
-        // now p1v is 3 2 1 0
-
-        return p1v;
+        // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
+        return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]);
     }
 #else
     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
-    vfloat operator[](vint idxv ) const
+    vfloat operator[](vint idxv) const
     {
-        vfloat tempv, p1v;
-        tempv = _mm_cvtepi32_ps(idxv);
-        tempv = _mm_min_ps( tempv, sizev );
-        idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( )  ));
-        // access the LUT 4 times and shuffle the values into p1v
-
-        int idx;
-
-        // get 4th value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
-        tempv = _mm_load_ss(&data[idx]);
-        p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
-        // now p1v is 3 3 3 3
-
-        // get 3rd value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
-        tempv = _mm_load_ss(&data[idx]);
-        p1v = _mm_move_ss( p1v, tempv);
-        // now p1v is 3 3 3 2
-
-        // get 2nd value
-        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
-        tempv = _mm_load_ss(&data[idx]);
-        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
-        // now p1v is 3 2 3 2
-        p1v = _mm_move_ss( p1v, tempv );
-        // now p1v is 3 2 3 1
-
-        // get 1st value
-        idx = _mm_cvtsi128_si32 (idxv);
-        tempv = _mm_load_ss(&data[idx]);
-        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
-        // now p1v is 3 2 1 1
-        p1v = _mm_move_ss( p1v, tempv );
-        // now p1v is 3 2 1 0
-
-        return p1v;
+        vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers
+        idxv = _mm_cvttps_epi32(tempv);
+        // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
+        return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)],
+                           data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))],
+                           data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))],
+                           data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]);
     }
 #endif
 #endif
 
     // use with float indices
-    template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
-    T operator[](float index) const
+    template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type>
+    T operator[](V index) const
     {
         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here