diff --git a/rtengine/LUT.h b/rtengine/LUT.h index a55c5f5bd..c0a63037a 100644 --- a/rtengine/LUT.h +++ b/rtengine/LUT.h @@ -397,94 +397,34 @@ public: vfloat diff = indexv - _mm_cvtepi32_ps(indexes); return vintpf(diff, upper, lower); } + + // vectorized LUT access with integer indices. Clips at lower and upper bounds #ifdef __SSE4_1__ template::value>::type> - vfloat operator[](vint idxv ) const + vfloat operator[](vint idxv) const { - vfloat tempv, p1v; idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv)); - // access the LUT 4 times and shuffle the values into p1v - - int idx; - - // get 4th value - idx = _mm_extract_epi32(idxv, 3); - tempv = _mm_load_ss(&data[idx]); - p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0)); - // now p1v is 3 3 3 3 - - // get 3rd value - idx = _mm_extract_epi32(idxv, 2); - tempv = _mm_load_ss(&data[idx]); - p1v = _mm_move_ss( p1v, tempv); - // now p1v is 3 3 3 2 - - // get 2nd value - idx = _mm_extract_epi32(idxv, 1); - tempv = _mm_load_ss(&data[idx]); - p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0)); - // now p1v is 3 2 3 2 - p1v = _mm_move_ss( p1v, tempv ); - // now p1v is 3 2 3 1 - - // get 1st value - idx = _mm_cvtsi128_si32(idxv); - tempv = _mm_load_ss(&data[idx]); - p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0)); - // now p1v is 3 2 1 1 - p1v = _mm_move_ss( p1v, tempv ); - // now p1v is 3 2 1 0 - - return p1v; + // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code + return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]); } #else template::value>::type> - vfloat operator[](vint idxv ) const + vfloat operator[](vint idxv) const { - vfloat tempv, p1v; - tempv = _mm_cvtepi32_ps(idxv); - tempv = _mm_min_ps( tempv, sizev ); - idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( ) )); - // access the LUT 4 times and shuffle the values into p1v - - int idx; - - // get 4th value - idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3))); - tempv = _mm_load_ss(&data[idx]); - p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0)); - // now p1v is 3 3 3 3 - - // get 3rd value - idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2))); - tempv = _mm_load_ss(&data[idx]); - p1v = _mm_move_ss( p1v, tempv); - // now p1v is 3 3 3 2 - - // get 2nd value - idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1))); - tempv = _mm_load_ss(&data[idx]); - p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0)); - // now p1v is 3 2 3 2 - p1v = _mm_move_ss( p1v, tempv ); - // now p1v is 3 2 3 1 - - // get 1st value - idx = _mm_cvtsi128_si32 (idxv); - tempv = _mm_load_ss(&data[idx]); - p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0)); - // now p1v is 3 2 1 1 - p1v = _mm_move_ss( p1v, tempv ); - // now p1v is 3 2 1 0 - - return p1v; + vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers + idxv = _mm_cvttps_epi32(tempv); + // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code + return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)], + data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))], + data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))], + data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]); } #endif #endif // use with float indices - template::value>::type> - T operator[](float index) const + template::value && std::is_same::value>::type> + T operator[](V index) const { int idx = (int)index; // don't use floor! The difference in negative space is no problems here