Accessing a lut with a double precision floating point index truncates, fixes #4431

This commit is contained in:
heckflosse 2018-03-20 14:40:58 +01:00
parent 4e9478df24
commit 84c56f55b2

View File

@ -397,94 +397,34 @@ public:
vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
return vintpf(diff, upper, lower);
}
// vectorized LUT access with integer indices. Clips at lower and upper bounds
#ifdef __SSE4_1__
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
vfloat operator[](vint idxv ) const
vfloat operator[](vint idxv) const
{
vfloat tempv, p1v;
idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
// access the LUT 4 times and shuffle the values into p1v
int idx;
// get 4th value
idx = _mm_extract_epi32(idxv, 3);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
// now p1v is 3 3 3 3
// get 3rd value
idx = _mm_extract_epi32(idxv, 2);
tempv = _mm_load_ss(&data[idx]);
p1v = _mm_move_ss( p1v, tempv);
// now p1v is 3 3 3 2
// get 2nd value
idx = _mm_extract_epi32(idxv, 1);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
// now p1v is 3 2 3 2
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 3 1
// get 1st value
idx = _mm_cvtsi128_si32(idxv);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
// now p1v is 3 2 1 1
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 1 0
return p1v;
// access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]);
}
#else
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
vfloat operator[](vint idxv ) const
vfloat operator[](vint idxv) const
{
vfloat tempv, p1v;
tempv = _mm_cvtepi32_ps(idxv);
tempv = _mm_min_ps( tempv, sizev );
idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( ) ));
// access the LUT 4 times and shuffle the values into p1v
int idx;
// get 4th value
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
// now p1v is 3 3 3 3
// get 3rd value
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
tempv = _mm_load_ss(&data[idx]);
p1v = _mm_move_ss( p1v, tempv);
// now p1v is 3 3 3 2
// get 2nd value
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
// now p1v is 3 2 3 2
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 3 1
// get 1st value
idx = _mm_cvtsi128_si32 (idxv);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
// now p1v is 3 2 1 1
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 1 0
return p1v;
vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers
idxv = _mm_cvttps_epi32(tempv);
// access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)],
data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))],
data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))],
data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]);
}
#endif
#endif
// use with float indices
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
T operator[](float index) const
template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type>
T operator[](V index) const
{
int idx = (int)index; // don't use floor! The difference in negative space is no problems here