Accessing a lut with a double precision floating point index truncates, fixes #4431
This commit is contained in:
parent
4e9478df24
commit
84c56f55b2
@ -397,94 +397,34 @@ public:
|
||||
vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
|
||||
return vintpf(diff, upper, lower);
|
||||
}
|
||||
|
||||
// vectorized LUT access with integer indices. Clips at lower and upper bounds
|
||||
#ifdef __SSE4_1__
|
||||
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
|
||||
vfloat operator[](vint idxv ) const
|
||||
vfloat operator[](vint idxv) const
|
||||
{
|
||||
vfloat tempv, p1v;
|
||||
idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
|
||||
// access the LUT 4 times and shuffle the values into p1v
|
||||
|
||||
int idx;
|
||||
|
||||
// get 4th value
|
||||
idx = _mm_extract_epi32(idxv, 3);
|
||||
tempv = _mm_load_ss(&data[idx]);
|
||||
p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
// now p1v is 3 3 3 3
|
||||
|
||||
// get 3rd value
|
||||
idx = _mm_extract_epi32(idxv, 2);
|
||||
tempv = _mm_load_ss(&data[idx]);
|
||||
p1v = _mm_move_ss( p1v, tempv);
|
||||
// now p1v is 3 3 3 2
|
||||
|
||||
// get 2nd value
|
||||
idx = _mm_extract_epi32(idxv, 1);
|
||||
tempv = _mm_load_ss(&data[idx]);
|
||||
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
// now p1v is 3 2 3 2
|
||||
p1v = _mm_move_ss( p1v, tempv );
|
||||
// now p1v is 3 2 3 1
|
||||
|
||||
// get 1st value
|
||||
idx = _mm_cvtsi128_si32(idxv);
|
||||
tempv = _mm_load_ss(&data[idx]);
|
||||
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
|
||||
// now p1v is 3 2 1 1
|
||||
p1v = _mm_move_ss( p1v, tempv );
|
||||
// now p1v is 3 2 1 0
|
||||
|
||||
return p1v;
|
||||
// access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
|
||||
return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]);
|
||||
}
|
||||
#else
|
||||
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
|
||||
vfloat operator[](vint idxv ) const
|
||||
vfloat operator[](vint idxv) const
|
||||
{
|
||||
vfloat tempv, p1v;
|
||||
tempv = _mm_cvtepi32_ps(idxv);
|
||||
tempv = _mm_min_ps( tempv, sizev );
|
||||
idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( ) ));
|
||||
// access the LUT 4 times and shuffle the values into p1v
|
||||
|
||||
int idx;
|
||||
|
||||
// get 4th value
|
||||
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
tempv = _mm_load_ss(&data[idx]);
|
||||
p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
// now p1v is 3 3 3 3
|
||||
|
||||
// get 3rd value
|
||||
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
tempv = _mm_load_ss(&data[idx]);
|
||||
p1v = _mm_move_ss( p1v, tempv);
|
||||
// now p1v is 3 3 3 2
|
||||
|
||||
// get 2nd value
|
||||
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
tempv = _mm_load_ss(&data[idx]);
|
||||
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
// now p1v is 3 2 3 2
|
||||
p1v = _mm_move_ss( p1v, tempv );
|
||||
// now p1v is 3 2 3 1
|
||||
|
||||
// get 1st value
|
||||
idx = _mm_cvtsi128_si32 (idxv);
|
||||
tempv = _mm_load_ss(&data[idx]);
|
||||
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
|
||||
// now p1v is 3 2 1 1
|
||||
p1v = _mm_move_ss( p1v, tempv );
|
||||
// now p1v is 3 2 1 0
|
||||
|
||||
return p1v;
|
||||
vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers
|
||||
idxv = _mm_cvttps_epi32(tempv);
|
||||
// access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
|
||||
return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)],
|
||||
data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))],
|
||||
data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))],
|
||||
data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// use with float indices
|
||||
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
|
||||
T operator[](float index) const
|
||||
template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type>
|
||||
T operator[](V index) const
|
||||
{
|
||||
int idx = (int)index; // don't use floor! The difference in negative space is no problems here
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user