diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc index 05f1a32a2..227efe6b9 100644 --- a/rtengine/dcraw.cc +++ b/rtengine/dcraw.cc @@ -9937,6 +9937,7 @@ static void decodeFPDeltaRow(Bytef * src, Bytef * dst, size_t tileWidth, size_t } +#ifndef __F16C__ // From DNG SDK dng_utils.h static inline uint32_t DNG_HalfToFloat(uint16_t halfValue) { int32_t sign = (halfValue >> 15) & 0x00000001; @@ -9970,6 +9971,7 @@ static inline uint32_t DNG_HalfToFloat(uint16_t halfValue) { // Assemble sign, exponent and mantissa. return (uint32_t) ((sign << 31) | (exponent << 23) | mantissa); } +#endif static inline uint32_t DNG_FP24ToFloat(const uint8_t * input) { int32_t sign = (input [0] >> 7) & 0x01; @@ -10006,11 +10008,32 @@ static inline uint32_t DNG_FP24ToFloat(const uint8_t * input) { static void expandFloats(Bytef * dst, int tileWidth, int bytesps) { if (bytesps == 2) { - uint16_t * dst16 = (uint16_t *) dst; - uint32_t * dst32 = (uint32_t *) dst; + uint16_t* const dst16 = reinterpret_cast(dst); +#ifndef __F16C__ + uint32_t* const dst32 = reinterpret_cast(dst); for (int index = tileWidth - 1; index >= 0; --index) { - dst32[index] = DNG_HalfToFloat(dst16[index]); + dst32[index] = DNG_HalfToFloat(dst16[index]); } +#else + float* const dst32 = reinterpret_cast(dst); + int index = tileWidth - 8; + for (; index >= 0; index -= 8) { + __m128i halfFloatv = _mm_loadu_si128((__m128i*)&dst16[index]); + STVFU(dst32[index], _mm_cvtph_ps(halfFloatv)); + STVFU(dst32[index + 4], _mm_cvtph_ps(_mm_shuffle_epi32(halfFloatv, _MM_SHUFFLE(0,0,3,2)))); + } + index += 4; + if(index >= 0) { + __m128i halfFloatv = _mm_loadu_si128((__m128i*)&dst16[index]); + STVFU(dst32[index], _mm_cvtph_ps(halfFloatv)); + index--; + } else { + index += 3; + } + for (; index >= 0; --index) { + dst32[index] = _cvtsh_ss(dst16[index]); + } +#endif } else if (bytesps == 3) { uint8_t * dst8 = ((uint8_t *) dst) + (tileWidth - 1) * 3; uint32_t * dst32 = (uint32_t *) dst;