From f2765b9a1cb53c9f92168f6f25403ca4909a2797 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Mon, 2 Jul 2018 20:29:41 +0200 Subject: [PATCH 1/3] If available, use intrinsics to convert half precision float values to single precision float values when loading float dng files --- rtengine/dcraw.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc index 05f1a32a2..a246e2b75 100644 --- a/rtengine/dcraw.cc +++ b/rtengine/dcraw.cc @@ -9937,6 +9937,7 @@ static void decodeFPDeltaRow(Bytef * src, Bytef * dst, size_t tileWidth, size_t } +#ifndef __F16C__ // From DNG SDK dng_utils.h static inline uint32_t DNG_HalfToFloat(uint16_t halfValue) { int32_t sign = (halfValue >> 15) & 0x00000001; @@ -9970,6 +9971,7 @@ static inline uint32_t DNG_HalfToFloat(uint16_t halfValue) { // Assemble sign, exponent and mantissa. return (uint32_t) ((sign << 31) | (exponent << 23) | mantissa); } +#endif static inline uint32_t DNG_FP24ToFloat(const uint8_t * input) { int32_t sign = (input [0] >> 7) & 0x01; @@ -10007,10 +10009,23 @@ static inline uint32_t DNG_FP24ToFloat(const uint8_t * input) { static void expandFloats(Bytef * dst, int tileWidth, int bytesps) { if (bytesps == 2) { uint16_t * dst16 = (uint16_t *) dst; +#ifndef __F16C__ uint32_t * dst32 = (uint32_t *) dst; for (int index = tileWidth - 1; index >= 0; --index) { - dst32[index] = DNG_HalfToFloat(dst16[index]); + dst32[index] = DNG_HalfToFloat(dst16[index]); } +#else + float * dst32 = (float *) dst; + int index = tileWidth - 4; + for (; index >= 0; index -= 4) { + __m128i halfFloatv = _mm_loadu_si128((__m128i_u*)&dst16[index]); + STVFU(dst32[index], _mm_cvtph_ps(halfFloatv)); + } + index += 3; + for (; index >= 0; --index) { + dst32[index] = _cvtsh_ss(dst16[index]); + } +#endif } else if (bytesps == 3) { uint8_t * dst8 = ((uint8_t *) dst) + (tileWidth - 1) * 3; uint32_t * dst32 = (uint32_t *) dst; From 0dd886d152e19687dd602c6ff359ba6da2e75d92 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Tue, 3 Jul 2018 12:28:45 +0200 Subject: [PATCH 2/3] small changes in conversion from half float to single precision float --- rtengine/dcraw.cc | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc index a246e2b75..98b7ac4a9 100644 --- a/rtengine/dcraw.cc +++ b/rtengine/dcraw.cc @@ -10008,20 +10008,28 @@ static inline uint32_t DNG_FP24ToFloat(const uint8_t * input) { static void expandFloats(Bytef * dst, int tileWidth, int bytesps) { if (bytesps == 2) { - uint16_t * dst16 = (uint16_t *) dst; + uint16_t* const dst16 = reinterpret_cast(dst); #ifndef __F16C__ - uint32_t * dst32 = (uint32_t *) dst; + uint32_t* const dst32 = reinterpret_cast(dst); for (int index = tileWidth - 1; index >= 0; --index) { dst32[index] = DNG_HalfToFloat(dst16[index]); } #else - float * dst32 = (float *) dst; - int index = tileWidth - 4; - for (; index >= 0; index -= 4) { + float* const dst32 = reinterpret_cast(dst); + int index = tileWidth - 8; + for (; index >= 0; index -= 8) { __m128i halfFloatv = _mm_loadu_si128((__m128i_u*)&dst16[index]); STVFU(dst32[index], _mm_cvtph_ps(halfFloatv)); + STVFU(dst32[index + 4], _mm_cvtph_ps(_mm_shuffle_epi32(halfFloatv, _MM_SHUFFLE(0,0,3,2)))); + } + index += 4; + if(index >= 0) { + __m128i halfFloatv = _mm_loadu_si128((__m128i_u*)&dst16[index]); + STVFU(dst32[index], _mm_cvtph_ps(halfFloatv)); + index--; + } else { + index += 3; } - index += 3; for (; index >= 0; --index) { dst32[index] = _cvtsh_ss(dst16[index]); } From ccfe4275d17d445792acbf77cd9a1400ad9c6bce Mon Sep 17 00:00:00 2001 From: heckflosse Date: Tue, 3 Jul 2018 21:07:39 +0200 Subject: [PATCH 3/3] use __m128i instead of __m128i_u --- rtengine/dcraw.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc index 98b7ac4a9..227efe6b9 100644 --- a/rtengine/dcraw.cc +++ b/rtengine/dcraw.cc @@ -10018,13 +10018,13 @@ static void expandFloats(Bytef * dst, int tileWidth, int bytesps) { float* const dst32 = reinterpret_cast(dst); int index = tileWidth - 8; for (; index >= 0; index -= 8) { - __m128i halfFloatv = _mm_loadu_si128((__m128i_u*)&dst16[index]); + __m128i halfFloatv = _mm_loadu_si128((__m128i*)&dst16[index]); STVFU(dst32[index], _mm_cvtph_ps(halfFloatv)); STVFU(dst32[index + 4], _mm_cvtph_ps(_mm_shuffle_epi32(halfFloatv, _MM_SHUFFLE(0,0,3,2)))); } index += 4; if(index >= 0) { - __m128i halfFloatv = _mm_loadu_si128((__m128i_u*)&dst16[index]); + __m128i halfFloatv = _mm_loadu_si128((__m128i*)&dst16[index]); STVFU(dst32[index], _mm_cvtph_ps(halfFloatv)); index--; } else {