From 09838174343e31d5465ad3dc9cf51a739a575a41 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Mon, 5 Nov 2018 15:59:41 +0100 Subject: [PATCH] SSE: Review usage of vminf, vmaxf functions, #4942 --- rtengine/FTblockDN.cc | 2 +- rtengine/LUT.h | 11 ++++++----- rtengine/ciecam02.cc | 18 +++++++++--------- rtengine/clutstore.cc | 2 +- rtengine/color.cc | 6 +++--- rtengine/fast_demo.cc | 4 ++-- rtengine/helpersse2.h | 4 ++++ rtengine/rawimagesource.cc | 4 ++-- 8 files changed, 28 insertions(+), 23 deletions(-) diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 58e27e118..8ced521d3 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -3340,7 +3340,7 @@ void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, aNv = LVFU(acalc[i >> 1][j >> 1]); bNv = LVFU(bcalc[i >> 1][j >> 1]); _mm_storeu_ps(&noisevarhue[i1 >> 1][j1 >> 1], xatan2f(bNv, aNv)); - _mm_storeu_ps(&noisevarchrom[i1 >> 1][j1 >> 1], _mm_max_ps(c100v, _mm_sqrt_ps(SQRV(aNv) + SQRV(bNv)))); + _mm_storeu_ps(&noisevarchrom[i1 >> 1][j1 >> 1], vmaxf(vsqrtf(SQRV(aNv) + SQRV(bNv)),c100v)); } for (; j < tileright; j += 2) { diff --git a/rtengine/LUT.h b/rtengine/LUT.h index d2f758689..fedc20ca2 100644 --- a/rtengine/LUT.h +++ b/rtengine/LUT.h @@ -320,7 +320,7 @@ public: // Clamp and convert to integer values. Extract out of SSE register because all // lookup operations use regular addresses. - vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); + vfloat clampedIndexes = vmaxf(vminf(maxsv, indexv), ZEROV); // this automagically uses ZEROV in case indexv is NaN vint indexes = _mm_cvttps_epi32(clampedIndexes); int indexArray[4]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); @@ -352,7 +352,7 @@ public: // Clamp and convert to integer values. Extract out of SSE register because all // lookup operations use regular addresses. - vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); + vfloat clampedIndexes = vmaxf(vminf(maxsv, indexv), ZEROV); // this automagically uses ZEROV in case indexv is NaN vint indexes = _mm_cvttps_epi32(clampedIndexes); int indexArray[4]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); @@ -372,7 +372,7 @@ public: vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); - vfloat diff = vmaxf(ZEROV, vminf(sizev, indexv)) - _mm_cvtepi32_ps(indexes); + vfloat diff = vmaxf(vminf(sizev, indexv), ZEROV) - _mm_cvtepi32_ps(indexes); // this automagically uses ZEROV in case indexv is NaN return vintpf(diff, upper, lower); } @@ -383,7 +383,7 @@ public: // Clamp and convert to integer values. Extract out of SSE register because all // lookup operations use regular addresses. - vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); + vfloat clampedIndexes = vmaxf(vminf(maxsv, indexv), ZEROV); // this automagically uses ZEROV in case indexv is NaN vint indexes = _mm_cvttps_epi32(clampedIndexes); int indexArray[4]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); @@ -420,7 +420,8 @@ public: template::value>::type> vfloat operator[](vint idxv) const { - vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers + // convert to float because SSE2 has no min/max for 32bit integers + vfloat tempv = vmaxf(vminf(sizev, _mm_cvtepi32_ps(idxv)), ZEROV); // this automagically uses ZEROV in case idxv is NaN (which will never happen because it is a vector of int) idxv = _mm_cvttps_epi32(tempv); // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)], diff --git a/rtengine/ciecam02.cc b/rtengine/ciecam02.cc index f9475eb4e..c5e172478 100644 --- a/rtengine/ciecam02.cc +++ b/rtengine/ciecam02.cc @@ -542,9 +542,9 @@ void Ciecam02::xyz2jchqms_ciecam02float ( vfloat &J, vfloat &C, vfloat &h, vfloa cat02_to_hpefloat ( rp, gp, bp, rc, gc, bc); //gamut correction M.H.Brill S.Susstrunk - rp = _mm_max_ps (rp, ZEROV); - gp = _mm_max_ps (gp, ZEROV); - bp = _mm_max_ps (bp, ZEROV); + rp = vmaxf (rp, ZEROV); + gp = vmaxf (gp, ZEROV); + bp = vmaxf (bp, ZEROV); rpa = nonlinear_adaptationfloat ( rp, fl ); gpa = nonlinear_adaptationfloat ( gp, fl ); bpa = nonlinear_adaptationfloat ( bp, fl ); @@ -559,20 +559,20 @@ void Ciecam02::xyz2jchqms_ciecam02float ( vfloat &J, vfloat &C, vfloat &h, vfloa myh = vself (vmaskf_lt (myh, ZEROV), temp, myh); a = ((rpa + rpa) + gpa + (F2V (0.05f) * bpa) - F2V (0.305f)) * nbb; - a = _mm_max_ps (a, ZEROV); //gamut correction M.H.Brill S.Susstrunk + a = vmaxf (a, ZEROV); //gamut correction M.H.Brill S.Susstrunk J = pow_F ( a / aw, c * cz * F2V (0.5f)); e = ((F2V (961.53846f)) * nc * ncb) * (xcosf ( myh + F2V (2.0f) ) + F2V (3.8f)); - t = (e * _mm_sqrt_ps ( (ca * ca) + (cb * cb) )) / (rpa + gpa + (F2V (1.05f) * bpa)); + t = (e * vsqrtf ( (ca * ca) + (cb * cb) )) / (rpa + gpa + (F2V (1.05f) * bpa)); C = pow_F ( t, F2V (0.9f) ) * J * pow1; Q = wh * J; J *= J * F2V (100.0f); M = C * pfl; - Q = _mm_max_ps (Q, F2V (0.0001f)); // avoid division by zero - s = F2V (100.0f) * _mm_sqrt_ps ( M / Q ); + Q = vmaxf (Q, F2V (0.0001f)); // avoid division by zero + s = F2V (100.0f) * vsqrtf ( M / Q ); h = (myh * F2V (180.f)) / F2V (rtengine::RT_PI); } #endif @@ -710,7 +710,7 @@ void Ciecam02::jch2xyz_ciecam02float ( vfloat &x, vfloat &y, vfloat &z, vfloat J xyz_to_cat02float ( rw, gw, bw, xw, yw, zw); e = ((F2V (961.53846f)) * nc * ncb) * (xcosf ( ((h * F2V (rtengine::RT_PI)) / F2V (180.0f)) + F2V (2.0f) ) + F2V (3.8f)); a = pow_F ( J / F2V (100.0f), reccmcz ) * aw; - t = pow_F ( F2V (10.f) * C / (_mm_sqrt_ps ( J ) * pow1), F2V (1.1111111f) ); + t = pow_F ( F2V (10.f) * C / (vsqrtf ( J ) * pow1), F2V (1.1111111f) ); calculate_abfloat ( ca, cb, h, e, t, nbb, a ); Aab_to_rgbfloat ( rpa, gpa, bpa, a, ca, cb, nbb ); @@ -780,7 +780,7 @@ vfloat Ciecam02::inverse_nonlinear_adaptationfloat ( vfloat c, vfloat fl ) c -= F2V (0.1f); fl = vmulsignf (fl, c); c = vabsf (c); - c = _mm_min_ps ( c, F2V (399.99f)); + c = vminf ( c, F2V (399.99f)); return (F2V (100.0f) / fl) * pow_F ( (F2V (27.13f) * c) / (F2V (400.0f) - c), F2V (2.38095238f) ); } #endif diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index 87ce25d97..1a425d21b 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -226,7 +226,7 @@ void rtengine::HaldCLUT::getRGB( #else const vfloat v_in = _mm_set_ps(0.0f, *b, *g, *r); const vfloat v_tmp = v_in * F2V(flevel_minus_one); - const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(F2V(flevel_minus_two), v_tmp))); + const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(vminf(v_tmp, F2V(flevel_minus_two)))); size_t index = color * 4; diff --git a/rtengine/color.cc b/rtengine/color.cc index 29844f64d..4ace03bc0 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -547,8 +547,8 @@ void Color::rgb2hsl(float r, float g, float b, float &h, float &s, float &l) #ifdef __SSE2__ void Color::rgb2hsl(vfloat r, vfloat g, vfloat b, vfloat &h, vfloat &s, vfloat &l) { - vfloat maxv = _mm_max_ps(r, _mm_max_ps(g, b)); - vfloat minv = _mm_min_ps(r, _mm_min_ps(g, b)); + vfloat maxv = vmaxf(r, vmaxf(g, b)); + vfloat minv = vminf(r, vminf(g, b)); vfloat C = maxv - minv; vfloat tempv = maxv + minv; l = (tempv) * F2V(7.6295109e-6f); @@ -2879,7 +2879,7 @@ void Color::LabGamutMunsell(float *labL, float *laba, float *labb, const int N, av = LVFU(laba[k]); bv = LVFU(labb[k]); _mm_storeu_ps(&HHBuffer[k], xatan2f(bv, av)); - _mm_storeu_ps(&CCBuffer[k], _mm_sqrt_ps(SQRV(av) + SQRV(bv)) / c327d68v); + _mm_storeu_ps(&CCBuffer[k], vsqrtf(SQRV(av) + SQRV(bv)) / c327d68v); } for(; k < N; k++) { diff --git a/rtengine/fast_demo.cc b/rtengine/fast_demo.cc index cab38b1f2..e88661485 100644 --- a/rtengine/fast_demo.cc +++ b/rtengine/fast_demo.cc @@ -364,7 +364,7 @@ void RawImageSource::fast_demosaic() for (int j = left + 1, cc = 1; j < right - 1; j += 4, cc += 4) { //interpolate B/R colors at R/B sites _mm_storeu_ps(&bluetile[rr * TS + cc], LVFU(greentile[rr * TS + cc]) - zd25v * ((LVFU(greentile[(rr - 1)*TS + (cc - 1)]) + LVFU(greentile[(rr - 1)*TS + (cc + 1)]) + LVFU(greentile[(rr + 1)*TS + cc + 1]) + LVFU(greentile[(rr + 1)*TS + cc - 1])) - - _mm_min_ps(clip_ptv, LVFU(rawData[i - 1][j - 1]) + LVFU(rawData[i - 1][j + 1]) + LVFU(rawData[i + 1][j + 1]) + LVFU(rawData[i + 1][j - 1])))); + vminf(LVFU(rawData[i - 1][j - 1]) + LVFU(rawData[i - 1][j + 1]) + LVFU(rawData[i + 1][j + 1]) + LVFU(rawData[i + 1][j - 1]), clip_ptv))); } #else @@ -381,7 +381,7 @@ void RawImageSource::fast_demosaic() for (int j = left + 1, cc = 1; j < right - 1; j += 4, cc += 4) { //interpolate B/R colors at R/B sites _mm_storeu_ps(&redtile[rr * TS + cc], LVFU(greentile[rr * TS + cc]) - zd25v * ((LVFU(greentile[(rr - 1)*TS + cc - 1]) + LVFU(greentile[(rr - 1)*TS + cc + 1]) + LVFU(greentile[(rr + 1)*TS + cc + 1]) + LVFU(greentile[(rr + 1)*TS + cc - 1])) - - _mm_min_ps(clip_ptv, LVFU(rawData[i - 1][j - 1]) + LVFU(rawData[i - 1][j + 1]) + LVFU(rawData[i + 1][j + 1]) + LVFU(rawData[i + 1][j - 1])))); + vminf(LVFU(rawData[i - 1][j - 1]) + LVFU(rawData[i - 1][j + 1]) + LVFU(rawData[i + 1][j + 1]) + LVFU(rawData[i + 1][j - 1]), clip_ptv))); } #else diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index 46af3aa89..74780cf48 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -157,10 +157,14 @@ static INLINE vfloat vsqrtf(vfloat x) } static INLINE vfloat vmaxf(vfloat x, vfloat y) { + // _mm_max_ps(x, y) returns y if x is NaN + // don't change the order of the parameters return _mm_max_ps(x, y); } static INLINE vfloat vminf(vfloat x, vfloat y) { + // _mm_min_ps(x, y) returns y if x is NaN + // don't change the order of the parameters return _mm_min_ps(x, y); } diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc index 502da8073..9b8d3794e 100644 --- a/rtengine/rawimagesource.cc +++ b/rtengine/rawimagesource.cc @@ -3101,8 +3101,8 @@ void RawImageSource::processFlatField(const RAWParams &raw, RawImage *riFlatFile vfloat rowBlackv = blackv[row & 1]; for (; col < W - 3; col += 4) { - vfloat linecorrv = SQRV(vmaxf(epsv, LVFU(cfablur[row * W + col]) - rowBlackv)) / - (vmaxf(epsv, LVFU(cfablur1[row * W + col]) - rowBlackv) * vmaxf(epsv, LVFU(cfablur2[row * W + col]) - rowBlackv)); + vfloat linecorrv = SQRV(vmaxf(LVFU(cfablur[row * W + col]) - rowBlackv, epsv)) / + (vmaxf(LVFU(cfablur1[row * W + col]) - rowBlackv, epsv) * vmaxf(LVFU(cfablur2[row * W + col]) - rowBlackv, epsv)); vfloat valv = LVFU(rawData[row][col]); valv -= rowBlackv; STVFU(rawData[row][col], valv * linecorrv + rowBlackv);