diff --git a/rtengine/LUT.h b/rtengine/LUT.h index 6a617f40b..48c85726d 100644 --- a/rtengine/LUT.h +++ b/rtengine/LUT.h @@ -340,6 +340,37 @@ public: vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes); return vintpf(diff, upper, lower); } + + // NOTE: This version requires LUTs which do not clip at upper and lower bounds + vfloat operator()(vfloat indexv) const + { + static_assert(std::is_same::value, "This method only works for float LUTs"); + + // Clamp and convert to integer values. Extract out of SSE register because all + // lookup operations use regular addresses. + vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxsf), indexv)); + vint indexes = _mm_cvttps_epi32(clampedIndexes); + int indexArray[4]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); + + // Load data from the table. This reads more than necessary, but there don't seem + // to exist more granular operations (though we could try non-SSE). + // Cast to int for convenience in the next operation (partial transpose). + vint values[4]; + for (int i = 0; i < 4; ++i) { + values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); + } + + // Partial 4x4 transpose operation. We want two new vectors, the first consisting + // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. + __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); + __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); + vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); + vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); + + vfloat diff = indexv - _mm_cvtepi32_ps(indexes); + return vintpf(diff, upper, lower); + } #ifdef __SSE4_1__ template::value>::type> vfloat operator[](vint idxv ) const diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 082799e62..8300a24e7 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -3540,7 +3540,27 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } for (int i = istart, ti = 0; i < tH; i++, ti++) { - for (int j = jstart, tj = 0; j < tW; j++, tj++) { + vfloat cr = F2V(0.299f); + vfloat cg = F2V(0.587f); + vfloat cb = F2V(0.114f); + int j = jstart; + int tj = 0; +#ifdef __SSE2__ + for (; j < tW - 3; j+=4, tj+=4) { + + vfloat rv = LVF(rtemp[ti * TS + tj]); + vfloat gv = LVF(gtemp[ti * TS + tj]); + vfloat bv = LVF(btemp[ti * TS + tj]); + + //shadow tone curve + vfloat Yv = cr * rv + cg * gv + cb * bv; + vfloat tonefactorv = shtonecurve(Yv); + STVF(rtemp[ti * TS + tj], rv * tonefactorv); + STVF(gtemp[ti * TS + tj], gv * tonefactorv); + STVF(btemp[ti * TS + tj], bv * tonefactorv); + } +#endif + for (; j < tW; j++, tj++) { float r = rtemp[ti * TS + tj]; float g = gtemp[ti * TS + tj]; @@ -3588,19 +3608,35 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } - for (int i = istart, ti = 0; i < tH; i++, ti++) { - for (int j = jstart, tj = 0; j < tW; j++, tj++) { + if (histToneCurveThr) { + for (int i = istart, ti = 0; i < tH; i++, ti++) { + for (int j = jstart, tj = 0; j < tW; j++, tj++) { - //brightness/contrast - rtemp[ti * TS + tj] = tonecurve[ rtemp[ti * TS + tj] ]; - gtemp[ti * TS + tj] = tonecurve[ gtemp[ti * TS + tj] ]; - btemp[ti * TS + tj] = tonecurve[ btemp[ti * TS + tj] ]; + //brightness/contrast + rtemp[ti * TS + tj] = tonecurve[ rtemp[ti * TS + tj] ]; + gtemp[ti * TS + tj] = tonecurve[ gtemp[ti * TS + tj] ]; + btemp[ti * TS + tj] = tonecurve[ btemp[ti * TS + tj] ]; - if (histToneCurveThr) { int y = CLIP (lumimulf[0] * Color::gamma2curve[rtemp[ti * TS + tj]] + lumimulf[1] * Color::gamma2curve[gtemp[ti * TS + tj]] + lumimulf[2] * Color::gamma2curve[btemp[ti * TS + tj]]); histToneCurveThr[y >> histToneCurveCompression]++; } } + } else { + for (int i = istart, ti = 0; i < tH; i++, ti++) { + int j = jstart, tj = 0; + for (; j < tW - 3; j+=4, tj+=4) { + //brightness/contrast + STVF(rtemp[ti * TS + tj], tonecurve(LVF(rtemp[ti * TS + tj]))); + STVF(gtemp[ti * TS + tj], tonecurve(LVF(gtemp[ti * TS + tj]))); + STVF(btemp[ti * TS + tj], tonecurve(LVF(btemp[ti * TS + tj]))); + } + for (; j < tW; j++, tj++) { + //brightness/contrast + rtemp[ti * TS + tj] = tonecurve[rtemp[ti * TS + tj]]; + gtemp[ti * TS + tj] = tonecurve[gtemp[ti * TS + tj]]; + btemp[ti * TS + tj] = tonecurve[btemp[ti * TS + tj]]; + } + } } if (editID == EUID_ToneCurve1) { // filling the pipette buffer @@ -3687,10 +3723,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer if (hasToneCurve2) { if (curveMode2 == ToneCurveParams::TcMode::STD) { // Standard for (int i = istart, ti = 0; i < tH; i++, ti++) { - for (int j = jstart, tj = 0; j < tW; j++, tj++) { - const StandardToneCurve& userToneCurve = static_cast (customToneCurve2); - userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]); - } + const StandardToneCurve& userToneCurve = static_cast (customToneCurve2); + userToneCurve.BatchApply ( + 0, tW - jstart, + &rtemp[ti * TS], >emp[ti * TS], &btemp[ti * TS]); } } else if (curveMode2 == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like for (int i = istart, ti = 0; i < tH; i++, ti++) {