diff --git a/rtengine/curves.h b/rtengine/curves.h index b88a3bdc4..0d5e6374b 100644 --- a/rtengine/curves.h +++ b/rtengine/curves.h @@ -887,9 +887,14 @@ class AdobeToneCurve : public ToneCurve { private: void RGBTone(float& r, float& g, float& b) const; // helper for tone curve - +#ifdef __SSE2__ + void RGBTone(vfloat& r, vfloat& g, vfloat& b) const; // helper for tone curve +#endif public: void Apply(float& r, float& g, float& b) const; + void BatchApply( + const size_t start, const size_t end, + float *r, float *g, float *b) const; }; class SatAndValueBlendingToneCurve : public ToneCurve @@ -1022,7 +1027,7 @@ inline void AdobeToneCurve::Apply (float& ir, float& ig, float& ib) const RGBTone (b, r, g); // Case 2: b > r >= g } else if (b > g) { RGBTone (r, b, g); // Case 3: r >= b > g - } else { // Case 4: r >= g == b + } else { // Case 4: r == g == b r = lutToneCurve[r]; g = lutToneCurve[g]; b = g; @@ -1040,15 +1045,88 @@ inline void AdobeToneCurve::Apply (float& ir, float& ig, float& ib) const setUnlessOOG(ir, ig, ib, r, g, b); } -inline void AdobeToneCurve::RGBTone (float& r, float& g, float& b) const -{ - float rold = r, gold = g, bold = b; +inline void AdobeToneCurve::BatchApply( + const size_t start, const size_t end, + float *r, float *g, float *b) const { + assert (lutToneCurve); + assert (lutToneCurve.getClip() & LUT_CLIP_BELOW); + assert (lutToneCurve.getClip() & LUT_CLIP_ABOVE); - r = lutToneCurve[rold]; - b = lutToneCurve[bold]; - g = b + ((r - b) * (gold - bold) / (rold - bold)); + // All pointers must have the same alignment for SSE usage. In the loop body below, + // we will only check `r`, assuming that the same result would hold for `g` and `b`. + assert (reinterpret_cast(r) % 16 == reinterpret_cast(g) % 16); + assert (reinterpret_cast(g) % 16 == reinterpret_cast(b) % 16); + + size_t i = start; + while (true) { + if (i >= end) { + // If we get to the end before getting to an aligned address, just return. + // (Or, for non-SSE mode, if we get to the end.) + return; +#ifdef __SSE2__ + } else if (reinterpret_cast(&r[i]) % 16 == 0) { + // Otherwise, we get to the first aligned address; go to the SSE part. + break; +#endif + } + Apply(r[i], g[i], b[i]); + i++; + } +#ifdef __SSE2__ + const vfloat upperv = F2V(MAXVALF); + for (; i + 3 < end; i += 4) { + + vfloat rc = vclampf(LVF(r[i]), ZEROV, upperv); + vfloat gc = vclampf(LVF(g[i]), ZEROV, upperv); + vfloat bc = vclampf(LVF(b[i]), ZEROV, upperv); + + vfloat minval = vminf(vminf(rc, gc), bc); + vfloat maxval = vmaxf(vmaxf(rc, gc), bc); + vfloat medval = vmaxf(vminf(rc, gc), vminf(bc, vmaxf(rc, gc))); + + const vfloat minvalold = minval; + const vfloat maxvalold = maxval; + + RGBTone(maxval, medval, minval); + + const vfloat nr = vself(vmaskf_eq(rc, maxvalold), maxval, vself(vmaskf_eq(rc, minvalold), minval, medval)); + const vfloat ng = vself(vmaskf_eq(gc, maxvalold), maxval, vself(vmaskf_eq(gc, minvalold), minval, medval)); + const vfloat nb = vself(vmaskf_eq(bc, maxvalold), maxval, vself(vmaskf_eq(bc, minvalold), minval, medval)); + + rc = LVF(r[i]); + gc = LVF(g[i]); + bc = LVF(b[i]); + setUnlessOOG(rc, gc, bc, nr, ng, nb); + STVF(r[i], rc); + STVF(g[i], gc); + STVF(b[i], bc); + } + // Remainder in non-SSE. + for (; i < end; ++i) { + Apply(r[i], g[i], b[i]); + } +#endif } +inline void AdobeToneCurve::RGBTone (float& maxval, float& medval, float& minval) const +{ + float minvalold = minval, medvalold = medval, maxvalold = maxval; + + maxval = lutToneCurve[maxvalold]; + minval = lutToneCurve[minvalold]; + medval = minval + ((maxval - minval) * (medvalold - minvalold) / (maxvalold - minvalold)); +} +#ifdef __SSE2__ +inline void AdobeToneCurve::RGBTone (vfloat& maxval, vfloat& medval, vfloat& minval) const +{ + const vfloat minvalold = minval, maxvalold = maxval; + + maxval = lutToneCurve[maxvalold]; + minval = lutToneCurve[minvalold]; + medval = minval + ((maxval - minval) * (medval - minvalold) / (maxvalold - minvalold)); + medval = vself(vmaskf_eq(minvalold, maxvalold), minval, medval); +} +#endif // Modifying the Luminance channel only inline void LuminanceToneCurve::Apply(float &ir, float &ig, float &ib) const { diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 465e3ffe8..cec81d520 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -215,9 +215,7 @@ void customToneCurve(const ToneCurve &customToneCurve, ToneCurveParams::TcMode c } else if (curveMode == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like const AdobeToneCurve& userToneCurve = static_cast (customToneCurve); for (int i = istart, ti = 0; i < tH; i++, ti++) { - for (int j = jstart, tj = 0; j < tW; j++, tj++) { - userToneCurve.Apply(rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj]); - } + userToneCurve.BatchApply(0, tW - jstart, &rtemp[ti * tileSize], >emp[ti * tileSize], &btemp[ti * tileSize]); } } else if (curveMode == ToneCurveParams::TcMode::SATANDVALBLENDING) { // apply the curve on the saturation and value channels const SatAndValueBlendingToneCurve& userToneCurve = static_cast (customToneCurve);