From 23934649eb99641205c759ab06ceb3cfc7b76e4f Mon Sep 17 00:00:00 2001 From: heckflosse Date: Wed, 3 Jan 2018 19:29:18 +0100 Subject: [PATCH] Speedup for weighted tone curve --- rtengine/curves.h | 79 ++++++++++++++++++++++++++++++++++++++++++- rtengine/improcfun.cc | 8 +---- 2 files changed, 79 insertions(+), 8 deletions(-) diff --git a/rtengine/curves.h b/rtengine/curves.h index e8b65c33d..73cc37ff6 100644 --- a/rtengine/curves.h +++ b/rtengine/curves.h @@ -828,8 +828,12 @@ class WeightedStdToneCurve : public ToneCurve { private: float Triangle(float refX, float refY, float X2) const; +#if defined( __SSE2__ ) && defined( __x86_64__ ) + vfloat Triangle(vfloat refX, vfloat refY, vfloat X2) const; +#endif public: void Apply(float& r, float& g, float& b) const; + void BatchApply(const size_t start, const size_t end, float *r, float *g, float *b) const; }; class LuminanceToneCurve : public ToneCurve @@ -1003,6 +1007,17 @@ inline float WeightedStdToneCurve::Triangle(float a, float a1, float b) const return a1; } +#if defined( __SSE2__ ) && defined( __x86_64__ ) +inline vfloat WeightedStdToneCurve::Triangle(vfloat a, vfloat a1, vfloat b) const +{ + vfloat a2 = a1 - a; + vmask cmask = vmaskf_lt(b, a); + vfloat b3 = vself(cmask, b, F2V(65535.f) - b); + vfloat a3 = vself(cmask, a, F2V(65535.f) - a); + return b + a2 * b3 / a3; +} +#endif + // Tone curve modifying the value channel only, preserving hue and saturation // values in 0xffff space inline void WeightedStdToneCurve::Apply (float& r, float& g, float& b) const @@ -1010,6 +1025,9 @@ inline void WeightedStdToneCurve::Apply (float& r, float& g, float& b) const assert (lutToneCurve); + r = CLIP(r); + g = CLIP(g); + b = CLIP(b); float r1 = lutToneCurve[r]; float g1 = Triangle(r, r1, g); float b1 = Triangle(r, r1, b); @@ -1022,11 +1040,70 @@ inline void WeightedStdToneCurve::Apply (float& r, float& g, float& b) const float r3 = Triangle(b, b3, r); float g3 = Triangle(b, b3, g); - r = CLIP( r1 * 0.50f + r2 * 0.25f + r3 * 0.25f); + r = CLIP(r1 * 0.50f + r2 * 0.25f + r3 * 0.25f); g = CLIP(g1 * 0.25f + g2 * 0.50f + g3 * 0.25f); b = CLIP(b1 * 0.25f + b2 * 0.25f + b3 * 0.50f); } +inline void WeightedStdToneCurve::BatchApply(const size_t start, const size_t end, float *r, float *g, float *b) const { + assert (lutToneCurve); + assert (lutToneCurve.getClip() & LUT_CLIP_BELOW); + assert (lutToneCurve.getClip() & LUT_CLIP_ABOVE); + + // All pointers must have the same alignment for SSE usage. In the loop body below, + // we will only check `r`, assuming that the same result would hold for `g` and `b`. + assert (reinterpret_cast(r) % 16 == reinterpret_cast(g) % 16); + assert (reinterpret_cast(g) % 16 == reinterpret_cast(b) % 16); + + size_t i = start; + while (true) { + if (i >= end) { + // If we get to the end before getting to an aligned address, just return. + // (Or, for non-SSE mode, if we get to the end.) + return; +#if defined( __SSE2__ ) && defined( __x86_64__ ) + } else if (reinterpret_cast(&r[i]) % 16 == 0) { + // Otherwise, we get to the first aligned address; go to the SSE part. + break; +#endif + } + Apply(r[i], g[i], b[i]); + i++; + } + +#if defined( __SSE2__ ) && defined( __x86_64__ ) + const vfloat c65535v = F2V(65535.f); + const vfloat zd5v = F2V(0.5f); + const vfloat zd25v = F2V(0.25f); + + for (; i + 3 < end; i += 4) { + vfloat r_val = LIMV(LVF(r[i]), ZEROV, c65535v); + vfloat g_val = LIMV(LVF(g[i]), ZEROV, c65535v); + vfloat b_val = LIMV(LVF(b[i]), ZEROV, c65535v); + vfloat r1 = lutToneCurve[r_val]; + vfloat g1 = Triangle(r_val, r1, g_val); + vfloat b1 = Triangle(r_val, r1, b_val); + + vfloat g2 = lutToneCurve[g_val]; + vfloat r2 = Triangle(g_val, g2, r_val); + vfloat b2 = Triangle(g_val, g2, b_val); + + vfloat b3 = lutToneCurve[b_val]; + vfloat r3 = Triangle(b_val, b3, r_val); + vfloat g3 = Triangle(b_val, b3, g_val); + + STVF(r[i], LIMV(r1 * zd5v + r2 * zd25v + r3 * zd25v, ZEROV, c65535v)); + STVF(g[i], LIMV(g1 * zd25v + g2 * zd5v + g3 * zd25v, ZEROV, c65535v)); + STVF(b[i], LIMV(b1 * zd25v + b2 * zd25v + b3 * zd5v, ZEROV, c65535v)); + } + + // Remainder in non-SSE. + for (; i < end; ++i) { + Apply(r[i], g[i], b[i]); + } +#endif +} + // Tone curve modifying the value channel only, preserving hue and saturation // values in 0xffff space inline void SatAndValueBlendingToneCurve::Apply (float& r, float& g, float& b) const diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 6a39befef..c639ffbec 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -222,14 +222,8 @@ void customToneCurve(const ToneCurve &customToneCurve, ToneCurveParams::TcMode c } } else if (curveMode == ToneCurveParams::TcMode::WEIGHTEDSTD) { // apply the curve to the rgb channels, weighted const WeightedStdToneCurve& userToneCurve = static_cast (customToneCurve); - for (int i = istart, ti = 0; i < tH; i++, ti++) { - for (int j = jstart, tj = 0; j < tW; j++, tj++) { - rtemp[ti * tileSize + tj] = CLIP (rtemp[ti * tileSize + tj]); - gtemp[ti * tileSize + tj] = CLIP (gtemp[ti * tileSize + tj]); - btemp[ti * tileSize + tj] = CLIP (btemp[ti * tileSize + tj]); - userToneCurve.Apply (rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj]); - } + userToneCurve.BatchApply(0, tW - jstart, &rtemp[ti * tileSize], >emp[ti * tileSize], &btemp[ti * tileSize]); } } else if (curveMode == ToneCurveParams::TcMode::LUMINANCE) { // apply the curve to the luminance channel const LuminanceToneCurve& userToneCurve = static_cast (customToneCurve);