Merge pull request #4277 from Beep6581/perceptual_curve_speedup

Perceptual tone curve speedup
2018-01-06 00:09:54 +01:00 · 2018-01-06 00:09:54 +01:00 · 2d37fd6676
commit 2d37fd6676
parent 7d97a172c4 0fcc1987a5
5 changed files with 282 additions and 246 deletions
--- a/rtengine/ciecam02.cc
+++ b/rtengine/ciecam02.cc
@ -608,12 +608,12 @@ void Ciecam02::calculate_ab ( double &aa, double &bb, double h, double e, double
 }
 void Ciecam02::calculate_abfloat ( float &aa, float &bb, float h, float e, float t, float nbb, float a )
 {
-    float2 sincosval = xsincosf ((h * rtengine::RT_PI) / 180.0f);
+    float2 sincosval = xsincosf(h * rtengine::RT_PI_F_180);
    float sinh = sincosval.x;
    float cosh = sincosval.y;
    float x = (a / nbb) + 0.305f;
-    float p3 = 1.05f;
-    bool swapValues = fabs ( sinh ) > fabs ( cosh );
+    constexpr float p3 = 1.05f;
+    const bool swapValues = fabs(sinh) > fabs(cosh);

    if (swapValues) {
        std::swap(sinh, cosh);
@ -626,7 +626,7 @@ void Ciecam02::calculate_abfloat ( float &aa, float &bb, float h, float e, float
        std::swap(c1, c2);
    }

-    float div = ((e / (t * cosh)) - (-0.31362f - (p3 * 0.15681f)) * c1 - ((0.01924f - (p3 * 4.49038f)) * (c2)));
+    float div = ((e / (t * cosh)) - (-0.31362f - (p3 * 0.15681f)) * c1 - ((0.01924f - (p3 * 4.49038f)) * c2));
    // for large values of t the above calculation can change its sign which results in a hue shift of 180 degree
    // so we have to check the sign to avoid this shift.
    // Additionally it seems useful to limit the minimum value of div
@ -1007,9 +1007,18 @@ void Ciecam02::xyz2jch_ciecam02float ( float &J, float &C, float &h, float aw, f
        bp = MAXR (bp, 0.0f);
    }

+#ifdef __SSE2__
+    vfloat pv = _mm_setr_ps(rp, gp, bp, 1.f);
+    vfloat fv = F2V(fl);
+    vfloat outv = nonlinear_adaptationfloat(pv, fv);
+    rpa = outv[0];
+    gpa = outv[1];
+    bpa = outv[2];
+#else
    rpa = nonlinear_adaptationfloat(rp, fl);
    gpa = nonlinear_adaptationfloat(gp, fl);
    bpa = nonlinear_adaptationfloat(bp, fl);
+#endif

    ca = rpa - ((12.0f * gpa) - bpa) / 11.0f;
    cb = (0.11111111f) * (rpa + gpa - (2.0f * bpa));
@ -1085,17 +1094,34 @@ void Ciecam02::jch2xyz_ciecam02float ( float &x, float &y, float &z, float J, fl
    float e, t;
    gamu = 1;
    xyz_to_cat02float(rw, gw, bw, xw, yw, zw, gamu);
-    e = ((961.53846f) * nc * ncb) * (xcosf ( ((h * rtengine::RT_PI) / 180.0f) + 2.0f ) + 3.8f);
+    e = ((961.53846f) * nc * ncb) * (xcosf(h * rtengine::RT_PI_F_180 + 2.0f) + 3.8f);
+
+#ifdef __SSE2__
+    vfloat powinv1 = _mm_setr_ps(J / 100.0f, 10.f * C / (sqrtf(J) * pow1), 1.f, 1.f);
+    vfloat powinv2 = _mm_setr_ps(1.0f / (c * cz), 1.1111111f, 1.f, 1.f);
+    vfloat powoutv = pow_F(powinv1, powinv2);
+    a = powoutv[0] * aw;
+    t = powoutv[1];
+#else
    a = pow_F(J / 100.0f, 1.0f / (c * cz)) * aw;
    t = pow_F(10.f * C / (sqrtf(J) * pow1), 1.1111111f);
+#endif

    calculate_abfloat(ca, cb, h, e, t, nbb, a);
    Aab_to_rgbfloat(rpa, gpa, bpa, a, ca, cb, nbb);

+#ifdef __SSE2__
+    vfloat pav = _mm_setr_ps(rpa, gpa, bpa, 1.f);
+    vfloat fv = F2V(fl);
+    vfloat outv = inverse_nonlinear_adaptationfloat(pav, fv);
+    rp = outv[0];
+    gp = outv[1];
+    bp = outv[2];
+#else
    rp = inverse_nonlinear_adaptationfloat(rpa, fl);
    gp = inverse_nonlinear_adaptationfloat(gpa, fl);
    bp = inverse_nonlinear_adaptationfloat(bpa, fl);
-
+#endif
    hpe_to_xyzfloat(x, y, z, rp, gp, bp);
    xyz_to_cat02float(rc, gc, bc, x, y, z, gamu);

--- a/rtengine/curves.cc
+++ b/rtengine/curves.cc
@ -1822,9 +1822,14 @@ float PerceptualToneCurve::calculateToneCurveContrastValue() const
    return maxslope;
 }

-void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurveState & state) const
+void PerceptualToneCurve::BatchApply(const size_t start, const size_t end, float *rc, float *gc, float *bc, const PerceptualToneCurveState &state) const
 {
-    float x, y, z;
+    const AdobeToneCurve& adobeTC = static_cast<const AdobeToneCurve&>((const ToneCurve&) * this);
+
+    for (size_t i = start; i < end; ++i) {
+        float r = CLIP(rc[i]);
+        float g = CLIP(gc[i]);
+        float b = CLIP(bc[i]);

        if (!state.isProphoto) {
            // convert to prophoto space to make sure the same result is had regardless of working color space
@ -1836,38 +1841,38 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
            b = newb;
        }

-    const AdobeToneCurve& adobeTC = static_cast<const AdobeToneCurve&>((const ToneCurve&) * this);
        float ar = r;
        float ag = g;
        float ab = b;
        adobeTC.Apply(ar, ag, ab);

        if (ar >= 65535.f && ag >= 65535.f && ab >= 65535.f) {
-        // clip fast path, will also avoid strange colors of clipped highlights
-        r = g = b = 65535.f;
-        return;
+            // clip fast path, will also avoid strange colours of clipped highlights
+            rc[i] = gc[i] = bc[i] = 65535.f;
+            continue;
        }

        if (ar <= 0.f && ag <= 0.f && ab <= 0.f) {
-        r = g = b = 0;
-        return;
+            rc[i] = gc[i] = bc[i] = 0;
+            continue;
        }

        // ProPhoto constants for luminance, that is xyz_prophoto[1][]
-    const float Yr = 0.2880402f;
-    const float Yg = 0.7118741f;
-    const float Yb = 0.0000857f;
+        constexpr float Yr = 0.2880402f;
+        constexpr float Yg = 0.7118741f;
+        constexpr float Yb = 0.0000857f;

        // we use the Adobe (RGB-HSV hue-stabilized) curve to decide luminance, which generally leads to a less contrasty result
        // compared to a pure luminance curve. We do this to be more compatible with the most popular curves.
-    float oldLuminance = r * Yr + g * Yg + b * Yb;
-    float newLuminance = ar * Yr + ag * Yg + ab * Yb;
-    float Lcoef = newLuminance / oldLuminance;
+        const float oldLuminance = r * Yr + g * Yg + b * Yb;
+        const float newLuminance = ar * Yr + ag * Yg + ab * Yb;
+        const float Lcoef = newLuminance / oldLuminance;
        r = LIM<float>(r * Lcoef, 0.f, 65535.f);
        g = LIM<float>(g * Lcoef, 0.f, 65535.f);
        b = LIM<float>(b * Lcoef, 0.f, 65535.f);

        // move to JCh so we can modulate chroma based on the global contrast-related chroma scaling factor
+        float x, y, z;
        Color::Prophotoxyz(r, g, b, x, y, z);

        float J, C, h;
@ -1879,7 +1884,7 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv


        if (!isfinite(J) || !isfinite(C) || !isfinite(h)) {
-        // this can happen for dark noise colors or colors outside human gamut. Then we just return the curve's result.
+            // this can happen for dark noise colours or colours outside human gamut. Then we just return the curve's result.
            if (!state.isProphoto) {
                float newr = state.Prophoto2Working[0][0] * r + state.Prophoto2Working[0][1] * g + state.Prophoto2Working[0][2] * b;
                float newg = state.Prophoto2Working[1][0] * r + state.Prophoto2Working[1][1] * g + state.Prophoto2Working[1][2] * b;
@ -1888,8 +1893,11 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
                g = newg;
                b = newb;
            }
+            rc[i] = r;
+            gc[i] = g;
+            bc[i] = b;

-        return;
+            continue;
        }

        float cmul = state.cmul_contrast; // chroma scaling factor
@ -1899,8 +1907,8 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
        {
            // decrease chroma scaling sligthly of extremely saturated colors
            float saturated_scale_factor = 0.95f;
-        const float lolim = 35.f; // lower limit, below this chroma all colors will keep original chroma scaling factor
-        const float hilim = 60.f; // high limit, above this chroma the chroma scaling factor is multiplied with the saturated scale factor value above
+            constexpr float lolim = 35.f; // lower limit, below this chroma all colors will keep original chroma scaling factor
+            constexpr float hilim = 60.f; // high limit, above this chroma the chroma scaling factor is multiplied with the saturated scale factor value above

            if (C < lolim) {
                // chroma is low enough, don't scale
@ -1928,8 +1936,8 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
            float nL = Color::gamma2curve[newLuminance]; // apply gamma so we make comparison and transition with a more perceptual lightness scale
            float dark_scale_factor = 1.20f;
            //float dark_scale_factor = 1.0 + state.debug.p2 / 100.0f;
-        const float lolim = 0.15f;
-        const float hilim = 0.50f;
+            constexpr float lolim = 0.15f;
+            constexpr float hilim = 0.50f;

            if (nL < lolim) {
                // do nothing, keep scale factor
@ -1954,8 +1962,8 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
        {
            // to avoid strange CIECAM02 chroma errors on close-to-shadow-clipping colors we reduce chroma scaling towards 1.0 for black colors
            float dark_scale_factor = 1.f / cmul;
-        const float lolim = 4.f;
-        const float hilim = 7.f;
+            constexpr float lolim = 4.f;
+            constexpr float hilim = 7.f;

            if (J < lolim) {
                // do nothing, keep scale factor
@ -1985,7 +1993,7 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
                                         c, nc, 1, pow1, nbb, ncb, fl, cz, d, aw );

        if (!isfinite(x) || !isfinite(y) || !isfinite(z)) {
-        // can happen for colors on the rim of being outside gamut, that worked without chroma scaling but not with. Then we return only the curve's result.
+            // can happen for colours on the rim of being outside gamut, that worked without chroma scaling but not with. Then we return only the curve's result.
            if (!state.isProphoto) {
                float newr = state.Prophoto2Working[0][0] * r + state.Prophoto2Working[0][1] * g + state.Prophoto2Working[0][2] * b;
                float newg = state.Prophoto2Working[1][0] * r + state.Prophoto2Working[1][1] * g + state.Prophoto2Working[1][2] * b;
@ -1995,7 +2003,11 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
                b = newb;
            }

-        return;
+            rc[i] = r;
+            gc[i] = g;
+            bc[i] = b;
+
+            continue;
        }

        Color::xyz2Prophoto(x, y, z, r, g, b);
@ -2012,14 +2024,13 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
            // we use the RGB-HSV hue-stable "Adobe" curve as reference. For S-curve contrast it increases
            // saturation greatly, but desaturates extreme highlights and thus provide a smooth transition to
            // the white point. However the desaturation effect is quite strong so we make a weighting
-        float ah, as, av, h, s, v;
-        Color::rgb2hsv(ar, ag, ab, ah, as, av);
-        Color::rgb2hsv(r, g, b, h, s, v);
+            const float as = Color::rgb2s(ar, ag, ab);
+            const float s = Color::rgb2s(r, g, b);

-        float sat_scale = as <= 0.f ? 1.f : s / as; // saturation scale compared to Adobe curve
+            const float sat_scale = as <= 0.f ? 1.f : s / as; // saturation scale compared to Adobe curve
            float keep = 0.2f;
-        const float lolim = 1.00f; // only mix in the Adobe curve if we have increased saturation compared to it
-        const float hilim = 1.20f;
+            constexpr float lolim = 1.00f; // only mix in the Adobe curve if we have increased saturation compared to it
+            constexpr float hilim = 1.20f;

            if (sat_scale < lolim) {
                // saturation is low enough, don't desaturate
@ -2041,9 +2052,9 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv

            if (keep < 1.f) {
                // mix in some of the Adobe curve result
-            r = r * keep + (1.f - keep) * ar;
-            g = g * keep + (1.f - keep) * ag;
-            b = b * keep + (1.f - keep) * ab;
+                r = intp(keep, r, ar);
+                g = intp(keep, g, ag);
+                b = intp(keep, b, ab);
            }
        }

@ -2055,8 +2066,11 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
            g = newg;
            b = newb;
        }
+        rc[i] = r;
+        gc[i] = g;
+        bc[i] = b;
+    }
 }
-
 float PerceptualToneCurve::cf_range[2];
 float PerceptualToneCurve::cf[1000];
 float PerceptualToneCurve::f, PerceptualToneCurve::c, PerceptualToneCurve::nc, PerceptualToneCurve::yb, PerceptualToneCurve::la, PerceptualToneCurve::xw, PerceptualToneCurve::yw, PerceptualToneCurve::zw, PerceptualToneCurve::gamut;
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@ -872,7 +872,7 @@ private:
 public:
    static void init();
    void initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const;
-    void Apply(float& r, float& g, float& b, PerceptualToneCurveState & state) const;
+    void BatchApply(const size_t start, const size_t end, float *r, float *g, float *b, const PerceptualToneCurveState &state) const;
 };

 // Standard tone curve
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@ -233,14 +233,8 @@ void customToneCurve(const ToneCurve &customToneCurve, ToneCurveParams::TcMode c
        }
    } else if (curveMode == ToneCurveParams::TcMode::PERCEPTUAL) { // apply curve while keeping color appearance constant
        const PerceptualToneCurve& userToneCurve = static_cast<const PerceptualToneCurve&> (customToneCurve);
-
        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                rtemp[ti * tileSize + tj] = CLIP<float> (rtemp[ti * tileSize + tj]);
-                gtemp[ti * tileSize + tj] = CLIP<float> (gtemp[ti * tileSize + tj]);
-                btemp[ti * tileSize + tj] = CLIP<float> (btemp[ti * tileSize + tj]);
-                userToneCurve.Apply(rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj], ptcApplyState);
-            }
+            userToneCurve.BatchApply(0, tW - jstart, &rtemp[ti * tileSize], &gtemp[ti * tileSize], &btemp[ti * tileSize], ptcApplyState);
        }
    }
 }
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@ -14,6 +14,7 @@ constexpr double MAXVALD = static_cast<double>(MAXVAL); // double version of MAX

 constexpr double RT_PI = 3.14159265358979323846; // pi
 constexpr double RT_PI_2 = 1.57079632679489661923; // pi/2
+constexpr double RT_PI_180 = 0.017453292519943295769; // pi/180
 constexpr double RT_1_PI = 0.31830988618379067154; // 1/pi
 constexpr double RT_2_PI = 0.63661977236758134308; // 2/pi
 constexpr double RT_SQRT1_2 = 0.70710678118654752440; // 1/sqrt(2)
@ -23,6 +24,7 @@ constexpr double RT_NAN = std::numeric_limits<double>::quiet_NaN();

 constexpr float RT_PI_F = RT_PI;
 constexpr float RT_PI_F_2 = RT_PI_2;
+constexpr float RT_PI_F_180 = RT_PI_180;

 constexpr float RT_INFINITY_F = std::numeric_limits<float>::infinity();
 constexpr float RT_NAN_F = std::numeric_limits<float>::quiet_NaN();