CIECAM02 speedup

2015-07-07 16:34:05 +02:00
parent b6f8bc675b
commit 1008e0e98d
7 changed files with 555 additions and 86 deletions
--- a/rtengine/ciecam02.cc
+++ b/rtengine/ciecam02.cc
@@ -20,10 +20,12 @@
 #include "rtengine.h"
 #include "curves.h"
 #include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include "sleef.c"
+
+#ifdef _DEBUG
 #include "settings.h"
+#include <stdio.h>
+#endif

 #undef CLIPD
 #define CLIPD(a) ((a)>0.0?((a)<1.0?(a):1.0):0.0)
@@ -33,7 +35,9 @@
 namespace rtengine
 {

+#ifdef _DEBUG
 extern const Settings* settings;
+#endif

 void Ciecam02::curvecolor(double satind, double satval, double &sres, double parsat)
 {
@@ -407,6 +411,15 @@ void Ciecam02::xyz_to_cat02float( float &r, float &g, float &b, float x, float y
        b = ( 0.0000f * x) + (0.0000f * y) + (1.0000f * z);
    }
 }
+#ifdef __SSE2__
+void Ciecam02::xyz_to_cat02float( vfloat &r, vfloat &g, vfloat &b, vfloat x, vfloat y, vfloat z )
+{
+    //gamut correction M.H.Brill S.Susstrunk
+    r = ( F2V(1.007245f) * x) + (F2V(0.011136f) * y) - (F2V(0.018381f) * z);//Changjun Li
+    g = (F2V(-0.318061f) * x) + (F2V(1.314589f) * y) + (F2V(0.003471f) * z);
+    b = z;
+}
+#endif

 void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g, double b, int gamu )
 {
@@ -425,6 +438,7 @@ void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g
    }
 }

+#ifndef __SSE2__
 void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b, int gamu )
 {
    gamu=1;
@@ -441,7 +455,15 @@ void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g
        z = ( 0.000000f * r) - (0.000000f * g) + (1.000000f * b);
    }
 }
-
+#else
+void Ciecam02::cat02_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
+{
+    //gamut correction M.H.Brill S.Susstrunk
+    x = ( F2V(0.99015849f) * r) - (F2V(0.00838772f)* g) + (F2V(0.018229217f) * b);//Changjun Li
+    y = ( F2V(0.239565979f) * r) + (F2V(0.758664642f) * g) + (F2V(0.001770137f)* b);
+    z = b;
+}
+#endif

 void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g, double b )
 {
@@ -450,12 +472,21 @@ void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g,
    z = b;
 }

+#ifndef __SSE2__
 void Ciecam02::hpe_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b )
 {
    x = (1.910197f * r) - (1.112124f * g) + (0.201908f * b);
    y = (0.370950f * r) + (0.629054f * g) - (0.000008f * b);
    z = b;
 }
+#else
+void Ciecam02::hpe_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
+{
+    x = (F2V(1.910197f) * r) - (F2V(1.112124f) * g) + (F2V(0.201908f) * b);
+    y = (F2V(0.370950f) * r) + (F2V(0.629054f) * g) - (F2V(0.000008f) * b);
+    z = b;
+}
+#endif

 void Ciecam02::cat02_to_hpe( double &rh, double &gh, double &bh, double r, double g, double b, int gamu )
 {
@@ -485,6 +516,16 @@ void Ciecam02::cat02_to_hpefloat( float &rh, float &gh, float &bh, float r, floa
    }
 }

+#ifdef __SSE2__
+void Ciecam02::cat02_to_hpefloat( vfloat &rh, vfloat &gh, vfloat &bh, vfloat r, vfloat g, vfloat b)
+{
+    //Changjun Li
+    rh = ( F2V(0.550930835f) * r) + (F2V(0.519435987f)* g) - ( F2V(0.070356303f)* b);
+    gh = ( F2V(0.055954056f) * r) + (F2V(0.89973132f) * g) + (F2V(0.044315524f) * b);
+    bh = b;
+}
+#endif
+
 void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa, double bb, double nbb )
 {
    double x = (A / nbb) + 0.305;
@@ -496,6 +537,8 @@ void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa,
    /*       c1              c6               c7       */
    b = (0.32787 * x) - (0.15681 * aa) - (4.49038 * bb);
 }
+
+#ifndef __SSE2__
 void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb )
 {
    float x = (A / nbb) + 0.305f;
@@ -507,6 +550,19 @@ void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa,
    /*       c1              c6               c7       */
    b = (0.32787f * x) - (0.15681f * aa) - (4.49038f * bb);
 }
+#else
+void Ciecam02::Aab_to_rgbfloat( vfloat &r, vfloat &g, vfloat &b, vfloat A, vfloat aa, vfloat bb, vfloat nbb )
+{
+    vfloat c1 = F2V(0.32787f) * ((A / nbb) + F2V(0.305f));
+
+    /*       c1              c2               c3       */
+    r = c1 + (F2V(0.32145f) * aa) + (F2V(0.20527f) * bb);
+    /*       c1              c4               c5       */
+    g = c1 - (F2V(0.63507f) * aa) - (F2V(0.18603f) * bb);
+    /*       c1              c6               c7       */
+    b = c1 - (F2V(0.15681f) * aa) - (F2V(4.49038f) * bb);
+}
+#endif

 void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double t, double nbb, double a )
 {
@@ -535,7 +591,7 @@ void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double
        bb = (aa * sinh) / cosh;
    }
 }
-
+#ifndef __SSE2__
 void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a )
 {
    float2 sincosval = xsincosf((h * M_PI) / 180.0f);
@@ -565,6 +621,34 @@ void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float
        std::swap(aa,bb);
    }
 }
+#else
+void Ciecam02::calculate_abfloat( vfloat &aa, vfloat &bb, vfloat h, vfloat e, vfloat t, vfloat nbb, vfloat a )
+{
+    vfloat2 sincosval = xsincosf((h * F2V(M_PI)) / F2V(180.0f));
+    vfloat sinh = sincosval.x;
+    vfloat cosh = sincosval.y;
+    vfloat x = (a / nbb) + F2V(0.305f);
+    vfloat p3 = F2V(1.05f);
+    vmask swapMask = vmaskf_gt(vabsf(sinh), vabsf(cosh));
+    vswap(swapMask, sinh, cosh);
+
+    vfloat div = ((e / (t * cosh)) - (F2V(-0.31362f) - (p3 * F2V(0.15681f))) - ((F2V(0.01924f) - (p3 * F2V(4.49038f))) * (sinh / cosh)));
+    // for large values of t the above calculation can change its sign which results in a hue shift of 180 degree
+    // so we have to check the sign to avoid this shift.
+    // Additionally it seems useful to limit the minimum value of div
+    // I limited it, but I'm sure the actual limit is not the best one
+
+    vmask limitMask = vmaskf_neq(vsignf(div), vsignf(cosh));
+    limitMask = vorm(limitMask, vmaskf_le(vabsf(div), vabsf(cosh) * F2V(2.f)));
+    div = vself(limitMask, cosh * F2V(2.f), div);
+
+    aa = ((F2V(0.32787f) * x) * (F2V(2.0f) + p3)) / div;
+    bb = (aa * sinh) / cosh;
+
+    vswap(swapMask, aa, bb);
+}
+
+#endif

 void Ciecam02::initcam1(double gamu, double yb, double pilotd, double f, double la, double xw, double yw, double zw, double &n, double &d, double &nbb, double &ncb,
                        double &cz, double &aw, double &wh, double &pfl, double &fl, double &c)
@@ -758,6 +842,63 @@ void Ciecam02::xyz2jchqms_ciecam02float( float &J, float &C, float &h, float &Q,
    s = 100.0f * sqrtf( M / Q );
    h = (myh * 180.f) / (float)M_PI;
 }
+#ifdef __SSE2__
+void Ciecam02::xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h, vfloat &Q, vfloat &M, vfloat &s, vfloat aw, vfloat fl, vfloat wh,
+        vfloat x, vfloat y, vfloat z, vfloat xw, vfloat yw, vfloat zw,
+        vfloat yb, vfloat la, vfloat f, vfloat c, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d)
+
+{
+    vfloat r, g, b;
+    vfloat rw, gw, bw;
+    vfloat rc, gc, bc;
+    vfloat rp, gp, bp;
+    vfloat rpa, gpa, bpa;
+    vfloat a, ca, cb;
+    vfloat e, t;
+
+    xyz_to_cat02float( r, g, b, x, y, z);
+    xyz_to_cat02float( rw, gw, bw, xw, yw, zw);
+    vfloat onev = F2V(1.f);
+    rc = r * (((yw * d) / rw) + (onev - d));
+    gc = g * (((yw * d) / gw) + (onev - d));
+    bc = b * (((yw * d) / bw) + (onev - d));
+
+    cat02_to_hpefloat( rp, gp, bp, rc, gc, bc);
+    //gamut correction M.H.Brill S.Susstrunk
+    rp = _mm_max_ps(rp,ZEROV);
+    gp = _mm_max_ps(gp,ZEROV);
+    bp = _mm_max_ps(bp,ZEROV);
+    rpa = nonlinear_adaptationfloat( rp, fl );
+    gpa = nonlinear_adaptationfloat( gp, fl );
+    bpa = nonlinear_adaptationfloat( bp, fl );
+
+    ca = rpa - ((F2V(12.0f) * gpa) - bpa) / F2V(11.0f);
+    cb = F2V(0.11111111f) * (rpa + gpa - (bpa + bpa));
+
+    vfloat myh = xatan2f( cb, ca );
+    vfloat temp = F2V(M_PI);
+    temp += temp;
+    temp += myh;
+    myh = vself(vmaskf_lt(myh, ZEROV), temp, myh);
+
+    a = ((rpa + rpa) + gpa + (F2V(0.05f) * bpa) - F2V(0.305f)) * nbb;
+    a = _mm_max_ps(a,ZEROV);    //gamut correction M.H.Brill S.Susstrunk
+
+    J = pow_F( a / aw, c * cz * F2V(0.5f));
+
+    e = ((F2V(961.53846f)) * nc * ncb) * (xcosf( myh + F2V(2.0f) ) + F2V(3.8f));
+    t = (e * _mm_sqrt_ps( (ca * ca) + (cb * cb) )) / (rpa + gpa + (F2V(1.05f) * bpa));
+
+    C = pow_F( t, F2V(0.9f) ) * J * pow1;
+
+    Q = wh * J;
+    J *= J * F2V(100.0f);
+    M = C * pfl;
+    Q = _mm_max_ps(Q,F2V(0.0001f)); // avoid division by zero
+    s = F2V(100.0f) * _mm_sqrt_ps( M / Q );
+    h = (myh * F2V(180.f)) / F2V(M_PI);
+}
+#endif

 void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, double C, double h,
                                 double xw, double yw, double zw, double yb, double la,
@@ -792,7 +933,7 @@ void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, doub

    cat02_to_xyz( x, y, z, r, g, b, gamu );
 }
-
+#ifndef __SSE2__
 void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, float C, float h,
                                      float xw, float yw, float zw, float yb, float la,
                                      float f, float c, float nc , int gamu, float pow1, float nbb, float ncb, float fl, float cz, float d, float aw)
@@ -827,6 +968,41 @@ void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, flo
    cat02_to_xyzfloat( x, y, z, r, g, b, gamu );
 }

+#else
+void Ciecam02::jch2xyz_ciecam02float( vfloat &x, vfloat &y, vfloat &z, vfloat J, vfloat C, vfloat h,
+                                      vfloat xw, vfloat yw, vfloat zw, vfloat yb, vfloat la,
+                                      vfloat f, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat fl, vfloat d, vfloat aw, vfloat reccmcz)
+{
+    vfloat r, g, b;
+    vfloat rc, gc, bc;
+    vfloat rp, gp, bp;
+    vfloat rpa, gpa, bpa;
+    vfloat rw, gw, bw;
+    vfloat a, ca, cb;
+    vfloat e, t;
+    xyz_to_cat02float( rw, gw, bw, xw, yw, zw);
+    e = ((F2V(961.53846f)) * nc * ncb) * (xcosf( ((h * F2V(M_PI)) / F2V(180.0f)) + F2V(2.0f) ) + F2V(3.8f));
+    a = pow_F( J / F2V(100.0f), reccmcz ) * aw;
+    t = pow_F( F2V(10.f) * C / (_mm_sqrt_ps( J ) * pow1), F2V(1.1111111f) );
+
+    calculate_abfloat( ca, cb, h, e, t, nbb, a );
+    Aab_to_rgbfloat( rpa, gpa, bpa, a, ca, cb, nbb );
+
+    rp = inverse_nonlinear_adaptationfloat( rpa, fl );
+    gp = inverse_nonlinear_adaptationfloat( gpa, fl );
+    bp = inverse_nonlinear_adaptationfloat( bpa, fl );
+
+    hpe_to_xyzfloat( x, y, z, rp, gp, bp );
+    xyz_to_cat02float( rc, gc, bc, x, y, z );
+
+    r = rc / (((yw * d) / rw) + (F2V(1.0f) - d));
+    g = gc / (((yw * d) / gw) + (F2V(1.0f) - d));
+    b = bc / (((yw * d) / bw) + (F2V(1.0f) - d));
+
+    cat02_to_xyzfloat( x, y, z, r, g, b );
+}
+#endif
+
 double Ciecam02::nonlinear_adaptation( double c, double fl )
 {
    double p;
@@ -841,6 +1017,20 @@ float Ciecam02::nonlinear_adaptationfloat( float c, float fl )
    else {p = pow_F( (fl * c) / 100.0f, 0.42f ); return ((400.0f * p) / (27.13f + p)) + 0.1f;}
 }

+#ifdef __SSE2__
+vfloat Ciecam02::nonlinear_adaptationfloat( vfloat c, vfloat fl )
+{
+    vfloat c100 = F2V(100.f);
+    vfloat czd42 = F2V(0.42f);
+    vfloat c400 = vmulsignf(F2V(400.f),c);
+    fl = vmulsignf(fl,c);
+    vfloat p = pow_F( (fl * c) / c100, czd42 );
+    vfloat c27d13 = F2V(27.13);
+    vfloat czd1 = F2V(0.1f);
+    return ((c400 * p) / (c27d13 + p)) + czd1;
+}
+#endif
+
 double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
 {
    int c1;
@@ -849,6 +1039,7 @@ double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
    return c1*(100.0 / fl) * pow( (27.13 * fabs( c - 0.1 )) / (400.0 - fabs( c - 0.1 )), 1.0 / 0.42 );
 }

+#ifndef __SSE2__
 float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
 {
    c -= 0.1f;
@@ -863,6 +1054,16 @@ float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
    return (100.0f / fl) * pow_F( (27.13f * fabsf( c )) / (400.0f - fabsf( c )), 2.38095238f );
 }

+#else
+vfloat Ciecam02::inverse_nonlinear_adaptationfloat( vfloat c, vfloat fl )
+{
+    c -= F2V(0.1f);
+    fl = vmulsignf(fl,c);
+    c = vabsf(c);
+    c = _mm_min_ps( c, F2V(399.99f));
+    return (F2V(100.0f) / fl) * pow_F( (F2V(27.13f) * c) / (F2V(400.0f) - c), F2V(2.38095238f) );
+}
+#endif
 //end CIECAM Billy Bigg

 }