CIECAM02 speedup

2015-07-07 16:34:05 +02:00 · 2015-07-07 16:34:05 +02:00 · 1008e0e98d
commit 1008e0e98d
parent b6f8bc675b
7 changed files with 555 additions and 86 deletions
--- a/rtengine/ciecam02.cc
+++ b/rtengine/ciecam02.cc
@ -20,10 +20,12 @@
 #include "rtengine.h"
 #include "curves.h"
 #include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include "sleef.c"
+
+#ifdef _DEBUG
 #include "settings.h"
+#include <stdio.h>
+#endif

 #undef CLIPD
 #define CLIPD(a) ((a)>0.0?((a)<1.0?(a):1.0):0.0)
@ -33,7 +35,9 @@
 namespace rtengine
 {

+#ifdef _DEBUG
 extern const Settings* settings;
+#endif

 void Ciecam02::curvecolor(double satind, double satval, double &sres, double parsat)
 {
@ -407,6 +411,15 @@ void Ciecam02::xyz_to_cat02float( float &r, float &g, float &b, float x, float y
        b = ( 0.0000f * x) + (0.0000f * y) + (1.0000f * z);
    }
 }
+#ifdef __SSE2__
+void Ciecam02::xyz_to_cat02float( vfloat &r, vfloat &g, vfloat &b, vfloat x, vfloat y, vfloat z )
+{
+    //gamut correction M.H.Brill S.Susstrunk
+    r = ( F2V(1.007245f) * x) + (F2V(0.011136f) * y) - (F2V(0.018381f) * z);//Changjun Li
+    g = (F2V(-0.318061f) * x) + (F2V(1.314589f) * y) + (F2V(0.003471f) * z);
+    b = z;
+}
+#endif

 void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g, double b, int gamu )
 {
@ -425,6 +438,7 @@ void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g
    }
 }

+#ifndef __SSE2__
 void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b, int gamu )
 {
    gamu=1;
@ -441,7 +455,15 @@ void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g
        z = ( 0.000000f * r) - (0.000000f * g) + (1.000000f * b);
    }
 }
-
+#else
+void Ciecam02::cat02_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
+{
+    //gamut correction M.H.Brill S.Susstrunk
+    x = ( F2V(0.99015849f) * r) - (F2V(0.00838772f)* g) + (F2V(0.018229217f) * b);//Changjun Li
+    y = ( F2V(0.239565979f) * r) + (F2V(0.758664642f) * g) + (F2V(0.001770137f)* b);
+    z = b;
+}
+#endif

 void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g, double b )
 {
@ -450,12 +472,21 @@ void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g,
    z = b;
 }

+#ifndef __SSE2__
 void Ciecam02::hpe_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b )
 {
    x = (1.910197f * r) - (1.112124f * g) + (0.201908f * b);
    y = (0.370950f * r) + (0.629054f * g) - (0.000008f * b);
    z = b;
 }
+#else
+void Ciecam02::hpe_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
+{
+    x = (F2V(1.910197f) * r) - (F2V(1.112124f) * g) + (F2V(0.201908f) * b);
+    y = (F2V(0.370950f) * r) + (F2V(0.629054f) * g) - (F2V(0.000008f) * b);
+    z = b;
+}
+#endif

 void Ciecam02::cat02_to_hpe( double &rh, double &gh, double &bh, double r, double g, double b, int gamu )
 {
@ -485,6 +516,16 @@ void Ciecam02::cat02_to_hpefloat( float &rh, float &gh, float &bh, float r, floa
    }
 }

+#ifdef __SSE2__
+void Ciecam02::cat02_to_hpefloat( vfloat &rh, vfloat &gh, vfloat &bh, vfloat r, vfloat g, vfloat b)
+{
+    //Changjun Li
+    rh = ( F2V(0.550930835f) * r) + (F2V(0.519435987f)* g) - ( F2V(0.070356303f)* b);
+    gh = ( F2V(0.055954056f) * r) + (F2V(0.89973132f) * g) + (F2V(0.044315524f) * b);
+    bh = b;
+}
+#endif
+
 void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa, double bb, double nbb )
 {
    double x = (A / nbb) + 0.305;
@ -496,6 +537,8 @@ void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa,
    /*       c1              c6               c7       */
    b = (0.32787 * x) - (0.15681 * aa) - (4.49038 * bb);
 }
+
+#ifndef __SSE2__
 void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb )
 {
    float x = (A / nbb) + 0.305f;
@ -507,6 +550,19 @@ void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa,
    /*       c1              c6               c7       */
    b = (0.32787f * x) - (0.15681f * aa) - (4.49038f * bb);
 }
+#else
+void Ciecam02::Aab_to_rgbfloat( vfloat &r, vfloat &g, vfloat &b, vfloat A, vfloat aa, vfloat bb, vfloat nbb )
+{
+    vfloat c1 = F2V(0.32787f) * ((A / nbb) + F2V(0.305f));
+
+    /*       c1              c2               c3       */
+    r = c1 + (F2V(0.32145f) * aa) + (F2V(0.20527f) * bb);
+    /*       c1              c4               c5       */
+    g = c1 - (F2V(0.63507f) * aa) - (F2V(0.18603f) * bb);
+    /*       c1              c6               c7       */
+    b = c1 - (F2V(0.15681f) * aa) - (F2V(4.49038f) * bb);
+}
+#endif

 void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double t, double nbb, double a )
 {
@ -535,7 +591,7 @@ void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double
        bb = (aa * sinh) / cosh;
    }
 }
-
+#ifndef __SSE2__
 void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a )
 {
    float2 sincosval = xsincosf((h * M_PI) / 180.0f);
@ -565,6 +621,34 @@ void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float
        std::swap(aa,bb);
    }
 }
+#else
+void Ciecam02::calculate_abfloat( vfloat &aa, vfloat &bb, vfloat h, vfloat e, vfloat t, vfloat nbb, vfloat a )
+{
+    vfloat2 sincosval = xsincosf((h * F2V(M_PI)) / F2V(180.0f));
+    vfloat sinh = sincosval.x;
+    vfloat cosh = sincosval.y;
+    vfloat x = (a / nbb) + F2V(0.305f);
+    vfloat p3 = F2V(1.05f);
+    vmask swapMask = vmaskf_gt(vabsf(sinh), vabsf(cosh));
+    vswap(swapMask, sinh, cosh);
+
+    vfloat div = ((e / (t * cosh)) - (F2V(-0.31362f) - (p3 * F2V(0.15681f))) - ((F2V(0.01924f) - (p3 * F2V(4.49038f))) * (sinh / cosh)));
+    // for large values of t the above calculation can change its sign which results in a hue shift of 180 degree
+    // so we have to check the sign to avoid this shift.
+    // Additionally it seems useful to limit the minimum value of div
+    // I limited it, but I'm sure the actual limit is not the best one
+
+    vmask limitMask = vmaskf_neq(vsignf(div), vsignf(cosh));
+    limitMask = vorm(limitMask, vmaskf_le(vabsf(div), vabsf(cosh) * F2V(2.f)));
+    div = vself(limitMask, cosh * F2V(2.f), div);
+
+    aa = ((F2V(0.32787f) * x) * (F2V(2.0f) + p3)) / div;
+    bb = (aa * sinh) / cosh;
+
+    vswap(swapMask, aa, bb);
+}
+
+#endif

 void Ciecam02::initcam1(double gamu, double yb, double pilotd, double f, double la, double xw, double yw, double zw, double &n, double &d, double &nbb, double &ncb,
                        double &cz, double &aw, double &wh, double &pfl, double &fl, double &c)
@ -758,6 +842,63 @@ void Ciecam02::xyz2jchqms_ciecam02float( float &J, float &C, float &h, float &Q,
    s = 100.0f * sqrtf( M / Q );
    h = (myh * 180.f) / (float)M_PI;
 }
+#ifdef __SSE2__
+void Ciecam02::xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h, vfloat &Q, vfloat &M, vfloat &s, vfloat aw, vfloat fl, vfloat wh,
+        vfloat x, vfloat y, vfloat z, vfloat xw, vfloat yw, vfloat zw,
+        vfloat yb, vfloat la, vfloat f, vfloat c, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d)
+
+{
+    vfloat r, g, b;
+    vfloat rw, gw, bw;
+    vfloat rc, gc, bc;
+    vfloat rp, gp, bp;
+    vfloat rpa, gpa, bpa;
+    vfloat a, ca, cb;
+    vfloat e, t;
+
+    xyz_to_cat02float( r, g, b, x, y, z);
+    xyz_to_cat02float( rw, gw, bw, xw, yw, zw);
+    vfloat onev = F2V(1.f);
+    rc = r * (((yw * d) / rw) + (onev - d));
+    gc = g * (((yw * d) / gw) + (onev - d));
+    bc = b * (((yw * d) / bw) + (onev - d));
+
+    cat02_to_hpefloat( rp, gp, bp, rc, gc, bc);
+    //gamut correction M.H.Brill S.Susstrunk
+    rp = _mm_max_ps(rp,ZEROV);
+    gp = _mm_max_ps(gp,ZEROV);
+    bp = _mm_max_ps(bp,ZEROV);
+    rpa = nonlinear_adaptationfloat( rp, fl );
+    gpa = nonlinear_adaptationfloat( gp, fl );
+    bpa = nonlinear_adaptationfloat( bp, fl );
+
+    ca = rpa - ((F2V(12.0f) * gpa) - bpa) / F2V(11.0f);
+    cb = F2V(0.11111111f) * (rpa + gpa - (bpa + bpa));
+
+    vfloat myh = xatan2f( cb, ca );
+    vfloat temp = F2V(M_PI);
+    temp += temp;
+    temp += myh;
+    myh = vself(vmaskf_lt(myh, ZEROV), temp, myh);
+
+    a = ((rpa + rpa) + gpa + (F2V(0.05f) * bpa) - F2V(0.305f)) * nbb;
+    a = _mm_max_ps(a,ZEROV);    //gamut correction M.H.Brill S.Susstrunk
+
+    J = pow_F( a / aw, c * cz * F2V(0.5f));
+
+    e = ((F2V(961.53846f)) * nc * ncb) * (xcosf( myh + F2V(2.0f) ) + F2V(3.8f));
+    t = (e * _mm_sqrt_ps( (ca * ca) + (cb * cb) )) / (rpa + gpa + (F2V(1.05f) * bpa));
+
+    C = pow_F( t, F2V(0.9f) ) * J * pow1;
+
+    Q = wh * J;
+    J *= J * F2V(100.0f);
+    M = C * pfl;
+    Q = _mm_max_ps(Q,F2V(0.0001f)); // avoid division by zero
+    s = F2V(100.0f) * _mm_sqrt_ps( M / Q );
+    h = (myh * F2V(180.f)) / F2V(M_PI);
+}
+#endif

 void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, double C, double h,
                                 double xw, double yw, double zw, double yb, double la,
@ -792,7 +933,7 @@ void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, doub

    cat02_to_xyz( x, y, z, r, g, b, gamu );
 }
-
+#ifndef __SSE2__
 void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, float C, float h,
                                      float xw, float yw, float zw, float yb, float la,
                                      float f, float c, float nc , int gamu, float pow1, float nbb, float ncb, float fl, float cz, float d, float aw)
@ -827,6 +968,41 @@ void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, flo
    cat02_to_xyzfloat( x, y, z, r, g, b, gamu );
 }

+#else
+void Ciecam02::jch2xyz_ciecam02float( vfloat &x, vfloat &y, vfloat &z, vfloat J, vfloat C, vfloat h,
+                                      vfloat xw, vfloat yw, vfloat zw, vfloat yb, vfloat la,
+                                      vfloat f, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat fl, vfloat d, vfloat aw, vfloat reccmcz)
+{
+    vfloat r, g, b;
+    vfloat rc, gc, bc;
+    vfloat rp, gp, bp;
+    vfloat rpa, gpa, bpa;
+    vfloat rw, gw, bw;
+    vfloat a, ca, cb;
+    vfloat e, t;
+    xyz_to_cat02float( rw, gw, bw, xw, yw, zw);
+    e = ((F2V(961.53846f)) * nc * ncb) * (xcosf( ((h * F2V(M_PI)) / F2V(180.0f)) + F2V(2.0f) ) + F2V(3.8f));
+    a = pow_F( J / F2V(100.0f), reccmcz ) * aw;
+    t = pow_F( F2V(10.f) * C / (_mm_sqrt_ps( J ) * pow1), F2V(1.1111111f) );
+
+    calculate_abfloat( ca, cb, h, e, t, nbb, a );
+    Aab_to_rgbfloat( rpa, gpa, bpa, a, ca, cb, nbb );
+
+    rp = inverse_nonlinear_adaptationfloat( rpa, fl );
+    gp = inverse_nonlinear_adaptationfloat( gpa, fl );
+    bp = inverse_nonlinear_adaptationfloat( bpa, fl );
+
+    hpe_to_xyzfloat( x, y, z, rp, gp, bp );
+    xyz_to_cat02float( rc, gc, bc, x, y, z );
+
+    r = rc / (((yw * d) / rw) + (F2V(1.0f) - d));
+    g = gc / (((yw * d) / gw) + (F2V(1.0f) - d));
+    b = bc / (((yw * d) / bw) + (F2V(1.0f) - d));
+
+    cat02_to_xyzfloat( x, y, z, r, g, b );
+}
+#endif
+
 double Ciecam02::nonlinear_adaptation( double c, double fl )
 {
    double p;
@ -841,6 +1017,20 @@ float Ciecam02::nonlinear_adaptationfloat( float c, float fl )
    else {p = pow_F( (fl * c) / 100.0f, 0.42f ); return ((400.0f * p) / (27.13f + p)) + 0.1f;}
 }

+#ifdef __SSE2__
+vfloat Ciecam02::nonlinear_adaptationfloat( vfloat c, vfloat fl )
+{
+    vfloat c100 = F2V(100.f);
+    vfloat czd42 = F2V(0.42f);
+    vfloat c400 = vmulsignf(F2V(400.f),c);
+    fl = vmulsignf(fl,c);
+    vfloat p = pow_F( (fl * c) / c100, czd42 );
+    vfloat c27d13 = F2V(27.13);
+    vfloat czd1 = F2V(0.1f);
+    return ((c400 * p) / (c27d13 + p)) + czd1;
+}
+#endif
+
 double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
 {
    int c1;
@ -849,6 +1039,7 @@ double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
    return c1*(100.0 / fl) * pow( (27.13 * fabs( c - 0.1 )) / (400.0 - fabs( c - 0.1 )), 1.0 / 0.42 );
 }

+#ifndef __SSE2__
 float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
 {
    c -= 0.1f;
@ -863,6 +1054,16 @@ float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
    return (100.0f / fl) * pow_F( (27.13f * fabsf( c )) / (400.0f - fabsf( c )), 2.38095238f );
 }

+#else
+vfloat Ciecam02::inverse_nonlinear_adaptationfloat( vfloat c, vfloat fl )
+{
+    c -= F2V(0.1f);
+    fl = vmulsignf(fl,c);
+    c = vabsf(c);
+    c = _mm_min_ps( c, F2V(399.99f));
+    return (F2V(100.0f) / fl) * pow_F( (F2V(27.13f) * c) / (F2V(400.0f) - c), F2V(2.38095238f) );
+}
+#endif
 //end CIECAM Billy Bigg

 }
--- a/rtengine/ciecam02.h
+++ b/rtengine/ciecam02.h
@ -20,6 +20,7 @@
 #define _CIECAM02_
 #include <cmath>
 #include "LUT.h"
+#include "opthelper.h"

 namespace rtengine
 {
@ -40,18 +41,33 @@ private:

    static void xyz_to_cat02float ( float &r,  float &g,  float &b,  float x, float y, float z, int gamu );
    static void cat02_to_hpefloat ( float &rh, float &gh, float &bh, float r, float g, float b, int gamu );
-    static void cat02_to_xyzfloat ( float &x,  float &y,  float &z,  float r, float g, float b, int gamu );
-    static void hpe_to_xyzfloat   ( float &x,  float &y,  float &z,  float r, float g, float b );
+
+#ifdef __SSE2__
+    static void xyz_to_cat02float ( vfloat &r,  vfloat &g,  vfloat &b,  vfloat x, vfloat y, vfloat z );
+    static void cat02_to_hpefloat ( vfloat &rh, vfloat &gh, vfloat &bh, vfloat r, vfloat g, vfloat b );
+    static vfloat nonlinear_adaptationfloat( vfloat c, vfloat fl );
+#endif

    static void Aab_to_rgb( double &r, double &g, double &b, double A, double aa, double bb, double nbb );
-    static void Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb );
    static void calculate_ab( double &aa, double &bb, double h, double e, double t, double nbb, double a );
-    static void calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a );

    static double nonlinear_adaptation( double c, double fl );
    static float nonlinear_adaptationfloat( float c, float fl );
    static double inverse_nonlinear_adaptation( double c, double fl );
+
+#ifndef __SSE2__
    static float inverse_nonlinear_adaptationfloat( float c, float fl );
+    static void calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a );
+    static void Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb );
+    static void hpe_to_xyzfloat   ( float &x,  float &y,  float &z,  float r, float g, float b );
+    static void cat02_to_xyzfloat ( float &x,  float &y,  float &z,  float r, float g, float b, int gamu );
+#else
+    static vfloat inverse_nonlinear_adaptationfloat( vfloat c, vfloat fl );
+    static void calculate_abfloat( vfloat &aa, vfloat &bb, vfloat h, vfloat e, vfloat t, vfloat nbb, vfloat a );
+    static void Aab_to_rgbfloat( vfloat &r, vfloat &g, vfloat &b, vfloat A, vfloat aa, vfloat bb, vfloat nbb );
+    static void hpe_to_xyzfloat   ( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b );
+    static void cat02_to_xyzfloat ( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b );
+#endif

 public:
    Ciecam02 () {}
@ -69,12 +85,19 @@ public:
                                  double yb, double la,
                                  double f, double c, double nc, int gamu, double n, double nbb, double ncb, double fl, double cz, double d, double aw);

+#ifndef __SSE2__
    static void jch2xyz_ciecam02float( float &x, float &y, float &z,
                                       float J, float C, float h,
                                       float xw, float yw, float zw,
                                       float yb, float la,
                                       float f, float c, float nc,int gamu,float n, float nbb, float ncb, float fl, float cz, float d, float aw );
-
+#else
+    static void jch2xyz_ciecam02float( vfloat &x, vfloat &y, vfloat &z,
+                                       vfloat J, vfloat C, vfloat h,
+                                       vfloat xw, vfloat yw, vfloat zw,
+                                       vfloat yb, vfloat la,
+                                       vfloat f, vfloat nc, vfloat n, vfloat nbb, vfloat ncb, vfloat fl, vfloat d, vfloat aw, vfloat reccmcz );
+#endif
    /**
     * Forward transform from XYZ to CIECAM02 JCh.
     */
@ -104,6 +127,16 @@ public:
                                          float yb, float la,
                                          float f, float c, float nc,  float pilotd, int gamu, float n, float nbb, float ncb, float pfl, float cz, float d  );

+#ifdef __SSE2__
+    static void xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h,
+                                          vfloat &Q, vfloat &M, vfloat &s,vfloat aw, vfloat fl, vfloat wh,
+                                          vfloat x, vfloat y, vfloat z,
+                                          vfloat xw, vfloat yw, vfloat zw,
+                                          vfloat yb, vfloat la,
+                                          vfloat f, vfloat c, vfloat nc, vfloat n, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d  );
+
+
+#endif

 };
 }
--- a/rtengine/color.cc
+++ b/rtengine/color.cc
@ -859,6 +859,25 @@ namespace rtengine {
        y=(LL>epskap) ? 65535.0f*fy*fy*fy : 65535.0f*LL/kappa;
    }

+#ifdef __SSE2__
+    void Color::Lab2XYZ(vfloat L, vfloat a, vfloat b, vfloat &x, vfloat &y, vfloat &z) {
+        vfloat c327d68 = F2V(327.68f);
+        L /= c327d68;
+        a /= c327d68;
+        b /= c327d68;
+        vfloat fy = F2V(0.00862069f) * L + F2V(0.137932f);
+        vfloat fx = F2V(0.002f) * a + fy;
+        vfloat fz = fy - (F2V(0.005f) * b);
+        vfloat c65535 = F2V(65535.f);
+        x = c65535*f2xyz(fx)*F2V(D50x);
+        z = c65535*f2xyz(fz)*F2V(D50z);
+        vfloat res1 = fy*fy*fy;
+        vfloat res2 = L / F2V(kappa);
+		y = vself(vmaskf_gt(L, F2V(epskap)), res1, res2);
+		y *= c65535;
+    }
+#endif // __SSE2__
+
    void Color::XYZ2Lab(float X, float Y, float Z, float &L, float &a, float &b) {

        float x = X/D50x;
--- a/rtengine/color.h
+++ b/rtengine/color.h
@ -316,6 +316,9 @@ public:
 	*/
 	static void Lab2XYZ(float L, float a, float b, float &x, float &y, float &z);

+#ifdef __SSE2__
+    static void Lab2XYZ(vfloat L, vfloat a, vfloat b, vfloat &x, vfloat &y, vfloat &z);
+#endif // __SSE2__

 	/**
 	* @brief Convert xyz in Lab
@ -439,7 +442,15 @@ public:

 		return (f > epsilonExpInv3) ? f*f*f : (116.f * f - 16.f) * kappaInv;
 	}
-
+#ifdef __SSE2__
+	static inline vfloat f2xyz(vfloat f) {
+		const vfloat epsilonExpInv3 = F2V(0.20689655f); // 6.0f/29.0f;
+		const vfloat kappaInv = F2V(0.0011070565f); // 27.0f/24389.0f;  // inverse of kappa
+		vfloat res1 = f*f*f;
+		vfloat res2 = (116.f * f - 16.f) * kappaInv;
+		return vself(vmaskf_gt(f, epsilonExpInv3), res1, res2);
+	}
+#endif

 	/**
 	 * @brief Calculate the effective direction (up or down) to linearly interpolating 2 colors so that it follows the shortest or longest path
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@ -64,6 +64,7 @@ typedef __m128i vint2;
 #endif

 #define ZEROV _mm_setzero_ps()
+#define F2V(a) _mm_set1_ps((a))

 static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
@ -126,6 +127,7 @@ static INLINE vmask vandm(vmask x, vmask y) { return _mm_and_si128(x, y); }
 static INLINE vmask vandnotm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
 static INLINE vmask vorm(vmask x, vmask y) { return _mm_or_si128(x, y); }
 static INLINE vmask vxorm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+static INLINE vmask vnotm(vmask x) { return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); }

 static INLINE vmask vmask_eq(vdouble x, vdouble y) { return (__m128i)_mm_cmpeq_pd(x, y); }
 static INLINE vmask vmask_neq(vdouble x, vdouble y) { return (__m128i)_mm_cmpneq_pd(x, y); }
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@ -1418,6 +1418,7 @@ if(settings->viewinggreySc==1) yb=18.0f;//fixed
 	const float pow1 = pow_F( 1.64f - pow_F( 0.29f, n ), 0.73f );
 	float nj,dj,nbbj,ncbj,czj,awj,flj;
 	Ciecam02::initcam2float(gamu,yb2, f2,  la2,  xw2,  yw2,  zw2, nj, dj, nbbj, ncbj,czj, awj, flj);
+	const float reccmcz = 1.f / (c2*czj);
 	const float pow1n = pow_F( 1.64f - pow_F( 0.29f, nj ), 0.73f );

 	const float epsil=0.0001f;
@ -1440,29 +1441,97 @@ if(settings->viewinggreySc==1) yb=18.0f;//fixed
 		{wiprof[2][0],wiprof[2][1],wiprof[2][2]}
 	};
 	
-	
+#ifdef __SSE2__
+    int bufferLength = ((width+3)/4) * 4; // bufferLength has to be a multiple of 4
+#endif
 #ifndef _DEBUG	
 #pragma omp parallel
 #endif
 {	
 	float minQThr = 10000.f;
 	float maxQThr = -1000.f;
-#ifndef _DEBUG
-#pragma omp for schedule(dynamic, 10)
+#ifdef __SSE2__
+    // one line buffer per channel and thread
+	float Jbuffer[bufferLength] ALIGNED16;
+	float Cbuffer[bufferLength] ALIGNED16;
+	float hbuffer[bufferLength] ALIGNED16;
+	float Qbuffer[bufferLength] ALIGNED16;
+	float Mbuffer[bufferLength] ALIGNED16;
+	float sbuffer[bufferLength] ALIGNED16;
 #endif
-	for (int i=0; i<height; i++)
-		for (int j=0; j<width; j++) {
+#ifndef _DEBUG
+#pragma omp for schedule(dynamic, 16)
+#endif
+	for (int i=0; i<height; i++) {
+#ifdef __SSE2__
+        // vectorized conversion from Lab to jchqms
+        int k;
+        vfloat x,y,z;
+		vfloat J, C, h, Q, M, s;

+        vfloat c655d35 = F2V(655.35f);
+        for(k=0;k<width-3;k+=4) {
+            Color::Lab2XYZ(LVFU(lab->L[i][k]),LVFU(lab->a[i][k]),LVFU(lab->b[i][k]),x,y,z);
+            x = x/c655d35;
+            y = y/c655d35;
+            z = z/c655d35;
+			Ciecam02::xyz2jchqms_ciecam02float( J, C,  h,
+                           Q,  M,  s, F2V(aw), F2V(fl), F2V(wh),
+                           x,  y,  z,
+                           F2V(xw1), F2V(yw1),  F2V(zw1),
+                           F2V(yb),  F2V(la),
+                           F2V(f), F2V(c),  F2V(nc), F2V(pow1), F2V(nbb), F2V(ncb), F2V(pfl), F2V(cz), F2V(d));
+            STVF(Jbuffer[k],J);
+            STVF(Cbuffer[k],C);
+            STVF(hbuffer[k],h);
+            STVF(Qbuffer[k],Q);
+            STVF(Mbuffer[k],M);
+            STVF(sbuffer[k],s);
+        }
+        for(;k<width;k++) {
+			float L=lab->L[i][k];
+			float a=lab->a[i][k];
+			float b=lab->b[i][k];
+			float x,y,z;
+			//convert Lab => XYZ
+			Color::Lab2XYZ(L, a, b, x, y, z);
+			x = x/655.35f;
+			y = y/655.35f;
+			z = z/655.35f;
+			float J, C, h, Q, M, s;
+			Ciecam02::xyz2jchqms_ciecam02float( J, C,  h,
+                           Q,  M,  s, aw, fl, wh,
+                           x,  y,  z,
+                           xw1, yw1,  zw1,
+                           yb,  la,
+                           f, c,  nc,  pilot, gamu, pow1, nbb, ncb, pfl, cz, d);
+            Jbuffer[k] = J;
+            Cbuffer[k] = C;
+            hbuffer[k] = h;
+            Qbuffer[k] = Q;
+            Mbuffer[k] = M;
+            sbuffer[k] = s;
+        }
+#endif // __SSE2__
+		for (int j=0; j<width; j++) {
+			float J, C, h, Q, M, s;
+			
+#ifdef __SSE2__
+            // use precomputed values from above
+            J = Jbuffer[j];
+            C = Cbuffer[j];
+            h = hbuffer[j];
+            Q = Qbuffer[j];
+            M = Mbuffer[j];
+            s = sbuffer[j];
+#else
+			float x,y,z;
 			float L=lab->L[i][j];
 			float a=lab->a[i][j];
 			float b=lab->b[i][j];
 			float x1,y1,z1;
-			float x,y,z;
 			//convert Lab => XYZ
 			Color::Lab2XYZ(L, a, b, x1, y1, z1);
-			float J, C, h, Q, M, s;
-			float Jpro,Cpro, hpro, Qpro, Mpro, spro;
-
 			x=(float)x1/655.35f;
 			y=(float)y1/655.35f;
 			z=(float)z1/655.35f;
@ -1473,6 +1542,8 @@ if(settings->viewinggreySc==1) yb=18.0f;//fixed
                           xw1, yw1,  zw1,
                           yb,  la,
                           f, c,  nc,  pilot, gamu, pow1, nbb, ncb, pfl, cz, d);
+#endif
+			float Jpro,Cpro, hpro, Qpro, Mpro, spro;
 			Jpro=J;
 			Cpro=C;
 			hpro=h;
@ -1797,6 +1868,12 @@ if(settings->viewinggreySc==1) yb=18.0f;//fixed
 				}
 			}
 if(LabPassOne){
+#ifdef __SSE2__
+            // write to line buffers
+            Jbuffer[j] = J;
+            Cbuffer[j] = C;
+            hbuffer[j] = h;
+#else
 			float xx,yy,zz;
 			//process normal==> viewing

@ -1805,22 +1882,13 @@ if(LabPassOne){
 			                             xw2, yw2,  zw2,
 			                             yb2, la2,
 			                             f2,  c2, nc2, gamu, pow1n, nbbj, ncbj, flj, czj, dj, awj);
+            float x,y,z;
 			x=(float)xx*655.35f;
 			y=(float)yy*655.35f;
 			z=(float)zz*655.35f;
 			float Ll,aa,bb;
 			//convert xyz=>lab
 			Color::XYZ2Lab(x,  y,  z, Ll, aa, bb);
-#ifdef _DEBUG
-			if(Ll > 70000.f && J < 1.f) {
-#pragma omp critical
-{
-				printf("Why is Ll so big when J is so small?\n");
-				printf("J : %f, Ll : %f, xx : %f, yy : %f, zz : %f\n",J,Ll,xx,yy,zz);
-				printf("J : %f, C : %f, h : %f\n",J,C,h);
-}
-			}
-#endif
 			
 		// gamut control in Lab mode; I must study how to do with cIECAM only
 		if(gamu==1) {
@ -1857,9 +1925,66 @@ if(LabPassOne){
 			lab->a[i][j]=aa;
 			lab->b[i][j]=bb;
 		}
+#endif
 		}
 		}
 		}
+#ifdef __SSE2__
+        // process line buffers
+        float *xbuffer = Qbuffer;
+        float *ybuffer = Mbuffer;
+        float *zbuffer = sbuffer;
+        for(k=0;k<bufferLength;k+=4) {
+            Ciecam02::jch2xyz_ciecam02float( x, y, z,
+                             LVF(Jbuffer[k]), LVF(Cbuffer[k]), LVF(hbuffer[k]),
+                             F2V(xw2), F2V(yw2), F2V(zw2),
+                             F2V(yb2), F2V(la2),
+                             F2V(f2),  F2V(nc2), F2V(pow1n), F2V(nbbj), F2V(ncbj), F2V(flj), F2V(dj), F2V(awj), F2V(reccmcz));
+            STVF(xbuffer[k],x*c655d35);
+            STVF(ybuffer[k],y*c655d35);
+            STVF(zbuffer[k],z*c655d35);
+        }
+        // XYZ2Lab uses a lookup table. The function behind that lut is a cube root.
+        // SSE can't beat the speed of that lut, so it doesn't make sense to use SSE
+        for(int j=0;j<width;j++) {
+            float Ll,aa,bb;
+                //convert xyz=>lab
+            Color::XYZ2Lab(xbuffer[j], ybuffer[j], zbuffer[j], Ll, aa, bb);
+
+            // gamut control in Lab mode; I must study how to do with cIECAM only
+            if(gamu==1) {
+                float HH, Lprov1, Chprov1;
+                Lprov1=Ll/327.68f;
+                Chprov1=sqrtf(SQR(aa) + SQR(bb))/327.68f;
+                HH=xatan2f(bb,aa);
+                float2  sincosval;
+                if(Chprov1==0.0f) {
+                    sincosval.y = 1.f;
+                    sincosval.x = 0.0f;
+                } else {
+                    sincosval.y = aa/(Chprov1*327.68f);
+                    sincosval.x = bb/(Chprov1*327.68f);
+                }
+#ifdef _DEBUG
+                bool neg=false;
+                bool more_rgb=false;
+                //gamut control : Lab values are in gamut
+                Color::gamutLchonly(sincosval,Lprov1,Chprov1, wip, highlight, 0.15f, 0.96f, neg, more_rgb);
+#else
+                //gamut control : Lab values are in gamut
+                Color::gamutLchonly(sincosval,Lprov1,Chprov1, wip, highlight, 0.15f, 0.96f);
+#endif
+                lab->L[i][j]=Lprov1*327.68f;
+                lab->a[i][j]=327.68f*Chprov1*sincosval.y;
+                lab->b[i][j]=327.68f*Chprov1*sincosval.x;
+            } else {
+                lab->L[i][j]=Ll;
+                lab->a[i][j]=aa;
+                lab->b[i][j]=bb;
+            }
+        }
+#endif
+    }
 #pragma omp critical
 {
 	if(minQThr < minQ)
@ -2023,12 +2148,20 @@ if((params->colorappearance.tonecie && (epdEnabled)) || (params->sharpening.enab
 #pragma omp parallel
 #endif
 {
-
+#ifdef __SSE2__
+    // one line buffer per channel
+	float Jbuffer[bufferLength] ALIGNED16;
+	float Cbuffer[bufferLength] ALIGNED16;
+	float hbuffer[bufferLength] ALIGNED16;
+	float *xbuffer = Jbuffer; // we can use one of the above buffers
+	float *ybuffer = Cbuffer; //             "
+	float *zbuffer = hbuffer; //             "
+#endif

 #ifndef _DEBUG
 		#pragma omp for schedule(dynamic, 10)
 #endif
-		for (int i=0; i<height; i++) // update CIECAM with new values after tone-mapping
+		for (int i=0; i<height; i++) { // update CIECAM with new values after tone-mapping
 			for (int j=0; j<width; j++) {
 				float xx,yy,zz;
 				float x,y,z;
@ -2066,6 +2199,11 @@ if((params->colorappearance.tonecie && (epdEnabled)) || (params->sharpening.enab
 				}
 				//end histograms

+#ifdef __SSE2__
+                Jbuffer[j] = ncie->J_p[i][j];
+                Cbuffer[j] = ncie_C_p;
+                hbuffer[j] = ncie->h_p[i][j];
+#else
 				Ciecam02::jch2xyz_ciecam02float( xx, yy, zz,
 											 ncie->J_p[i][j],  ncie_C_p, ncie->h_p[i][j],
 											 xw2, yw2,  zw2,
@ -2109,6 +2247,64 @@ if((params->colorappearance.tonecie && (epdEnabled)) || (params->sharpening.enab
 					lab->a[i][j]=aa;
 					lab->b[i][j]=bb;
 				}
+#endif
+			}
+#ifdef __SSE2__
+            // process line buffers
+            int k;
+            vfloat x,y,z;
+            vfloat c655d35 = F2V(655.35f);
+            for(k=0;k<bufferLength;k+=4) {
+                Ciecam02::jch2xyz_ciecam02float( x, y, z,
+                                         LVF(Jbuffer[k]), LVF(Cbuffer[k]), LVF(hbuffer[k]),
+                                         F2V(xw2), F2V(yw2), F2V(zw2),
+                                         F2V(yb2), F2V(la2),
+                                         F2V(f2), F2V(nc2), F2V(pow1n), F2V(nbbj), F2V(ncbj), F2V(flj), F2V(dj), F2V(awj), F2V(reccmcz));
+                x *= c655d35;
+                y *= c655d35;
+                z *= c655d35;
+                STVF(xbuffer[k],x);
+                STVF(ybuffer[k],y);
+                STVF(zbuffer[k],z);
+            }
+            // XYZ2Lab uses a lookup table. The function behind that lut is a cube root.
+            // SSE can't beat the speed of that lut, so it doesn't make sense to use SSE
+            for(int j=0;j<width;j++) {
+				float Ll,aa,bb;
+				//convert xyz=>lab
+				Color::XYZ2Lab(xbuffer[j], ybuffer[j], zbuffer[j], Ll, aa, bb);
+				if(gamu==1) {
+					float Lprov1, Chprov1;
+					Lprov1=Ll/327.68f;
+					Chprov1=sqrtf(SQR(aa) + SQR(bb))/327.68f;
+					float2  sincosval;
+					if(Chprov1==0.0f) {
+						sincosval.y = 1.f;
+						sincosval.x = 0.0f;
+					} else {
+						sincosval.y = aa/(Chprov1*327.68f);
+						sincosval.x = bb/(Chprov1*327.68f);
+					}
+#ifdef _DEBUG
+					bool neg=false;
+					bool more_rgb=false;
+					//gamut control : Lab values are in gamut
+					Color::gamutLchonly(sincosval,Lprov1,Chprov1, wipa, highlight, 0.15f, 0.96f, neg, more_rgb);
+#else
+					//gamut control : Lab values are in gamut
+					Color::gamutLchonly(sincosval,Lprov1,Chprov1, wipa, highlight, 0.15f, 0.96f);
+#endif
+					lab->L[i][j]=Lprov1*327.68f;
+					lab->a[i][j]=327.68f*Chprov1*sincosval.y;
+					lab->b[i][j]=327.68f*Chprov1*sincosval.x;
+				} else {
+					lab->L[i][j]=Ll;
+					lab->a[i][j]=aa;
+					lab->b[i][j]=bb;
+				}
+
+            }
+#endif // __SSE2__
        }

 } //end parallelization
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@ -1315,5 +1315,12 @@ static INLINE vfloat SQRV(vfloat a){
 	return _mm_mul_ps( a,a );
 }

+static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
+    vfloat temp = vself(condition, a, b); // the larger of the two
+    condition = vnotm(condition); // invert the mask
+    a = vself(condition, a, b); // the smaller of the two
+    b = temp;
+}
+
 #endif // __SSE2__
 #endif // SLEEFSSEAVX