From c67b986744d21b803de17dba71f1d82d6bc38a7e Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 21 Sep 2016 00:22:42 +0200
Subject: [PATCH 1/5] add faster implementation to clip float to [0;65535] and
 round

---
 rtengine/rt_math.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h
index 0836c8be7..b5c93d127 100644
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@@ -80,7 +80,7 @@ inline _Tp intp(_Tp a, _Tp b, _Tp c)
     // following is valid:
     // intp(a, b+x, c+x) = intp(a, b, c) + x
     // intp(a, b*x, c*x) = intp(a, b, c) * x
-    return a * (b-c) + c;
+    return a * (b - c) + c;
 }
 
 template<typename T>
@@ -101,5 +101,17 @@ inline T norminf(const T& x, const T& y)
     return std::max(std::abs(x), std::abs(y));
 }
 
-}
+inline int float2uint16range(float d) // clips input to [0;65535] and rounds
+{
+    d = CLIP(d); // clip to [0;65535]
+#ifdef __SSE2__ // this only works in IEEE 754 maths. For simplicity I restricted it to SSE2. We can enhance it later, but we have to take care of endianness then.
+    d += 12582912.f;
+    return reinterpret_cast<int&>(d);
+#else // fall back to slow std::round()
+    return std::round(d);
+#endif
+}
+
+}
+
 #endif

From f17011cec400b0730a8f9ead26069ddb1390bd82 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 21 Sep 2016 00:24:44 +0200
Subject: [PATCH 2/5] use faster implementation to clip float to [0;65535] and
 round in rtengine::lab2rgb16b and rtengine::lab2rgb16

---
 rtengine/iplab2rgb.cc | 45 +++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/rtengine/iplab2rgb.cc b/rtengine/iplab2rgb.cc
index 489ca60bc..d37ba362d 100644
--- a/rtengine/iplab2rgb.cc
+++ b/rtengine/iplab2rgb.cc
@@ -26,6 +26,19 @@
 #include "curves.h"
 #include "alignedbuffer.h"
 #include "color.h"
+#define BENCHMARK
+#include "StopWatch.h"
+
+namespace
+{
+
+int float2intx(float d)
+{
+    d += 12582912.f;
+    return reinterpret_cast<int&>(d);
+}
+}
+
 
 namespace rtengine
 {
@@ -241,8 +254,7 @@ Image8* ImProcFunctions::lab2rgb (LabImage* lab, int cx, int cy, int cw, int ch,
 // for default (not gamma)
 Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile, RenderingIntent intent, bool bw)
 {
-
-    //gamutmap(lab);
+    BENCHFUN
 
     if (cx < 0) {
         cx = 0;
@@ -279,7 +291,7 @@ Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int
             for (int j = cx; j < cx + cw; j++) {
 
                 float fy = (0.0086206897f * rL[j]) / 327.68f + 0.1379310345f; // (L+16)/116
-                float fx = (0.002 * ra[j]) / 327.68f + fy;
+                float fx = (0.002f * ra[j]) / 327.68f + fy;
                 float fz = fy - (0.005f * rb[j]) / 327.68f;
                 float LL = rL[j] / 327.68f;
 
@@ -288,15 +300,14 @@ Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int
                 float z_ = 65535.0f * (float) Color::f2xyz(fz) * Color::D50z;
                 float y_ = (LL > Color::epskap) ? 65535.0f * fy * fy * fy : 65535.0f * LL / Color::kappa;
 
-                xa[j - cx] =  CLIP((int)  round(x_));
-                ya[j - cx] =  CLIP((int)  round(y_));
-                za[j - cx] = CLIP((int)   round(z_));
+                xa[j - cx] = float2uint16range(x_);
+                ya[j - cx] = float2uint16range(y_);
+                za[j - cx] = float2uint16range(z_);
 
                 if(bw && y_ < 65535.f ) { //force Bw value and take highlight into account
-                    xa[j - cx] = (int) round(y_ * Color::D50x );
-                    za[j - cx] = (int) round(y_ * Color::D50z);
+                    xa[j - cx] = float2uint16range(y_ * Color::D50x);
+                    za[j - cx] = float2uint16range(y_ * Color::D50z);
                 }
-
             }
         }
 
@@ -345,7 +356,7 @@ Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int
 // for gamma options (BT709...sRGB linear...)
 Image16* ImProcFunctions::lab2rgb16b (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile, RenderingIntent intent, Glib::ustring profi, Glib::ustring gam,  bool freegamma, double gampos, double slpos, double &ga0, double &ga1, double &ga2, double &ga3, double &ga4, double &ga5, double &ga6, bool bw)
 {
-
+BENCHFUN
     //gamutmap(lab);
 
     if (cx < 0) {
@@ -539,7 +550,7 @@ Image16* ImProcFunctions::lab2rgb16b (LabImage* lab, int cx, int cy, int cw, int
 // 7 parameters for smoother curves
     cmsWhitePointFromTemp(&xyD, t50);
     GammaTRC[0] = GammaTRC[1] = GammaTRC[2] =   cmsBuildParametricToneCurve(NULL, 5, Parameters);//5 = more smoother than 4
-    cmsHPROFILE oprofdef = cmsCreateRGBProfileTHR(NULL, &xyD, &Primaries, GammaTRC); //oprofdef  become Outputprofile
+    cmsHPROFILE oprofdef = cmsCreateRGBProfileTHR(NULL, &xyD, &Primaries, GammaTRC); //oprofdef  becomes Outputprofile
 
     cmsFreeToneCurve(GammaTRC[0]);
 
@@ -567,13 +578,13 @@ Image16* ImProcFunctions::lab2rgb16b (LabImage* lab, int cx, int cy, int cw, int
                 float z_ = 65535.0f * (float)Color::f2xyz(fz) * Color::D50z;
                 float y_ = (LL > Color::epskap) ? (float) 65535.0 * fy * fy * fy : 65535.0f * LL / Color::kappa;
 
-                xa[j - cx] = CLIP((int) round(x_)) ;
-                ya[j - cx] = CLIP((int) round(y_));
-                za[j - cx] = CLIP((int) round(z_));
+                xa[j - cx] = float2uint16range(x_);
+                ya[j - cx] = float2uint16range(y_);
+                za[j - cx] = float2uint16range(z_);
 
                 if(bw && y_ < 65535.f) { //force Bw value and take highlight into account
-                    xa[j - cx] = (int) round(y_ * Color::D50x);
-                    za[j - cx] = (int) round(y_ * Color::D50z);
+                    xa[j - cx] = float2uint16range(y_ * Color::D50x);
+                    za[j - cx] = float2uint16range(y_ * Color::D50z);
                 }
 
             }
@@ -581,7 +592,7 @@ Image16* ImProcFunctions::lab2rgb16b (LabImage* lab, int cx, int cy, int cw, int
 
         cmsHPROFILE iprof = iccStore->getXYZProfile ();
         lcmsMutex->lock ();
-        cmsHTRANSFORM hTransform = cmsCreateTransform (iprof, TYPE_RGB_16, oprofdef, TYPE_RGB_16, intent,  cmsFLAGS_NOOPTIMIZE | cmsFLAGS_NOCACHE);
+        cmsHTRANSFORM hTransform = cmsCreateTransform (iprof, TYPE_RGB_16, oprofdef, TYPE_RGB_16, intent, cmsFLAGS_NOOPTIMIZE | cmsFLAGS_NOCACHE);
         lcmsMutex->unlock ();
 
         image->ExecCMSTransform(hTransform);

From 29c4d936aa3c7f4801fadc1d15e45c15e7a7e5ba Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 21 Sep 2016 00:35:02 +0200
Subject: [PATCH 3/5] cleaned code

---
 rtengine/iplab2rgb.cc | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/rtengine/iplab2rgb.cc b/rtengine/iplab2rgb.cc
index d37ba362d..c5eeb1a2a 100644
--- a/rtengine/iplab2rgb.cc
+++ b/rtengine/iplab2rgb.cc
@@ -29,17 +29,6 @@
 #define BENCHMARK
 #include "StopWatch.h"
 
-namespace
-{
-
-int float2intx(float d)
-{
-    d += 12582912.f;
-    return reinterpret_cast<int&>(d);
-}
-}
-
-
 namespace rtengine
 {
 

From ab2be87333407451ae090f6bc3f86b6910ea0056 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 21 Sep 2016 14:05:30 +0200
Subject: [PATCH 4/5] Simplified float2uint16range(..), removed StopWatches

---
 rtengine/iplab2rgb.cc | 26 +++++++++-----------------
 rtengine/rt_math.h    |  7 +------
 2 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/rtengine/iplab2rgb.cc b/rtengine/iplab2rgb.cc
index c5eeb1a2a..576af00c3 100644
--- a/rtengine/iplab2rgb.cc
+++ b/rtengine/iplab2rgb.cc
@@ -26,8 +26,6 @@
 #include "curves.h"
 #include "alignedbuffer.h"
 #include "color.h"
-#define BENCHMARK
-#include "StopWatch.h"
 
 namespace rtengine
 {
@@ -79,9 +77,7 @@ void ImProcFunctions::lab2monitorRgb (LabImage* lab, Image8* image)
                     cmsDoTransform (monitorTransform, buffer, data + ix, W);
                 }
             }
-
         } // End of parallelization
-
     } else {
 
         int W = lab->W;
@@ -129,7 +125,6 @@ void ImProcFunctions::lab2monitorRgb (LabImage* lab, Image8* image)
 
 Image8* ImProcFunctions::lab2rgb (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile, RenderingIntent intent, bool standard_gamma)
 {
-    //gamutmap(lab);
 
     if (cx < 0) {
         cx = 0;
@@ -243,7 +238,6 @@ Image8* ImProcFunctions::lab2rgb (LabImage* lab, int cx, int cy, int cw, int ch,
 // for default (not gamma)
 Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile, RenderingIntent intent, bool bw)
 {
-    BENCHFUN
 
     if (cx < 0) {
         cx = 0;
@@ -264,10 +258,10 @@ Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int
     Image16* image = new Image16 (cw, ch);
     cmsHPROFILE oprof = iccStore->getProfile (profile);
 
-
-
     if (oprof) {
+#ifdef _OPENMP
         #pragma omp parallel for if (multiThread)
+#endif
 
         for (int i = cy; i < cy + ch; i++) {
             float* rL = lab->L[i];
@@ -309,7 +303,9 @@ Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int
 
         cmsDeleteTransform(hTransform);
     } else {
+#ifdef _OPENMP
         #pragma omp parallel for if (multiThread)
+#endif
 
         for (int i = cy; i < cy + ch; i++) {
             float R, G, B;
@@ -345,8 +341,6 @@ Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int
 // for gamma options (BT709...sRGB linear...)
 Image16* ImProcFunctions::lab2rgb16b (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile, RenderingIntent intent, Glib::ustring profi, Glib::ustring gam,  bool freegamma, double gampos, double slpos, double &ga0, double &ga1, double &ga2, double &ga3, double &ga4, double &ga5, double &ga6, bool bw)
 {
-BENCHFUN
-    //gamutmap(lab);
 
     if (cx < 0) {
         cx = 0;
@@ -506,13 +500,11 @@ BENCHFUN
 
         Color::calcGamma(pwr, ts, mode, imax, g_a0, g_a1, g_a2, g_a3, g_a4, g_a5); // call to calcGamma with selected gamma and slope : return parameters for LCMS2
         ga4 = g_a3 * ts;
-        //printf("g_a0=%f g_a1=%f g_a2=%f g_a3=%f g_a4=%f\n", g_a0,g_a1,g_a2,g_a3,g_a4);
         ga0 = gampos;
         ga1 = 1. / (1.0 + g_a4);
         ga2 = g_a4 / (1.0 + g_a4);
         ga3 = 1. / slpos;
         ga5 = 0.0;
-        //printf("ga0=%f ga1=%f ga2=%f ga3=%f ga4=%f\n", ga0,ga1,ga2,ga3,ga4);
 
     }
 
@@ -543,9 +535,10 @@ BENCHFUN
 
     cmsFreeToneCurve(GammaTRC[0]);
 
-
     if (oprofdef) {
+#ifdef _OPENMP
         #pragma omp parallel for if (multiThread)
+#endif
 
         for (int i = cy; i < cy + ch; i++) {
             float* rL = lab->L[i];
@@ -575,7 +568,6 @@ BENCHFUN
                     xa[j - cx] = float2uint16range(y_ * Color::D50x);
                     za[j - cx] = float2uint16range(y_ * Color::D50z);
                 }
-
             }
         }
 
@@ -587,8 +579,10 @@ BENCHFUN
         image->ExecCMSTransform(hTransform);
         cmsDeleteTransform(hTransform);
     } else {
-        //
+#ifdef _OPENMP
         #pragma omp parallel for if (multiThread)
+#endif
+
         for (int i = cy; i < cy + ch; i++) {
             float R, G, B;
             float* rL = lab->L[i];
@@ -619,6 +613,4 @@ BENCHFUN
     return image;
 }
 
-//#include "sRGBgamutbdy.cc"
-
 }
diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h
index b5c93d127..f55f7c1b2 100644
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@@ -104,12 +104,7 @@ inline T norminf(const T& x, const T& y)
 inline int float2uint16range(float d) // clips input to [0;65535] and rounds
 {
     d = CLIP(d); // clip to [0;65535]
-#ifdef __SSE2__ // this only works in IEEE 754 maths. For simplicity I restricted it to SSE2. We can enhance it later, but we have to take care of endianness then.
-    d += 12582912.f;
-    return reinterpret_cast<int&>(d);
-#else // fall back to slow std::round()
-    return std::round(d);
-#endif
+    return d + 0.5f;
 }
 
 }

From 1e268105dbcb3d8653166907cb87589b87839a89 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 21 Sep 2016 21:01:51 +0200
Subject: [PATCH 5/5] replaced code to convert from Lab to XYZ by calling
 Color::Lab2XYZ(..)

---
 rtengine/iplab2rgb.cc | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/rtengine/iplab2rgb.cc b/rtengine/iplab2rgb.cc
index 576af00c3..c12e3463e 100644
--- a/rtengine/iplab2rgb.cc
+++ b/rtengine/iplab2rgb.cc
@@ -272,16 +272,8 @@ Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int
             short* za = (short*)image->b(i - cy);
 
             for (int j = cx; j < cx + cw; j++) {
-
-                float fy = (0.0086206897f * rL[j]) / 327.68f + 0.1379310345f; // (L+16)/116
-                float fx = (0.002f * ra[j]) / 327.68f + fy;
-                float fz = fy - (0.005f * rb[j]) / 327.68f;
-                float LL = rL[j] / 327.68f;
-
-                float x_ = 65535.0f * (float) Color::f2xyz(fx) * Color::D50x;
-                //float y_ = 65535.0 * Color::f2xyz(fy);
-                float z_ = 65535.0f * (float) Color::f2xyz(fz) * Color::D50z;
-                float y_ = (LL > Color::epskap) ? 65535.0f * fy * fy * fy : 65535.0f * LL / Color::kappa;
+                float x_, y_, z_;
+                Color::Lab2XYZ(rL[j], ra[j], rb[j], x_, y_, z_);
 
                 xa[j - cx] = float2uint16range(x_);
                 ya[j - cx] = float2uint16range(y_);
@@ -549,16 +541,8 @@ Image16* ImProcFunctions::lab2rgb16b (LabImage* lab, int cx, int cy, int cw, int
             short* za = (short*)image->b(i - cy);
 
             for (int j = cx; j < cx + cw; j++) {
-
-                float fy = (0.0086206897f * rL[j]) / 327.68f + 0.1379310345f; // (L+16)/116
-                float fx = (0.002f * ra[j]) / 327.68f + fy;
-                float fz = fy - (0.005f * rb[j]) / 327.68f;
-                float LL = rL[j] / 327.68f;
-
-                float x_ = 65535.0f * (float)Color::f2xyz(fx) * Color::D50x;
-                //  float y_ = 65535.0 * Color::f2xyz(fy);
-                float z_ = 65535.0f * (float)Color::f2xyz(fz) * Color::D50z;
-                float y_ = (LL > Color::epskap) ? (float) 65535.0 * fy * fy * fy : 65535.0f * LL / Color::kappa;
+                float x_, y_, z_;
+                Color::Lab2XYZ(rL[j], ra[j], rb[j], x_, y_, z_);
 
                 xa[j - cx] = float2uint16range(x_);
                 ya[j - cx] = float2uint16range(y_);