From 94129861f5366a620de1ab3d99855656793f23e5 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sun, 19 Feb 2017 20:36:45 +0100
Subject: [PATCH] Speedup for lcp vignetting correction

---
 rtengine/lcp.cc            | 66 ++++++++++++++++++++++++++------------
 rtengine/lcp.h             | 15 ++++++---
 rtengine/rawimagesource.cc | 24 +++++++++++---
 3 files changed, 77 insertions(+), 28 deletions(-)
diff --git a/rtengine/lcp.cc b/rtengine/lcp.cc
index ce1a6c67e..8579efcbd 100644
--- a/rtengine/lcp.cc
+++ b/rtengine/lcp.cc
@@ -19,32 +19,28 @@
 #include <cstring>
 
 #include "lcp.h"
-#include "iccmatrices.h"
-#include "iccstore.h"
-#include "rawimagesource.h"
-#include "improcfun.h"
-#include "rt_math.h"
+#include <glib/gstdio.h>
 
 #ifdef WIN32
 #include <windows.h>
-// for GCC32
-#ifndef _WIN32_IE
-#define _WIN32_IE 0x0600
-#endif
 #include <shlobj.h>
 #endif
 
 
 using namespace std;
 using namespace rtengine;
-using namespace rtexif;
 
 
 LCPModelCommon::LCPModelCommon()
 {
     focLenX = focLenY = -1;
     imgXCenter = imgYCenter = 0.5;
-    x0 = y0 = fx = fy = meanErr = 0;
+    x0 = y0 = fx = fy = rfx = rfy = meanErr = 0;
+
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+    x0v = y0v = rfxv = rfyv = ZEROV;
+#endif
+
     badErr = false;
 
     for (int i = 0; i < 5; i++) {
@@ -66,7 +62,7 @@ void LCPModelCommon::print() const
     printf("param: %g/%g/%g/%g/%g\n", param[0], param[1], param[2], param[3], param[4]);
 }
 
-// weightened merge two parameters
+// weighted merge two parameters
 void LCPModelCommon::merge(const LCPModelCommon& a, const LCPModelCommon& b, float facA)
 {
     float facB = 1 - facA;
@@ -81,6 +77,20 @@ void LCPModelCommon::merge(const LCPModelCommon& a, const LCPModelCommon& b, flo
     for (int i = 0; i < 5; i++) {
         param[i] = facA * a.param[i] + facB * b.param[i];
     }
+
+    double param0Sqr = param[0] * param[0];
+
+    vignParam[0] = - param[0];
+    vignParam[1] = param0Sqr - param[1];
+    vignParam[2] = param0Sqr * param[0] - 2. * param[0] * param[1] + param[2];
+    vignParam[3] = param0Sqr * param0Sqr + param[1] * param[1] + 2. * param[0] * param[2] - 3. * param0Sqr * param[1];
+
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+    vignParamv[0] = F2V(vignParam[0]);
+    vignParamv[1] = F2V(vignParam[1]);
+    vignParamv[2] = F2V(vignParam[2]);
+    vignParamv[3] = F2V(vignParam[3]);
+#endif // __SSE2__
 }
 
 void LCPModelCommon::prepareParams(int fullWidth, int fullHeight, float focalLength, float focalLength35mm, float sensorFormatFactor, bool swapXY, bool mirrorX, bool mirrorY)
@@ -113,6 +123,15 @@ void LCPModelCommon::prepareParams(int fullWidth, int fullHeight, float focalLen
         fx = focLenX * Dmax;
         fy = focLenY * Dmax;
     }
+    rfx = 1.0 / fx;
+    rfy = 1.0 / fy;
+
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+    x0v = F2V(x0);
+    y0v = F2V(y0);
+    rfxv = F2V(rfx);
+    rfyv = F2V(rfy);
+#endif
 
     //printf("FW %i /X0 %g   FH %i /Y0 %g  %g\n",fullWidth,x0,fullHeight,y0, imgYCenter);
 }
@@ -264,18 +283,25 @@ void LCPMapper::correctCA(double& x, double& y, int channel) const
 float LCPMapper::calcVignetteFac(int x, int y) const
 {
     // No need for swapXY, since vignette is in RAW and always before rotation
-    double xd = ((double)x - mc.x0) / mc.fx, yd = ((double)y - mc.y0) / mc.fy;
+    float xd = ((float)x - mc.x0) * mc.rfx, yd = ((float)y - mc.y0) * mc.rfy;
 
-    const float* aVig = mc.param;
-    double rsqr      = xd * xd + yd * yd;
-    double param0Sqr = aVig[0] * aVig[0];
+    const float* vignParam = mc.vignParam;
+    float rsqr = xd * xd + yd * yd;
 
-    return 1. + rsqr * (-aVig[0] + rsqr * ((param0Sqr - aVig[1])
-                                           - (param0Sqr * aVig[0] - 2.*aVig[0] * aVig[1] + aVig[2]) * rsqr
-                                           + (param0Sqr * param0Sqr + aVig[1] * aVig[1]
-                                                   + 2.*aVig[0] * aVig[2] - 3.*param0Sqr * aVig[1]) * rsqr * rsqr));
+    return 1.f + rsqr * (vignParam[0] + rsqr * ((vignParam[1]) - (vignParam[2]) * rsqr + (vignParam[3]) * rsqr * rsqr));
 }
 
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+vfloat LCPMapper::calcVignetteFac(vfloat x, vfloat y) const
+{
+    // No need for swapXY, since vignette is in RAW and always before rotation
+    vfloat xd = (x - mc.x0v) * mc.rfxv, yd = (y - mc.y0v) * mc.rfyv;
+    vfloat rsqr = xd * xd + yd * yd;
+
+    return F2V(1.f) + rsqr * (mc.vignParamv[0] + rsqr * (mc.vignParamv[1] - mc.vignParamv[2] * rsqr + mc.vignParamv[3] * rsqr * rsqr));
+}
+#endif
+
 LCPProfile::LCPProfile(const Glib::ustring &fname)
 {
     const int BufferSize = 8192;
diff --git a/rtengine/lcp.h b/rtengine/lcp.h
index 908bacf56..74d06c958 100644
--- a/rtengine/lcp.h
+++ b/rtengine/lcp.h
@@ -21,10 +21,9 @@
 #define _LCP_
 
 #include "imagefloat.h"
-#include "../rtgui/threadutils.h"
+#include "opthelper.h"
 #include <glibmm.h>
 #include <map>
-#include <list>
 #include <string>
 #include <expat.h>
 
@@ -40,8 +39,13 @@ public:
     double meanErr;
     bool badErr;
 
-    double x0, y0, fx, fy; // prepared params
-
+    float x0, y0, fx, fy; // prepared params
+    float rfx, rfy;
+    float vignParam[4];
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+    vfloat vignParamv[4] ALIGNED16;
+    vfloat x0v, y0v, rfxv, rfyv;
+#endif
     LCPModelCommon();
     bool empty() const;  // is it empty
     void print() const;  // printf all values
@@ -133,6 +137,9 @@ public:
     void  correctDistortion(double& x, double& y) const;  // MUST be the first stage
     void  correctCA(double& x, double& y, int channel) const;
     float calcVignetteFac  (int x, int y) const;  // MUST be in RAW
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+    vfloat calcVignetteFac(vfloat x, vfloat y) const;
+#endif
 };
 }
 #endif
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 9c8f9788b..47613cf1a 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -1767,15 +1767,31 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
         LCPProfile *pLCPProf = lcpStore->getProfile(lensProf.lcpFile);
 
         if (pLCPProf) { // don't check focal length to allow distortion correction for lenses without chip, also pass dummy focal length 1 in case of 0
+            StopWatch Stop1("lcp vignette correction");
             LCPMapper map(pLCPProf, max(idata->getFocalLen(), 1.0), idata->getFocalLen35mm(), idata->getFocusDist(), idata->getFNumber(), true, false, W, H, coarse, -1);
 
-if (ri->getSensorType() == ST_BAYER || ri->getSensorType() == ST_FUJI_XTRANS || ri->get_colors() == 1) {
+            if (ri->getSensorType() == ST_BAYER || ri->getSensorType() == ST_FUJI_XTRANS || ri->get_colors() == 1) {
 #ifdef _OPENMP
-                #pragma omp parallel for
+                #pragma omp parallel for schedule(dynamic,16)
 #endif
 
                 for (int y = 0; y < H; y++) {
-                    for (int x = 0; x < W; x++) {
+                    int x = 0;
+
+#if defined( __SSE2__ ) && defined( __x86_64__ )
+                    vfloat yv = F2V(y);
+                    vfloat fourv = F2V(4.f);
+                    vfloat onev = F2V(1.f);
+                    vfloat xv = _mm_set_ps(3,2,1,0);
+                    for (; x < W-3; x+=4) {
+                        vfloat vignFactorv = map.calcVignetteFac(xv, yv);
+                        vfloat rawValv = LVFU(rawData[y][x]);
+                        rawValv *= vself(vmaskf_gt(rawValv, ZEROV), vignFactorv, onev);
+                        STVFU(rawData[y][x], rawValv);
+                        xv += fourv;
+                    }
+#endif
+                    for (; x < W; x++) {
                         if (rawData[y][x] > 0) {
                             rawData[y][x] *= map.calcVignetteFac(x, y);
                         }
@@ -1783,7 +1799,7 @@ if (ri->getSensorType() == ST_BAYER || ri->getSensorType() == ST_FUJI_XTRANS ||
                 }
             } else if(ri->get_colors() == 3) {
 #ifdef _OPENMP
-                #pragma omp parallel for
+                #pragma omp parallel for schedule(dynamic,16)
 #endif
 
                 for (int y = 0; y < H; y++) {