Small speedup for epd, also reduces size of executable

2020-11-17 15:40:49 +01:00
parent f7f527d952
commit ba78b24fa5
3 changed files with 26 additions and 33 deletions
--- a/rtengine/EdgePreservingDecomposition.cc
+++ b/rtengine/EdgePreservingDecomposition.cc
@@ -4,6 +4,7 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include "rt_algo.h"
 #include "sleef.h"

 #define DIAGONALS 5
@@ -42,21 +43,13 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl

    //s is preconditionment of r. Without, direct to r.
    float *s = r;
-    double rs = 0.0; // use double precision for large summations

    if(Preconditioner != nullptr) {
        s = new float[n];
-
        Preconditioner(s, r, Pass);
    }

-#ifdef _OPENMP
-    #pragma omp parallel for reduction(+:rs)  // removed schedule(dynamic,10)
-#endif
-
-    for(int ii = 0; ii < n; ii++) {
-        rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
-    }
+    double rs = rtengine::accumulateProduct(r, s, n);

    //Search direction d.
    float *d = (buffer + n + 32);
@@ -77,16 +70,9 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl

    for(iterate = 0; iterate < MaximumIterates; iterate++) {
        //Get step size alpha, store ax while at it.
-        double ab = 0.0; // use double precision for large summations
        Ax(ax, d, Pass);
-#ifdef _OPENMP
-        #pragma omp parallel for reduction(+:ab)
-#endif
-
-        for(int ii = 0; ii < n; ii++) {
-            ab += static_cast<double>(d[ii]) * static_cast<double>(ax[ii]);
-        }

+        double ab = rtengine::accumulateProduct(d, ax, n);
        if(ab == 0.0) {
            break;    //So unlikely. It means perfectly converged or singular, stop either way.
        }
@@ -118,22 +104,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl

        //Get beta.
        ab = rs;
-        rs = 0.0f;
-
-#ifdef _OPENMP
-        #pragma omp parallel
-#endif
-        {
-#ifdef _OPENMP
-            #pragma omp for reduction(+:rs)
-#endif
-
-            for(int ii = 0; ii < n; ii++) {
-                rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
-            }
-
-        }
-
+        rs = rtengine::accumulateProduct(r, s, n);
        ab = rs / ab;
        abf = ab;
        //Update search direction p.
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -479,4 +479,25 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
    }
 }

+double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread) {
+    if (n == 0) {
+        return 0.0;
+    }
+
+    // use two accumulators to reduce dependencies (improves speed) and increase accuracy
+    double acc1 = 0.0;
+    double acc2 = 0.0;
+#ifdef _OPENMP
+    #pragma omp parallel for reduction(+:acc1,acc2) if(multiThread)
+#endif
+    for (size_t i = 0; i < n - 1; i += 2) {
+        acc1 += static_cast<double>(data1[i]) * static_cast<double>(data2[i]);
+        acc2 += static_cast<double>(data1[i + 1]) * static_cast<double>(data2[i + 1]);
+    }
+
+    if (n & 1) {
+        acc1 += static_cast<double>(data1[n -1]) * static_cast<double>(data2[n -1]);
+    }
+    return acc1 + acc2;
+}
 }
--- a/rtengine/rt_algo.h
+++ b/rtengine/rt_algo.h
@@ -29,4 +29,5 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
 void buildGradientsMask(int W, int H, float **luminance, float **out, 
                        float amount, int nlevels, int detail_level,
                        float alfa, float beta, bool multithread);
+double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread = true);
 }