Small speedup for epd, also reduces size of executable

2020-11-17 15:40:49 +01:00 · 2020-11-17 15:40:49 +01:00 · ba78b24fa5
commit ba78b24fa5
parent f7f527d952
3 changed files with 26 additions and 33 deletions
--- a/rtengine/EdgePreservingDecomposition.cc
+++ b/rtengine/EdgePreservingDecomposition.cc
@ -4,6 +4,7 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #include "rt_algo.h"
 #include "sleef.h"
 #define DIAGONALS 5
@ -42,21 +43,13 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
    //s is preconditionment of r. Without, direct to r.
    float *s = r;
    double rs = 0.0; // use double precision for large summations
    if(Preconditioner != nullptr) {
        s = new float[n];
        Preconditioner(s, r, Pass);
    }
-#ifdef _OPENMP
+    double rs = rtengine::accumulateProduct(r, s, n);
    #pragma omp parallel for reduction(+:rs)  // removed schedule(dynamic,10)
 #endif
    for(int ii = 0; ii < n; ii++) {
        rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
    }
    //Search direction d.
    float *d = (buffer + n + 32);
@ -77,16 +70,9 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
    for(iterate = 0; iterate < MaximumIterates; iterate++) {
        //Get step size alpha, store ax while at it.
        double ab = 0.0; // use double precision for large summations
        Ax(ax, d, Pass);
 #ifdef _OPENMP
        #pragma omp parallel for reduction(+:ab)
 #endif
        for(int ii = 0; ii < n; ii++) {
            ab += static_cast<double>(d[ii]) * static_cast<double>(ax[ii]);
        }
        double ab = rtengine::accumulateProduct(d, ax, n);
        if(ab == 0.0) {
            break;    //So unlikely. It means perfectly converged or singular, stop either way.
        }
@ -118,22 +104,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
        //Get beta.
        ab = rs;
-        rs = 0.0f;
+        rs = rtengine::accumulateProduct(r, s, n);
 #ifdef _OPENMP
        #pragma omp parallel
 #endif
        {
 #ifdef _OPENMP
            #pragma omp for reduction(+:rs)
 #endif
            for(int ii = 0; ii < n; ii++) {
                rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
            }
        }
        ab = rs / ab;
        abf = ab;
        //Update search direction p.
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@ -479,4 +479,25 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
    }
 }
 double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread) {
    if (n == 0) {
        return 0.0;
    }
    // use two accumulators to reduce dependencies (improves speed) and increase accuracy
    double acc1 = 0.0;
    double acc2 = 0.0;
 #ifdef _OPENMP
    #pragma omp parallel for reduction(+:acc1,acc2) if(multiThread)
 #endif
    for (size_t i = 0; i < n - 1; i += 2) {
        acc1 += static_cast<double>(data1[i]) * static_cast<double>(data2[i]);
        acc2 += static_cast<double>(data1[i + 1]) * static_cast<double>(data2[i + 1]);
    }
    if (n & 1) {
        acc1 += static_cast<double>(data1[n -1]) * static_cast<double>(data2[n -1]);
    }
    return acc1 + acc2;
 }
 }
--- a/rtengine/rt_algo.h
+++ b/rtengine/rt_algo.h
@ -29,4 +29,5 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
 void buildGradientsMask(int W, int H, float **luminance, float **out, 
                        float amount, int nlevels, int detail_level,
                        float alfa, float beta, bool multithread);
 double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread = true);
 }