diff --git a/rtengine/EdgePreservingDecomposition.cc b/rtengine/EdgePreservingDecomposition.cc index 34ac2440b..5ae023122 100644 --- a/rtengine/EdgePreservingDecomposition.cc +++ b/rtengine/EdgePreservingDecomposition.cc @@ -4,6 +4,7 @@ #ifdef _OPENMP #include #endif +#include "rt_algo.h" #include "sleef.h" #define DIAGONALS 5 @@ -42,21 +43,13 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl //s is preconditionment of r. Without, direct to r. float *s = r; - double rs = 0.0; // use double precision for large summations if(Preconditioner != nullptr) { s = new float[n]; - Preconditioner(s, r, Pass); } -#ifdef _OPENMP - #pragma omp parallel for reduction(+:rs) // removed schedule(dynamic,10) -#endif - - for(int ii = 0; ii < n; ii++) { - rs += static_cast(r[ii]) * static_cast(s[ii]); - } + double rs = rtengine::accumulateProduct(r, s, n); //Search direction d. float *d = (buffer + n + 32); @@ -77,16 +70,9 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl for(iterate = 0; iterate < MaximumIterates; iterate++) { //Get step size alpha, store ax while at it. - double ab = 0.0; // use double precision for large summations Ax(ax, d, Pass); -#ifdef _OPENMP - #pragma omp parallel for reduction(+:ab) -#endif - - for(int ii = 0; ii < n; ii++) { - ab += static_cast(d[ii]) * static_cast(ax[ii]); - } + double ab = rtengine::accumulateProduct(d, ax, n); if(ab == 0.0) { break; //So unlikely. It means perfectly converged or singular, stop either way. } @@ -118,22 +104,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl //Get beta. ab = rs; - rs = 0.0f; - -#ifdef _OPENMP - #pragma omp parallel -#endif - { -#ifdef _OPENMP - #pragma omp for reduction(+:rs) -#endif - - for(int ii = 0; ii < n; ii++) { - rs += static_cast(r[ii]) * static_cast(s[ii]); - } - - } - + rs = rtengine::accumulateProduct(r, s, n); ab = rs / ab; abf = ab; //Update search direction p. diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc index 81121f58c..e4ae84a46 100644 --- a/rtengine/rt_algo.cc +++ b/rtengine/rt_algo.cc @@ -479,4 +479,25 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H, } } +double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread) { + if (n == 0) { + return 0.0; + } + + // use two accumulators to reduce dependencies (improves speed) and increase accuracy + double acc1 = 0.0; + double acc2 = 0.0; +#ifdef _OPENMP + #pragma omp parallel for reduction(+:acc1,acc2) if(multiThread) +#endif + for (size_t i = 0; i < n - 1; i += 2) { + acc1 += static_cast(data1[i]) * static_cast(data2[i]); + acc2 += static_cast(data1[i + 1]) * static_cast(data2[i + 1]); + } + + if (n & 1) { + acc1 += static_cast(data1[n -1]) * static_cast(data2[n -1]); + } + return acc1 + acc2; +} } diff --git a/rtengine/rt_algo.h b/rtengine/rt_algo.h index a72a7c56b..bffe17b57 100644 --- a/rtengine/rt_algo.h +++ b/rtengine/rt_algo.h @@ -29,4 +29,5 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H, void buildGradientsMask(int W, int H, float **luminance, float **out, float amount, int nlevels, int detail_level, float alfa, float beta, bool multithread); +double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread = true); }