diff --git a/rtengine/EdgePreservingDecomposition.cc b/rtengine/EdgePreservingDecomposition.cc
index 34ac2440b..5ae023122 100644
--- a/rtengine/EdgePreservingDecomposition.cc
+++ b/rtengine/EdgePreservingDecomposition.cc
@@ -4,6 +4,7 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include "rt_algo.h"
 #include "sleef.h"
 
 #define DIAGONALS 5
@@ -42,21 +43,13 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
 
     //s is preconditionment of r. Without, direct to r.
     float *s = r;
-    double rs = 0.0; // use double precision for large summations
 
     if(Preconditioner != nullptr) {
         s = new float[n];
-
         Preconditioner(s, r, Pass);
     }
 
-#ifdef _OPENMP
-    #pragma omp parallel for reduction(+:rs)  // removed schedule(dynamic,10)
-#endif
-
-    for(int ii = 0; ii < n; ii++) {
-        rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
-    }
+    double rs = rtengine::accumulateProduct(r, s, n);
 
     //Search direction d.
     float *d = (buffer + n + 32);
@@ -77,16 +70,9 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
 
     for(iterate = 0; iterate < MaximumIterates; iterate++) {
         //Get step size alpha, store ax while at it.
-        double ab = 0.0; // use double precision for large summations
         Ax(ax, d, Pass);
-#ifdef _OPENMP
-        #pragma omp parallel for reduction(+:ab)
-#endif
-
-        for(int ii = 0; ii < n; ii++) {
-            ab += static_cast<double>(d[ii]) * static_cast<double>(ax[ii]);
-        }
 
+        double ab = rtengine::accumulateProduct(d, ax, n);
         if(ab == 0.0) {
             break;    //So unlikely. It means perfectly converged or singular, stop either way.
         }
@@ -118,22 +104,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
 
         //Get beta.
         ab = rs;
-        rs = 0.0f;
-
-#ifdef _OPENMP
-        #pragma omp parallel
-#endif
-        {
-#ifdef _OPENMP
-            #pragma omp for reduction(+:rs)
-#endif
-
-            for(int ii = 0; ii < n; ii++) {
-                rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
-            }
-
-        }
-
+        rs = rtengine::accumulateProduct(r, s, n);
         ab = rs / ab;
         abf = ab;
         //Update search direction p.
diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc
index 81121f58c..e4ae84a46 100644
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -479,4 +479,25 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
     }
 }
 
+double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread) {
+    if (n == 0) {
+        return 0.0;
+    }
+
+    // use two accumulators to reduce dependencies (improves speed) and increase accuracy
+    double acc1 = 0.0;
+    double acc2 = 0.0;
+#ifdef _OPENMP
+    #pragma omp parallel for reduction(+:acc1,acc2) if(multiThread)
+#endif
+    for (size_t i = 0; i < n - 1; i += 2) {
+        acc1 += static_cast<double>(data1[i]) * static_cast<double>(data2[i]);
+        acc2 += static_cast<double>(data1[i + 1]) * static_cast<double>(data2[i + 1]);
+    }
+
+    if (n & 1) {
+        acc1 += static_cast<double>(data1[n -1]) * static_cast<double>(data2[n -1]);
+    }
+    return acc1 + acc2;
+}
 }
diff --git a/rtengine/rt_algo.h b/rtengine/rt_algo.h
index a72a7c56b..bffe17b57 100644
--- a/rtengine/rt_algo.h
+++ b/rtengine/rt_algo.h
@@ -29,4 +29,5 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
 void buildGradientsMask(int W, int H, float **luminance, float **out, 
                         float amount, int nlevels, int detail_level,
                         float alfa, float beta, bool multithread);
+double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread = true);
 }