Small speedup for epd, also reduces size of executable

This commit is contained in:
Ingo Weyrich 2020-11-17 15:40:49 +01:00
parent f7f527d952
commit ba78b24fa5
3 changed files with 26 additions and 33 deletions

View File

@ -4,6 +4,7 @@
#ifdef _OPENMP #ifdef _OPENMP
#include <omp.h> #include <omp.h>
#endif #endif
#include "rt_algo.h"
#include "sleef.h" #include "sleef.h"
#define DIAGONALS 5 #define DIAGONALS 5
@ -42,21 +43,13 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//s is preconditionment of r. Without, direct to r. //s is preconditionment of r. Without, direct to r.
float *s = r; float *s = r;
double rs = 0.0; // use double precision for large summations
if(Preconditioner != nullptr) { if(Preconditioner != nullptr) {
s = new float[n]; s = new float[n];
Preconditioner(s, r, Pass); Preconditioner(s, r, Pass);
} }
#ifdef _OPENMP double rs = rtengine::accumulateProduct(r, s, n);
#pragma omp parallel for reduction(+:rs) // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < n; ii++) {
rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
}
//Search direction d. //Search direction d.
float *d = (buffer + n + 32); float *d = (buffer + n + 32);
@ -77,16 +70,9 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
for(iterate = 0; iterate < MaximumIterates; iterate++) { for(iterate = 0; iterate < MaximumIterates; iterate++) {
//Get step size alpha, store ax while at it. //Get step size alpha, store ax while at it.
double ab = 0.0; // use double precision for large summations
Ax(ax, d, Pass); Ax(ax, d, Pass);
#ifdef _OPENMP
#pragma omp parallel for reduction(+:ab)
#endif
for(int ii = 0; ii < n; ii++) {
ab += static_cast<double>(d[ii]) * static_cast<double>(ax[ii]);
}
double ab = rtengine::accumulateProduct(d, ax, n);
if(ab == 0.0) { if(ab == 0.0) {
break; //So unlikely. It means perfectly converged or singular, stop either way. break; //So unlikely. It means perfectly converged or singular, stop either way.
} }
@ -118,22 +104,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Get beta. //Get beta.
ab = rs; ab = rs;
rs = 0.0f; rs = rtengine::accumulateProduct(r, s, n);
#ifdef _OPENMP
#pragma omp parallel
#endif
{
#ifdef _OPENMP
#pragma omp for reduction(+:rs)
#endif
for(int ii = 0; ii < n; ii++) {
rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
}
}
ab = rs / ab; ab = rs / ab;
abf = ab; abf = ab;
//Update search direction p. //Update search direction p.

View File

@ -479,4 +479,25 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
} }
} }
double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread) {
if (n == 0) {
return 0.0;
}
// use two accumulators to reduce dependencies (improves speed) and increase accuracy
double acc1 = 0.0;
double acc2 = 0.0;
#ifdef _OPENMP
#pragma omp parallel for reduction(+:acc1,acc2) if(multiThread)
#endif
for (size_t i = 0; i < n - 1; i += 2) {
acc1 += static_cast<double>(data1[i]) * static_cast<double>(data2[i]);
acc2 += static_cast<double>(data1[i + 1]) * static_cast<double>(data2[i + 1]);
}
if (n & 1) {
acc1 += static_cast<double>(data1[n -1]) * static_cast<double>(data2[n -1]);
}
return acc1 + acc2;
}
} }

View File

@ -29,4 +29,5 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
void buildGradientsMask(int W, int H, float **luminance, float **out, void buildGradientsMask(int W, int H, float **luminance, float **out,
float amount, int nlevels, int detail_level, float amount, int nlevels, int detail_level,
float alfa, float beta, bool multithread); float alfa, float beta, bool multithread);
double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread = true);
} }