Small speedup for epd, also reduces size of executable
This commit is contained in:
parent
f7f527d952
commit
ba78b24fa5
@ -4,6 +4,7 @@
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include "rt_algo.h"
|
||||
#include "sleef.h"
|
||||
|
||||
#define DIAGONALS 5
|
||||
@ -42,21 +43,13 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
||||
|
||||
//s is preconditionment of r. Without, direct to r.
|
||||
float *s = r;
|
||||
double rs = 0.0; // use double precision for large summations
|
||||
|
||||
if(Preconditioner != nullptr) {
|
||||
s = new float[n];
|
||||
|
||||
Preconditioner(s, r, Pass);
|
||||
}
|
||||
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+:rs) // removed schedule(dynamic,10)
|
||||
#endif
|
||||
|
||||
for(int ii = 0; ii < n; ii++) {
|
||||
rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
|
||||
}
|
||||
double rs = rtengine::accumulateProduct(r, s, n);
|
||||
|
||||
//Search direction d.
|
||||
float *d = (buffer + n + 32);
|
||||
@ -77,16 +70,9 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
||||
|
||||
for(iterate = 0; iterate < MaximumIterates; iterate++) {
|
||||
//Get step size alpha, store ax while at it.
|
||||
double ab = 0.0; // use double precision for large summations
|
||||
Ax(ax, d, Pass);
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+:ab)
|
||||
#endif
|
||||
|
||||
for(int ii = 0; ii < n; ii++) {
|
||||
ab += static_cast<double>(d[ii]) * static_cast<double>(ax[ii]);
|
||||
}
|
||||
|
||||
double ab = rtengine::accumulateProduct(d, ax, n);
|
||||
if(ab == 0.0) {
|
||||
break; //So unlikely. It means perfectly converged or singular, stop either way.
|
||||
}
|
||||
@ -118,22 +104,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
||||
|
||||
//Get beta.
|
||||
ab = rs;
|
||||
rs = 0.0f;
|
||||
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel
|
||||
#endif
|
||||
{
|
||||
#ifdef _OPENMP
|
||||
#pragma omp for reduction(+:rs)
|
||||
#endif
|
||||
|
||||
for(int ii = 0; ii < n; ii++) {
|
||||
rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
rs = rtengine::accumulateProduct(r, s, n);
|
||||
ab = rs / ab;
|
||||
abf = ab;
|
||||
//Update search direction p.
|
||||
|
@ -479,4 +479,25 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
|
||||
}
|
||||
}
|
||||
|
||||
double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread) {
|
||||
if (n == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// use two accumulators to reduce dependencies (improves speed) and increase accuracy
|
||||
double acc1 = 0.0;
|
||||
double acc2 = 0.0;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+:acc1,acc2) if(multiThread)
|
||||
#endif
|
||||
for (size_t i = 0; i < n - 1; i += 2) {
|
||||
acc1 += static_cast<double>(data1[i]) * static_cast<double>(data2[i]);
|
||||
acc2 += static_cast<double>(data1[i + 1]) * static_cast<double>(data2[i + 1]);
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
acc1 += static_cast<double>(data1[n -1]) * static_cast<double>(data2[n -1]);
|
||||
}
|
||||
return acc1 + acc2;
|
||||
}
|
||||
}
|
||||
|
@ -29,4 +29,5 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
|
||||
void buildGradientsMask(int W, int H, float **luminance, float **out,
|
||||
float amount, int nlevels, int detail_level,
|
||||
float alfa, float beta, bool multithread);
|
||||
double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread = true);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user