Small speedup for epd, also reduces size of executable
This commit is contained in:
parent
f7f527d952
commit
ba78b24fa5
@ -4,6 +4,7 @@
|
|||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#endif
|
#endif
|
||||||
|
#include "rt_algo.h"
|
||||||
#include "sleef.h"
|
#include "sleef.h"
|
||||||
|
|
||||||
#define DIAGONALS 5
|
#define DIAGONALS 5
|
||||||
@ -42,21 +43,13 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
|||||||
|
|
||||||
//s is preconditionment of r. Without, direct to r.
|
//s is preconditionment of r. Without, direct to r.
|
||||||
float *s = r;
|
float *s = r;
|
||||||
double rs = 0.0; // use double precision for large summations
|
|
||||||
|
|
||||||
if(Preconditioner != nullptr) {
|
if(Preconditioner != nullptr) {
|
||||||
s = new float[n];
|
s = new float[n];
|
||||||
|
|
||||||
Preconditioner(s, r, Pass);
|
Preconditioner(s, r, Pass);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _OPENMP
|
double rs = rtengine::accumulateProduct(r, s, n);
|
||||||
#pragma omp parallel for reduction(+:rs) // removed schedule(dynamic,10)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for(int ii = 0; ii < n; ii++) {
|
|
||||||
rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
//Search direction d.
|
//Search direction d.
|
||||||
float *d = (buffer + n + 32);
|
float *d = (buffer + n + 32);
|
||||||
@ -77,16 +70,9 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
|||||||
|
|
||||||
for(iterate = 0; iterate < MaximumIterates; iterate++) {
|
for(iterate = 0; iterate < MaximumIterates; iterate++) {
|
||||||
//Get step size alpha, store ax while at it.
|
//Get step size alpha, store ax while at it.
|
||||||
double ab = 0.0; // use double precision for large summations
|
|
||||||
Ax(ax, d, Pass);
|
Ax(ax, d, Pass);
|
||||||
#ifdef _OPENMP
|
|
||||||
#pragma omp parallel for reduction(+:ab)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for(int ii = 0; ii < n; ii++) {
|
|
||||||
ab += static_cast<double>(d[ii]) * static_cast<double>(ax[ii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
double ab = rtengine::accumulateProduct(d, ax, n);
|
||||||
if(ab == 0.0) {
|
if(ab == 0.0) {
|
||||||
break; //So unlikely. It means perfectly converged or singular, stop either way.
|
break; //So unlikely. It means perfectly converged or singular, stop either way.
|
||||||
}
|
}
|
||||||
@ -118,22 +104,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
|||||||
|
|
||||||
//Get beta.
|
//Get beta.
|
||||||
ab = rs;
|
ab = rs;
|
||||||
rs = 0.0f;
|
rs = rtengine::accumulateProduct(r, s, n);
|
||||||
|
|
||||||
#ifdef _OPENMP
|
|
||||||
#pragma omp parallel
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#ifdef _OPENMP
|
|
||||||
#pragma omp for reduction(+:rs)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for(int ii = 0; ii < n; ii++) {
|
|
||||||
rs += static_cast<double>(r[ii]) * static_cast<double>(s[ii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
ab = rs / ab;
|
ab = rs / ab;
|
||||||
abf = ab;
|
abf = ab;
|
||||||
//Update search direction p.
|
//Update search direction p.
|
||||||
|
@ -479,4 +479,25 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread) {
|
||||||
|
if (n == 0) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// use two accumulators to reduce dependencies (improves speed) and increase accuracy
|
||||||
|
double acc1 = 0.0;
|
||||||
|
double acc2 = 0.0;
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp parallel for reduction(+:acc1,acc2) if(multiThread)
|
||||||
|
#endif
|
||||||
|
for (size_t i = 0; i < n - 1; i += 2) {
|
||||||
|
acc1 += static_cast<double>(data1[i]) * static_cast<double>(data2[i]);
|
||||||
|
acc2 += static_cast<double>(data1[i + 1]) * static_cast<double>(data2[i + 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n & 1) {
|
||||||
|
acc1 += static_cast<double>(data1[n -1]) * static_cast<double>(data2[n -1]);
|
||||||
|
}
|
||||||
|
return acc1 + acc2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,4 +29,5 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
|
|||||||
void buildGradientsMask(int W, int H, float **luminance, float **out,
|
void buildGradientsMask(int W, int H, float **luminance, float **out,
|
||||||
float amount, int nlevels, int detail_level,
|
float amount, int nlevels, int detail_level,
|
||||||
float alfa, float beta, bool multithread);
|
float alfa, float beta, bool multithread);
|
||||||
|
double accumulateProduct(const float* data1, const float* data2, size_t n, bool multiThread = true);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user