Optimization for Tonemapping, issue 1670

This commit is contained in:
Ingo 2013-02-05 21:08:36 +01:00
parent b1e67f4a67
commit caf24d842d
2 changed files with 390 additions and 348 deletions

View File

@ -11,10 +11,10 @@ Stops at n iterates if MaximumIterates = 0 since that many iterates gives exact
definite problems only, which is what unconstrained smooth optimization pretty much always is.
Parameter pass can be passed through, containing whatever info you like it to contain (matrix info?).
Takes less memory with OkToModify_b = true, and Preconditioner = NULL. */
float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, unsigned int n, bool OkToModify_b,
float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, unsigned int n, bool OkToModify_b,
float *x, float RMSResidual, void *Pass, unsigned int MaximumIterates, void Preconditioner(float *Product, float *x, void *Pass)){
unsigned int iterate, i;
float *r = new float[n];
//Start r and x.
@ -25,7 +25,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
}else{
Ax(r, x, Pass);
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < n; ii++) r[ii] = b[ii] - r[ii]; //r = b - A x.
}
@ -35,9 +35,9 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
s = new float[n];
Preconditioner(s, r, Pass);
}
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) firstprivate(fp) reduction(+:rs)
#endif
#ifdef _OPENMP
#pragma omp parallel for firstprivate(fp) reduction(+:rs) // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < n; ii++) {
fp = r[ii]*s[ii];
rs=rs+fp;
@ -56,6 +56,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Get step size alpha, store ax while at it.
float ab = 0.0f;
Ax(ax, d, Pass);
#pragma omp parallel for reduction(+:ab)
for(int ii = 0; ii < n; ii++) ab += d[ii]*ax[ii];
if(ab == 0.0f) break; //So unlikely. It means perfectly converged or singular, stop either way.
@ -63,6 +64,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Update x and r with this step size.
float rms = 0.0;
//#pragma omp parallel for reduction(+:rms) // Omp makes it slower here. Don't know why
for(int ii = 0; ii < n; ii++){
x[ii] += ab*d[ii];
r[ii] -= ab*ax[ii]; //"Fast recursive formula", use explicit r = b - Ax occasionally?
@ -78,6 +80,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Get beta.
ab = rs;
rs = 0.0f;
//#pragma omp parallel for reduction(+:rs) // Omp makes it slower here. Don't know why
for(int ii = 0; ii < n; ii++) rs += r[ii]*s[ii];
ab = rs/ab;
@ -173,12 +176,20 @@ void MultiDiagonalSymmetricMatrix::VectorProduct(float *Product, float *x){
unsigned int j, l = DiagonalLength(sr);
if(sr == 0)
for(j = 0; j != l; j++)
#pragma omp parallel for
for(j = 0; j < l; j++)
Product[j] += a[j]*x[j]; //Separate, fairly simple treatment for the main diagonal.
else
for(j = 0; j != l; j++)
Product[j + sr] += a[j]*x[j], //Contribution from lower...
else {
// Split the loop in 2 parts, so now it can be parallelized without race conditions
#pragma omp parallel for
for(j = 0; j < l; j++) {
Product[j + sr] += a[j]*x[j]; //Contribution from lower...
}
#pragma omp parallel for
for(j = 0; j < l; j++) {
Product[j] += a[j]*x[j + sr]; //...and upper triangle.
}
}
}
}
@ -198,8 +209,8 @@ bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigne
mic=1;
fp=1;
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) firstprivate(fp) reduction(+:mic)
#endif
#pragma omp parallel for firstprivate(fp) reduction(+:mic) // removed schedule(dynamic,10)
#endif
for(int ii = 1; ii < m; ii++) {
fp = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove); //Guarunteed positive since StartRows must be created in increasing order.
mic=mic+fp;
@ -300,23 +311,34 @@ void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float *x, float *b){
unsigned int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
unsigned int i, j;
for(j = 0; j != N; j++){
y[j] = b[j];
float sub = 0; // using local var to reduce memory writes, gave a big speedup
for(i = 1; i != M; i++){ //Start at 1 because zero is D.
int c = (int)j - (int)s[i];
if(c < 0) break; //Due to ordering of StartRows, no further contributions.
y[j] -= d[i][c]*y[c];
if(c==j) {
sub += d[i][c]*b[c]; //Because y is not filled yet, we have to access b
}
else {
sub += d[i][c]*y[c];
}
}
y[j] = b[j] - sub; // only one memory-write per j
}
//Now, solve x from D Lt x = y -> Lt x = D^-1 y
while(j-- != 0){
// Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
#pragma omp parallel for
for(j = 0; j < N; j++)
x[j] = y[j]/d[0][j];
while(j-- != 0){
float sub = 0; // using local var to reduce memory writes, gave a big speedup
for(i = 1; i != M; i++){
if(j + s[i] >= N) break;
x[j] -= d[i][j]*x[j + s[i]];
sub += d[i][j]*x[j + s[i]];
}
x[j] -= sub; // only one memory-write per j
}
delete[] y;
@ -371,9 +393,11 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
//unsigned int x, y;
unsigned int i;
unsigned int w1 = w - 1, h1 = h - 1;
float eps = 0.02f;
// float eps = 0.02f;
const float sqreps = 0.0004f; // removed eps*eps from inner loop
// float ScaleConstant = Scale * powf(0.5f,-EdgeStopping);
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int y = 0; y < h1; y++){
float *rg = &g[w*y];
@ -383,7 +407,7 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
//Apply power to the magnitude of the gradient to get the edge stopping function.
a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + eps*eps), -EdgeStopping);
a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
}
}
//unsigned int x,y;
@ -401,9 +425,20 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
unsigned int x, y;
for(i = y = 0; y != h; y++){
for(x = 0; x != w; x++, i++){
// unsigned int x, y;
// checked for race condition here
// a0[] is read and write but adressed by i only
// a[] is read only
// a_w_1 is write only
// a_w is write only
// a_w1 is write only
// a_1 is write only
// So, there should be no race conditions
#pragma omp parallel for
for(int y = 0; y < h; y++){
unsigned int i = y*w;
for(int x = 0; x != w; x++, i++){
float ac;
a0[i] = 1.0;
@ -426,7 +461,7 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
a0[i] += 4.0f*a[i]/6.0f;
}
}
if(UseBlurForEdgeStop) delete[] a;
//Solve & return.
@ -465,8 +500,8 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc
//We're working with luminance, which does better logarithmic.
unsigned int i;
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#endif
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < n; ii++)
Source[ii] = logf(Source[ii] + eps);
@ -476,7 +511,7 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc
//Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int i = 0; i < n; i++){
float ce = expf(Source[i] + u[i]*(CompressionExponent - 1.0f)) - eps;
@ -484,7 +519,7 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc
Source[i] = expf(Source[i]) - eps;
Compressed[i] = ce + DetailBoost*(Source[i] - ue);
}
if(Compressed != u) delete[] u;
return Compressed;

File diff suppressed because it is too large Load Diff