Optimization for Tonemapping, issue 1670

This commit is contained in:
Ingo 2013-02-05 21:08:36 +01:00
parent b1e67f4a67
commit caf24d842d
2 changed files with 390 additions and 348 deletions

View File

@ -25,7 +25,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
}else{ }else{
Ax(r, x, Pass); Ax(r, x, Pass);
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) #pragma omp parallel for // removed schedule(dynamic,10)
#endif #endif
for(int ii = 0; ii < n; ii++) r[ii] = b[ii] - r[ii]; //r = b - A x. for(int ii = 0; ii < n; ii++) r[ii] = b[ii] - r[ii]; //r = b - A x.
} }
@ -36,7 +36,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
Preconditioner(s, r, Pass); Preconditioner(s, r, Pass);
} }
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) firstprivate(fp) reduction(+:rs) #pragma omp parallel for firstprivate(fp) reduction(+:rs) // removed schedule(dynamic,10)
#endif #endif
for(int ii = 0; ii < n; ii++) { for(int ii = 0; ii < n; ii++) {
fp = r[ii]*s[ii]; fp = r[ii]*s[ii];
@ -56,6 +56,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Get step size alpha, store ax while at it. //Get step size alpha, store ax while at it.
float ab = 0.0f; float ab = 0.0f;
Ax(ax, d, Pass); Ax(ax, d, Pass);
#pragma omp parallel for reduction(+:ab)
for(int ii = 0; ii < n; ii++) ab += d[ii]*ax[ii]; for(int ii = 0; ii < n; ii++) ab += d[ii]*ax[ii];
if(ab == 0.0f) break; //So unlikely. It means perfectly converged or singular, stop either way. if(ab == 0.0f) break; //So unlikely. It means perfectly converged or singular, stop either way.
@ -63,6 +64,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Update x and r with this step size. //Update x and r with this step size.
float rms = 0.0; float rms = 0.0;
//#pragma omp parallel for reduction(+:rms) // Omp makes it slower here. Don't know why
for(int ii = 0; ii < n; ii++){ for(int ii = 0; ii < n; ii++){
x[ii] += ab*d[ii]; x[ii] += ab*d[ii];
r[ii] -= ab*ax[ii]; //"Fast recursive formula", use explicit r = b - Ax occasionally? r[ii] -= ab*ax[ii]; //"Fast recursive formula", use explicit r = b - Ax occasionally?
@ -78,6 +80,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Get beta. //Get beta.
ab = rs; ab = rs;
rs = 0.0f; rs = 0.0f;
//#pragma omp parallel for reduction(+:rs) // Omp makes it slower here. Don't know why
for(int ii = 0; ii < n; ii++) rs += r[ii]*s[ii]; for(int ii = 0; ii < n; ii++) rs += r[ii]*s[ii];
ab = rs/ab; ab = rs/ab;
@ -173,13 +176,21 @@ void MultiDiagonalSymmetricMatrix::VectorProduct(float *Product, float *x){
unsigned int j, l = DiagonalLength(sr); unsigned int j, l = DiagonalLength(sr);
if(sr == 0) if(sr == 0)
for(j = 0; j != l; j++) #pragma omp parallel for
for(j = 0; j < l; j++)
Product[j] += a[j]*x[j]; //Separate, fairly simple treatment for the main diagonal. Product[j] += a[j]*x[j]; //Separate, fairly simple treatment for the main diagonal.
else else {
for(j = 0; j != l; j++) // Split the loop in 2 parts, so now it can be parallelized without race conditions
Product[j + sr] += a[j]*x[j], //Contribution from lower... #pragma omp parallel for
for(j = 0; j < l; j++) {
Product[j + sr] += a[j]*x[j]; //Contribution from lower...
}
#pragma omp parallel for
for(j = 0; j < l; j++) {
Product[j] += a[j]*x[j + sr]; //...and upper triangle. Product[j] += a[j]*x[j + sr]; //...and upper triangle.
} }
}
}
} }
bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigned int MaxFillAbove){ bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigned int MaxFillAbove){
@ -198,7 +209,7 @@ bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigne
mic=1; mic=1;
fp=1; fp=1;
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) firstprivate(fp) reduction(+:mic) #pragma omp parallel for firstprivate(fp) reduction(+:mic) // removed schedule(dynamic,10)
#endif #endif
for(int ii = 1; ii < m; ii++) { for(int ii = 1; ii < m; ii++) {
fp = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove); //Guarunteed positive since StartRows must be created in increasing order. fp = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove); //Guarunteed positive since StartRows must be created in increasing order.
@ -300,23 +311,34 @@ void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float *x, float *b){
unsigned int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n; unsigned int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
unsigned int i, j; unsigned int i, j;
for(j = 0; j != N; j++){ for(j = 0; j != N; j++){
y[j] = b[j]; float sub = 0; // using local var to reduce memory writes, gave a big speedup
for(i = 1; i != M; i++){ //Start at 1 because zero is D. for(i = 1; i != M; i++){ //Start at 1 because zero is D.
int c = (int)j - (int)s[i]; int c = (int)j - (int)s[i];
if(c < 0) break; //Due to ordering of StartRows, no further contributions. if(c < 0) break; //Due to ordering of StartRows, no further contributions.
y[j] -= d[i][c]*y[c]; if(c==j) {
sub += d[i][c]*b[c]; //Because y is not filled yet, we have to access b
} }
else {
sub += d[i][c]*y[c];
}
}
y[j] = b[j] - sub; // only one memory-write per j
} }
//Now, solve x from D Lt x = y -> Lt x = D^-1 y //Now, solve x from D Lt x = y -> Lt x = D^-1 y
while(j-- != 0){ // Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
#pragma omp parallel for
for(j = 0; j < N; j++)
x[j] = y[j]/d[0][j]; x[j] = y[j]/d[0][j];
while(j-- != 0){
float sub = 0; // using local var to reduce memory writes, gave a big speedup
for(i = 1; i != M; i++){ for(i = 1; i != M; i++){
if(j + s[i] >= N) break; if(j + s[i] >= N) break;
x[j] -= d[i][j]*x[j + s[i]]; sub += d[i][j]*x[j + s[i]];
} }
x[j] -= sub; // only one memory-write per j
} }
delete[] y; delete[] y;
@ -371,9 +393,11 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
//unsigned int x, y; //unsigned int x, y;
unsigned int i; unsigned int i;
unsigned int w1 = w - 1, h1 = h - 1; unsigned int w1 = w - 1, h1 = h - 1;
float eps = 0.02f; // float eps = 0.02f;
const float sqreps = 0.0004f; // removed eps*eps from inner loop
// float ScaleConstant = Scale * powf(0.5f,-EdgeStopping);
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) #pragma omp parallel for // removed schedule(dynamic,10)
#endif #endif
for(int y = 0; y < h1; y++){ for(int y = 0; y < h1; y++){
float *rg = &g[w*y]; float *rg = &g[w*y];
@ -383,7 +407,7 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]); float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
//Apply power to the magnitude of the gradient to get the edge stopping function. //Apply power to the magnitude of the gradient to get the edge stopping function.
a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + eps*eps), -EdgeStopping); a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
} }
} }
//unsigned int x,y; //unsigned int x,y;
@ -401,9 +425,20 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float)); memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
memset(a_w, 0, A->DiagonalLength(w)*sizeof(float)); memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float)); memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
unsigned int x, y; // unsigned int x, y;
for(i = y = 0; y != h; y++){
for(x = 0; x != w; x++, i++){ // checked for race condition here
// a0[] is read and write but adressed by i only
// a[] is read only
// a_w_1 is write only
// a_w is write only
// a_w1 is write only
// a_1 is write only
// So, there should be no race conditions
#pragma omp parallel for
for(int y = 0; y < h; y++){
unsigned int i = y*w;
for(int x = 0; x != w; x++, i++){
float ac; float ac;
a0[i] = 1.0; a0[i] = 1.0;
@ -465,7 +500,7 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc
//We're working with luminance, which does better logarithmic. //We're working with luminance, which does better logarithmic.
unsigned int i; unsigned int i;
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) #pragma omp parallel for // removed schedule(dynamic,10)
#endif #endif
for(int ii = 0; ii < n; ii++) for(int ii = 0; ii < n; ii++)
Source[ii] = logf(Source[ii] + eps); Source[ii] = logf(Source[ii] + eps);
@ -476,7 +511,7 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc
//Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged. //Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) #pragma omp parallel for // removed schedule(dynamic,10)
#endif #endif
for(int i = 0; i < n; i++){ for(int i = 0; i < n; i++){
float ce = expf(Source[i] + u[i]*(CompressionExponent - 1.0f)) - eps; float ce = expf(Source[i] + u[i]*(CompressionExponent - 1.0f)) - eps;

View File

@ -2767,11 +2767,18 @@ float rew=params->edgePreservingDecompositionUI.ReweightingIterates;
//Due to the taking of logarithms, L must be nonnegative. Further, scale to 0 to 1 using nominal range of L, 0 to 15 bit. //Due to the taking of logarithms, L must be nonnegative. Further, scale to 0 to 1 using nominal range of L, 0 to 15 bit.
float minL = FLT_MAX; float minL = FLT_MAX;
for(i = 0; i != N; i++) #pragma omp parallel
if(L[i] < minL) minL = L[i]; {
float lminL = FLT_MAX;
#pragma omp for
for(i = 0; i < N; i++)
if(L[i] < lminL) lminL = L[i];
#pragma omp critical
if(lminL < minL) minL = lminL;
}
if(minL > 0.0f) minL = 0.0f; //Disable the shift if there are no negative numbers. I wish there were just no negative numbers to begin with. if(minL > 0.0f) minL = 0.0f; //Disable the shift if there are no negative numbers. I wish there were just no negative numbers to begin with.
#pragma omp parallel for
for(i = 0; i != N; i++) for(i = 0; i < N; i++)
L[i] = (L[i] - minL)/32767.0f; L[i] = (L[i] - minL)/32767.0f;
//Some interpretations. //Some interpretations.
@ -2794,7 +2801,7 @@ fclose(f);*/
//Restore past range, also desaturate a bit per Mantiuk's Color correction for tone mapping. //Restore past range, also desaturate a bit per Mantiuk's Color correction for tone mapping.
float s = (1.0f + 38.7889f)*powf(Compression, 1.5856f)/(1.0f + 38.7889f*powf(Compression, 1.5856f)); float s = (1.0f + 38.7889f)*powf(Compression, 1.5856f)/(1.0f + 38.7889f*powf(Compression, 1.5856f));
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) #pragma omp parallel for // removed schedule(dynamic,10)
#endif #endif
for(int ii = 0; ii < N; ii++) for(int ii = 0; ii < N; ii++)
a[ii] *= s, a[ii] *= s,