Optimization for Tonemapping, issue 1670

This commit is contained in:
Ingo 2013-02-05 21:08:36 +01:00
parent b1e67f4a67
commit caf24d842d
2 changed files with 390 additions and 348 deletions

View File

@ -25,7 +25,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
}else{
Ax(r, x, Pass);
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < n; ii++) r[ii] = b[ii] - r[ii]; //r = b - A x.
}
@ -36,7 +36,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
Preconditioner(s, r, Pass);
}
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) firstprivate(fp) reduction(+:rs)
#pragma omp parallel for firstprivate(fp) reduction(+:rs) // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < n; ii++) {
fp = r[ii]*s[ii];
@ -56,6 +56,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Get step size alpha, store ax while at it.
float ab = 0.0f;
Ax(ax, d, Pass);
#pragma omp parallel for reduction(+:ab)
for(int ii = 0; ii < n; ii++) ab += d[ii]*ax[ii];
if(ab == 0.0f) break; //So unlikely. It means perfectly converged or singular, stop either way.
@ -63,6 +64,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Update x and r with this step size.
float rms = 0.0;
//#pragma omp parallel for reduction(+:rms) // Omp makes it slower here. Don't know why
for(int ii = 0; ii < n; ii++){
x[ii] += ab*d[ii];
r[ii] -= ab*ax[ii]; //"Fast recursive formula", use explicit r = b - Ax occasionally?
@ -78,6 +80,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
//Get beta.
ab = rs;
rs = 0.0f;
//#pragma omp parallel for reduction(+:rs) // Omp makes it slower here. Don't know why
for(int ii = 0; ii < n; ii++) rs += r[ii]*s[ii];
ab = rs/ab;
@ -173,13 +176,21 @@ void MultiDiagonalSymmetricMatrix::VectorProduct(float *Product, float *x){
unsigned int j, l = DiagonalLength(sr);
if(sr == 0)
for(j = 0; j != l; j++)
#pragma omp parallel for
for(j = 0; j < l; j++)
Product[j] += a[j]*x[j]; //Separate, fairly simple treatment for the main diagonal.
else
for(j = 0; j != l; j++)
Product[j + sr] += a[j]*x[j], //Contribution from lower...
else {
// Split the loop in 2 parts, so now it can be parallelized without race conditions
#pragma omp parallel for
for(j = 0; j < l; j++) {
Product[j + sr] += a[j]*x[j]; //Contribution from lower...
}
#pragma omp parallel for
for(j = 0; j < l; j++) {
Product[j] += a[j]*x[j + sr]; //...and upper triangle.
}
}
}
}
bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigned int MaxFillAbove){
@ -198,7 +209,7 @@ bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigne
mic=1;
fp=1;
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10) firstprivate(fp) reduction(+:mic)
#pragma omp parallel for firstprivate(fp) reduction(+:mic) // removed schedule(dynamic,10)
#endif
for(int ii = 1; ii < m; ii++) {
fp = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove); //Guarunteed positive since StartRows must be created in increasing order.
@ -300,23 +311,34 @@ void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float *x, float *b){
unsigned int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
unsigned int i, j;
for(j = 0; j != N; j++){
y[j] = b[j];
float sub = 0; // using local var to reduce memory writes, gave a big speedup
for(i = 1; i != M; i++){ //Start at 1 because zero is D.
int c = (int)j - (int)s[i];
if(c < 0) break; //Due to ordering of StartRows, no further contributions.
y[j] -= d[i][c]*y[c];
if(c==j) {
sub += d[i][c]*b[c]; //Because y is not filled yet, we have to access b
}
else {
sub += d[i][c]*y[c];
}
}
y[j] = b[j] - sub; // only one memory-write per j
}
//Now, solve x from D Lt x = y -> Lt x = D^-1 y
while(j-- != 0){
// Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
#pragma omp parallel for
for(j = 0; j < N; j++)
x[j] = y[j]/d[0][j];
while(j-- != 0){
float sub = 0; // using local var to reduce memory writes, gave a big speedup
for(i = 1; i != M; i++){
if(j + s[i] >= N) break;
x[j] -= d[i][j]*x[j + s[i]];
sub += d[i][j]*x[j + s[i]];
}
x[j] -= sub; // only one memory-write per j
}
delete[] y;
@ -371,9 +393,11 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
//unsigned int x, y;
unsigned int i;
unsigned int w1 = w - 1, h1 = h - 1;
float eps = 0.02f;
// float eps = 0.02f;
const float sqreps = 0.0004f; // removed eps*eps from inner loop
// float ScaleConstant = Scale * powf(0.5f,-EdgeStopping);
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int y = 0; y < h1; y++){
float *rg = &g[w*y];
@ -383,7 +407,7 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
//Apply power to the magnitude of the gradient to get the edge stopping function.
a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + eps*eps), -EdgeStopping);
a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
}
}
//unsigned int x,y;
@ -401,9 +425,20 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
unsigned int x, y;
for(i = y = 0; y != h; y++){
for(x = 0; x != w; x++, i++){
// unsigned int x, y;
// checked for race condition here
// a0[] is read and write but adressed by i only
// a[] is read only
// a_w_1 is write only
// a_w is write only
// a_w1 is write only
// a_1 is write only
// So, there should be no race conditions
#pragma omp parallel for
for(int y = 0; y < h; y++){
unsigned int i = y*w;
for(int x = 0; x != w; x++, i++){
float ac;
a0[i] = 1.0;
@ -465,7 +500,7 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc
//We're working with luminance, which does better logarithmic.
unsigned int i;
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < n; ii++)
Source[ii] = logf(Source[ii] + eps);
@ -476,7 +511,7 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc
//Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int i = 0; i < n; i++){
float ce = expf(Source[i] + u[i]*(CompressionExponent - 1.0f)) - eps;

View File

@ -2767,11 +2767,18 @@ float rew=params->edgePreservingDecompositionUI.ReweightingIterates;
//Due to the taking of logarithms, L must be nonnegative. Further, scale to 0 to 1 using nominal range of L, 0 to 15 bit.
float minL = FLT_MAX;
for(i = 0; i != N; i++)
if(L[i] < minL) minL = L[i];
#pragma omp parallel
{
float lminL = FLT_MAX;
#pragma omp for
for(i = 0; i < N; i++)
if(L[i] < lminL) lminL = L[i];
#pragma omp critical
if(lminL < minL) minL = lminL;
}
if(minL > 0.0f) minL = 0.0f; //Disable the shift if there are no negative numbers. I wish there were just no negative numbers to begin with.
for(i = 0; i != N; i++)
#pragma omp parallel for
for(i = 0; i < N; i++)
L[i] = (L[i] - minL)/32767.0f;
//Some interpretations.
@ -2794,7 +2801,7 @@ fclose(f);*/
//Restore past range, also desaturate a bit per Mantiuk's Color correction for tone mapping.
float s = (1.0f + 38.7889f)*powf(Compression, 1.5856f)/(1.0f + 38.7889f*powf(Compression, 1.5856f));
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,10)
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < N; ii++)
a[ii] *= s,