Optimization for Tonemapping, issue 1670

2013-02-05 21:08:36 +01:00 · 2013-02-05 21:08:36 +01:00 · caf24d842d
commit caf24d842d
parent b1e67f4a67
2 changed files with 390 additions and 348 deletions
--- a/rtengine/EdgePreservingDecomposition.cc
+++ b/rtengine/EdgePreservingDecomposition.cc
@ -25,7 +25,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
 	}else{
 		Ax(r, x, Pass);
 		#ifdef _OPENMP
-		#pragma omp  parallel for schedule(dynamic,10)
+		#pragma omp  parallel for           // removed schedule(dynamic,10)
 		#endif
 		for(int ii = 0; ii < n; ii++) r[ii] = b[ii] - r[ii];		//r = b - A x.
 	}
@ -36,7 +36,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
 		Preconditioner(s, r, Pass);
 	}
 	#ifdef _OPENMP
-	#pragma omp  parallel for schedule(dynamic,10) firstprivate(fp) reduction(+:rs)
+	#pragma omp  parallel for firstprivate(fp) reduction(+:rs)  // removed schedule(dynamic,10)
 	#endif
 	for(int ii = 0; ii < n; ii++) {
 		fp = r[ii]*s[ii];
@ -56,6 +56,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
 		//Get step size alpha, store ax while at it.
 		float ab = 0.0f;
 		Ax(ax, d, Pass);
+#pragma omp parallel for reduction(+:ab)
 		for(int ii = 0; ii < n; ii++) ab += d[ii]*ax[ii];

 		if(ab == 0.0f) break;	//So unlikely. It means perfectly converged or singular, stop either way.
@ -63,6 +64,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl

 		//Update x and r with this step size.
 		float rms = 0.0;
+//#pragma omp parallel for reduction(+:rms)                        // Omp makes it slower here. Don't know why
 		for(int ii = 0; ii < n; ii++){
 			x[ii] += ab*d[ii];
 			r[ii] -= ab*ax[ii];	//"Fast recursive formula", use explicit r = b - Ax occasionally?
@ -78,6 +80,7 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
 		//Get beta.
 		ab = rs;
 		rs = 0.0f;
+//#pragma omp parallel for reduction(+:rs)                            // Omp makes it slower here. Don't know why
 		for(int ii = 0; ii < n; ii++) rs += r[ii]*s[ii];
 		ab = rs/ab;

@ -173,13 +176,21 @@ void MultiDiagonalSymmetricMatrix::VectorProduct(float *Product, float *x){
 		unsigned int j, l = DiagonalLength(sr);

 		if(sr == 0)
-			for(j = 0; j != l; j++)
+#pragma omp parallel for
+			for(j = 0; j < l; j++)
 				Product[j] += a[j]*x[j];		//Separate, fairly simple treatment for the main diagonal.
-		else
-			for(j = 0; j != l; j++)
-				Product[j + sr] += a[j]*x[j],	//Contribution from lower...
+		else {
+// Split the loop in 2 parts, so now it can be parallelized without race conditions
+#pragma omp parallel for
+			for(j = 0; j < l; j++) {
+				Product[j + sr] += a[j]*x[j];	//Contribution from lower...
+			}
+#pragma omp parallel for
+			for(j = 0; j < l; j++) {
 				Product[j] += a[j]*x[j + sr];	//...and upper triangle.
 			}
+		}
+	}
 }

 bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigned int MaxFillAbove){
@ -198,7 +209,7 @@ bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigne
 	mic=1;
 	fp=1;
 	#ifdef _OPENMP
-	#pragma omp parallel for schedule(dynamic,10) firstprivate(fp) reduction(+:mic)
+	#pragma omp parallel for firstprivate(fp) reduction(+:mic)                  // removed schedule(dynamic,10)
 	#endif
 	for(int ii = 1; ii < m; ii++) {
 		fp = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove);	//Guarunteed positive since StartRows must be created in increasing order.
@ -300,23 +311,34 @@ void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float *x, float *b){
 	unsigned int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
 	unsigned int i, j;
 	for(j = 0; j != N; j++){
-		y[j] = b[j];
-
+        float sub = 0;              // using local var to reduce memory writes, gave a big speedup
 		for(i = 1; i != M; i++){	//Start at 1 because zero is D.
+
 			int c = (int)j - (int)s[i];
 			if(c < 0) break;		//Due to ordering of StartRows, no further contributions.
-			y[j] -= d[i][c]*y[c];
+			if(c==j) {
+                sub += d[i][c]*b[c];    //Because y is not filled yet, we have to access b
 			}
+			else {
+                sub += d[i][c]*y[c];
+			}
+		}
+		y[j] = b[j] - sub;          // only one memory-write per j
 	}

 	//Now, solve x from D Lt x = y -> Lt x = D^-1 y
-	while(j-- != 0){
+// Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
+#pragma omp parallel for
+    for(j = 0; j < N; j++)
 		x[j] = y[j]/d[0][j];

+	while(j-- != 0){
+        float sub = 0;                      // using local var to reduce memory writes, gave a big speedup
 		for(i = 1; i != M; i++){
 			if(j + s[i] >= N) break;
-			x[j] -= d[i][j]*x[j + s[i]];
+			sub += d[i][j]*x[j + s[i]];
 		}
+		x[j] -= sub;                        // only one memory-write per j
 	}

 	delete[] y;
@ -371,9 +393,11 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
 	//unsigned int x, y;
 	unsigned int i;
 	unsigned int w1 = w - 1, h1 = h - 1;
-	float eps = 0.02f;
+//	float eps = 0.02f;
+	const float sqreps = 0.0004f;                           // removed eps*eps from inner loop
+//	float ScaleConstant = Scale * powf(0.5f,-EdgeStopping);
 	#ifdef _OPENMP
-	#pragma omp parallel for schedule(dynamic,10)
+	#pragma omp parallel for                // removed schedule(dynamic,10)
 	#endif
 	for(int y = 0; y < h1; y++){
 		float *rg = &g[w*y];
@ -383,7 +407,7 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
 			float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);

 			//Apply power to the magnitude of the gradient to get the edge stopping function.
-			a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + eps*eps), -EdgeStopping);
+			a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
 		}
 	}
 //unsigned int x,y;
@ -401,9 +425,20 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
 	memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
 	memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
 	memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
-	unsigned int x, y;
-	for(i = y = 0; y != h; y++){
-		for(x = 0; x != w; x++, i++){
+//	unsigned int x, y;
+
+// checked for race condition here
+// a0[] is read and write but adressed by i only
+// a[] is read only
+// a_w_1 is write only
+// a_w is write only
+// a_w1 is write only
+// a_1 is write only
+// So, there should be no race conditions
+#pragma omp parallel for
+	for(int y = 0; y < h; y++){
+        unsigned int i = y*w;
+		for(int x = 0; x != w; x++, i++){
 			float ac;
 			a0[i] = 1.0;

@ -465,7 +500,7 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc
 	//We're working with luminance, which does better logarithmic.
 	unsigned int i;
 	#ifdef _OPENMP
-	#pragma omp parallel for schedule(dynamic,10)
+	#pragma omp parallel for                        // removed schedule(dynamic,10)
 	#endif
 	for(int ii = 0; ii < n; ii++)
 		Source[ii] = logf(Source[ii] + eps);
@ -476,7 +511,7 @@ float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Sc

 	//Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
 	#ifdef _OPENMP
-	#pragma omp parallel for schedule(dynamic,10)	
+	#pragma omp parallel for                        // removed schedule(dynamic,10)
 	#endif
 	for(int i = 0; i < n; i++){
 		float ce = expf(Source[i] + u[i]*(CompressionExponent - 1.0f)) - eps;
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@ -2767,11 +2767,18 @@ float rew=params->edgePreservingDecompositionUI.ReweightingIterates;

 	//Due to the taking of logarithms, L must be nonnegative. Further, scale to 0 to 1 using nominal range of L, 0 to 15 bit.
    float minL = FLT_MAX;
-	for(i = 0; i != N; i++)
-		if(L[i] < minL) minL = L[i];
+#pragma omp parallel
+{
+	float lminL = FLT_MAX;
+#pragma omp for
+	for(i = 0; i < N; i++)
+		if(L[i] < lminL) lminL = L[i];
+#pragma omp critical
+    if(lminL < minL) minL = lminL;
+}
 	if(minL > 0.0f) minL = 0.0f;		//Disable the shift if there are no negative numbers. I wish there were just no negative numbers to begin with.
-
-	for(i = 0; i != N; i++)
+#pragma omp parallel for
+	for(i = 0; i < N; i++)
 		L[i] = (L[i] - minL)/32767.0f;

 	//Some interpretations.
@ -2794,7 +2801,7 @@ fclose(f);*/
 	//Restore past range, also desaturate a bit per Mantiuk's Color correction for tone mapping.
 	float s = (1.0f + 38.7889f)*powf(Compression, 1.5856f)/(1.0f + 38.7889f*powf(Compression, 1.5856f));
 	#ifdef _OPENMP
-	#pragma omp parallel for schedule(dynamic,10)
+	#pragma omp parallel for            // removed schedule(dynamic,10)
 	#endif
 	for(int ii = 0; ii < N; ii++)
 		a[ii] *= s,