Issue 2134: removed obsolete raw highlight preservation setting from GUI (still left in procparams for backwards compatilibility)

2015-07-10 12:00:36 +02:00
commit d5ca351c20
1751 changed files with 441944 additions and 0 deletions
--- a/rtengine/EdgePreservingDecomposition.cc
+++ b/rtengine/EdgePreservingDecomposition.cc
@@ -0,0 +1,814 @@
+#include <cmath>
+#include "rt_math.h"
+#include "EdgePreservingDecomposition.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "sleef.c"
+#include "opthelper.h"
+
+#define pow_F(a,b) (xexpf(b*xlogf(a)))
+
+#define DIAGONALS 5
+#define DIAGONALSP1 6
+
+/* Solves A x = b by the conjugate gradient method, where instead of feeding it the matrix A you feed it a function which
+calculates A x where x is some vector. Stops when rms residual < RMSResidual or when maximum iterates is reached.
+Stops at n iterates if MaximumIterates = 0 since that many iterates gives exact solution. Applicable to symmetric positive
+definite problems only, which is what unconstrained smooth optimization pretty much always is.
+Parameter pass can be passed through, containing whatever info you like it to contain (matrix info?).
+Takes less memory with OkToModify_b = true, and Preconditioner = NULL. */
+float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, int n, bool OkToModify_b,
+	float *x, float RMSResidual, void *Pass, int MaximumIterates, void Preconditioner(float *Product, float *x, void *Pass)){
+	int iterate, i;
+
+	char* buffer = (char*)malloc(2*n*sizeof(float)+128);
+	float *r = (float*)(buffer+64);
+	//Start r and x.
+	if(x == NULL){
+		x = new float[n];
+
+		memset(x, 0, sizeof(float)*n);		//Zero initial guess if x == NULL.
+		memcpy(r, b, sizeof(float)*n);
+	}else{
+		Ax(r, x, Pass);
+#ifdef _OPENMP
+#pragma omp parallel for           // removed schedule(dynamic,10)
+#endif
+		for(int ii = 0; ii < n; ii++)
+			r[ii] = b[ii] - r[ii];		//r = b - A x.
+	}
+	//s is preconditionment of r. Without, direct to r.
+	float *s = r, rs = 0.0f;
+	if(Preconditioner != NULL){
+		s = new float[n];
+
+		Preconditioner(s, r, Pass);
+	}
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+:rs)  // removed schedule(dynamic,10)
+#endif
+	for(int ii = 0; ii < n; ii++) {
+		rs += r[ii]*s[ii];
+		}
+	//Search direction d.
+	float *d = (float*)(buffer + n*sizeof(float) + 128);
+	
+	memcpy(d, s, sizeof(float)*n);
+
+	//Store calculations of Ax in this.
+	float *ax = b;
+	if(!OkToModify_b) ax = new float[n];
+
+	//Start iterating!
+	if(MaximumIterates == 0) MaximumIterates = n;
+	for(iterate = 0; iterate < MaximumIterates; iterate++){
+		//Get step size alpha, store ax while at it.
+		float ab = 0.0f;
+		Ax(ax, d, Pass);
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+:ab)
+#endif
+		for(int ii = 0; ii < n; ii++)
+			ab += d[ii]*ax[ii];
+
+		if(ab == 0.0f) break;	//So unlikely. It means perfectly converged or singular, stop either way.
+		ab = rs/ab;
+
+		//Update x and r with this step size.
+		float rms = 0.0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+:rms)
+#endif
+		for(int ii = 0; ii < n; ii++){
+			x[ii] += ab*d[ii];
+			r[ii] -= ab*ax[ii];	//"Fast recursive formula", use explicit r = b - Ax occasionally?
+			rms += r[ii]*r[ii];
+		}
+		rms = sqrtf(rms/n);
+		//Quit? This probably isn't the best stopping condition, but ok.
+		if(rms < RMSResidual) break;
+
+		if(Preconditioner != NULL) Preconditioner(s, r, Pass);
+
+		//Get beta.
+		ab = rs;
+		rs = 0.0f;
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+		float c = 0.0f;
+		float t;
+		float temp;
+#ifdef _OPENMP
+#pragma omp for reduction(+:rs)                            // Summation with error correction
+#endif
+		for(int ii = 0; ii < n; ii++) {
+			temp = r[ii]*s[ii];
+			t = rs + temp;
+			if( fabsf(rs) >= fabsf(temp) )
+				c += ((rs-t) + temp);
+			else
+				c += ((temp-t)+rs);
+			rs = t;
+		}
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+		rs += c;
+}
+
+		ab = rs/ab;
+
+		//Update search direction p.
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+		for(int ii = 0; ii < n; ii++)
+			d[ii] = s[ii] + ab*d[ii];
+
+
+	}
+
+	if(iterate == MaximumIterates)
+		if(iterate != n && RMSResidual != 0.0f)
+			printf("Warning: MaximumIterates (%u) reached in SparseConjugateGradient.\n", MaximumIterates);
+
+	if(ax != b) delete[] ax;
+	if(s != r) delete[] s;
+	free(buffer);
+	return x;
+}
+
+MultiDiagonalSymmetricMatrix::MultiDiagonalSymmetricMatrix(int Dimension, int NumberOfDiagonalsInLowerTriangle){
+	n = Dimension;
+	m = NumberOfDiagonalsInLowerTriangle;
+	IncompleteCholeskyFactorization = NULL;
+
+	Diagonals = new float *[m];
+	StartRows = new int [m+1];
+	memset(Diagonals, 0, sizeof(float *)*m);
+	memset(StartRows, 0, sizeof(int)*(m+1));
+	StartRows[m] = n+1;
+}
+
+MultiDiagonalSymmetricMatrix::~MultiDiagonalSymmetricMatrix(){
+	if(DiagBuffer != NULL)
+		free(buffer);
+	else
+		for(int i=0;i<m;i++)
+			delete[] Diagonals[i];
+	
+	delete[] Diagonals;
+	delete[] StartRows;
+}
+
+bool MultiDiagonalSymmetricMatrix::CreateDiagonal(int index, int StartRow){
+	// Changed memory allocation for diagonals to avoid L1 conflict misses
+	// Falls back to original version if big block could not be allocated
+	int padding = 4096 - ((n*m*sizeof(float)) % 4096);
+	if(index == 0){
+		buffer = (char*)calloc( (n+padding) * m * sizeof(float)+ (m+16)*64 + 63,1);
+		if(buffer == NULL)
+			// no big memory block available => try to allocate smaller blocks
+			DiagBuffer = NULL;
+		else {
+			DiagBuffer = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
+		}
+	}
+	if(index >= m){
+		printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: invalid index.\n");
+		return false;
+	}
+	if(index > 0)
+		if(StartRow <= StartRows[index - 1]){
+			printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: each StartRow must exceed the previous.\n");
+			return false;
+		}
+
+	if(DiagBuffer != NULL)
+		Diagonals[index] = (float*)(DiagBuffer+(index*(n+padding)*sizeof(float))+((index+16)*64));
+	else {
+		Diagonals[index] = new float[DiagonalLength(StartRow)];
+		if(Diagonals[index] == NULL) {
+			printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: memory allocation failed. Out of memory?\n");
+			return false;
+		}
+		memset(Diagonals[index], 0, sizeof(float)*DiagonalLength(StartRow));
+	}
+		
+	StartRows[index] = StartRow;
+	return true;
+}
+
+inline int MultiDiagonalSymmetricMatrix::FindIndex(int StartRow) {
+	//There's GOT to be a better way to do this. "Bidirectional map?"
+	// Issue 1895 : Changed start of loop from zero to one
+	// m is small (5 or 6)
+	for(int i = 1; i < m; i++)
+		if(StartRows[i] == StartRow)
+			return i;
+	return -1;
+}
+
+bool MultiDiagonalSymmetricMatrix::LazySetEntry(float value, int row, int column){
+	//On the strict upper triangle? Swap, this is ok due to symmetry.
+	int i, sr;
+	if(column > row)
+		i = column,
+		column = row,
+		row = i;
+	if(row >= n) return false;
+	sr = row - column;
+
+	//Locate the relevant diagonal.
+	i = FindIndex(sr);
+	if(i < 0) return false;
+
+	Diagonals[i][column] = value;
+	return true;
+}
+
+SSEFUNCTION void MultiDiagonalSymmetricMatrix::VectorProduct(float* RESTRICT Product, float* RESTRICT x){
+
+	int srm = StartRows[m-1];
+	int lm = DiagonalLength(srm);
+#ifdef _OPENMP
+#ifdef __SSE2__
+	const int chunkSize = (lm-srm)/(omp_get_num_procs()*32);
+#else
+	const int chunkSize = (lm-srm)/(omp_get_num_procs()*8);
+#endif
+#endif
+#pragma omp parallel
+{
+	// First fill the big part in the middle
+	// This can be done without intermediate stores to memory and it can be parallelized too
+#ifdef _OPENMP
+#pragma omp for schedule(dynamic,chunkSize) nowait
+#endif
+#ifdef __SSE2__
+	for(int j=srm;j<lm-3;j+=4) {
+		__m128 prodv = LVFU(Diagonals[0][j]) * LVFU(x[j]);
+		for(int i=m-1;i>0;i--) {
+			int s = StartRows[i];
+			prodv += (LVFU(Diagonals[i][j - s])*LVFU(x[j - s])) + (LVFU(Diagonals[i][j])*LVFU(x[j + s]));
+		}
+		_mm_storeu_ps(&Product[j],prodv);
+	}
+#else
+	for(int j=srm;j<lm;j++) {
+		float prod = Diagonals[0][j]*x[j];
+		for(int i=m-1;i>0;i--) {
+			int s = StartRows[i];
+			prod += (Diagonals[i][j - s]*x[j - s]) + (Diagonals[i][j]*x[j + s]);
+		}
+		Product[j] = prod;
+	}
+
+#endif
+#pragma omp single
+{
+#ifdef __SSE2__
+		for(int j=lm-((lm-srm)%4);j<lm;j++) {
+		float prod = Diagonals[0][j]*x[j];
+		for(int i=m-1;i>0;i--) {
+			int s = StartRows[i];
+			prod += (Diagonals[i][j - s]*x[j - s]) + (Diagonals[i][j]*x[j + s]);
+		}
+		Product[j] = prod;
+	}
+#endif
+	// Fill remaining area
+	// Loop over the stored diagonals.
+	for(int i = 0; i < m; i++){
+		int sr = StartRows[i];
+		float *a = Diagonals[i];	//One fewer dereference.
+		int l = DiagonalLength(sr);
+		if(sr == 0) {
+			for(int j = 0; j < srm; j++)
+				Product[j] = a[j]*x[j];		//Separate, fairly simple treatment for the main diagonal.
+			for(int j = lm; j < l; j++)
+				Product[j] = a[j]*x[j];		//Separate, fairly simple treatment for the main diagonal.
+		} else {
+// Split the loop in 3 parts, so now the big one in the middle can be parallelized without race conditions
+			// updates 0 to sr - 1. Because sr is small (in the range of image-width) no benefit by omp
+			for(int j=0;j<sr;j++) {
+				Product[j] += a[j]*x[j + sr];		//Contribution from upper triangle
+			}
+			// Updates sr to l - 1. Because sr is small and l is big, this loop is parallelized
+			for(int j = sr; j < srm; j++) {
+				Product[j] += a[j - sr]*x[j - sr] + a[j]*x[j + sr];		//Contribution from lower and upper triangle
+			}
+			for(int j = lm; j < l; j++) {
+				Product[j] += a[j - sr]*x[j - sr] + a[j]*x[j + sr];		//Contribution from lower and upper triangle
+			}
+			// Updates l to l + sr - 1. Because sr is small (in the range of image-width) no benefit by omp
+			for(int j = l; j < l + sr; j++) {
+				Product[j] += a[j-sr]*x[j - sr];	//Contribution from lower triangle
+			}
+		}
+	}
+}
+}
+}
+
+bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(int MaxFillAbove){
+	if(m == 1){
+		printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: just one diagonal? Can you divide?\n");
+		return false;
+	}
+	if(StartRows[0] != 0){
+		printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: main diagonal required to exist for this math.\n");
+		return false;
+	}
+	//How many diagonals in the decomposition?
+	MaxFillAbove++;	//Conceptually, now "fill" includes an existing diagonal. Simpler in the math that follows.
+	int i, j, mic, fp;
+	mic=1;
+	fp=1;
+	for(int ii = 1; ii < m; ii++) {
+		fp = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove);	//Guarunteed positive since StartRows must be created in increasing order.
+		mic=mic+fp;
+		}
+	//Initialize the decomposition - setup memory, start rows, etc.
+
+	MultiDiagonalSymmetricMatrix *ic = new MultiDiagonalSymmetricMatrix(n, mic);
+	ic->CreateDiagonal(0, 0);	//There's always a main diagonal in this type of decomposition.
+	mic=1;
+	for(int ii = 1; ii < m; ii++){
+		//Set j to the number of diagonals to be created corresponding to a diagonal on this source matrix...
+		j = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove);
+
+		//...and create those diagonals. I want to take a moment to tell you about how much I love minimalistic loops: very much.
+		while(j-- != 0)
+			if(!ic->CreateDiagonal(mic++, StartRows[ii] - j)){
+				//Beware of out of memory, possible for large, sparse problems if you ask for too much fill.
+				printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: Out of memory. Ask for less fill?\n");
+				delete ic;
+				return false;
+			}
+	}
+
+	//It's all initialized? Uhkay. Do the actual math then.
+	int sss, ss, s;
+	int k, MaxStartRow = StartRows[m - 1];	//Handy number.
+	float **l = ic->Diagonals;
+	float  *d = ic->Diagonals[0];		//Describes D in LDLt.
+	int icm = ic->m;
+	int icn = ic->n;
+	int* RESTRICT icStartRows = ic->StartRows;
+
+	//Loop over the columns.
+
+	// create array for quicker access to ic->StartRows
+	struct s_diagmap {
+		int sss;
+		int ss;
+		int k;
+	};
+
+
+	// Pass one: count number of needed entries
+	int	entrycount = 0;
+	for(int i=1;i<icm;i++) {
+		for(int j=1;j<icm;j++) {
+			if(ic->FindIndex( icStartRows[i] + icStartRows[j]) > 0)
+				entrycount ++;
+		}
+	}
+
+	// now we can create the array
+	struct s_diagmap* RESTRICT DiagMap = new s_diagmap[entrycount];
+	// we also need the maxvalues
+	int	entrynumber = 0;
+	int index;
+	int* RESTRICT MaxIndizes = new int[icm];
+
+	for(int i=1;i<icm;i++) {
+		for(int j=1;j<icm;j++){
+			index = ic->FindIndex( icStartRows[i] + icStartRows[j]);
+			if(index > 0) {
+				DiagMap[entrynumber].ss = j;
+				DiagMap[entrynumber].sss = index;
+				DiagMap[entrynumber].k = icStartRows[j];
+				entrynumber ++;
+			}
+		}
+		MaxIndizes[i] = entrynumber - 1;
+	}
+
+	int* RESTRICT findmap = new int[icm];
+		for(int j=0;j<icm;j++)
+			findmap[j] = FindIndex( icStartRows[j]);
+	
+	for(j = 0; j < n; j++){
+		//Calculate d for this column.
+		d[j] = Diagonals[0][j];
+
+		//This is a loop over k from 1 to j, inclusive. We'll cover that by looping over the index of the diagonals (s), and get k from it.
+		//The first diagonal is d (k = 0), so skip that and have s start at 1. Cover all available s but stop if k exceeds j.
+		s=1;
+		k=icStartRows[s];
+		while(k<=j) {
+			d[j] -= l[s][j - k]*l[s][j - k]*d[j - k];
+			s++;
+			k=icStartRows[s];
+		}
+		if(UNLIKELY(d[j] == 0.0f)){
+			printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: division by zero. Matrix not decomposable.\n");
+			delete ic;
+			delete[] DiagMap;
+			delete[] MaxIndizes;
+			delete[] findmap;
+			return false;
+		}
+		float id = 1.0f/d[j];
+		//Now, calculate l from top down along this column.
+
+		int mapindex = 0;
+		int jMax = icn - j;
+		for(s = 1; s < icm; s++){
+			if(icStartRows[s] >= jMax)
+				break; //Possible values of j are limited
+				
+			float temp = 0.0f;
+			while(mapindex <= MaxIndizes[s] && ( k = DiagMap[mapindex].k) <= j) {
+				temp -= l[DiagMap[mapindex].sss][j - k]*l[DiagMap[mapindex].ss][j - k]*d[j - k];
+				mapindex ++;
+			}
+			sss = findmap[s];
+			l[s][j] = id * (sss < 0 ? temp : (Diagonals[sss][j] + temp));
+		}
+	}
+	delete[] DiagMap;
+	delete[] MaxIndizes;
+	delete[] findmap;
+	IncompleteCholeskyFactorization = ic;
+	return true;
+}
+
+void MultiDiagonalSymmetricMatrix::KillIncompleteCholeskyFactorization(void){
+	delete IncompleteCholeskyFactorization;
+}
+
+void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float* RESTRICT x, float* RESTRICT b){
+	//We want to solve L D Lt x = b where D is a diagonal matrix described by Diagonals[0] and L is a unit lower triagular matrix described by the rest of the diagonals.
+	//Let D Lt x = y. Then, first solve L y = b.
+	float* RESTRICT  *d = IncompleteCholeskyFactorization->Diagonals;
+	int* RESTRICT s = IncompleteCholeskyFactorization->StartRows;
+	int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
+	int i, j;
+	
+	if(M != DIAGONALSP1){					// can happen in theory
+		for(j = 0; j < N; j++){
+			float sub = b[j];					// using local var to reduce memory writes, gave a big speedup
+			i = 1;
+			int c = j - s[i];
+			while(c >= 0) {
+				sub -= d[i][c]*x[c];
+				i++;
+				c = j - s[i];
+			}
+			x[j] = sub;							// only one memory-write per j
+		}
+	} else {							   // that's the case almost every time
+		for(j = 0; j <= s[M-1]; j++){
+			float sub = b[j];					// using local var to reduce memory writes, gave a big speedup
+			i = 1;
+			int c = j - s[1];
+			while(c >= 0) {
+				sub -= d[i][c]*x[c];
+				i++;
+				c = j - s[i];
+			}
+			x[j] = sub;							// only one memory-write per j
+		}
+		for(j = s[M-1]+1; j<N; j++){
+			float sub = b[j];              		// using local var to reduce memory writes, gave a big speedup
+			for(int i=DIAGONALSP1-1;i>0;i--){		// using a constant upperbound allows the compiler to unroll this loop (gives a good speedup)
+				sub -= d[i][j-s[i]]*x[j-s[i]];
+			}
+			x[j] = sub;							// only one memory-write per j
+		}
+	}
+
+	//Now, solve x from D Lt x = y -> Lt x = D^-1 y
+// Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for(j = 0; j < N; j++)
+		x[j] = x[j]/d[0][j];
+
+	if(M != DIAGONALSP1){					// can happen in theory
+		while(j-- > 0){
+			float sub = x[j];					// using local var to reduce memory writes, gave a big speedup
+			i=1;
+			int c = j+s[1];
+			while(c < N) {
+				sub -= d[i][j]*x[c];
+				i++;
+				c = j+s[i];
+			}
+			x[j] = sub;							// only one memory-write per j
+		}
+	} else {								// that's the case almost every time
+		for(j=N-1;j>=(N-1)-s[M-1];j--) {
+			float sub = x[j];					// using local var to reduce memory writes, gave a big speedup
+			i=1;
+			int c = j+s[1];
+			while(c < N) {
+				sub -= d[i][j]*x[j+s[i]];
+				i++;
+				c = j+s[i];
+			}
+			x[j] = sub;							// only one memory-write per j
+		}
+		for(j=(N-2)-s[M-1];j>=0;j--) {
+			float sub = x[j];					// using local var to reduce memory writes, gave a big speedup
+			for(int i=DIAGONALSP1-1;i>0;i--){		// using a constant upperbound allows the compiler to unroll this loop (gives a good speedup)
+				sub -= d[i][j]*x[j + s[i]];
+			}
+			x[j] = sub;							// only one memory-write per j
+		}
+	}
+}
+
+EdgePreservingDecomposition::EdgePreservingDecomposition(int width, int height){
+	w = width;
+	h = height;
+	n = w*h;
+
+	//Initialize the matrix just once at construction.
+	A = new MultiDiagonalSymmetricMatrix(n, DIAGONALS);
+	if(!(
+		A->CreateDiagonal(0, 0) &&
+		A->CreateDiagonal(1, 1) &&
+		A->CreateDiagonal(2, w - 1) &&
+		A->CreateDiagonal(3, w) &&
+		A->CreateDiagonal(4, w + 1))){
+		delete A;
+		A = NULL;
+		printf("Error in EdgePreservingDecomposition construction: out of memory.\n");
+	}else{
+		a0    = A->Diagonals[0];
+		a_1   = A->Diagonals[1];
+		a_w1  = A->Diagonals[2];
+		a_w   = A->Diagonals[3];
+		a_w_1 = A->Diagonals[4];
+	}
+}
+
+EdgePreservingDecomposition::~EdgePreservingDecomposition(){
+	delete A;
+}
+
+SSEFUNCTION float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float EdgeStopping, int Iterates, float *Blur, bool UseBlurForEdgeStop){
+
+	if(Blur == NULL)
+		UseBlurForEdgeStop = false,	//Use source if there's no supplied Blur.
+		Blur = new float[n];
+
+	if(Scale == 0.0f){
+		memcpy(Blur, Source, n*sizeof(float));
+		return Blur;
+	}
+
+	//Create the edge stopping function a, rotationally symmetric and just one instead of (ax, ay). Maybe don't need Blur yet, so use its memory.
+	float* RESTRICT a;
+	float* RESTRICT g;
+	if(UseBlurForEdgeStop) a = new float[n], g = Blur;
+	else a = Blur, g = Source;
+
+	int i;
+	int w1 = w - 1, h1 = h - 1;
+//	float eps = 0.02f;
+	const float sqreps = 0.0004f;                           // removed eps*eps from inner loop
+
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+#ifdef __SSE2__
+	int x;
+	__m128 gxv,gyv;
+	__m128 Scalev = _mm_set1_ps( Scale );
+	__m128 sqrepsv = _mm_set1_ps( sqreps );
+	__m128 EdgeStoppingv = _mm_set1_ps( -EdgeStopping );
+	__m128 zd5v = _mm_set1_ps( 0.5f );
+#endif
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for(int y = 0; y < h1; y++){
+		float *rg = &g[w*y];
+#ifdef __SSE2__
+		for(x = 0; x < w1-3; x+=4){
+			//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
+			gxv = (LVFU(rg[x + 1]) -  LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + w]));
+			gyv = (LVFU(rg[x + w]) -  LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + 1]));
+			//Apply power to the magnitude of the gradient to get the edge stopping function.
+			_mm_storeu_ps( &a[x + w*y], Scalev * pow_F((zd5v*_mm_sqrt_ps(gxv*gxv + gyv*gyv + sqrepsv)), EdgeStoppingv) );
+		}
+		for(; x < w1; x++){
+			//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
+			float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
+			float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
+			//Apply power to the magnitude of the gradient to get the edge stopping function.
+			a[x + w*y] = Scale*pow_F(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
+		}
+#else
+		for(int x = 0; x < w1; x++){
+			//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
+			float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
+			float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
+
+			//Apply power to the magnitude of the gradient to get the edge stopping function.
+			a[x + w*y] = Scale*pow_F(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
+		}
+#endif
+	}
+}
+
+
+	/* Now setup the linear problem. I use the Maxima CAS, here's code for making an FEM formulation for the smoothness term:
+		p(x, y) := (1 - x)*(1 - y);
+		P(m, n) := A[m][n]*p(x, y) + A[m + 1][n]*p(1 - x, y) + A[m + 1][n + 1]*p(1 - x, 1 - y) + A[m][n + 1]*p(x, 1 - y);
+		Integrate(f) := integrate(integrate(f, x, 0, 1), y, 0, 1);
+
+		Integrate(diff(P(u, v), x)*diff(p(x, y), x) + diff(P(u, v), y)*diff(p(x, y), y));
+		Integrate(diff(P(u - 1, v), x)*diff(p(1 - x, y), x) + diff(P(u - 1, v), y)*diff(p(1 - x, y), y));
+		Integrate(diff(P(u - 1, v - 1), x)*diff(p(1 - x, 1 - y), x) + diff(P(u - 1, v - 1), y)*diff(p(1 - x, 1 - y), y));
+		Integrate(diff(P(u, v - 1), x)*diff(p(x, 1 - y), x) + diff(P(u, v - 1), y)*diff(p(x, 1 - y), y));
+	So yeah. Use the numeric results of that to fill the matrix A.*/
+
+	memset(a_1, 0, A->DiagonalLength(1)*sizeof(float));
+	memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
+	memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
+	memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
+
+
+// checked for race condition here
+// a0[] is read and write but adressed by i only
+// a[] is read only
+// a_w_1 is write only
+// a_w is write only
+// a_w1 is write only
+// a_1 is write only
+// So, there should be no race conditions
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+	for(int y = 0; y < h; y++){
+        int i = y*w;
+		for(int x = 0; x < w; x++, i++){
+			float ac,a0temp;
+			a0temp = 0.25f;
+
+			//Remember, only fill the lower triangle. Memory for upper is never made. It's symmetric. Trust.
+			if(x > 0 && y > 0) {						
+				ac = a[i - w - 1]/6.0f;
+				a_w_1[i - w - 1] -= 2.0f*ac;
+				a_w[i - w] -= ac;
+				a_1[i - 1] -= ac;
+				a0temp += ac;
+			}
+			if(x < w1 && y > 0) {
+				ac = a[i - w]/6.0f;
+				a_w[i - w] -= ac;
+				a_w1[i - w + 1] -= 2.0f*ac;
+				a0temp += ac;
+			}
+			if(x > 0 && y < h1) {
+				ac = a[i - 1]/6.0f;
+				a_1[i - 1] -= ac;
+				a0temp += ac;
+			}
+			if(x < w1 && y < h1)
+				a0temp += a[i]/6.0f;
+			a0[i] = 4.0f*a0temp;
+		}
+	}
+
+  if(UseBlurForEdgeStop) delete[] a;
+  //Solve & return.
+  bool success=A->CreateIncompleteCholeskyFactorization(1); //Fill-in of 1 seems to work really good. More doesn't really help and less hurts (slightly).
+  if(!success) {
+    fprintf(stderr,"Error: Tonemapping has failed.\n");
+    memset(Blur, 0, sizeof(float)*n);  // On failure, set the blur to zero.  This is subsequently exponentiated in CompressDynamicRange.
+    return Blur;
+  }
+  if(!UseBlurForEdgeStop) memcpy(Blur, Source, n*sizeof(float));
+  SparseConjugateGradient(A->PassThroughVectorProduct, Source, n, false, Blur, 0.0f, (void *)A, Iterates, A->PassThroughCholeskyBackSolve);
+  A->KillIncompleteCholeskyFactorization();
+  return Blur;
+}
+
+float *EdgePreservingDecomposition::CreateIteratedBlur(float *Source, float Scale, float EdgeStopping, int Iterates, int Reweightings, float *Blur){
+	//Simpler outcome?
+	if(Reweightings == 0) return CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur);
+
+	//Create a blur here, initialize.
+	if(Blur == NULL) Blur = new float[n];
+	memcpy(Blur, Source, n*sizeof(float));
+
+	//Iteratively improve the blur.
+	Reweightings++;
+	for(int i = 0; i < Reweightings; i++)
+		CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur, true);
+
+	return Blur;
+}
+
+SSEFUNCTION float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Scale, float EdgeStopping, float CompressionExponent, float DetailBoost, int Iterates, int Reweightings, float *Compressed){
+	if(w<300 && h<300) // set number of Reweightings to zero for small images (thumbnails). We could try to find a better solution here.
+		Reweightings = 0;
+		
+	//Small number intended to prevent division by zero. This is different from the eps in CreateBlur.
+	const float eps = 0.0001f;
+
+	//We're working with luminance, which does better logarithmic.
+#ifdef __SSE2__
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	__m128 epsv = _mm_set1_ps( eps );
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for(int ii = 0; ii < n-3; ii+=4)
+		_mm_storeu_ps( &Source[ii], xlogf(LVFU(Source[ii]) + epsv));
+}
+	for(int ii = n-(n%4); ii < n; ii++)
+		Source[ii] = xlogf(Source[ii] + eps);
+
+#else
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+	for(int ii = 0; ii < n; ii++)
+		Source[ii] = xlogf(Source[ii] + eps);
+#endif
+	
+	//Blur. Also setup memory for Compressed (we can just use u since each element of u is used in one calculation).
+	float *u = CreateIteratedBlur(Source, Scale, EdgeStopping, Iterates, Reweightings);
+	if(Compressed == NULL) Compressed = u;
+
+	//Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
+//	float temp = CompressionExponent - 1.0f;
+	float temp;
+	if(DetailBoost>0.f) {
+	float betemp=expf(-(2.f-DetailBoost+0.694f))-1.f;//0.694 = log(2)
+	temp = 1.2f*xlogf( -betemp);
+	}
+	else temp= CompressionExponent - 1.0f;
+#ifdef __SSE2__
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	__m128 cev, uev, sourcev;
+	__m128 epsv = _mm_set1_ps( eps );
+	__m128 DetailBoostv = _mm_set1_ps( DetailBoost );
+	__m128 tempv = _mm_set1_ps( temp );
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for(int i = 0; i < n-3; i+=4){
+		cev = xexpf(LVFU(Source[i]) + LVFU(u[i])*(tempv)) - epsv;
+		uev = xexpf(LVFU(u[i])) - epsv;
+		sourcev = xexpf(LVFU(Source[i])) - epsv;
+		_mm_storeu_ps( &Source[i], sourcev);
+		_mm_storeu_ps( &Compressed[i], cev + DetailBoostv * (sourcev - uev) );
+	}
+}
+	for(int i=n-(n%4); i < n; i++){
+		float ce = xexpf(Source[i] + u[i]*(temp)) - eps;
+		float ue = xexpf(u[i]) - eps;
+		Source[i] = xexpf(Source[i]) - eps;
+		Compressed[i] = ce + DetailBoost*(Source[i] - ue);
+	}
+	
+#else
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+	for(int i = 0; i < n; i++){
+		float ce = xexpf(Source[i] + u[i]*(temp)) - eps;
+		float ue = xexpf(u[i]) - eps;
+		Source[i] = xexpf(Source[i]) - eps;
+		Compressed[i] = ce + DetailBoost*(Source[i] - ue);
+	}
+#endif
+
+	if(Compressed != u) delete[] u;
+	return Compressed;
+
+}
+