Tonemapping optimization, Issue 1895
This commit is contained in:
@@ -4,6 +4,11 @@
|
|||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#endif
|
#endif
|
||||||
|
#include "sleef.c"
|
||||||
|
#define pow_F(a,b) (xexpf(b*xlogf(a)))
|
||||||
|
|
||||||
|
#define DIAGONALS 5
|
||||||
|
#define DIAGONALSP1 6
|
||||||
|
|
||||||
/* Solves A x = b by the conjugate gradient method, where instead of feeding it the matrix A you feed it a function which
|
/* Solves A x = b by the conjugate gradient method, where instead of feeding it the matrix A you feed it a function which
|
||||||
calculates A x where x is some vector. Stops when rms residual < RMSResidual or when maximum iterates is reached.
|
calculates A x where x is some vector. Stops when rms residual < RMSResidual or when maximum iterates is reached.
|
||||||
@@ -11,15 +16,16 @@ Stops at n iterates if MaximumIterates = 0 since that many iterates gives exact
|
|||||||
definite problems only, which is what unconstrained smooth optimization pretty much always is.
|
definite problems only, which is what unconstrained smooth optimization pretty much always is.
|
||||||
Parameter pass can be passed through, containing whatever info you like it to contain (matrix info?).
|
Parameter pass can be passed through, containing whatever info you like it to contain (matrix info?).
|
||||||
Takes less memory with OkToModify_b = true, and Preconditioner = NULL. */
|
Takes less memory with OkToModify_b = true, and Preconditioner = NULL. */
|
||||||
float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, unsigned int n, bool OkToModify_b,
|
float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, int n, bool OkToModify_b,
|
||||||
float *x, float RMSResidual, void *Pass, unsigned int MaximumIterates, void Preconditioner(float *Product, float *x, void *Pass)){
|
float *x, float RMSResidual, void *Pass, int MaximumIterates, void Preconditioner(float *Product, float *x, void *Pass)){
|
||||||
unsigned int iterate, i;
|
int iterate, i;
|
||||||
|
|
||||||
float *r = new float[n];
|
|
||||||
|
|
||||||
|
char* buffer = (char*)malloc(2*n*sizeof(float)+128);
|
||||||
|
float *r = (float*)(buffer+64);
|
||||||
//Start r and x.
|
//Start r and x.
|
||||||
if(x == NULL){
|
if(x == NULL){
|
||||||
x = new float[n];
|
x = new float[n];
|
||||||
|
|
||||||
memset(x, 0, sizeof(float)*n); //Zero initial guess if x == NULL.
|
memset(x, 0, sizeof(float)*n); //Zero initial guess if x == NULL.
|
||||||
memcpy(r, b, sizeof(float)*n);
|
memcpy(r, b, sizeof(float)*n);
|
||||||
}else{
|
}else{
|
||||||
@@ -27,23 +33,25 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
|||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for // removed schedule(dynamic,10)
|
#pragma omp parallel for // removed schedule(dynamic,10)
|
||||||
#endif
|
#endif
|
||||||
for(int ii = 0; ii < n; ii++) r[ii] = b[ii] - r[ii]; //r = b - A x.
|
for(int ii = 0; ii < n; ii++)
|
||||||
|
r[ii] = b[ii] - r[ii]; //r = b - A x.
|
||||||
}
|
}
|
||||||
//s is preconditionment of r. Without, direct to r.
|
//s is preconditionment of r. Without, direct to r.
|
||||||
float *s = r, rs = 0.0f, fp=0.f;
|
float *s = r, rs = 0.0f;
|
||||||
if(Preconditioner != NULL){
|
if(Preconditioner != NULL){
|
||||||
s = new float[n];
|
s = new float[n];
|
||||||
|
|
||||||
Preconditioner(s, r, Pass);
|
Preconditioner(s, r, Pass);
|
||||||
}
|
}
|
||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for firstprivate(fp) reduction(+:rs) // removed schedule(dynamic,10)
|
#pragma omp parallel for reduction(+:rs) // removed schedule(dynamic,10)
|
||||||
#endif
|
#endif
|
||||||
for(int ii = 0; ii < n; ii++) {
|
for(int ii = 0; ii < n; ii++) {
|
||||||
fp = r[ii]*s[ii];
|
rs += r[ii]*s[ii];
|
||||||
rs=rs+fp;
|
|
||||||
}
|
}
|
||||||
//Search direction d.
|
//Search direction d.
|
||||||
float *d = new float[n];
|
float *d = (float*)(buffer + n*sizeof(float) + 128);
|
||||||
|
|
||||||
memcpy(d, s, sizeof(float)*n);
|
memcpy(d, s, sizeof(float)*n);
|
||||||
|
|
||||||
//Store calculations of Ax in this.
|
//Store calculations of Ax in this.
|
||||||
@@ -56,22 +64,26 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
|||||||
//Get step size alpha, store ax while at it.
|
//Get step size alpha, store ax while at it.
|
||||||
float ab = 0.0f;
|
float ab = 0.0f;
|
||||||
Ax(ax, d, Pass);
|
Ax(ax, d, Pass);
|
||||||
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for reduction(+:ab)
|
#pragma omp parallel for reduction(+:ab)
|
||||||
for(int ii = 0; ii < n; ii++) ab += d[ii]*ax[ii];
|
#endif
|
||||||
|
for(int ii = 0; ii < n; ii++)
|
||||||
|
ab += d[ii]*ax[ii];
|
||||||
|
|
||||||
if(ab == 0.0f) break; //So unlikely. It means perfectly converged or singular, stop either way.
|
if(ab == 0.0f) break; //So unlikely. It means perfectly converged or singular, stop either way.
|
||||||
ab = rs/ab;
|
ab = rs/ab;
|
||||||
|
|
||||||
//Update x and r with this step size.
|
//Update x and r with this step size.
|
||||||
float rms = 0.0;
|
float rms = 0.0;
|
||||||
//#pragma omp parallel for reduction(+:rms) // Omp makes it slower here. Don't know why
|
#ifdef _OPENMP
|
||||||
|
#pragma omp parallel for reduction(+:rms)
|
||||||
|
#endif
|
||||||
for(int ii = 0; ii < n; ii++){
|
for(int ii = 0; ii < n; ii++){
|
||||||
x[ii] += ab*d[ii];
|
x[ii] += ab*d[ii];
|
||||||
r[ii] -= ab*ax[ii]; //"Fast recursive formula", use explicit r = b - Ax occasionally?
|
r[ii] -= ab*ax[ii]; //"Fast recursive formula", use explicit r = b - Ax occasionally?
|
||||||
rms += r[ii]*r[ii];
|
rms += r[ii]*r[ii];
|
||||||
}
|
}
|
||||||
rms = sqrtf(rms/n);
|
rms = sqrtf(rms/n);
|
||||||
//printf("%f\n", rms);
|
|
||||||
//Quit? This probably isn't the best stopping condition, but ok.
|
//Quit? This probably isn't the best stopping condition, but ok.
|
||||||
if(rms < RMSResidual) break;
|
if(rms < RMSResidual) break;
|
||||||
|
|
||||||
@@ -80,12 +92,39 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
|||||||
//Get beta.
|
//Get beta.
|
||||||
ab = rs;
|
ab = rs;
|
||||||
rs = 0.0f;
|
rs = 0.0f;
|
||||||
//#pragma omp parallel for reduction(+:rs) // Omp makes it slower here. Don't know why
|
|
||||||
for(int ii = 0; ii < n; ii++) rs += r[ii]*s[ii];
|
#ifdef _OPENMP
|
||||||
|
#pragma omp parallel
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
float c = 0.0f;
|
||||||
|
float t;
|
||||||
|
float temp;
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp for reduction(+:rs) // Summation with error correction
|
||||||
|
#endif
|
||||||
|
for(int ii = 0; ii < n; ii++) {
|
||||||
|
temp = r[ii]*s[ii];
|
||||||
|
t = rs + temp;
|
||||||
|
if( fabsf(rs) >= fabsf(temp) )
|
||||||
|
c += ((rs-t) + temp);
|
||||||
|
else
|
||||||
|
c += ((temp-t)+rs);
|
||||||
|
rs = t;
|
||||||
|
}
|
||||||
|
rs += c;
|
||||||
|
}
|
||||||
|
|
||||||
ab = rs/ab;
|
ab = rs/ab;
|
||||||
|
|
||||||
//Update search direction p.
|
//Update search direction p.
|
||||||
for(int ii = 0; ii < n; ii++) d[ii] = s[ii] + ab*d[ii];
|
#ifdef _OPENMP
|
||||||
|
#pragma omp parallel for
|
||||||
|
#endif
|
||||||
|
for(int ii = 0; ii < n; ii++)
|
||||||
|
d[ii] = s[ii] + ab*d[ii];
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(iterate == MaximumIterates)
|
if(iterate == MaximumIterates)
|
||||||
@@ -94,29 +133,46 @@ float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), fl
|
|||||||
|
|
||||||
if(ax != b) delete[] ax;
|
if(ax != b) delete[] ax;
|
||||||
if(s != r) delete[] s;
|
if(s != r) delete[] s;
|
||||||
delete[] r;
|
free(buffer);
|
||||||
delete[] d;
|
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
MultiDiagonalSymmetricMatrix::MultiDiagonalSymmetricMatrix(unsigned int Dimension, unsigned int NumberOfDiagonalsInLowerTriangle){
|
MultiDiagonalSymmetricMatrix::MultiDiagonalSymmetricMatrix(int Dimension, int NumberOfDiagonalsInLowerTriangle){
|
||||||
n = Dimension;
|
n = Dimension;
|
||||||
m = NumberOfDiagonalsInLowerTriangle;
|
m = NumberOfDiagonalsInLowerTriangle;
|
||||||
IncompleteCholeskyFactorization = NULL;
|
IncompleteCholeskyFactorization = NULL;
|
||||||
|
|
||||||
Diagonals = new float *[m];
|
Diagonals = new float *[m];
|
||||||
StartRows = new unsigned int [m];
|
StartRows = new int [m+1];
|
||||||
memset(Diagonals, 0, sizeof(float *)*m);
|
memset(Diagonals, 0, sizeof(float *)*m);
|
||||||
memset(StartRows, 0, sizeof(unsigned int)*m);
|
memset(StartRows, 0, sizeof(int)*(m+1));
|
||||||
|
StartRows[m] = n+1;
|
||||||
}
|
}
|
||||||
|
|
||||||
MultiDiagonalSymmetricMatrix::~MultiDiagonalSymmetricMatrix(){
|
MultiDiagonalSymmetricMatrix::~MultiDiagonalSymmetricMatrix(){
|
||||||
for(unsigned int i = 0; i != m; i++) delete[] Diagonals[i];
|
if(DiagBuffer != NULL)
|
||||||
|
free(buffer);
|
||||||
|
else
|
||||||
|
for(int i=0;i<m;i++)
|
||||||
|
delete[] Diagonals[i];
|
||||||
|
|
||||||
delete[] Diagonals;
|
delete[] Diagonals;
|
||||||
delete[] StartRows;
|
delete[] StartRows;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MultiDiagonalSymmetricMatrix::CreateDiagonal(unsigned int index, unsigned int StartRow){
|
bool MultiDiagonalSymmetricMatrix::CreateDiagonal(int index, int StartRow){
|
||||||
|
// Changed memory allocation for diagonals to avoid L1 conflict misses
|
||||||
|
// Falls back to original version if big block could not be allocated
|
||||||
|
int padding = 4096 - ((n*m*sizeof(float)) % 4096);
|
||||||
|
if(index == 0){
|
||||||
|
buffer = (char*)malloc( (n+padding) * m * sizeof(float)+ (m+16)*64 + 63);
|
||||||
|
if(buffer == NULL)
|
||||||
|
// no big memory block available => try to allocate smaller blocks
|
||||||
|
DiagBuffer = NULL;
|
||||||
|
else {
|
||||||
|
DiagBuffer = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
|
||||||
|
}
|
||||||
|
}
|
||||||
if(index >= m){
|
if(index >= m){
|
||||||
printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: invalid index.\n");
|
printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: invalid index.\n");
|
||||||
return false;
|
return false;
|
||||||
@@ -127,27 +183,32 @@ bool MultiDiagonalSymmetricMatrix::CreateDiagonal(unsigned int index, unsigned i
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] Diagonals[index];
|
if(DiagBuffer != NULL)
|
||||||
|
Diagonals[index] = (float*)(DiagBuffer+(index*(n+padding)*sizeof(float))+((index+16)*64));
|
||||||
|
else {
|
||||||
Diagonals[index] = new float[DiagonalLength(StartRow)];
|
Diagonals[index] = new float[DiagonalLength(StartRow)];
|
||||||
if(Diagonals[index] == NULL) {
|
if(Diagonals[index] == NULL) {
|
||||||
printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: memory allocation failed. Out of memory?\n");
|
printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: memory allocation failed. Out of memory?\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
StartRows[index] = StartRow;
|
StartRows[index] = StartRow;
|
||||||
memset(Diagonals[index], 0, sizeof(float)*DiagonalLength(StartRow));
|
memset(Diagonals[index], 0, sizeof(float)*DiagonalLength(StartRow));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int MultiDiagonalSymmetricMatrix::FindIndex(unsigned int StartRow){
|
inline int MultiDiagonalSymmetricMatrix::FindIndex(int StartRow) {
|
||||||
//There's GOT to be a better way to do this. "Bidirectional map?"
|
//There's GOT to be a better way to do this. "Bidirectional map?"
|
||||||
for(unsigned int i = 0; i != m; i++)
|
// Issue 1895 : Changed start of loop from zero to one
|
||||||
|
// m is small (5 or 6)
|
||||||
|
for(int i = 1; i < m; i++)
|
||||||
if(StartRows[i] == StartRow)
|
if(StartRows[i] == StartRow)
|
||||||
return i;
|
return i;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MultiDiagonalSymmetricMatrix::LazySetEntry(float value, unsigned int row, unsigned int column){
|
bool MultiDiagonalSymmetricMatrix::LazySetEntry(float value, int row, int column){
|
||||||
//On the strict upper triangle? Swap, this is ok due to symmetry.
|
//On the strict upper triangle? Swap, this is ok due to symmetry.
|
||||||
int i, sr;
|
int i, sr;
|
||||||
if(column > row)
|
if(column > row)
|
||||||
@@ -165,35 +226,55 @@ bool MultiDiagonalSymmetricMatrix::LazySetEntry(float value, unsigned int row, u
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MultiDiagonalSymmetricMatrix::VectorProduct(float *Product, float *x){
|
void MultiDiagonalSymmetricMatrix::VectorProduct(float* RESTRICT Product, float* RESTRICT x){
|
||||||
//Initialize to zero.
|
|
||||||
memset(Product, 0, n*sizeof(float));
|
|
||||||
|
|
||||||
//Loop over the stored diagonals.
|
//Loop over the stored diagonals.
|
||||||
for(unsigned int i = 0; i != m; i++){
|
for(int i = 0; i < m; i++){
|
||||||
unsigned int sr = StartRows[i];
|
int sr = StartRows[i];
|
||||||
float *a = Diagonals[i]; //One fewer dereference.
|
float *a = Diagonals[i]; //One fewer dereference.
|
||||||
unsigned int j, l = DiagonalLength(sr);
|
int l = DiagonalLength(sr);
|
||||||
|
|
||||||
if(sr == 0)
|
if(sr == 0)
|
||||||
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(j = 0; j < l; j++)
|
#endif
|
||||||
Product[j] += a[j]*x[j]; //Separate, fairly simple treatment for the main diagonal.
|
for(int j = 0; j < l; j++)
|
||||||
|
Product[j] = a[j]*x[j]; //Separate, fairly simple treatment for the main diagonal.
|
||||||
else {
|
else {
|
||||||
// Split the loop in 2 parts, so now it can be parallelized without race conditions
|
// Split the loop in 3 parts, so now the big one in the middle can be parallelized without race conditions
|
||||||
#pragma omp parallel for
|
|
||||||
for(j = 0; j < l; j++) {
|
// updates 0 to sr - 1. Because sr is small (in the range of image-width) no benefit by omp
|
||||||
Product[j + sr] += a[j]*x[j]; //Contribution from lower...
|
for(int j=0;j<sr;j++) {
|
||||||
|
Product[j] += a[j]*x[j + sr]; //Contribution from upper triangle
|
||||||
|
}
|
||||||
|
|
||||||
|
// Updates sr to l - 1. Because sr is small and l is big, this loop is parallelized
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp parallel
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp for nowait
|
||||||
|
#endif
|
||||||
|
for(int j = sr; j < l; j++) {
|
||||||
|
Product[j] += a[j - sr]*x[j - sr] + a[j]*x[j + sr]; //Contribution from lower and upper triangle
|
||||||
|
}
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp single
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
// Updates l to l + sr - 1. Because sr is small (in the range of image-width) no benefit by omp
|
||||||
|
for(int j = l; j < l + sr; j++) {
|
||||||
|
Product[j] += a[j-sr]*x[j - sr]; //Contribution from lower triangle
|
||||||
}
|
}
|
||||||
#pragma omp parallel for
|
|
||||||
for(j = 0; j < l; j++) {
|
|
||||||
Product[j] += a[j]*x[j + sr]; //...and upper triangle.
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigned int MaxFillAbove){
|
}
|
||||||
|
|
||||||
|
bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(int MaxFillAbove){
|
||||||
if(m == 1){
|
if(m == 1){
|
||||||
printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: just one diagonal? Can you divide?\n");
|
printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: just one diagonal? Can you divide?\n");
|
||||||
return false;
|
return false;
|
||||||
@@ -202,20 +283,17 @@ bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigne
|
|||||||
printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: main diagonal required to exist for this math.\n");
|
printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: main diagonal required to exist for this math.\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//How many diagonals in the decomposition?
|
//How many diagonals in the decomposition?
|
||||||
MaxFillAbove++; //Conceptually, now "fill" includes an existing diagonal. Simpler in the math that follows.
|
MaxFillAbove++; //Conceptually, now "fill" includes an existing diagonal. Simpler in the math that follows.
|
||||||
unsigned int i, j, mic, fp;
|
int i, j, mic, fp;
|
||||||
mic=1;
|
mic=1;
|
||||||
fp=1;
|
fp=1;
|
||||||
#ifdef _OPENMP
|
|
||||||
#pragma omp parallel for firstprivate(fp) reduction(+:mic) // removed schedule(dynamic,10)
|
|
||||||
#endif
|
|
||||||
for(int ii = 1; ii < m; ii++) {
|
for(int ii = 1; ii < m; ii++) {
|
||||||
fp = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove); //Guarunteed positive since StartRows must be created in increasing order.
|
fp = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove); //Guarunteed positive since StartRows must be created in increasing order.
|
||||||
mic=mic+fp;
|
mic=mic+fp;
|
||||||
}
|
}
|
||||||
//Initialize the decomposition - setup memory, start rows, etc.
|
//Initialize the decomposition - setup memory, start rows, etc.
|
||||||
|
|
||||||
MultiDiagonalSymmetricMatrix *ic = new MultiDiagonalSymmetricMatrix(n, mic);
|
MultiDiagonalSymmetricMatrix *ic = new MultiDiagonalSymmetricMatrix(n, mic);
|
||||||
ic->CreateDiagonal(0, 0); //There's always a main diagonal in this type of decomposition.
|
ic->CreateDiagonal(0, 0); //There's always a main diagonal in this type of decomposition.
|
||||||
mic=1;
|
mic=1;
|
||||||
@@ -233,67 +311,96 @@ bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(unsigne
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//It's all initialized? Uhkay. Do the actual math then.
|
//It's all initialized? Uhkay. Do the actual math then.
|
||||||
int sss, ss, s;
|
int sss, ss, s;
|
||||||
unsigned int k, MaxStartRow = StartRows[m - 1]; //Handy number.
|
int k, MaxStartRow = StartRows[m - 1]; //Handy number.
|
||||||
float **l = ic->Diagonals;
|
float **l = ic->Diagonals;
|
||||||
float *d = ic->Diagonals[0]; //Describes D in LDLt.
|
float *d = ic->Diagonals[0]; //Describes D in LDLt.
|
||||||
|
int icm = ic->m;
|
||||||
|
int icn = ic->n;
|
||||||
|
int* RESTRICT icStartRows = ic->StartRows;
|
||||||
|
|
||||||
//Loop over the columns.
|
//Loop over the columns.
|
||||||
for(j = 0; j != n; j++){
|
|
||||||
|
// create array for quicker access to ic->StartRows
|
||||||
|
struct s_diagmap {
|
||||||
|
int sss;
|
||||||
|
int ss;
|
||||||
|
int k;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// Pass one: count number of needed entries
|
||||||
|
int entrycount = 0;
|
||||||
|
for(int i=1;i<icm;i++) {
|
||||||
|
for(int j=1;j<icm;j++) {
|
||||||
|
if(ic->FindIndex( icStartRows[i] + icStartRows[j]) > 0)
|
||||||
|
entrycount ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// now we can create the array
|
||||||
|
struct s_diagmap* RESTRICT DiagMap = new s_diagmap[entrycount];
|
||||||
|
// we also need the maxvalues
|
||||||
|
int entrynumber = 0;
|
||||||
|
int index;
|
||||||
|
int* RESTRICT MaxIndizes = new int[icm];
|
||||||
|
|
||||||
|
for(int i=1;i<icm;i++) {
|
||||||
|
for(int j=1;j<icm;j++){
|
||||||
|
index = ic->FindIndex( icStartRows[i] + icStartRows[j]);
|
||||||
|
if(index > 0) {
|
||||||
|
DiagMap[entrynumber].ss = j;
|
||||||
|
DiagMap[entrynumber].sss = index;
|
||||||
|
DiagMap[entrynumber].k = icStartRows[j];
|
||||||
|
entrynumber ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MaxIndizes[i] = entrynumber - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int* RESTRICT findmap = new int[icm];
|
||||||
|
for(int j=0;j<icm;j++)
|
||||||
|
findmap[j] = FindIndex( icStartRows[j]);
|
||||||
|
|
||||||
|
for(j = 0; j < n; j++){
|
||||||
//Calculate d for this column.
|
//Calculate d for this column.
|
||||||
d[j] = Diagonals[0][j];
|
d[j] = Diagonals[0][j];
|
||||||
|
|
||||||
//This is a loop over k from 1 to j, inclusive. We'll cover that by looping over the index of the diagonals (s), and get k from it.
|
//This is a loop over k from 1 to j, inclusive. We'll cover that by looping over the index of the diagonals (s), and get k from it.
|
||||||
//The first diagonal is d (k = 0), so skip that and have s start at 1. Cover all available s but stop if k exceeds j.
|
//The first diagonal is d (k = 0), so skip that and have s start at 1. Cover all available s but stop if k exceeds j.
|
||||||
for(s = 1; s != ic->m; s++){
|
s=1;
|
||||||
k = ic->StartRows[s];
|
k=icStartRows[s];
|
||||||
if(k > j) break;
|
while(k<=j) {
|
||||||
d[j] -= l[s][j - k]*l[s][j - k]*d[j - k];
|
d[j] -= l[s][j - k]*l[s][j - k]*d[j - k];
|
||||||
|
s++;
|
||||||
|
k=icStartRows[s];
|
||||||
}
|
}
|
||||||
|
if(UNLIKELY(d[j] == 0.0f)){
|
||||||
if(d[j] == 0.0f){
|
|
||||||
printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: division by zero. Matrix not decomposable.\n");
|
printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: division by zero. Matrix not decomposable.\n");
|
||||||
delete ic;
|
delete ic;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
float id = 1.0f/d[j];
|
float id = 1.0f/d[j];
|
||||||
|
|
||||||
//Now, calculate l from top down along this column.
|
//Now, calculate l from top down along this column.
|
||||||
for(s = 1; s != ic->m; s++){
|
|
||||||
i = ic->StartRows[s]; //Start row for this entry.
|
|
||||||
if(j >= ic->n - i) break; //Possible values of j are limited.
|
|
||||||
|
|
||||||
//Quicker access for an element of l.
|
int mapindex = 0;
|
||||||
float *lij = &l[s][j];
|
for(s = 1; s < icm; s++){
|
||||||
sss = FindIndex(i); //Find element in same spot in the source matrix. It might be a zero.
|
if(j >= icn - icStartRows[s]) break; //Possible values of j are limited.
|
||||||
*lij = sss < 0 ? 0.0f : Diagonals[sss][j];
|
|
||||||
|
|
||||||
//Similar to the loop involving d, convoluted by the fact that two l are involved.
|
float temp = 0.0f;
|
||||||
for(ss = 1; ss != ic->m; ss++){
|
|
||||||
k = ic->StartRows[ss];
|
|
||||||
if(k > j) break;
|
|
||||||
if(i + k > MaxStartRow) break; //Quick exit once k to big.
|
|
||||||
|
|
||||||
int sss = ic->FindIndex(i + k);
|
while(mapindex <= MaxIndizes[s] && ( k = DiagMap[mapindex].k) <= j) {
|
||||||
if(sss < 0) continue; //Asked for diagonal nonexistant. But there may be something later, so don't break.
|
temp -= l[DiagMap[mapindex].sss][j - k]*l[DiagMap[mapindex].ss][j - k]*d[j - k];
|
||||||
|
mapindex ++;
|
||||||
/* Let's think about the factors in the term below for a moment.
|
|
||||||
j varies from 0 to n - 1, so j - k is bounded inclusive by 0 and j - 1. So d[j - k] is always in the matrix.
|
|
||||||
|
|
||||||
l[sss] and l[ss] are diagonals with corresponding start rows i + k and k.
|
|
||||||
For l[sss][j - k] to exist, we must have j - k < n - (i + k) -> j < n - i, which was checked outside this loop and true at this point.
|
|
||||||
For l[ ss][j - k] to exist, we must have j - k < n - k -> j < n, which is true straight from definition.
|
|
||||||
|
|
||||||
So, no additional checks, all is good and within bounds at this point.*/
|
|
||||||
*lij -= l[sss][j - k]*l[ss][j - k]*d[j - k];
|
|
||||||
}
|
}
|
||||||
|
sss = findmap[s];
|
||||||
*lij *= id;
|
l[s][j] = sss < 0 ? temp * id : (Diagonals[sss][j] + temp) * id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
delete[] DiagMap;
|
||||||
|
delete[] MaxIndizes;
|
||||||
|
delete[] findmap;
|
||||||
IncompleteCholeskyFactorization = ic;
|
IncompleteCholeskyFactorization = ic;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -302,58 +409,96 @@ void MultiDiagonalSymmetricMatrix::KillIncompleteCholeskyFactorization(void){
|
|||||||
delete IncompleteCholeskyFactorization;
|
delete IncompleteCholeskyFactorization;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float *x, float *b){
|
void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float* RESTRICT x, float* RESTRICT b){
|
||||||
//We want to solve L D Lt x = b where D is a diagonal matrix described by Diagonals[0] and L is a unit lower triagular matrix described by the rest of the diagonals.
|
//We want to solve L D Lt x = b where D is a diagonal matrix described by Diagonals[0] and L is a unit lower triagular matrix described by the rest of the diagonals.
|
||||||
//Let D Lt x = y. Then, first solve L y = b.
|
//Let D Lt x = y. Then, first solve L y = b.
|
||||||
float *y = new float[n];
|
float* RESTRICT *d = IncompleteCholeskyFactorization->Diagonals;
|
||||||
float **d = IncompleteCholeskyFactorization->Diagonals;
|
int* RESTRICT s = IncompleteCholeskyFactorization->StartRows;
|
||||||
unsigned int *s = IncompleteCholeskyFactorization->StartRows;
|
int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
|
||||||
unsigned int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
|
int i, j;
|
||||||
unsigned int i, j;
|
|
||||||
for(j = 0; j != N; j++){
|
|
||||||
float sub = 0; // using local var to reduce memory writes, gave a big speedup
|
|
||||||
for(i = 1; i != M; i++){ //Start at 1 because zero is D.
|
|
||||||
|
|
||||||
int c = (int)j - (int)s[i];
|
if(M != DIAGONALSP1){ // can happen in theory
|
||||||
if(c < 0) break; //Due to ordering of StartRows, no further contributions.
|
for(j = 0; j < N; j++){
|
||||||
if(c==j) {
|
float sub = b[j]; // using local var to reduce memory writes, gave a big speedup
|
||||||
sub += d[i][c]*b[c]; //Because y is not filled yet, we have to access b
|
i = 1;
|
||||||
|
int c = j - s[i];
|
||||||
|
while(c >= 0) {
|
||||||
|
sub -= d[i][c]*x[c];
|
||||||
|
i++;
|
||||||
|
c = j - s[i];
|
||||||
}
|
}
|
||||||
else {
|
x[j] = sub; // only one memory-write per j
|
||||||
sub += d[i][c]*y[c];
|
|
||||||
}
|
}
|
||||||
|
} else { // that's the case almost every time
|
||||||
|
for(j = 0; j <= s[M-1]; j++){
|
||||||
|
float sub = b[j]; // using local var to reduce memory writes, gave a big speedup
|
||||||
|
i = 1;
|
||||||
|
int c = j - s[1];
|
||||||
|
while(c >= 0) {
|
||||||
|
sub -= d[i][c]*x[c];
|
||||||
|
i++;
|
||||||
|
c = j - s[i];
|
||||||
|
}
|
||||||
|
x[j] = sub; // only one memory-write per j
|
||||||
|
}
|
||||||
|
for(j = s[M-1]+1; j<N; j++){
|
||||||
|
float sub = b[j]; // using local var to reduce memory writes, gave a big speedup
|
||||||
|
for(int i=1;i<DIAGONALSP1;i++){ // using a constant upperbound allows the compiler to unroll this loop (gives a good speedup)
|
||||||
|
sub -= d[i][j-s[i]]*x[j-s[i]];
|
||||||
|
}
|
||||||
|
x[j] = sub; // only one memory-write per j
|
||||||
}
|
}
|
||||||
y[j] = b[j] - sub; // only one memory-write per j
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Now, solve x from D Lt x = y -> Lt x = D^-1 y
|
//Now, solve x from D Lt x = y -> Lt x = D^-1 y
|
||||||
// Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
|
// Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
|
||||||
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
|
#endif
|
||||||
for(j = 0; j < N; j++)
|
for(j = 0; j < N; j++)
|
||||||
x[j] = y[j]/d[0][j];
|
x[j] = x[j]/d[0][j];
|
||||||
|
|
||||||
while(j-- != 0){
|
if(M != DIAGONALSP1){ // can happen in theory
|
||||||
float sub = 0; // using local var to reduce memory writes, gave a big speedup
|
while(j-- > 0){
|
||||||
for(i = 1; i != M; i++){
|
float sub = x[j]; // using local var to reduce memory writes, gave a big speedup
|
||||||
if(j + s[i] >= N) break;
|
i=1;
|
||||||
sub += d[i][j]*x[j + s[i]];
|
int c = j+s[1];
|
||||||
|
while(c < N) {
|
||||||
|
sub -= d[i][j]*x[c];
|
||||||
|
i++;
|
||||||
|
c = j+s[i];
|
||||||
|
}
|
||||||
|
x[j] = sub; // only one memory-write per j
|
||||||
|
}
|
||||||
|
} else { // that's the case almost every time
|
||||||
|
for(j=N-1;j>=(N-1)-s[M-1];j--) {
|
||||||
|
float sub = x[j]; // using local var to reduce memory writes, gave a big speedup
|
||||||
|
i=1;
|
||||||
|
int c = j+s[1];
|
||||||
|
while(c < N) {
|
||||||
|
sub -= d[i][j]*x[j+s[i]];
|
||||||
|
i++;
|
||||||
|
c = j+s[i];
|
||||||
|
}
|
||||||
|
x[j] = sub; // only one memory-write per j
|
||||||
|
}
|
||||||
|
for(j=(N-2)-s[M-1];j>=0;j--) {
|
||||||
|
float sub = x[j]; // using local var to reduce memory writes, gave a big speedup
|
||||||
|
for(int i=1;i<DIAGONALSP1;i++){ // using a constant upperbound allows the compiler to unroll this loop (gives a good speedup)
|
||||||
|
sub -= d[i][j]*x[j + s[i]];
|
||||||
|
}
|
||||||
|
x[j] = sub; // only one memory-write per j
|
||||||
|
}
|
||||||
}
|
}
|
||||||
x[j] -= sub; // only one memory-write per j
|
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] y;
|
EdgePreservingDecomposition::EdgePreservingDecomposition(int width, int height){
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
EdgePreservingDecomposition::EdgePreservingDecomposition(unsigned int width, unsigned int height){
|
|
||||||
w = width;
|
w = width;
|
||||||
h = height;
|
h = height;
|
||||||
n = w*h;
|
n = w*h;
|
||||||
|
|
||||||
//Initialize the matrix just once at construction.
|
//Initialize the matrix just once at construction.
|
||||||
A = new MultiDiagonalSymmetricMatrix(n, 5);
|
A = new MultiDiagonalSymmetricMatrix(n, DIAGONALS);
|
||||||
if(!(
|
if(!(
|
||||||
A->CreateDiagonal(0, 0) &&
|
A->CreateDiagonal(0, 0) &&
|
||||||
A->CreateDiagonal(1, 1) &&
|
A->CreateDiagonal(1, 1) &&
|
||||||
@@ -376,41 +521,76 @@ EdgePreservingDecomposition::~EdgePreservingDecomposition(){
|
|||||||
delete A;
|
delete A;
|
||||||
}
|
}
|
||||||
|
|
||||||
float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float EdgeStopping, unsigned int Iterates, float *Blur, bool UseBlurForEdgeStop){
|
SSEFUNCTION float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float EdgeStopping, int Iterates, float *Blur, bool UseBlurForEdgeStop){
|
||||||
|
|
||||||
if(Blur == NULL)
|
if(Blur == NULL)
|
||||||
UseBlurForEdgeStop = false, //Use source if there's no supplied Blur.
|
UseBlurForEdgeStop = false, //Use source if there's no supplied Blur.
|
||||||
Blur = new float[n];
|
Blur = new float[n];
|
||||||
|
|
||||||
if(Scale == 0.0f){
|
if(Scale == 0.0f){
|
||||||
memcpy(Blur, Source, n*sizeof(float));
|
memcpy(Blur, Source, n*sizeof(float));
|
||||||
return Blur;
|
return Blur;
|
||||||
}
|
}
|
||||||
|
|
||||||
//Create the edge stopping function a, rotationally symmetric and just one instead of (ax, ay). Maybe don't need Blur yet, so use its memory.
|
//Create the edge stopping function a, rotationally symmetric and just one instead of (ax, ay). Maybe don't need Blur yet, so use its memory.
|
||||||
float *a, *g;
|
float* RESTRICT a;
|
||||||
|
float* RESTRICT g;
|
||||||
if(UseBlurForEdgeStop) a = new float[n], g = Blur;
|
if(UseBlurForEdgeStop) a = new float[n], g = Blur;
|
||||||
else a = Blur, g = Source;
|
else a = Blur, g = Source;
|
||||||
|
|
||||||
//unsigned int x, y;
|
int i;
|
||||||
unsigned int i;
|
int w1 = w - 1, h1 = h - 1;
|
||||||
unsigned int w1 = w - 1, h1 = h - 1;
|
|
||||||
// float eps = 0.02f;
|
// float eps = 0.02f;
|
||||||
const float sqreps = 0.0004f; // removed eps*eps from inner loop
|
const float sqreps = 0.0004f; // removed eps*eps from inner loop
|
||||||
// float ScaleConstant = Scale * powf(0.5f,-EdgeStopping);
|
|
||||||
|
|
||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for // removed schedule(dynamic,10)
|
#pragma omp parallel
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
#ifdef __SSE2__
|
||||||
|
int x;
|
||||||
|
__m128 gxv,gyv;
|
||||||
|
__m128 Scalev = _mm_set1_ps( Scale );
|
||||||
|
__m128 sqrepsv = _mm_set1_ps( sqreps );
|
||||||
|
__m128 EdgeStoppingv = _mm_set1_ps( -EdgeStopping );
|
||||||
|
__m128 zd5v = _mm_set1_ps( 0.5f );
|
||||||
|
__m128 temp1v, temp2v;
|
||||||
|
#endif
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp for
|
||||||
#endif
|
#endif
|
||||||
for(int y = 0; y < h1; y++){
|
for(int y = 0; y < h1; y++){
|
||||||
float *rg = &g[w*y];
|
float *rg = &g[w*y];
|
||||||
|
#ifdef __SSE2__
|
||||||
|
for(x = 0; x < w1-3; x+=4){
|
||||||
|
//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
|
||||||
|
gxv = (LVFU(rg[x + 1]) - LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + w]));
|
||||||
|
gyv = (LVFU(rg[x + w]) - LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + 1]));
|
||||||
|
//Apply power to the magnitude of the gradient to get the edge stopping function.
|
||||||
|
_mm_storeu_ps( &a[x + w*y], Scalev * pow_F((zd5v*_mm_sqrt_ps(gxv*gxv + gyv*gyv + sqrepsv)), EdgeStoppingv) );
|
||||||
|
}
|
||||||
|
for(; x < w1; x++){
|
||||||
|
//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
|
||||||
|
float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
|
||||||
|
float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
|
||||||
|
//Apply power to the magnitude of the gradient to get the edge stopping function.
|
||||||
|
a[x + w*y] = Scale*pow_F(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
|
||||||
|
}
|
||||||
|
#else
|
||||||
for(int x = 0; x < w1; x++){
|
for(int x = 0; x < w1; x++){
|
||||||
//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
|
//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
|
||||||
float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
|
float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
|
||||||
float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
|
float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
|
||||||
|
|
||||||
//Apply power to the magnitude of the gradient to get the edge stopping function.
|
//Apply power to the magnitude of the gradient to get the edge stopping function.
|
||||||
a[x + w*y] = Scale*powf(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
|
a[x + w*y] = Scale*pow_F(0.5f*sqrtf(gx*gx + gy*gy + sqreps), -EdgeStopping);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//unsigned int x,y;
|
|
||||||
|
|
||||||
/* Now setup the linear problem. I use the Maxima CAS, here's code for making an FEM formulation for the smoothness term:
|
/* Now setup the linear problem. I use the Maxima CAS, here's code for making an FEM formulation for the smoothness term:
|
||||||
p(x, y) := (1 - x)*(1 - y);
|
p(x, y) := (1 - x)*(1 - y);
|
||||||
P(m, n) := A[m][n]*p(x, y) + A[m + 1][n]*p(1 - x, y) + A[m + 1][n + 1]*p(1 - x, 1 - y) + A[m][n + 1]*p(x, 1 - y);
|
P(m, n) := A[m][n]*p(x, y) + A[m + 1][n]*p(1 - x, y) + A[m + 1][n + 1]*p(1 - x, 1 - y) + A[m][n + 1]*p(x, 1 - y);
|
||||||
@@ -421,11 +601,12 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
|
|||||||
Integrate(diff(P(u - 1, v - 1), x)*diff(p(1 - x, 1 - y), x) + diff(P(u - 1, v - 1), y)*diff(p(1 - x, 1 - y), y));
|
Integrate(diff(P(u - 1, v - 1), x)*diff(p(1 - x, 1 - y), x) + diff(P(u - 1, v - 1), y)*diff(p(1 - x, 1 - y), y));
|
||||||
Integrate(diff(P(u, v - 1), x)*diff(p(x, 1 - y), x) + diff(P(u, v - 1), y)*diff(p(x, 1 - y), y));
|
Integrate(diff(P(u, v - 1), x)*diff(p(x, 1 - y), x) + diff(P(u, v - 1), y)*diff(p(x, 1 - y), y));
|
||||||
So yeah. Use the numeric results of that to fill the matrix A.*/
|
So yeah. Use the numeric results of that to fill the matrix A.*/
|
||||||
|
|
||||||
memset(a_1, 0, A->DiagonalLength(1)*sizeof(float));
|
memset(a_1, 0, A->DiagonalLength(1)*sizeof(float));
|
||||||
memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
|
memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
|
||||||
memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
|
memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
|
||||||
memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
|
memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
|
||||||
// unsigned int x, y;
|
|
||||||
|
|
||||||
// checked for race condition here
|
// checked for race condition here
|
||||||
// a0[] is read and write but adressed by i only
|
// a0[] is read and write but adressed by i only
|
||||||
@@ -435,35 +616,42 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
|
|||||||
// a_w1 is write only
|
// a_w1 is write only
|
||||||
// a_1 is write only
|
// a_1 is write only
|
||||||
// So, there should be no race conditions
|
// So, there should be no race conditions
|
||||||
|
|
||||||
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
|
#endif
|
||||||
for(int y = 0; y < h; y++){
|
for(int y = 0; y < h; y++){
|
||||||
unsigned int i = y*w;
|
int i = y*w;
|
||||||
for(int x = 0; x != w; x++, i++){
|
for(int x = 0; x < w; x++, i++){
|
||||||
float ac;
|
float ac,a0temp;
|
||||||
a0[i] = 1.0;
|
a0temp = 0.25f;
|
||||||
|
|
||||||
//Remember, only fill the lower triangle. Memory for upper is never made. It's symmetric. Trust.
|
//Remember, only fill the lower triangle. Memory for upper is never made. It's symmetric. Trust.
|
||||||
if(x > 0 && y > 0)
|
if(x > 0 && y > 0) {
|
||||||
ac = a[i - w - 1]/6.0f,
|
ac = a[i - w - 1]/6.0f;
|
||||||
a_w_1[i - w - 1] -= 2.0f*ac, a_w[i - w] -= ac,
|
a_w_1[i - w - 1] -= 2.0f*ac;
|
||||||
a_1[i - 1] -= ac, a0[i] += 4.0f*ac;
|
a_w[i - w] -= ac;
|
||||||
|
a_1[i - 1] -= ac;
|
||||||
if(x < w1 && y > 0)
|
a0temp += ac;
|
||||||
ac = a[i - w]/6.0f,
|
}
|
||||||
a_w[i - w] -= ac, a_w1[i - w + 1] -= 2.0f*ac,
|
if(x < w1 && y > 0) {
|
||||||
a0[i] += 4.0f*ac;
|
ac = a[i - w]/6.0f;
|
||||||
|
a_w[i - w] -= ac;
|
||||||
if(x > 0 && y < h1)
|
a_w1[i - w + 1] -= 2.0f*ac;
|
||||||
ac = a[i - 1]/6.0f,
|
a0temp += ac;
|
||||||
a_1[i - 1] -= ac, a0[i] += 4.0f*ac;
|
}
|
||||||
|
if(x > 0 && y < h1) {
|
||||||
|
ac = a[i - 1]/6.0f;
|
||||||
|
a_1[i - 1] -= ac;
|
||||||
|
a0temp += ac;
|
||||||
|
}
|
||||||
if(x < w1 && y < h1)
|
if(x < w1 && y < h1)
|
||||||
a0[i] += 4.0f*a[i]/6.0f;
|
a0temp += a[i]/6.0f;
|
||||||
|
a0[i] = 4.0f*a0temp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(UseBlurForEdgeStop) delete[] a;
|
if(UseBlurForEdgeStop) delete[] a;
|
||||||
|
|
||||||
//Solve & return.
|
//Solve & return.
|
||||||
bool success=A->CreateIncompleteCholeskyFactorization(1); //Fill-in of 1 seems to work really good. More doesn't really help and less hurts (slightly).
|
bool success=A->CreateIncompleteCholeskyFactorization(1); //Fill-in of 1 seems to work really good. More doesn't really help and less hurts (slightly).
|
||||||
if(!success) {
|
if(!success) {
|
||||||
@@ -477,7 +665,7 @@ float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float
|
|||||||
return Blur;
|
return Blur;
|
||||||
}
|
}
|
||||||
|
|
||||||
float *EdgePreservingDecomposition::CreateIteratedBlur(float *Source, float Scale, float EdgeStopping, unsigned int Iterates, unsigned int Reweightings, float *Blur){
|
float *EdgePreservingDecomposition::CreateIteratedBlur(float *Source, float Scale, float EdgeStopping, int Iterates, int Reweightings, float *Blur){
|
||||||
//Simpler outcome?
|
//Simpler outcome?
|
||||||
if(Reweightings == 0) return CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur);
|
if(Reweightings == 0) return CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur);
|
||||||
|
|
||||||
@@ -487,39 +675,86 @@ float *EdgePreservingDecomposition::CreateIteratedBlur(float *Source, float Scal
|
|||||||
|
|
||||||
//Iteratively improve the blur.
|
//Iteratively improve the blur.
|
||||||
Reweightings++;
|
Reweightings++;
|
||||||
for(unsigned int i = 0; i < Reweightings; i++)
|
for(int i = 0; i < Reweightings; i++)
|
||||||
CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur, true);
|
CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur, true);
|
||||||
|
|
||||||
return Blur;
|
return Blur;
|
||||||
}
|
}
|
||||||
|
|
||||||
float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Scale, float EdgeStopping, float CompressionExponent, float DetailBoost, unsigned int Iterates, unsigned int Reweightings, float *Compressed){
|
SSEFUNCTION float *EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Scale, float EdgeStopping, float CompressionExponent, float DetailBoost, int Iterates, int Reweightings, float *Compressed){
|
||||||
//Small number intended to prevent division by zero. This is different from the eps in CreateBlur.
|
//Small number intended to prevent division by zero. This is different from the eps in CreateBlur.
|
||||||
const float eps = 0.0001f;
|
const float eps = 0.0001f;
|
||||||
|
|
||||||
//We're working with luminance, which does better logarithmic.
|
//We're working with luminance, which does better logarithmic.
|
||||||
unsigned int i;
|
#ifdef __SSE2__
|
||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for // removed schedule(dynamic,10)
|
#pragma omp parallel
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
__m128 epsv = _mm_set1_ps( eps );
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp for
|
||||||
|
#endif
|
||||||
|
for(int ii = 0; ii < n-3; ii+=4)
|
||||||
|
_mm_storeu_ps( &Source[ii], xlogf(LVFU(Source[ii]) + epsv));
|
||||||
|
}
|
||||||
|
for(int ii = n-(n%4); ii < n; ii++)
|
||||||
|
Source[ii] = xlogf(Source[ii] + eps);
|
||||||
|
|
||||||
|
#else
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp parallel for
|
||||||
#endif
|
#endif
|
||||||
for(int ii = 0; ii < n; ii++)
|
for(int ii = 0; ii < n; ii++)
|
||||||
Source[ii] = logf(Source[ii] + eps);
|
Source[ii] = xlogf(Source[ii] + eps);
|
||||||
|
#endif
|
||||||
|
|
||||||
//Blur. Also setup memory for Compressed (we can just use u since each element of u is used in one calculation).
|
//Blur. Also setup memory for Compressed (we can just use u since each element of u is used in one calculation).
|
||||||
float *u = CreateIteratedBlur(Source, Scale, EdgeStopping, Iterates, Reweightings);
|
float *u = CreateIteratedBlur(Source, Scale, EdgeStopping, Iterates, Reweightings);
|
||||||
if(Compressed == NULL) Compressed = u;
|
if(Compressed == NULL) Compressed = u;
|
||||||
|
|
||||||
//Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
|
//Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
|
||||||
|
float temp = CompressionExponent - 1.0f;
|
||||||
|
|
||||||
|
#ifdef __SSE2__
|
||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
#pragma omp parallel for // removed schedule(dynamic,10)
|
#pragma omp parallel
|
||||||
#endif
|
#endif
|
||||||
for(int i = 0; i < n; i++){
|
{
|
||||||
float ce = expf(Source[i] + u[i]*(CompressionExponent - 1.0f)) - eps;
|
__m128 cev, uev, sourcev;
|
||||||
float ue = expf(u[i]) - eps;
|
__m128 epsv = _mm_set1_ps( eps );
|
||||||
Source[i] = expf(Source[i]) - eps;
|
__m128 DetailBoostv = _mm_set1_ps( DetailBoost );
|
||||||
|
__m128 tempv = _mm_set1_ps( temp );
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp for
|
||||||
|
#endif
|
||||||
|
for(int i = 0; i < n-3; i+=4){
|
||||||
|
cev = xexpf(LVFU(Source[i]) + LVFU(u[i])*(tempv)) - epsv;
|
||||||
|
uev = xexpf(LVFU(u[i])) - epsv;
|
||||||
|
sourcev = xexpf(LVFU(Source[i])) - epsv;
|
||||||
|
_mm_storeu_ps( &Source[i], sourcev);
|
||||||
|
_mm_storeu_ps( &Compressed[i], cev + DetailBoostv * (sourcev - uev) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int i=n-(n%4); i < n; i++){
|
||||||
|
float ce = xexpf(Source[i] + u[i]*(temp)) - eps;
|
||||||
|
float ue = xexpf(u[i]) - eps;
|
||||||
|
Source[i] = xexpf(Source[i]) - eps;
|
||||||
Compressed[i] = ce + DetailBoost*(Source[i] - ue);
|
Compressed[i] = ce + DetailBoost*(Source[i] - ue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#pragma omp parallel for
|
||||||
|
#endif
|
||||||
|
for(int i = 0; i < n; i++){
|
||||||
|
float ce = xexpf(Source[i] + u[i]*(temp)) - eps;
|
||||||
|
float ue = xexpf(u[i]) - eps;
|
||||||
|
Source[i] = xexpf(Source[i]) - eps;
|
||||||
|
Compressed[i] = ce + DetailBoost*(Source[i] - ue);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if(Compressed != u) delete[] u;
|
if(Compressed != u) delete[] u;
|
||||||
return Compressed;
|
return Compressed;
|
||||||
|
|
||||||
|
@@ -49,15 +49,15 @@ My email address is my screen name followed by @yahoo.com. I'm also known as ben
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include "opthelper.h"
|
||||||
|
|
||||||
//This is for solving big symmetric positive definite linear problems.
|
//This is for solving big symmetric positive definite linear problems.
|
||||||
float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, unsigned int n, bool OkToModify_b = true, float *x = NULL, float RMSResidual = 0.0f, void *Pass = NULL, unsigned int MaximumIterates = 0, void Preconditioner(float *Product, float *x, void *Pass) = NULL);
|
float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, int n, bool OkToModify_b = true, float *x = NULL, float RMSResidual = 0.0f, void *Pass = NULL, int MaximumIterates = 0, void Preconditioner(float *Product, float *x, void *Pass) = NULL);
|
||||||
|
|
||||||
//Storage and use class for symmetric matrices, the nonzero contents of which are confined to diagonals.
|
//Storage and use class for symmetric matrices, the nonzero contents of which are confined to diagonals.
|
||||||
class MultiDiagonalSymmetricMatrix{
|
class MultiDiagonalSymmetricMatrix{
|
||||||
public:
|
public:
|
||||||
MultiDiagonalSymmetricMatrix(unsigned int Dimension, unsigned int NumberOfDiagonalsInLowerTriangle);
|
MultiDiagonalSymmetricMatrix(int Dimension, int NumberOfDiagonalsInLowerTriangle);
|
||||||
~MultiDiagonalSymmetricMatrix();
|
~MultiDiagonalSymmetricMatrix();
|
||||||
|
|
||||||
/* Storage of matrix data, and a function to create memory for Diagonals[index].
|
/* Storage of matrix data, and a function to create memory for Diagonals[index].
|
||||||
@@ -70,21 +70,23 @@ public:
|
|||||||
only every worry or think about the lower trianglular (including main diagonal) part of the matrix.
|
only every worry or think about the lower trianglular (including main diagonal) part of the matrix.
|
||||||
*/
|
*/
|
||||||
float **Diagonals;
|
float **Diagonals;
|
||||||
unsigned int *StartRows;
|
char *buffer;
|
||||||
bool CreateDiagonal(unsigned int index, unsigned int StartRow);
|
char *DiagBuffer;
|
||||||
unsigned int n, m; //The matrix is n x n, with m diagonals on the lower triangle. Don't change these. They should be private but aren't for convenience.
|
int *StartRows;
|
||||||
inline unsigned int DiagonalLength(unsigned int StartRow){ //Gives number of elements in a diagonal.
|
bool CreateDiagonal(int index, int StartRow);
|
||||||
|
int n, m; //The matrix is n x n, with m diagonals on the lower triangle. Don't change these. They should be private but aren't for convenience.
|
||||||
|
inline int DiagonalLength(int StartRow){ //Gives number of elements in a diagonal.
|
||||||
return n - StartRow;
|
return n - StartRow;
|
||||||
};
|
};
|
||||||
|
|
||||||
//Not efficient, but you can use it if you're lazy, or for early tests. Returns false if the row + column falls on no loaded diagonal, true otherwise.
|
//Not efficient, but you can use it if you're lazy, or for early tests. Returns false if the row + column falls on no loaded diagonal, true otherwise.
|
||||||
bool LazySetEntry(float value, unsigned int row, unsigned int column);
|
bool LazySetEntry(float value, int row, int column);
|
||||||
|
|
||||||
//Calculates the matrix-vector product of the matrix represented by this class onto the vector x.
|
//Calculates the matrix-vector product of the matrix represented by this class onto the vector x.
|
||||||
void VectorProduct(float *Product, float *x);
|
void VectorProduct(float *Product, float *x);
|
||||||
|
|
||||||
//Given the start row, attempts to find the corresponding index, or -1 if the StartRow doesn't exist.
|
//Given the start row, attempts to find the corresponding index, or -1 if the StartRow doesn't exist.
|
||||||
int FindIndex(unsigned int StartRow);
|
inline int FindIndex(int StartRow) __attribute__((always_inline));
|
||||||
|
|
||||||
//This is the same as above, but designed to take this class as a pass through variable. By this way you can feed
|
//This is the same as above, but designed to take this class as a pass through variable. By this way you can feed
|
||||||
//the meat of this class into an independent function, such as SparseConjugateGradient.
|
//the meat of this class into an independent function, such as SparseConjugateGradient.
|
||||||
@@ -96,7 +98,7 @@ public:
|
|||||||
LDLt factorization of this matrix. Storage is like this: the first diagonal is the diagonal matrix D and the remaining diagonals
|
LDLt factorization of this matrix. Storage is like this: the first diagonal is the diagonal matrix D and the remaining diagonals
|
||||||
describe all of L except its main diagonal, which is a bunch of ones. Read up on the LDLt Cholesky factorization for what all this means.
|
describe all of L except its main diagonal, which is a bunch of ones. Read up on the LDLt Cholesky factorization for what all this means.
|
||||||
Note that VectorProduct is nonsense. More useful to you is CholeskyBackSolve which fills x, where LDLt x = b. */
|
Note that VectorProduct is nonsense. More useful to you is CholeskyBackSolve which fills x, where LDLt x = b. */
|
||||||
bool CreateIncompleteCholeskyFactorization(unsigned int MaxFillAbove = 0);
|
bool CreateIncompleteCholeskyFactorization(int MaxFillAbove = 0);
|
||||||
void KillIncompleteCholeskyFactorization(void);
|
void KillIncompleteCholeskyFactorization(void);
|
||||||
void CholeskyBackSolve(float *x, float *b);
|
void CholeskyBackSolve(float *x, float *b);
|
||||||
MultiDiagonalSymmetricMatrix *IncompleteCholeskyFactorization;
|
MultiDiagonalSymmetricMatrix *IncompleteCholeskyFactorization;
|
||||||
@@ -109,27 +111,27 @@ public:
|
|||||||
|
|
||||||
class EdgePreservingDecomposition{
|
class EdgePreservingDecomposition{
|
||||||
public:
|
public:
|
||||||
EdgePreservingDecomposition(unsigned int width, unsigned int height);
|
EdgePreservingDecomposition(int width, int height);
|
||||||
~EdgePreservingDecomposition();
|
~EdgePreservingDecomposition();
|
||||||
|
|
||||||
//Create an edge preserving blur of Source. Will create and return, or fill into Blur if not NULL. In place not ok.
|
//Create an edge preserving blur of Source. Will create and return, or fill into Blur if not NULL. In place not ok.
|
||||||
//If UseBlurForEdgeStop is true, supplied not NULL Blur is used to calculate the edge stopping function instead of Source.
|
//If UseBlurForEdgeStop is true, supplied not NULL Blur is used to calculate the edge stopping function instead of Source.
|
||||||
float *CreateBlur(float *Source, float Scale, float EdgeStopping, unsigned int Iterates, float *Blur = NULL, bool UseBlurForEdgeStop = false);
|
float *CreateBlur(float *Source, float Scale, float EdgeStopping, int Iterates, float *Blur = NULL, bool UseBlurForEdgeStop = false);
|
||||||
|
|
||||||
//Iterates CreateBlur such that the smoothness term approaches a specific norm via iteratively reweighted least squares. In place not ok.
|
//Iterates CreateBlur such that the smoothness term approaches a specific norm via iteratively reweighted least squares. In place not ok.
|
||||||
float *CreateIteratedBlur(float *Source, float Scale, float EdgeStopping, unsigned int Iterates, unsigned int Reweightings, float *Blur = NULL);
|
float *CreateIteratedBlur(float *Source, float Scale, float EdgeStopping, int Iterates, int Reweightings, float *Blur = NULL);
|
||||||
|
|
||||||
/*Lowers global contrast while preserving or boosting local contrast. Can fill into Compressed. The smaller Compression
|
/*Lowers global contrast while preserving or boosting local contrast. Can fill into Compressed. The smaller Compression
|
||||||
the more compression is applied, with Compression = 1 giving no effect and above 1 the opposite effect. You can totally
|
the more compression is applied, with Compression = 1 giving no effect and above 1 the opposite effect. You can totally
|
||||||
use Compression = 1 and play with DetailBoost for some really sweet unsharp masking. If working on luma/grey, consider giving it a logarithm.
|
use Compression = 1 and play with DetailBoost for some really sweet unsharp masking. If working on luma/grey, consider giving it a logarithm.
|
||||||
In place calculation to save memory (Source == Compressed) is totally ok. Reweightings > 0 invokes CreateIteratedBlur instead of CreateBlur. */
|
In place calculation to save memory (Source == Compressed) is totally ok. Reweightings > 0 invokes CreateIteratedBlur instead of CreateBlur. */
|
||||||
float *CompressDynamicRange(float *Source, float Scale = 1.0f, float EdgeStopping = 1.4f, float CompressionExponent = 0.8f, float DetailBoost = 0.1f, unsigned int Iterates = 20, unsigned int Reweightings = 0, float *Compressed = NULL);
|
float *CompressDynamicRange(float *Source, float Scale = 1.0f, float EdgeStopping = 1.4f, float CompressionExponent = 0.8f, float DetailBoost = 0.1f, int Iterates = 20, int Reweightings = 0, float *Compressed = NULL);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
MultiDiagonalSymmetricMatrix *A; //The equations are simple enough to not mandate a matrix class, but fast solution NEEDS a complicated preconditioner.
|
MultiDiagonalSymmetricMatrix *A; //The equations are simple enough to not mandate a matrix class, but fast solution NEEDS a complicated preconditioner.
|
||||||
unsigned int w, h, n;
|
int w, h, n;
|
||||||
|
|
||||||
//Convenient access to the data in A.
|
//Convenient access to the data in A.
|
||||||
float *a0, *a_1, *a_w, *a_w_1, *a_w1;
|
float * RESTRICT a0, * RESTRICT a_1, * RESTRICT a_w, * RESTRICT a_w_1, * RESTRICT a_w1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
63
rtengine/opthelper.h
Normal file
63
rtengine/opthelper.h
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// opthelper.h includes some #defines which help to make optimizations easier and better readable
|
||||||
|
//
|
||||||
|
// copyright (c) 2013 Ingo Weyrich
|
||||||
|
//
|
||||||
|
// this is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
//
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#ifndef OPTHELPER_H
|
||||||
|
#define OPTHELPER_H
|
||||||
|
|
||||||
|
#ifdef __SSE2__
|
||||||
|
#include "sleefsseavx.c"
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#ifdef WIN32
|
||||||
|
// needed for actual versions of GCC with 32-Bit Windows
|
||||||
|
#define SSEFUNCTION __attribute__((force_align_arg_pointer))
|
||||||
|
#else
|
||||||
|
#define SSEFUNCTION
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define SSEFUNCTION
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifdef __SSE__
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#ifdef WIN32
|
||||||
|
// needed for actual versions of GCC with 32-Bit Windows
|
||||||
|
#define SSEFUNCTION __attribute__((force_align_arg_pointer))
|
||||||
|
#else
|
||||||
|
#define SSEFUNCTION
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define SSEFUNCTION
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define SSEFUNCTION
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#define RESTRICT __restrict__
|
||||||
|
#define LIKELY(x) __builtin_expect (!!(x), 1)
|
||||||
|
#define UNLIKELY(x) __builtin_expect (!!(x), 0)
|
||||||
|
#else
|
||||||
|
#define RESTRICT
|
||||||
|
#define LIKELY(x) (x)
|
||||||
|
#define UNLIKELY(x) (x)
|
||||||
|
#endif
|
||||||
|
#endif
|
Reference in New Issue
Block a user