diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index d537cce3f..6e4b45ccb 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -3,7 +3,7 @@
  *  This file is part of RawTherapee.
  *
  *  Ported from LuminanceHDR by Alberto Griggio <alberto.griggio@gmail.com>
- *  
+ *
  *  RawTherapee is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation, either version 3 of the License, or
@@ -73,7 +73,8 @@
 #include "StopWatch.h"
 #include "sleef.c"
 #include "opthelper.h"
-namespace rtengine {
+namespace rtengine
+{
 
 /******************************************************************************
  * RT code
@@ -84,66 +85,68 @@ extern MyMutex *fftwMutex;
 
 using namespace std;
 
-namespace {
+namespace
+{
 
-class Array2Df: public array2D<float> {
+class Array2Df: public array2D<float>
+{
     typedef array2D<float> Super;
 public:
     Array2Df(): Super() {}
-    Array2Df(int w, int h): Super(w, h) {}
+    Array2Df (int w, int h): Super (w, h) {}
 
-    float &operator()(int w, int h)
+    float &operator() (int w, int h)
     {
         return (*this)[h][w];
     }
 
-    const float &operator()(int w, int h) const
+    const float &operator() (int w, int h) const
     {
         return (*this)[h][w];
     }
 
-    float &operator()(int i)
+    float &operator() (int i)
     {
-        return static_cast<float *>(*this)[i];
+        return static_cast<float *> (*this)[i];
     }
 
-    const float &operator()(int i) const
+    const float &operator() (int i) const
     {
-        return const_cast<Array2Df &>(*this).operator()(i);
+        return const_cast<Array2Df &> (*this).operator() (i);
     }
 
     int getRows() const
     {
-        return const_cast<Array2Df &>(*this).height();
+        return const_cast<Array2Df &> (*this).height();
     }
 
     int getCols() const
     {
-        return const_cast<Array2Df &>(*this).width();
+        return const_cast<Array2Df &> (*this).width();
     }
 
     float *data()
     {
-        return static_cast<float *>(*this);
+        return static_cast<float *> (*this);
     }
 
     const float *data() const
     {
-        return const_cast<Array2Df &>(*this).data();
+        return const_cast<Array2Df &> (*this).data();
     }
 };
 
 // upper bound on image dimension used in tmo_fattal02 -- see the comment there
 const int RT_dimension_cap = 1920;
 
-void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread);
+void rescale_bilinear (const Array2Df &src, Array2Df &dst, bool multithread);
 
 
 /******************************************************************************
  * Luminance HDR code (modifications are marked with an RT comment)
  ******************************************************************************/
 
-void downSample(const Array2Df& A, Array2Df& B)
+void downSample (const Array2Df& A, Array2Df& B)
 {
     const int width = B.getCols();
     const int height = B.getRows();
@@ -153,163 +156,164 @@ void downSample(const Array2Df& A, Array2Df& B)
     // speed improvements. The main issue is the pde solver and in case of the
     // fft solver uses optimised threaded fftw routines.
     //#pragma omp parallel for
-    for ( int y=0 ; y<height ; y++ )
-    {
-        for ( int x=0 ; x<width ; x++ )
-        {
-            float p = A(2*x,2*y);
-            p += A(2*x+1,2*y);
-            p += A(2*x,2*y+1);
-            p += A(2*x+1,2*y+1);
-            B(x,y) = p * 0.25f; // p / 4.0f;
+    for ( int y = 0 ; y < height ; y++ ) {
+        for ( int x = 0 ; x < width ; x++ ) {
+            float p = A (2 * x, 2 * y);
+            p += A (2 * x + 1, 2 * y);
+            p += A (2 * x, 2 * y + 1);
+            p += A (2 * x + 1, 2 * y + 1);
+            B (x, y) = p * 0.25f; // p / 4.0f;
         }
     }
 }
 
-void gaussianBlur(const Array2Df& I, Array2Df& L)
+void gaussianBlur (const Array2Df& I, Array2Df& L)
 {
     const int width = I.getCols();
     const int height = I.getRows();
 
     if (width < 3 || height < 3) {
         if (&I != &L) {
-            for (int i = 0, n = width*height; i < n; ++i) {
-                L(i) = I(i);
+            for (int i = 0, n = width * height; i < n; ++i) {
+                L (i) = I (i);
             }
         }
+
         return;
     }
 
-    Array2Df T(width,height);
+    Array2Df T (width, height);
 
     //--- X blur
     #pragma omp parallel for shared(I, T)
-    for ( int y=0 ; y<height ; y++ )
-    {
-        for ( int x=1 ; x<width-1 ; x++ )
-        {
-            float t = 2.f * I(x,y);
-            t += I(x-1,y);
-            t += I(x+1,y);
-            T(x,y) = t * 0.25f; // t / 4.f;
+
+    for ( int y = 0 ; y < height ; y++ ) {
+        for ( int x = 1 ; x < width - 1 ; x++ ) {
+            float t = 2.f * I (x, y);
+            t += I (x - 1, y);
+            t += I (x + 1, y);
+            T (x, y) = t * 0.25f; // t / 4.f;
         }
-        T(0,y) = ( 3.f * I(0,y)+ I(1,y) ) * 0.25f; // / 4.f;
-        T(width-1,y) = ( 3.f * I(width-1,y) + I(width-2,y) ) * 0.25f; // / 4.f;
+
+        T (0, y) = ( 3.f * I (0, y) + I (1, y) ) * 0.25f; // / 4.f;
+        T (width - 1, y) = ( 3.f * I (width - 1, y) + I (width - 2, y) ) * 0.25f; // / 4.f;
     }
 
     //--- Y blur
     #pragma omp parallel for
-    for ( int x=0 ; x<width-7 ; x+=8 )
-    {
-        for ( int y=1 ; y<height-1 ; y++ )
-        {
-            for(int xx = 0; xx < 8; ++xx) {
-                float t = 2.f * T(x+xx,y);
-                t += T(x+xx,y-1);
-                t += T(x+xx,y+1);
-                L(x+xx,y) = t * 0.25f; // t/4.0f;
+
+    for ( int x = 0 ; x < width - 7 ; x += 8 ) {
+        for ( int y = 1 ; y < height - 1 ; y++ ) {
+            for (int xx = 0; xx < 8; ++xx) {
+                float t = 2.f * T (x + xx, y);
+                t += T (x + xx, y - 1);
+                t += T (x + xx, y + 1);
+                L (x + xx, y) = t * 0.25f; // t/4.0f;
             }
         }
-        for(int xx = 0; xx < 8; ++xx) {
-            L(x+xx,0) = ( 3.f * T(x+xx,0) + T(x+xx,1) ) * 0.25f; // / 4.0f;
-            L(x+xx,height-1) = ( 3.f * T(x+xx,height-1) + T(x+xx,height-2) ) * 0.25f; // / 4.0f;
+
+        for (int xx = 0; xx < 8; ++xx) {
+            L (x + xx, 0) = ( 3.f * T (x + xx, 0) + T (x + xx, 1) ) * 0.25f; // / 4.0f;
+            L (x + xx, height - 1) = ( 3.f * T (x + xx, height - 1) + T (x + xx, height - 2) ) * 0.25f; // / 4.0f;
         }
     }
-    for ( int x = width - (width % 8) ; x<width ; x++ )
-    {
-        for ( int y=1 ; y<height-1 ; y++ )
-        {
-            float t = 2.f * T(x,y);
-            t += T(x,y-1);
-            t += T(x,y+1);
-            L(x,y) = t * 0.25f; // t/4.0f;
+
+    for ( int x = width - (width % 8) ; x < width ; x++ ) {
+        for ( int y = 1 ; y < height - 1 ; y++ ) {
+            float t = 2.f * T (x, y);
+            t += T (x, y - 1);
+            t += T (x, y + 1);
+            L (x, y) = t * 0.25f; // t/4.0f;
         }
-        L(x,0) = ( 3.f * T(x,0) + T(x,1) ) * 0.25f; // / 4.0f;
-        L(x,height-1) = ( 3.f * T(x,height-1) + T(x,height-2) ) * 0.25f; // / 4.0f;
+
+        L (x, 0) = ( 3.f * T (x, 0) + T (x, 1) ) * 0.25f; // / 4.0f;
+        L (x, height - 1) = ( 3.f * T (x, height - 1) + T (x, height - 2) ) * 0.25f; // / 4.0f;
     }
 }
 
-void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
+void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels)
 {
-  int width = H->getCols();
-  int height = H->getRows();
-  const int size = width*height;
+    int width = H->getCols();
+    int height = H->getRows();
+    const int size = width * height;
+
+    pyramids[0] = new Array2Df (width, height);
 
-  pyramids[0] = new Array2Df(width,height);
 //#pragma omp parallel for shared(pyramids, H)
-  for( int i=0 ; i<size ; i++ )
-    (*pyramids[0])(i) = (*H)(i);
-
-  Array2Df* L = new Array2Df(width,height);
-  gaussianBlur( *pyramids[0], *L );
-
-  for ( int k=1 ; k<nlevels ; k++ )
-  {
-      if (width > 2 && height > 2) {
-          width /= 2;
-          height /= 2;
-          pyramids[k] = new Array2Df(width,height);
-          downSample(*L, *pyramids[k]);
-      } else {
-          // RT - now nlevels is fixed in tmo_fattal02 (see the comment in
-          // there), so it might happen that we have to add some padding to
-          // the gaussian pyramids
-          pyramids[k] = new Array2Df(width,height);
-          for (int j = 0, n = width*height; j < n; ++j) {
-              (*pyramids[k])(j) = (*L)(j);
-          }
-      }
-
-    if(k < nlevels -1) {
-        delete L;
-        L = new Array2Df(width,height);
-        gaussianBlur( *pyramids[k], *L );
+    for ( int i = 0 ; i < size ; i++ ) {
+        (*pyramids[0]) (i) = (*H) (i);
     }
-  }
 
-  delete L;
+    Array2Df* L = new Array2Df (width, height);
+    gaussianBlur ( *pyramids[0], *L );
+
+    for ( int k = 1 ; k < nlevels ; k++ ) {
+        if (width > 2 && height > 2) {
+            width /= 2;
+            height /= 2;
+            pyramids[k] = new Array2Df (width, height);
+            downSample (*L, *pyramids[k]);
+        } else {
+            // RT - now nlevels is fixed in tmo_fattal02 (see the comment in
+            // there), so it might happen that we have to add some padding to
+            // the gaussian pyramids
+            pyramids[k] = new Array2Df (width, height);
+
+            for (int j = 0, n = width * height; j < n; ++j) {
+                (*pyramids[k]) (j) = (*L) (j);
+            }
+        }
+
+        if (k < nlevels - 1) {
+            delete L;
+            L = new Array2Df (width, height);
+            gaussianBlur ( *pyramids[k], *L );
+        }
+    }
+
+    delete L;
 }
 
 //--------------------------------------------------------------------
 
-float calculateGradients(Array2Df* H, Array2Df* G, int k)
+float calculateGradients (Array2Df* H, Array2Df* G, int k)
 {
-  const int width = H->getCols();
-  const int height = H->getRows();
-  const float divider = pow( 2.0f, k+1 );
-  float avgGrad = 0.0f;
+    const int width = H->getCols();
+    const int height = H->getRows();
+    const float divider = pow ( 2.0f, k + 1 );
+    float avgGrad = 0.0f;
 
-#pragma omp parallel for reduction(+:avgGrad)
-  for( int y=0 ; y<height ; y++ )
-  {
-    int n = (y == 0 ? 0 : y-1);
-    int s = (y+1 == height ? y : y+1);
-    for( int x=0 ; x<width ; x++ )
-    {
-      float gx, gy;
-      int w, e;
-      w = (x == 0 ? 0 : x-1);
-      e = (x+1 == width ? x : x+1);
+    #pragma omp parallel for reduction(+:avgGrad)
 
-      gx = ((*H)(w,y)-(*H)(e,y));
+    for ( int y = 0 ; y < height ; y++ ) {
+        int n = (y == 0 ? 0 : y - 1);
+        int s = (y + 1 == height ? y : y + 1);
 
-      gy = ((*H)(x,s)-(*H)(x,n));
-      // note this implicitely assumes that H(-1)=H(0)
-      // for the fft-pde slover this would need adjustment as H(-1)=H(1)
-      // is assumed, which means gx=0.0, gy=0.0 at the boundaries
-      // however, the impact is not visible so we ignore this here
+        for ( int x = 0 ; x < width ; x++ ) {
+            float gx, gy;
+            int w, e;
+            w = (x == 0 ? 0 : x - 1);
+            e = (x + 1 == width ? x : x + 1);
 
-      (*G)(x,y) = sqrt(gx*gx+gy*gy) / divider;
-      avgGrad += (*G)(x,y);
+            gx = ((*H) (w, y) - (*H) (e, y));
+
+            gy = ((*H) (x, s) - (*H) (x, n));
+            // note this implicitely assumes that H(-1)=H(0)
+            // for the fft-pde slover this would need adjustment as H(-1)=H(1)
+            // is assumed, which means gx=0.0, gy=0.0 at the boundaries
+            // however, the impact is not visible so we ignore this here
+
+            (*G) (x, y) = sqrt (gx * gx + gy * gy) / divider;
+            avgGrad += (*G) (x, y);
+        }
     }
-  }
 
-  return avgGrad / (width*height);
+    return avgGrad / (width * height);
 }
 
 //--------------------------------------------------------------------
 
-void upSample(const Array2Df& A, Array2Df& B)
+void upSample (const Array2Df& A, Array2Df& B)
 {
     const int width = B.getCols();
     const int height = B.getRows();
@@ -317,18 +321,17 @@ void upSample(const Array2Df& A, Array2Df& B)
     const int aheight = A.getRows();
 
     //#pragma omp parallel for shared(A, B)
-    for ( int y=0 ; y<height ; y++ )
-    {
-        for ( int x=0 ; x<width ; x++ )
-        {
-            int ax = static_cast<int>(x * 0.5f); //x / 2.f;
-            int ay = static_cast<int>(y * 0.5f); //y / 2.f;
-            ax = (ax<awidth) ? ax : awidth-1;
-            ay = (ay<aheight) ? ay : aheight-1;
+    for ( int y = 0 ; y < height ; y++ ) {
+        for ( int x = 0 ; x < width ; x++ ) {
+            int ax = static_cast<int> (x * 0.5f); //x / 2.f;
+            int ay = static_cast<int> (y * 0.5f); //y / 2.f;
+            ax = (ax < awidth) ? ax : awidth - 1;
+            ay = (ay < aheight) ? ay : aheight - 1;
 
-            B(x,y) = A(ax,ay);
+            B (x, y) = A (ax, ay);
         }
     }
+
 //--- this code below produces 'use of uninitialized value error'
 //   int width = A->getCols();
 //   int height = A->getRows();
@@ -345,84 +348,78 @@ void upSample(const Array2Df& A, Array2Df& B)
 }
 
 
-void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
-                       float avgGrad[], int nlevels, int detail_level,
-                       float alfa, float beta, float noise)
+void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[],
+                        float avgGrad[], int nlevels, int detail_level,
+                        float alfa, float beta, float noise)
 {
     const bool newfattal = true;
-    int width = gradients[nlevels-1]->getCols();
-    int height = gradients[nlevels-1]->getRows();
+    int width = gradients[nlevels - 1]->getCols();
+    int height = gradients[nlevels - 1]->getRows();
     Array2Df** fi = new Array2Df*[nlevels];
 
-    fi[nlevels-1] = new Array2Df(width,height);
-    if (newfattal)
-    {
+    fi[nlevels - 1] = new Array2Df (width, height);
+
+    if (newfattal) {
         //#pragma omp parallel for shared(fi)
-        for ( int k = 0 ; k < width*height ; k++ )
-        {
-            (*fi[nlevels-1])(k) = 1.0f;
+        for ( int k = 0 ; k < width * height ; k++ ) {
+            (*fi[nlevels - 1]) (k) = 1.0f;
         }
     }
 
-    for ( int k = nlevels-1; k >= 0 ; k-- )
-    {
+    for ( int k = nlevels - 1; k >= 0 ; k-- ) {
         width = gradients[k]->getCols();
         height = gradients[k]->getRows();
 
         // only apply gradients to levels>=detail_level but at least to the coarsest
         if ( k >= detail_level
-             ||k==nlevels-1
-             || newfattal == false)
-        {
+                || k == nlevels - 1
+                || newfattal == false) {
             //DEBUG_STR << "calculateFiMatrix: apply gradient to level " << k << endl;
             #pragma omp parallel for shared(fi,avgGrad)
-            for ( int y = 0; y < height; y++ )
-            {
-                for ( int x = 0; x < width; x++ )
-                {
-                    float grad = ((*gradients[k])(x,y) < 1e-4f) ? 1e-4 : (*gradients[k])(x,y);
+            for ( int y = 0; y < height; y++ ) {
+                for ( int x = 0; x < width; x++ ) {
+                    float grad = ((*gradients[k]) (x, y) < 1e-4f) ? 1e-4 : (*gradients[k]) (x, y);
                     float a = alfa * avgGrad[k];
 
-                    float value = pow((grad+noise)/a, beta - 1.0f);
+                    float value = pow ((grad + noise) / a, beta - 1.0f);
 
-                    if (newfattal)
-                        (*fi[k])(x,y) *= value;
-                    else
-                        (*fi[k])(x,y) = value;
+                    if (newfattal) {
+                        (*fi[k]) (x, y) *= value;
+                    } else {
+                        (*fi[k]) (x, y) = value;
+                    }
                 }
             }
         }
 
         // create next level
-        if ( k>1 )
-        {
-            width = gradients[k-1]->getCols();
-            height = gradients[k-1]->getRows();
-            fi[k-1] = new Array2Df(width,height);
+        if ( k > 1 ) {
+            width = gradients[k - 1]->getCols();
+            height = gradients[k - 1]->getRows();
+            fi[k - 1] = new Array2Df (width, height);
+        } else {
+            fi[0] = FI;    // highest level -> result
         }
-        else
-            fi[0] = FI;                         // highest level -> result
 
-        if ( k>0  && newfattal )
-        {
-            upSample(*fi[k], *fi[k-1]);           // upsample to next level
-            gaussianBlur(*fi[k-1], *fi[k-1]);
+        if ( k > 0  && newfattal ) {
+            upSample (*fi[k], *fi[k - 1]);        // upsample to next level
+            gaussianBlur (*fi[k - 1], *fi[k - 1]);
         }
     }
 
-    for ( int k=1 ; k<nlevels ; k++ )
-    {
+    for ( int k = 1 ; k < nlevels ; k++ ) {
         delete fi[k];
     }
+
     delete[] fi;
 }
 
 inline
-void findMaxMinPercentile(const Array2Df& I,
-                                 float minPrct, float& minLum,
-                                 float maxPrct, float& maxLum)
+void findMaxMinPercentile (const Array2Df& I,
+                           float minPrct, float& minLum,
+                           float maxPrct, float& maxLum)
 {
-    assert(minPrct <= maxPrct);
+    assert (minPrct <= maxPrct);
 
     const int size = I.getRows() * I.getCols();
     const float* data = I.data();
@@ -432,73 +429,76 @@ void findMaxMinPercentile(const Array2Df& I,
     // memory usage of this method is 65536 * sizeof(float) * (t + 1) byte, where t is the number of threads
 
     // We need one global histogram
-    LUTu histo(65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
+    LUTu histo (65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
     histo.clear();
 #ifdef _OPENMP
-#pragma omp parallel
+    #pragma omp parallel
 #endif
-{
-    // We need one histogram per thread
-    LUTu histothr(65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
-    histothr.clear();
+    {
+        // We need one histogram per thread
+        LUTu histothr (65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
+        histothr.clear();
 
 #ifdef _OPENMP
-#pragma omp for nowait
+        #pragma omp for nowait
 #endif
-    for(int i = 0; i< size; ++i) {
-        // values are in [0;1] range, so we have to multiply with 65535 to get the histogram index
-        histothr[(unsigned int)(65535.f * data[i])]++;
+
+        for (int i = 0; i < size; ++i) {
+            // values are in [0;1] range, so we have to multiply with 65535 to get the histogram index
+            histothr[ (unsigned int) (65535.f * data[i])]++;
+        }
+
+#ifdef _OPENMP
+        #pragma omp critical
+#endif
+        // add per thread histogram to global histogram
+        histo += histothr;
     }
 
-#ifdef _OPENMP
-#pragma omp critical
-#endif
-    // add per thread histogram to global histogram
-    histo += histothr;
-}
-
     int k = 0;
     int count = 0;
 
     // find (minPrct*size) smallest value
-    while(count < minPrct*size) {
+    while (count < minPrct * size) {
         count += histo[k++];
     }
-    if(k > 0) { // interpolate
+
+    if (k > 0) { // interpolate
         int count_ = count - histo[k - 1];
         float c0 = count - minPrct * size;
         float c1 = minPrct * size - count_;
         minLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f);
     } else {
-        minLum = k /65535.f;
+        minLum = k / 65535.f;
     }
 
     // find (maxPrct*size) smallest value
-    while(count < maxPrct*size) {
+    while (count < maxPrct * size) {
         count += histo[k++];
     }
-    if(k > 0) { // interpolate
+
+    if (k > 0) { // interpolate
         int count_ = count - histo[k - 1];
         float c0 = count - maxPrct * size;
         float c1 = maxPrct * size - count_;
         maxLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f);
     } else {
-        maxLum = k /65535.f;
+        maxLum = k / 65535.f;
     }
 
 }
 
-void solve_pde_fft(Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread);
+void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread);
 
-void tmo_fattal02(size_t width,
-                  size_t height,
-                  const Array2Df& Y,
-                  Array2Df& L,
-                  float alfa,
-                  float beta,
-                  float noise,
-                  int detail_level,
-                  bool multithread)
+void tmo_fattal02 (size_t width,
+                   size_t height,
+                   const Array2Df& Y,
+                   Array2Df& L,
+                   float alfa,
+                   float beta,
+                   float noise,
+                   int detail_level,
+                   bool multithread)
 {
 // #ifdef TIMER_PROFILING
 //     msec_timer stop_watch;
@@ -507,12 +507,18 @@ void tmo_fattal02(size_t width,
     static const float black_point = 0.1f;
     static const float white_point = 0.5f;
     static const float gamma = 1.0f; // 0.8f;
-    // static const int   detail_level = 3;
-    if ( detail_level < 0 ) detail_level = 0;
-    if ( detail_level > 3 ) detail_level = 3;
 
-  // ph.setValue(2);
-  // if (ph.canceled()) return;
+    // static const int   detail_level = 3;
+    if ( detail_level < 0 ) {
+        detail_level = 0;
+    }
+
+    if ( detail_level > 3 ) {
+        detail_level = 3;
+    }
+
+    // ph.setValue(2);
+    // if (ph.canceled()) return;
 
     /* RT -- we use a hardcoded value for nlevels, to limit the
      * dependency of the result on the image size. When using an auto computed
@@ -520,247 +526,274 @@ void tmo_fattal02(size_t width,
      * image sizes, making it essentially impossible to preview the tool
      * inside RT. With a hardcoded value, the results for the preview are much
      * closer to those for the final image */
-  // int MSIZE = 32;         // minimum size of gaussian pyramid
-  // // I believe a smaller value than 32 results in slightly better overall
-  // // quality but I'm only applying this if the newly implemented fft solver
-  // // is used in order not to change behaviour of the old version
-  // // TODO: best let the user decide this value
-  // // if (fftsolver)
-  // {
-  //    MSIZE = 8;
-  // }
-        
+    // int MSIZE = 32;         // minimum size of gaussian pyramid
+    // // I believe a smaller value than 32 results in slightly better overall
+    // // quality but I'm only applying this if the newly implemented fft solver
+    // // is used in order not to change behaviour of the old version
+    // // TODO: best let the user decide this value
+    // // if (fftsolver)
+    // {
+    //    MSIZE = 8;
+    // }
 
-  int size = width*height;
 
-  // find max value, normalize to range 0..100 and take logarithm
-  float minLum = Y(0,0);
-  float maxLum = Y(0,0);
+    int size = width * height;
 
-  #pragma omp parallel for reduction(max:maxLum)
-  for ( int i=0 ; i<size ; i++ )
-  {
-      maxLum = std::max(maxLum, Y(i));
-  }
+    // find max value, normalize to range 0..100 and take logarithm
+    float minLum = Y (0, 0);
+    float maxLum = Y (0, 0);
 
-  Array2Df* H = new Array2Df(width, height);
-  float temp = 100.f / maxLum;
-  float eps = 1e-4f;
-  #pragma omp parallel
-  {
-#ifdef __SSE2__
-  vfloat epsv = F2V(eps);
-  vfloat tempv = F2V(temp);
-#endif
-  #pragma omp for schedule(dynamic,16)
-  for (size_t i=0 ; i<height ; ++i) {
-      size_t j = 0;
-#ifdef __SSE2__
-      for(; j < width - 3; j+=4) {
-          STVFU((*H)[i][j], xlogf(tempv * LVFU(Y[i][j]) + epsv));
-      }
-#endif
-      for(; j < width; ++j) {
-          (*H)[i][j] = xlogf(temp * Y[i][j] + eps);
-      }
-  }
-  }
+    #pragma omp parallel for reduction(max:maxLum)
 
-  /** RT - this is also here to reduce the dependency of the results on the
-   * input image size, with the primary aim of having a preview in RT that is
-   * reasonably close to the actual output image. Intuitively, what we do is
-   * to put a cap on the dimension of the image processed, so that it is close
-   * in size to the typical preview that you will see on a normal consumer
-   * monitor. (That's where the 1920 value for RT_dimension_cap comes from.)
-   * However, we can't simply downscale the input Y array and then upscale it
-   * on output, because that would cause a big loss of sharpness (confirmed by
-   * testing).
-   * So, we use a different method: we downscale the H array, so that we
-   * compute a downscaled gaussian pyramid and a downscaled FI matrix. Then,
-   * we upscale the FI matrix later on, before it gets combined with the
-   * original input luminance array H. This seems to preserve the input
-   * sharpness and at the same time significantly reduce the dependency of the
-   * result on the input size. Clearly this is a hack, and keep in mind that I
-   * do not really know how Fattal works (it comes from LuminanceHDR almost
-   * verbatim), so this should probably be revised/reviewed by someone who
-   * knows better... also, we use a quite naive bilinear interpolation
-   * algorithm (see rescale_bilinear below), which could definitely be
-   * improved */
-  int fullwidth = width;
-  int fullheight = height;
-  int dim = std::max(width, height);
-  Array2Df *fullH = nullptr;
-  if (dim > RT_dimension_cap) {
-      float s = float(RT_dimension_cap) / float(dim);
-      Array2Df *HH = new Array2Df(width * s, height * s);
-      rescale_bilinear(*H, *HH, multithread);
-      fullH = H;
-      H = HH;
-      width = H->getCols();
-      height = H->getRows();
-  }
-  /** RT */
-
-  // create gaussian pyramids
-  // int mins = (width<height) ? width : height;    // smaller dimension
-  // int nlevels = 0;
-  // while ( mins >= MSIZE )
-  // {
-  //   nlevels++;
-  //   mins /= 2;
-  // }
-  // // std::cout << "DEBUG: nlevels = " << nlevels << ", mins = " << mins << std::endl;
-  // // The following lines solves a bug with images particularly small
-  // if (nlevels == 0) nlevels = 1;
-  const int nlevels = 7; // RT -- see above
-
-  Array2Df** pyramids = new Array2Df*[nlevels];
-  createGaussianPyramids(H, pyramids, nlevels);
-  // ph.setValue(8);
-
-  // calculate gradients and its average values on pyramid levels
-  Array2Df** gradients = new Array2Df*[nlevels];
-  float* avgGrad = new float[nlevels];
-  for ( int k=0 ; k<nlevels ; k++ )
-  {
-    gradients[k] = new Array2Df(pyramids[k]->getCols(), pyramids[k]->getRows());
-    avgGrad[k] = calculateGradients(pyramids[k],gradients[k], k);
-    delete pyramids[k];
-  }
-  delete[] pyramids;
-  // ph.setValue(12);
-
-  // calculate fi matrix
-  Array2Df* FI = new Array2Df(width, height);
-  calculateFiMatrix(FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise);
-//  dumpPFS( "FI.pfs", FI, "Y" );
-  for ( int i=0 ; i<nlevels ; i++ )
-  {
-    delete gradients[i];
-  }
-  delete[] gradients;
-  delete[] avgGrad;
-  // ph.setValue(16);
-  // if (ph.canceled()){
-  //   delete FI;
-  //   delete H;
-  //   return;
-  // }
-
-  /** - RT - bring back the FI image to the input size if it was downscaled */
-  if (fullH) {
-      Array2Df *FI2 = new Array2Df(fullwidth, fullheight);
-      rescale_bilinear(*FI, *FI2, multithread);
-      delete FI;
-      FI = FI2;
-      width = fullwidth;
-      height = fullheight;
-      delete H;
-      H = fullH;
-  }
-  /** RT */
-
-  // attenuate gradients
-  Array2Df* Gx = new Array2Df(width, height);
-  Array2Df* Gy = new Array2Df(width, height);
-
-  // the fft solver solves the Poisson pde but with slightly different
-  // boundary conditions, so we need to adjust the assembly of the right hand
-  // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero
-  // Neumann conditions assume U(-1)=U(0)), see also divergence calculation
-  // if (fftsolver)
-#pragma omp parallel for
-    for ( size_t y=0 ; y<height ; y++ ) {
-      // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
-      unsigned int yp1 = (y+1 >= height ? height-2 : y+1);
-      for ( size_t x=0 ; x<width ; x++ )
-      {
-        // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
-        unsigned int xp1 = (x+1 >= width ?  width-2  : x+1);
-        // forward differences in H, so need to use between-points approx of FI
-        (*Gx)(x,y) = ((*H)(xp1,y)-(*H)(x,y)) * 0.5 * ((*FI)(xp1,y)+(*FI)(x,y));
-        (*Gy)(x,y) = ((*H)(x,yp1)-(*H)(x,y)) * 0.5 * ((*FI)(x,yp1)+(*FI)(x,y));
-      }
+    for ( int i = 0 ; i < size ; i++ ) {
+        maxLum = std::max (maxLum, Y (i));
     }
-  delete H;
 
-  // calculate divergence
-#pragma omp parallel for
-  for ( size_t y = 0; y < height; ++y )
-  {
-      for ( size_t x = 0; x < width; ++x )
-      {
-          (*FI)(x,y) = (*Gx)(x,y) + (*Gy)(x,y);
-          if ( x > 0 ) (*FI)(x,y) -= (*Gx)(x-1,y);
-          if ( y > 0 ) (*FI)(x,y) -= (*Gy)(x,y-1);
+    Array2Df* H = new Array2Df (width, height);
+    float temp = 100.f / maxLum;
+    float eps = 1e-4f;
+    #pragma omp parallel
+    {
+#ifdef __SSE2__
+        vfloat epsv = F2V (eps);
+        vfloat tempv = F2V (temp);
+#endif
+        #pragma omp for schedule(dynamic,16)
 
-          // if (fftsolver)
-          {
-              if (x==0) (*FI)(x,y) += (*Gx)(x,y);
-              if (y==0) (*FI)(x,y) += (*Gy)(x,y);
-          }
+        for (size_t i = 0 ; i < height ; ++i) {
+            size_t j = 0;
+#ifdef __SSE2__
 
-      }
-  }
-  //delete Gx; // RT - reused as temp buffer in solve_pde_fft, deleted later
-  delete Gy;
+            for (; j < width - 3; j += 4) {
+                STVFU ((*H)[i][j], xlogf (tempv * LVFU (Y[i][j]) + epsv));
+            }
 
-  // solve pde and exponentiate (ie recover compressed image)
-  {
-  // if (fftsolver)
-  {
-      MyMutex::MyLock lock(*fftwMutex);
-      solve_pde_fft(FI, &L, Gx, multithread);//, ph);
-  }
-  delete Gx;
-  delete FI;
-  // else
-  // {
-  //     solve_pde_multigrid(&DivG, &U, ph);
-  // }
+#endif
+
+            for (; j < width; ++j) {
+                (*H)[i][j] = xlogf (temp * Y[i][j] + eps);
+            }
+        }
+    }
+
+    /** RT - this is also here to reduce the dependency of the results on the
+     * input image size, with the primary aim of having a preview in RT that is
+     * reasonably close to the actual output image. Intuitively, what we do is
+     * to put a cap on the dimension of the image processed, so that it is close
+     * in size to the typical preview that you will see on a normal consumer
+     * monitor. (That's where the 1920 value for RT_dimension_cap comes from.)
+     * However, we can't simply downscale the input Y array and then upscale it
+     * on output, because that would cause a big loss of sharpness (confirmed by
+     * testing).
+     * So, we use a different method: we downscale the H array, so that we
+     * compute a downscaled gaussian pyramid and a downscaled FI matrix. Then,
+     * we upscale the FI matrix later on, before it gets combined with the
+     * original input luminance array H. This seems to preserve the input
+     * sharpness and at the same time significantly reduce the dependency of the
+     * result on the input size. Clearly this is a hack, and keep in mind that I
+     * do not really know how Fattal works (it comes from LuminanceHDR almost
+     * verbatim), so this should probably be revised/reviewed by someone who
+     * knows better... also, we use a quite naive bilinear interpolation
+     * algorithm (see rescale_bilinear below), which could definitely be
+     * improved */
+    int fullwidth = width;
+    int fullheight = height;
+    int dim = std::max (width, height);
+    Array2Df *fullH = nullptr;
+
+    if (dim > RT_dimension_cap) {
+        float s = float (RT_dimension_cap) / float (dim);
+        Array2Df *HH = new Array2Df (width * s, height * s);
+        rescale_bilinear (*H, *HH, multithread);
+        fullH = H;
+        H = HH;
+        width = H->getCols();
+        height = H->getRows();
+    }
+
+    /** RT */
+
+    // create gaussian pyramids
+    // int mins = (width<height) ? width : height;    // smaller dimension
+    // int nlevels = 0;
+    // while ( mins >= MSIZE )
+    // {
+    //   nlevels++;
+    //   mins /= 2;
+    // }
+    // // std::cout << "DEBUG: nlevels = " << nlevels << ", mins = " << mins << std::endl;
+    // // The following lines solves a bug with images particularly small
+    // if (nlevels == 0) nlevels = 1;
+    const int nlevels = 7; // RT -- see above
+
+    Array2Df** pyramids = new Array2Df*[nlevels];
+    createGaussianPyramids (H, pyramids, nlevels);
+    // ph.setValue(8);
+
+    // calculate gradients and its average values on pyramid levels
+    Array2Df** gradients = new Array2Df*[nlevels];
+    float* avgGrad = new float[nlevels];
+
+    for ( int k = 0 ; k < nlevels ; k++ ) {
+        gradients[k] = new Array2Df (pyramids[k]->getCols(), pyramids[k]->getRows());
+        avgGrad[k] = calculateGradients (pyramids[k], gradients[k], k);
+        delete pyramids[k];
+    }
+
+    delete[] pyramids;
+    // ph.setValue(12);
+
+    // calculate fi matrix
+    Array2Df* FI = new Array2Df (width, height);
+    calculateFiMatrix (FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise);
+
+//  dumpPFS( "FI.pfs", FI, "Y" );
+    for ( int i = 0 ; i < nlevels ; i++ ) {
+        delete gradients[i];
+    }
+
+    delete[] gradients;
+    delete[] avgGrad;
+    // ph.setValue(16);
+    // if (ph.canceled()){
+    //   delete FI;
+    //   delete H;
+    //   return;
+    // }
+
+    /** - RT - bring back the FI image to the input size if it was downscaled */
+    if (fullH) {
+        Array2Df *FI2 = new Array2Df (fullwidth, fullheight);
+        rescale_bilinear (*FI, *FI2, multithread);
+        delete FI;
+        FI = FI2;
+        width = fullwidth;
+        height = fullheight;
+        delete H;
+        H = fullH;
+    }
+
+    /** RT */
+
+    // attenuate gradients
+    Array2Df* Gx = new Array2Df (width, height);
+    Array2Df* Gy = new Array2Df (width, height);
+
+    // the fft solver solves the Poisson pde but with slightly different
+    // boundary conditions, so we need to adjust the assembly of the right hand
+    // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero
+    // Neumann conditions assume U(-1)=U(0)), see also divergence calculation
+    // if (fftsolver)
+    #pragma omp parallel for
+
+    for ( size_t y = 0 ; y < height ; y++ ) {
+        // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
+        unsigned int yp1 = (y + 1 >= height ? height - 2 : y + 1);
+
+        for ( size_t x = 0 ; x < width ; x++ ) {
+            // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
+            unsigned int xp1 = (x + 1 >= width ?  width - 2  : x + 1);
+            // forward differences in H, so need to use between-points approx of FI
+            (*Gx) (x, y) = ((*H) (xp1, y) - (*H) (x, y)) * 0.5 * ((*FI) (xp1, y) + (*FI) (x, y));
+            (*Gy) (x, y) = ((*H) (x, yp1) - (*H) (x, y)) * 0.5 * ((*FI) (x, yp1) + (*FI) (x, y));
+        }
+    }
+
+    delete H;
+
+    // calculate divergence
+    #pragma omp parallel for
+
+    for ( size_t y = 0; y < height; ++y ) {
+        for ( size_t x = 0; x < width; ++x ) {
+            (*FI) (x, y) = (*Gx) (x, y) + (*Gy) (x, y);
+
+            if ( x > 0 ) {
+                (*FI) (x, y) -= (*Gx) (x - 1, y);
+            }
+
+            if ( y > 0 ) {
+                (*FI) (x, y) -= (*Gy) (x, y - 1);
+            }
+
+            // if (fftsolver)
+            {
+                if (x == 0) {
+                    (*FI) (x, y) += (*Gx) (x, y);
+                }
+
+                if (y == 0) {
+                    (*FI) (x, y) += (*Gy) (x, y);
+                }
+            }
+
+        }
+    }
+
+    //delete Gx; // RT - reused as temp buffer in solve_pde_fft, deleted later
+    delete Gy;
+
+    // solve pde and exponentiate (ie recover compressed image)
+    {
+        // if (fftsolver)
+        {
+            MyMutex::MyLock lock (*fftwMutex);
+            solve_pde_fft (FI, &L, Gx, multithread); //, ph);
+        }
+        delete Gx;
+        delete FI;
+        // else
+        // {
+        //     solve_pde_multigrid(&DivG, &U, ph);
+        // }
 // #ifndef NDEBUG
 //   printf("\npde residual error: %f\n", residual_pde(&U, &DivG));
 // #endif
-  // ph.setValue(90);
-  // if ( ph.canceled() )
-  // {
-  //     return;
-  // }
-  #pragma omp parallel
-  {
+        // ph.setValue(90);
+        // if ( ph.canceled() )
+        // {
+        //     return;
+        // }
+        #pragma omp parallel
+        {
 #ifdef __SSE2__
-  vfloat gammav = F2V(gamma);
+            vfloat gammav = F2V (gamma);
 #endif
-  #pragma omp for schedule(dynamic,16)
-  for (size_t i=0 ; i<height ; i++) {
-      size_t j = 0;
+            #pragma omp for schedule(dynamic,16)
+
+            for (size_t i = 0 ; i < height ; i++) {
+                size_t j = 0;
 #ifdef __SSE2__
-      for(; j < width - 3; j+=4) {
-          STVFU(L[i][j], xexpf(gammav * LVFU(L[i][j])));
-      }
+
+                for (; j < width - 3; j += 4) {
+                    STVFU (L[i][j], xexpf (gammav * LVFU (L[i][j])));
+                }
+
 #endif
-      for(; j < width; j++) {
-          L[i][j] = xexpf( gamma * L[i][j]);
-      }
-  }
-  }
-  }
-  // ph.setValue(95);
 
-  // remove percentile of min and max values and renormalize
-  float cut_min = 0.01f * black_point;
-  float cut_max = 1.0f - 0.01f * white_point;
-  assert(cut_min>=0.0f && (cut_max<=1.0f) && (cut_min<cut_max));
-  findMaxMinPercentile(L, cut_min, minLum, cut_max, maxLum);
-  float dividor = (maxLum - minLum);
-
-#pragma omp parallel for
-  for (size_t i = 0; i < height; ++i) {
-    for (size_t j = 0; j < width; ++j) {
-        L[i][j] = std::max((L[i][j] - minLum) / dividor, 0.f);
-        // note, we intentionally do not cut off values > 1.0
+                for (; j < width; j++) {
+                    L[i][j] = xexpf ( gamma * L[i][j]);
+                }
+            }
+        }
+    }
+    // ph.setValue(95);
+
+    // remove percentile of min and max values and renormalize
+    float cut_min = 0.01f * black_point;
+    float cut_max = 1.0f - 0.01f * white_point;
+    assert (cut_min >= 0.0f && (cut_max <= 1.0f) && (cut_min < cut_max));
+    findMaxMinPercentile (L, cut_min, minLum, cut_max, maxLum);
+    float dividor = (maxLum - minLum);
+
+    #pragma omp parallel for
+
+    for (size_t i = 0; i < height; ++i) {
+        for (size_t j = 0; j < width; ++j) {
+            L[i][j] = std::max ((L[i][j] - minLum) / dividor, 0.f);
+            // note, we intentionally do not cut off values > 1.0
+        }
     }
-  }
 }
 
 
@@ -836,91 +869,94 @@ void tmo_fattal02(size_t width,
 
 // returns T = EVy A EVx^tr
 // note, modifies input data
-void transform_ev2normal(Array2Df *A, Array2Df *T)
+void transform_ev2normal (Array2Df *A, Array2Df *T)
 {
-  int width = A->getCols();
-  int height = A->getRows();
-  assert((int)T->getCols()==width && (int)T->getRows()==height);
+    int width = A->getCols();
+    int height = A->getRows();
+    assert ((int)T->getCols() == width && (int)T->getRows() == height);
 
-  // the discrete cosine transform is not exactly the transform needed
-  // need to scale input values to get the right transformation
-  #pragma omp parallel for
-  for(int y=1 ; y<height-1 ; y++ )
-    for(int x=1 ; x<width-1 ; x++ )
-      (*A)(x,y)*=0.25f;
+    // the discrete cosine transform is not exactly the transform needed
+    // need to scale input values to get the right transformation
+    #pragma omp parallel for
 
-  for(int x=1 ; x<width-1 ; x++ )
-  {
-    (*A)(x,0)*=0.5f;
-    (*A)(x,height-1)*=0.5f;
-  }
-  for(int y=1 ; y<height-1 ; y++ )
-  {
-    (*A)(0,y)*=0.5;
-    (*A)(width-1,y)*=0.5f;
-  }
+    for (int y = 1 ; y < height - 1 ; y++ )
+        for (int x = 1 ; x < width - 1 ; x++ ) {
+            (*A) (x, y) *= 0.25f;
+        }
 
-  // note, fftw provides its own memory allocation routines which
-  // ensure that memory is properly 16/32 byte aligned so it can
-  // use SSE/AVX operations (2/4 double ops in parallel), if our
-  // data is not properly aligned fftw won't use SSE/AVX
-  // (I believe new() aligns memory to 16 byte so avoid overhead here)
-  //
-  // double* in = (double*) fftwf_malloc(sizeof(double) * width*height);
-  // fftwf_free(in);
+    for (int x = 1 ; x < width - 1 ; x++ ) {
+        (*A) (x, 0) *= 0.5f;
+        (*A) (x, height - 1) *= 0.5f;
+    }
 
-  // executes 2d discrete cosine transform
-  fftwf_plan p;
-  p=fftwf_plan_r2r_2d(height, width, A->data(), T->data(),
-                        FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
-  fftwf_execute(p);
-  fftwf_destroy_plan(p);
+    for (int y = 1 ; y < height - 1 ; y++ ) {
+        (*A) (0, y) *= 0.5;
+        (*A) (width - 1, y) *= 0.5f;
+    }
+
+    // note, fftw provides its own memory allocation routines which
+    // ensure that memory is properly 16/32 byte aligned so it can
+    // use SSE/AVX operations (2/4 double ops in parallel), if our
+    // data is not properly aligned fftw won't use SSE/AVX
+    // (I believe new() aligns memory to 16 byte so avoid overhead here)
+    //
+    // double* in = (double*) fftwf_malloc(sizeof(double) * width*height);
+    // fftwf_free(in);
+
+    // executes 2d discrete cosine transform
+    fftwf_plan p;
+    p = fftwf_plan_r2r_2d (height, width, A->data(), T->data(),
+                           FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
+    fftwf_execute (p);
+    fftwf_destroy_plan (p);
 }
 
 
 // returns T = EVy^-1 * A * (EVx^-1)^tr
-void transform_normal2ev(Array2Df *A, Array2Df *T)
+void transform_normal2ev (Array2Df *A, Array2Df *T)
 {
-  int width = A->getCols();
-  int height = A->getRows();
-  assert((int)T->getCols()==width && (int)T->getRows()==height);
+    int width = A->getCols();
+    int height = A->getRows();
+    assert ((int)T->getCols() == width && (int)T->getRows() == height);
 
-  // executes 2d discrete cosine transform
-  fftwf_plan p;
-  p=fftwf_plan_r2r_2d(height, width, A->data(), T->data(),
-                        FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
-  fftwf_execute(p);
-  fftwf_destroy_plan(p);
+    // executes 2d discrete cosine transform
+    fftwf_plan p;
+    p = fftwf_plan_r2r_2d (height, width, A->data(), T->data(),
+                           FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
+    fftwf_execute (p);
+    fftwf_destroy_plan (p);
 
-  // need to scale the output matrix to get the right transform
-  float factor = (1.0f/((height-1)*(width-1)));
-#pragma omp parallel for
-  for(int y=0 ; y<height ; y++ )
-    for(int x=0 ; x<width ; x++ )
-      (*T)(x,y)*= factor;
-  for(int x=0 ; x<width ; x++ )
-  {
-    (*T)(x,0)*=0.5f;
-    (*T)(x,height-1)*=0.5f;
-  }
-  for(int y=0 ; y<height ; y++ )
-  {
-    (*T)(0,y)*=0.5f;
-    (*T)(width-1,y)*=0.5f;
-  }
+    // need to scale the output matrix to get the right transform
+    float factor = (1.0f / ((height - 1) * (width - 1)));
+    #pragma omp parallel for
+
+    for (int y = 0 ; y < height ; y++ )
+        for (int x = 0 ; x < width ; x++ ) {
+            (*T) (x, y) *= factor;
+        }
+
+    for (int x = 0 ; x < width ; x++ ) {
+        (*T) (x, 0) *= 0.5f;
+        (*T) (x, height - 1) *= 0.5f;
+    }
+
+    for (int y = 0 ; y < height ; y++ ) {
+        (*T) (0, y) *= 0.5f;
+        (*T) (width - 1, y) *= 0.5f;
+    }
 }
 
 // returns the eigenvalues of the 1d laplace operator
-std::vector<double> get_lambda(int n)
+std::vector<double> get_lambda (int n)
 {
-  assert(n>1);
-  std::vector<double> v(n);
-  for (int i=0; i<n; i++)
-  {
-    v[i]=-4.0*SQR(sin((double)i/(2*(n-1))*RT_PI));
-  }
+    assert (n > 1);
+    std::vector<double> v (n);
 
-  return v;
+    for (int i = 0; i < n; i++) {
+        v[i] = -4.0 * SQR (sin ((double)i / (2 * (n - 1)) * RT_PI));
+    }
+
+    return v;
 }
 
 // // makes boundary conditions compatible so that a solution exists
@@ -967,99 +1003,105 @@ std::vector<double> get_lambda(int n)
 // not modified and the equation might not have a solution but an
 // approximate solution with a minimum error is then calculated
 // double precision version
-void solve_pde_fft(Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*, pfs::Progress &ph,
+void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*, pfs::Progress &ph,
                                               bool adjust_bound)*/
 {
-   // ph.setValue(20);
-  //DEBUG_STR << "solve_pde_fft: solving Laplace U = F ..." << std::endl;
-  int width = F->getCols();
-  int height = F->getRows();
-  assert((int)U->getCols()==width && (int)U->getRows()==height);
-  assert(buf->getCols()==width && buf->getRows()==height);
+    // ph.setValue(20);
+    //DEBUG_STR << "solve_pde_fft: solving Laplace U = F ..." << std::endl;
+    int width = F->getCols();
+    int height = F->getRows();
+    assert ((int)U->getCols() == width && (int)U->getRows() == height);
+    assert (buf->getCols() == width && buf->getRows() == height);
 
-  // activate parallel execution of fft routines
+    // activate parallel execution of fft routines
 #ifdef RT_FFTW3F_OMP
-  if (multithread) {
-      fftwf_init_threads();
-      fftwf_plan_with_nthreads( omp_get_max_threads() );
-  }
+
+    if (multithread) {
+        fftwf_init_threads();
+        fftwf_plan_with_nthreads ( omp_get_max_threads() );
+    }
+
 // #else
 //   fftwf_plan_with_nthreads( 2 );
 #endif
 
-  // in general there might not be a solution to the Poisson pde
-  // with Neumann boundary conditions unless the boundary satisfies
-  // an integral condition, this function modifies the boundary so that
-  // the condition is exactly satisfied
-  // if(adjust_bound)
-  // {
-  //   //DEBUG_STR << "solve_pde_fft: checking boundary conditions" << std::endl;
-  //   make_compatible_boundary(F);
-  // }
+    // in general there might not be a solution to the Poisson pde
+    // with Neumann boundary conditions unless the boundary satisfies
+    // an integral condition, this function modifies the boundary so that
+    // the condition is exactly satisfied
+    // if(adjust_bound)
+    // {
+    //   //DEBUG_STR << "solve_pde_fft: checking boundary conditions" << std::endl;
+    //   make_compatible_boundary(F);
+    // }
 
-  // transforms F into eigenvector space: Ftr =
-  //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl;
-  Array2Df* F_tr = buf; //new Array2Df(width,height);
-  transform_normal2ev(F, F_tr);
-  // TODO: F no longer needed so could release memory, but as it is an
-  // input parameter we won't do that
-  // ph.setValue(50);
-  // if (ph.canceled())
-  // {
-  //   delete F_tr;
-  //   return;
-  // }
+    // transforms F into eigenvector space: Ftr =
+    //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl;
+    Array2Df* F_tr = buf; //new Array2Df(width,height);
+    transform_normal2ev (F, F_tr);
+    // TODO: F no longer needed so could release memory, but as it is an
+    // input parameter we won't do that
+    // ph.setValue(50);
+    // if (ph.canceled())
+    // {
+    //   delete F_tr;
+    //   return;
+    // }
 
-  //DEBUG_STR << "solve_pde_fft: F_tr(0,0) = " << (*F_tr)(0,0);
-  //DEBUG_STR << " (must be 0 for solution to exist)" << std::endl;
+    //DEBUG_STR << "solve_pde_fft: F_tr(0,0) = " << (*F_tr)(0,0);
+    //DEBUG_STR << " (must be 0 for solution to exist)" << std::endl;
 
-  // in the eigenvector space the solution is very simple
-  //DEBUG_STR << "solve_pde_fft: solve in eigenvector space" << std::endl;
+    // in the eigenvector space the solution is very simple
+    //DEBUG_STR << "solve_pde_fft: solve in eigenvector space" << std::endl;
 //  Array2Df* U_tr = new Array2Df(width,height);
-  std::vector<double> l1=get_lambda(height);
-  std::vector<double> l2=get_lambda(width);
+    std::vector<double> l1 = get_lambda (height);
+    std::vector<double> l2 = get_lambda (width);
 
-#pragma omp parallel for
-  for(int y=0 ; y<height ; y++ )
-  {
-    for(int x=0 ; x<width ; x++ )
-    {
-        (*F_tr)(x,y)=(*F_tr)(x,y)/(l1[y]+l2[x]);
+    #pragma omp parallel for
+
+    for (int y = 0 ; y < height ; y++ ) {
+        for (int x = 0 ; x < width ; x++ ) {
+            (*F_tr) (x, y) = (*F_tr) (x, y) / (l1[y] + l2[x]);
+        }
     }
-  }
-  (*F_tr)(0,0)=0.f; // any value ok, only adds a const to the solution
 
-  // transforms U_tr back to the normal space
-  //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl;
-  transform_ev2normal(F_tr, U);
+    (*F_tr) (0, 0) = 0.f; // any value ok, only adds a const to the solution
+
+    // transforms U_tr back to the normal space
+    //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl;
+    transform_ev2normal (F_tr, U);
 //  delete F_tr;    // no longer needed so release memory
 
-  // the solution U as calculated will satisfy something like int U = 0
-  // since for any constant c, U-c is also a solution and we are mainly
-  // working in the logspace of (0,1) data we prefer to have
-  // a solution which has no positive values: U_new(x,y)=U(x,y)-max
-  // (not really needed but good for numerics as we later take exp(U))
-  //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl;
-  float max=0.f;
-  #pragma omp parallel for reduction(max:max)
-  for(int i=0; i<width*height; i++) {
-    max = std::max(max, (*U)(i));
-  }
+    // the solution U as calculated will satisfy something like int U = 0
+    // since for any constant c, U-c is also a solution and we are mainly
+    // working in the logspace of (0,1) data we prefer to have
+    // a solution which has no positive values: U_new(x,y)=U(x,y)-max
+    // (not really needed but good for numerics as we later take exp(U))
+    //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl;
+    float max = 0.f;
+    #pragma omp parallel for reduction(max:max)
 
-  #pragma omp parallel for
-  for(int i=0; i<width*height; i++) {
-    (*U)(i)-=max;
-  }
+    for (int i = 0; i < width * height; i++) {
+        max = std::max (max, (*U) (i));
+    }
 
-  // fft parallel threads cleanup, better handled outside this function?
+    #pragma omp parallel for
+
+    for (int i = 0; i < width * height; i++) {
+        (*U) (i) -= max;
+    }
+
+    // fft parallel threads cleanup, better handled outside this function?
 #ifdef RT_FFTW3F_OMP
-  if (multithread) {
-      fftwf_cleanup_threads();
-  }
+
+    if (multithread) {
+        fftwf_cleanup_threads();
+    }
+
 #endif
 
-  // ph.setValue(90);
-  //DEBUG_STR << "solve_pde_fft: done" << std::endl;
+    // ph.setValue(90);
+    //DEBUG_STR << "solve_pde_fft: done" << std::endl;
 }
 
 
@@ -1092,21 +1134,21 @@ void solve_pde_fft(Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*,
  * RT code from here on
  *****************************************************************************/
 
-inline float get_bilinear_value(const Array2Df &src, float x, float y)
+inline float get_bilinear_value (const Array2Df &src, float x, float y)
 {
     // Get integer and fractional parts of numbers
     int xi = x;
     int yi = y;
     float xf = x - xi;
     float yf = y - yi;
-    int xi1 = std::min(xi+1, src.getCols()-1);
-    int yi1 = std::min(yi+1, src.getRows()-1);
- 
-    float bl = src(xi, yi);
-    float br = src(xi1, yi);
-    float tl = src(xi, yi1);
-    float tr = src(xi1, yi1);
- 
+    int xi1 = std::min (xi + 1, src.getCols() - 1);
+    int yi1 = std::min (yi + 1, src.getRows() - 1);
+
+    float bl = src (xi, yi);
+    float br = src (xi1, yi);
+    float tl = src (xi, yi1);
+    float tr = src (xi1, yi1);
+
     // interpolate
     float b = xf * br + (1.f - xf) * bl;
     float t = xf * tr + (1.f - xf) * tl;
@@ -1115,23 +1157,25 @@ inline float get_bilinear_value(const Array2Df &src, float x, float y)
 }
 
 
-void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
+void rescale_bilinear (const Array2Df &src, Array2Df &dst, bool multithread)
 {
-    float col_scale = float(src.getCols())/float(dst.getCols());
-    float row_scale = float(src.getRows())/float(dst.getRows());
+    float col_scale = float (src.getCols()) / float (dst.getCols());
+    float row_scale = float (src.getRows()) / float (dst.getRows());
 
 #ifdef _OPENMP
     #pragma omp parallel for if (multithread)
 #endif
+
     for (int y = 0; y < dst.getRows(); ++y) {
         float ymrs = y * row_scale;
+
         for (int x = 0; x < dst.getCols(); ++x) {
-            dst(x, y) = get_bilinear_value(src, x * col_scale, ymrs);
+            dst (x, y) = get_bilinear_value (src, x * col_scale, ymrs);
         }
     }
 }
 
-void rescale_nearest(const Array2Df &src, Array2Df &dst, bool multithread)
+void rescale_nearest (const Array2Df &src, Array2Df &dst, bool multithread)
 {
     const int width = src.getCols();
     const int height = src.getRows();
@@ -1141,26 +1185,28 @@ void rescale_nearest(const Array2Df &src, Array2Df &dst, bool multithread)
 #ifdef _OPENMP
     #pragma omp parallel for if (multithread)
 #endif
+
     for (int y = 0; y < nh; ++y) {
         int sy = y * height / nh;
+
         for (int x = 0; x < nw; ++x) {
             int sx = x * width / nw;
-            dst(x, y) = src(sx, sy);
+            dst (x, y) = src (sx, sy);
         }
     }
 }
 
 
-inline float luminance(float r, float g, float b, TMatrix ws)
+inline float luminance (float r, float g, float b, TMatrix ws)
 {
     return r * ws[1][0] + g * ws[1][1] + b * ws[1][2];
 }
 
 
-inline int round_up_pow2(int dim)
+inline int round_up_pow2 (int dim)
 {
     // from https://graphics.stanford.edu/~seander/bithacks.html
-    assert(dim > 0);
+    assert (dim > 0);
     unsigned int v = dim;
     v--;
     v |= v >> 1;
@@ -1172,7 +1218,7 @@ inline int round_up_pow2(int dim)
     return v;
 }
 
-inline int find_fast_dim(int dim)
+inline int find_fast_dim (int dim)
 {
     // as per the FFTW docs:
     //
@@ -1183,39 +1229,42 @@ inline int find_fast_dim(int dim)
     // the above form. This is not exhaustive, but should be ok for pictures
     // up to 100MPix at least
 
-    int d1 = round_up_pow2(dim);
+    int d1 = round_up_pow2 (dim);
     std::vector<int> d = {
-        d1/128 * 65,
-        d1/64 * 33,
-        d1/512 * 273,
-        d1/16 * 9,
-        d1/8 * 5,
-        d1/16 * 11,
-        d1/128 * 91,
-        d1/4 * 3,
-        d1/64 * 49,
-        d1/16 * 13,
-        d1/8 * 7,
+        d1 / 128 * 65,
+        d1 / 64 * 33,
+        d1 / 512 * 273,
+        d1 / 16 * 9,
+        d1 / 8 * 5,
+        d1 / 16 * 11,
+        d1 / 128 * 91,
+        d1 / 4 * 3,
+        d1 / 64 * 49,
+        d1 / 16 * 13,
+        d1 / 8 * 7,
         d1
     };
+
     for (size_t i = 0; i < d.size(); ++i) {
         if (d[i] >= dim) {
             return d[i];
         }
     }
-    assert(false);
+
+    assert (false);
     return dim;
 }
 
 } // namespace
 
 
-void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
+void ImProcFunctions::ToneMapFattal02 (Imagefloat *rgb)
 {
     BENCHFUN
     const int detail_level = 3;
 
     float alpha = 1.f;
+
     if (params->fattal.threshold < 0) {
         alpha += (params->fattal.threshold * 0.9f) / 100.f;
     } else if (params->fattal.threshold > 0) {
@@ -1223,7 +1272,7 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
     }
 
     float beta = 1.f - (params->fattal.amount * 0.3f) / 100.f;
-    
+
     // sanity check
     if (alpha <= 0 || beta <= 0) {
         return;
@@ -1231,35 +1280,38 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 
     int w = rgb->getWidth();
     int h = rgb->getHeight();
-    
-    Array2Df Yr(w, h);
+
+    Array2Df Yr (w, h);
 
     const float epsilon = 1e-4f;
     const float luminance_noise_floor = 65.535f;
     const float min_luminance = 1.f;
-    TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix(params->icm.working);
+    TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix (params->icm.working);
 
 #ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
 #endif
+
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
-            Yr(x, y) = std::max(luminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x), ws), min_luminance); // clip really black pixels
+            Yr (x, y) = std::max (luminance (rgb->r (y, x), rgb->g (y, x), rgb->b (y, x), ws), min_luminance); // clip really black pixels
         }
     }
+
     // median filter on the deep shadows, to avoid boosting noise
     // because w2 >= w and h2 >= h, we can use the L buffer as temporary buffer for Median_Denoise()
-    int w2 = find_fast_dim(w) + 1;
-    int h2 = find_fast_dim(h) + 1;
-    Array2Df L(w2, h2);
+    int w2 = find_fast_dim (w) + 1;
+    int h2 = find_fast_dim (h) + 1;
+    Array2Df L (w2, h2);
     {
 #ifdef _OPENMP
         int num_threads = multiThread ? omp_get_max_threads() : 1;
 #else
         int num_threads = 1;
 #endif
-        float r = float(std::max(w, h)) / float(RT_dimension_cap);
+        float r = float (std::max (w, h)) / float (RT_dimension_cap);
         Median med;
+
         if (r >= 3) {
             med = Median::TYPE_7X7;
         } else if (r >= 2) {
@@ -1269,7 +1321,8 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
         } else {
             med = Median::TYPE_3X3_STRONG;
         }
-        Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads, L);
+
+        Median_Denoise (Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads, L);
     }
 
     float noise = alpha * 0.01f;
@@ -1279,27 +1332,29 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
                   << ", detail_level = " << detail_level << std::endl;
     }
 
-    rescale_nearest(Yr, L, multiThread);
-    tmo_fattal02(w2, h2, L, L, alpha, beta, noise, detail_level, multiThread);
+    rescale_nearest (Yr, L, multiThread);
+    tmo_fattal02 (w2, h2, L, L, alpha, beta, noise, detail_level, multiThread);
 
 //    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
 
 #ifdef _OPENMP
     #pragma omp parallel for if(multiThread)
 #endif
+
     for (int y = 0; y < h; y++) {
         int yy = y * h2 / h;
+
         for (int x = 0; x < w; x++) {
             int xx = x * w2 / w;
-            float Y = Yr(x, y);
-            float l = std::max(L(xx, yy), epsilon) * (65535.f / Y);
-            rgb->r(y, x) = std::max(rgb->r(y, x), 0.f) * l;
-            rgb->g(y, x) = std::max(rgb->g(y, x), 0.f) * l;
-            rgb->b(y, x) = std::max(rgb->b(y, x), 0.f) * l;
-            
-            assert(std::isfinite(rgb->r(y, x)));
-            assert(std::isfinite(rgb->g(y, x)));
-            assert(std::isfinite(rgb->b(y, x)));
+            float Y = Yr (x, y);
+            float l = std::max (L (xx, yy), epsilon) * (65535.f / Y);
+            rgb->r (y, x) = std::max (rgb->r (y, x), 0.f) * l;
+            rgb->g (y, x) = std::max (rgb->g (y, x), 0.f) * l;
+            rgb->b (y, x) = std::max (rgb->b (y, x), 0.f) * l;
+
+            assert (std::isfinite (rgb->r (y, x)));
+            assert (std::isfinite (rgb->g (y, x)));
+            assert (std::isfinite (rgb->b (y, x)));
         }
     }
 }