From 77b4ad497bc8568cc38beba81a6e4a7729f77d50 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Thu, 2 Nov 2017 22:34:49 +0100
Subject: [PATCH 01/39] Integrated "Fattal02" tone-mapping operator from
 Luminance HDR

---
 CMakeLists.txt           |  11 +
 rtdata/languages/default |   6 +
 rtengine/CMakeLists.txt  |   1 +
 rtengine/improcfun.cc    |  13 +
 rtengine/improcfun.h     |   2 +
 rtengine/procevents.h    |   4 +
 rtengine/procparams.cc   |  45 ++
 rtengine/procparams.h    |  22 +
 rtengine/refreshmap.cc   |   5 +-
 rtengine/tmo_fattal02.cc | 964 +++++++++++++++++++++++++++++++++++++++
 rtgui/CMakeLists.txt     |   1 +
 rtgui/fattaltonemap.cc   | 120 +++++
 rtgui/fattaltonemap.h    |  44 ++
 rtgui/paramsedited.cc    |  17 +
 rtgui/paramsedited.h     |   9 +
 rtgui/toolpanelcoord.cc  |   2 +
 rtgui/toolpanelcoord.h   |   2 +
 17 files changed, 1267 insertions(+), 1 deletion(-)
 create mode 100644 rtengine/tmo_fattal02.cc
 create mode 100644 rtgui/fattaltonemap.cc
 create mode 100644 rtgui/fattaltonemap.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 490cfa7ca..d99a4feaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -307,6 +307,7 @@ if(WITH_SYSTEM_KLT)
     find_package(KLT REQUIRED)
 endif()
 
+
 # Check for libcanberra-gtk3 (sound events on Linux):
 if(UNIX AND(NOT APPLE))
     pkg_check_modules(CANBERRA-GTK REQUIRED libcanberra-gtk3)
@@ -341,6 +342,16 @@ if(OPTION_OMP)
     endif()
 endif()
 
+# check for libfftw3f_omp
+if(OPENMP_FOUND)
+    find_library(fftw3f_omp fftw3f_omp PATHS ${FFTW3F_LIBRARY_DIRS})
+    if(fftw3f_omp)
+        add_definitions(-DRT_FFTW3F_OMP)
+        set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARIES} ${fftw3f_omp})
+    endif()
+endif()
+
+
 # Find out whether we are building out of source:
 get_filename_component(ABS_SOURCE_DIR "${PROJECT_SOURCE_DIR}" ABSOLUTE)
 get_filename_component(ABS_BINARY_DIR "${CMAKE_BINARY_DIR}" ABSOLUTE)
diff --git a/rtdata/languages/default b/rtdata/languages/default
index f610a03bc..81a9cf231 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -719,6 +719,9 @@ HISTORY_MSG_484;CAM02 - Auto Yb scene
 HISTORY_MSG_485;Lens Correction
 HISTORY_MSG_486;Lens Correction - Camera
 HISTORY_MSG_487;Lens Correction - Lens
+HISTORY_MSG_488;HDR Tone Mapping
+HISTORY_MSG_489;HDR TM - Alpha
+HISTORY_MSG_490;HDR TM - Beta
 HISTORY_NEWSNAPSHOT;Add
 HISTORY_NEWSNAPSHOT_TOOLTIP;Shortcut: <b>Alt-s</b>
 HISTORY_SNAPSHOT;Snapshot
@@ -1925,6 +1928,9 @@ TP_SHARPENMICRO_AMOUNT;Quantity
 TP_SHARPENMICRO_LABEL;Microcontrast
 TP_SHARPENMICRO_MATRIX;3×3 matrix instead of 5×5
 TP_SHARPENMICRO_UNIFORMITY;Uniformity
+TP_TM_FATTAL_LABEL;HDR Tone Mapping (Fattal02)
+TP_TM_FATTAL_ALPHA;Alpha
+TP_TM_FATTAL_BETA;Beta
 TP_VIBRANCE_AVOIDCOLORSHIFT;Avoid color shift
 TP_VIBRANCE_CURVEEDITOR_SKINTONES;HH
 TP_VIBRANCE_CURVEEDITOR_SKINTONES_LABEL;Skin-tones
diff --git a/rtengine/CMakeLists.txt b/rtengine/CMakeLists.txt
index 424b3352e..3f32872c3 100644
--- a/rtengine/CMakeLists.txt
+++ b/rtengine/CMakeLists.txt
@@ -112,6 +112,7 @@ set(RTENGINESOURCEFILES
     stdimagesource.cc
     utils.cc
     rtlensfun.cc
+    tmo_fattal02.cc
     )
 
 if(LENSFUN_HAS_LOAD_DIRECTORY)
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index 516e0ee9d..21bc9d9c5 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -3119,6 +3119,19 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
         }
     }
 
+    std::unique_ptr<Imagefloat> fattal;
+    if (params->fattal.enabled) {
+        fattal.reset(working->copy());
+        int detail_level = 3;
+        if (scale < 8) {
+            detail_level = 3;
+        } else {
+            detail_level = 0;
+        }
+        ToneMapFattal02(fattal.get(), detail_level);
+        working = fattal.get();
+    }
+
     int h_th = 0, s_th = 0;
 
     if (shmap) {
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 6c5ded3b6..53de02d3d 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -347,6 +347,8 @@ public:
     void Badpixelscam (CieImage * src, CieImage * dst, double radius, int thresh, int mode,  float b_l, float t_l, float t_r, float b_r, float skinprot, float chrom, int hotbad);
     void BadpixelsLab (LabImage * src, LabImage * dst, double radius, int thresh, int mode, float b_l, float t_l, float t_r, float b_r, float skinprot, float chrom);
 
+    void ToneMapFattal02(Imagefloat *rgb, int detail_level);
+    
     Image8*     lab2rgb   (LabImage* lab, int cx, int cy, int cw, int ch, const procparams::ColorManagementParams &icm);
     Image16*    lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int ch, const procparams::ColorManagementParams &icm, bool bw, GammaValues *ga = nullptr);
     // CieImage *ciec;
diff --git a/rtengine/procevents.h b/rtengine/procevents.h
index 5f30374ee..2bd4107ed 100644
--- a/rtengine/procevents.h
+++ b/rtengine/procevents.h
@@ -515,6 +515,10 @@ enum ProcEvent {
     EvLensCorrMode = 484,
     EvLensCorrLensfunCamera = 485,
     EvLensCorrLensfunLens = 486,
+    // Fattal tone mapping
+    EvTMFattalEnabled = 487,
+    EvTMFattalAlpha = 488,
+    EvTMFattalBeta = 489,
 
     NUMOFEVENTS
 
diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index 7a3fec1ea..9bbed9d4e 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -1241,6 +1241,8 @@ void ProcParams::setDefaults ()
     epd.scale = 1.0;
     epd.reweightingIterates = 0;
 
+    fattal.setDefaults();
+
     sh.enabled = false;
     sh.hq = false;
     sh.highlights = 0;
@@ -2442,6 +2444,19 @@ int ProcParams::save (const Glib::ustring &fname, const Glib::ustring &fname2, b
             keyFile.set_integer ("EPD", "ReweightingIterates", epd.reweightingIterates);
         }
 
+// save fattal
+        if (!pedited || pedited->fattal.enabled) {
+            keyFile.set_boolean ("FattalToneMapping", "Enabled", fattal.enabled);
+        }
+
+        if (!pedited || pedited->fattal.alpha) {
+            keyFile.set_double ("FattalToneMapping", "Alpha", fattal.alpha);
+        }
+
+        if (!pedited || pedited->fattal.beta) {
+            keyFile.set_double ("FattalToneMapping", "Beta", fattal.beta);
+        }
+        
         /*
         // save lumaDenoise
         if (!pedited || pedited->lumaDenoise.enabled) keyFile.set_boolean ("Luminance Denoising", "Enabled", lumaDenoise.enabled);
@@ -5588,6 +5603,33 @@ int ProcParams::load (const Glib::ustring &fname, ParamsEdited* pedited)
             }
         }
 
+//Load FattalToneMapping
+        if (keyFile.has_group ("FattalToneMapping")) {
+            if (keyFile.has_key ("FattalToneMapping", "Enabled")) {
+                fattal.enabled = keyFile.get_boolean ("FattalToneMapping", "Enabled");
+
+                if (pedited) {
+                    pedited->fattal.enabled = true;
+                }
+            }
+
+            if (keyFile.has_key ("FattalToneMapping", "Alpha")) {
+                fattal.alpha = keyFile.get_double ("FattalToneMapping", "Alpha");
+
+                if (pedited) {
+                    pedited->fattal.alpha = true;
+                }
+            }
+
+            if (keyFile.has_key ("FattalToneMapping", "Beta")) {
+                fattal.beta = keyFile.get_double ("FattalToneMapping", "Beta");
+
+                if (pedited) {
+                    pedited->fattal.beta = true;
+                }
+            }
+        }        
+
         // load lumaDenoise
         /*if (keyFile.has_group ("Luminance Denoising")) {
             if (keyFile.has_key ("Luminance Denoising", "Enabled"))        { lumaDenoise.enabled       = keyFile.get_boolean ("Luminance Denoising", "Enabled"); if (pedited) pedited->lumaDenoise.enabled = true; }
@@ -8446,6 +8488,9 @@ bool ProcParams::operator== (const ProcParams& other)
         && epd.edgeStopping == other.epd.edgeStopping
         && epd.scale == other.epd.scale
         && epd.reweightingIterates == other.epd.reweightingIterates
+        && fattal.enabled == other.fattal.enabled
+        && fattal.alpha == other.fattal.alpha
+        && fattal.beta == other.fattal.beta
         && defringe.enabled == other.defringe.enabled
         && defringe.radius == other.defringe.radius
         && defringe.threshold == other.defringe.threshold
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index aaf62c53f..f2d1c7b00 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -738,6 +738,27 @@ public:
     int    reweightingIterates;
 };
 
+
+// Fattal02 Tone-Mapping parameters
+class FattalToneMappingParams {
+public:
+    bool enabled;
+    double alpha;
+    double beta;
+    
+    FattalToneMappingParams()
+    {
+        setDefaults();
+    }
+    
+    void setDefaults()
+    {
+        enabled = false;
+        alpha = 1.0;
+        beta = 1.0;
+    }
+};
+
 /**
   * Parameters of the shadow/highlight enhancement
   */
@@ -1384,6 +1405,7 @@ public:
     ImpulseDenoiseParams    impulseDenoise;  ///< Impulse denoising parameters
     DirPyrDenoiseParams     dirpyrDenoise;   ///< Directional Pyramid denoising parameters
     EPDParams               epd;             ///< Edge Preserving Decomposition parameters
+    FattalToneMappingParams          fattal;          ///< Fattal02 tone mapping
     SHParams                sh;              ///< Shadow/highlight enhancement parameters
     CropParams              crop;            ///< Crop parameters
     CoarseTransformParams   coarse;          ///< Coarse transformation (90, 180, 270 deg rotation, h/v flipping) parameters
diff --git a/rtengine/refreshmap.cc b/rtengine/refreshmap.cc
index 74eda6110..95e129ad6 100644
--- a/rtengine/refreshmap.cc
+++ b/rtengine/refreshmap.cc
@@ -513,7 +513,10 @@ int refreshmap[rtengine::NUMOFEVENTS] = {
     LUMINANCECURVE,   // EvCATAutoyb
     DARKFRAME,        // EvLensCorrMode
     DARKFRAME,        // EvLensCorrLensfunCamera
-    DARKFRAME         // EvLensCorrLensfunLens
+    DARKFRAME,        // EvLensCorrLensfunLens
+    RGBCURVE,         // EvTMFattalEnabled
+    RGBCURVE,         // EvTMFattalAlpha
+    RGBCURVE          // EvTMFattalBeta
 
 };
 
diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
new file mode 100644
index 000000000..d50488554
--- /dev/null
+++ b/rtengine/tmo_fattal02.cc
@@ -0,0 +1,964 @@
+/* -*- C++ -*-
+ *
+ *  This file is part of RawTherapee.
+ *
+ *  Ported from LuminanceHDR by Alberto Griggio <alberto.griggio@gmail.com>
+ *  
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file tmo_fattal02.cpp
+ * @brief TMO: Gradient Domain High Dynamic Range Compression
+ *
+ * Implementation of Gradient Domain High Dynamic Range Compression
+ * by Raanan Fattal, Dani Lischinski, Michael Werman.
+ *
+ * @author Grzegorz Krawczyk, <krawczyk@mpi-sb.mpg.de>
+ *
+ *
+ * This file is a part of LuminanceHDR package, based on pfstmo.
+ * ----------------------------------------------------------------------
+ * Copyright (C) 2003,2004 Grzegorz Krawczyk
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * ----------------------------------------------------------------------
+ *
+ * $Id: tmo_fattal02.cpp,v 1.3 2008/11/04 23:43:08 rafm Exp $
+ */
+
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <cstdio>
+#include <iostream>
+#include <iterator>
+#include <vector>
+#include <algorithm>
+#include <limits>
+
+#include <math.h>
+#include <assert.h>
+#include <fftw3.h>
+
+#include "array2D.h"
+#include "improcfun.h"
+#include "settings.h"
+
+namespace rtengine {
+
+extern const Settings *settings;
+
+using namespace std;
+
+namespace {
+
+class Array2Df: public array2D<float> {
+    typedef array2D<float> Super;
+public:
+    Array2Df(): Super() {}
+    Array2Df(int w, int h): Super(w, h) {}
+
+    float &operator()(int w, int h)
+    {
+        return (*this)[h][w];
+    }
+
+    const float &operator()(int w, int h) const
+    {
+        return (*this)[h][w];
+    }
+
+    float &operator()(int i)
+    {
+        return static_cast<float *>(*this)[i];
+    }
+
+    const float &operator()(int i) const
+    {
+        return const_cast<Array2Df &>(*this).operator()(i);
+    }
+
+    int getRows() const
+    {
+        return const_cast<Array2Df &>(*this).height();
+    }
+
+    int getCols() const
+    {
+        return const_cast<Array2Df &>(*this).width();
+    }
+
+    float *data()
+    {
+        return static_cast<float *>(*this);
+    }
+
+    const float *data() const
+    {
+        return const_cast<Array2Df &>(*this).data();
+    }
+};
+
+
+void downSample(const Array2Df& A, Array2Df& B)
+{
+    const int width = B.getCols();
+    const int height = B.getRows();
+
+    // Note, I've uncommented all omp directives. They are all ok but are
+    // applied to too small problems and in total don't lead to noticable
+    // speed improvements. The main issue is the pde solver and in case of the
+    // fft solver uses optimised threaded fftw routines.
+    //#pragma omp parallel for
+    for ( int y=0 ; y<height ; y++ )
+    {
+        for ( int x=0 ; x<width ; x++ )
+        {
+            float p = A(2*x,2*y);
+            p += A(2*x+1,2*y);
+            p += A(2*x,2*y+1);
+            p += A(2*x+1,2*y+1);
+            B(x,y) = p * 0.25f; // p / 4.0f;
+        }
+    }
+}
+
+void gaussianBlur(const Array2Df& I, Array2Df& L)
+{
+    const int width = I.getCols();
+    const int height = I.getRows();
+
+    Array2Df T(width,height);
+
+    //--- X blur
+    //#pragma omp parallel for shared(I, T)
+    for ( int y=0 ; y<height ; y++ )
+    {
+        for ( int x=1 ; x<width-1 ; x++ )
+        {
+            float t = 2.f * I(x,y);
+            t += I(x-1,y);
+            t += I(x+1,y);
+            T(x,y) = t * 0.25f; // t / 4.f;
+        }
+        T(0,y) = ( 3.f * I(0,y)+ I(1,y) ) * 0.25f; // / 4.f;
+        T(width-1,y) = ( 3.f * I(width-1,y) + I(width-2,y) ) * 0.25f; // / 4.f;
+    }
+
+    //--- Y blur
+    //#pragma omp parallel for shared(T, L)
+    for ( int x=0 ; x<width ; x++ )
+    {
+        for ( int y=1 ; y<height-1 ; y++ )
+        {
+            float t = 2.f * T(x,y);
+            t += T(x,y-1);
+            t += T(x,y+1);
+            L(x,y) = t * 0.25f; // t/4.0f;
+        }
+        L(x,0) = ( 3.f * T(x,0) + T(x,1) ) * 0.25f; // / 4.0f;
+        L(x,height-1) = ( 3.f * T(x,height-1) + T(x,height-2) ) * 0.25f; // / 4.0f;
+    }
+}
+
+void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
+{
+  int width = H->getCols();
+  int height = H->getRows();
+  const int size = width*height;
+
+  pyramids[0] = new Array2Df(width,height);
+//#pragma omp parallel for shared(pyramids, H)
+  for( int i=0 ; i<size ; i++ )
+    (*pyramids[0])(i) = (*H)(i);
+
+  Array2Df* L = new Array2Df(width,height);
+  gaussianBlur( *pyramids[0], *L );
+
+  for ( int k=1 ; k<nlevels ; k++ )
+  {
+    width /= 2;
+    height /= 2;
+    pyramids[k] = new Array2Df(width,height);
+    downSample(*L, *pyramids[k]);
+
+    delete L;
+    L = new Array2Df(width,height);
+    gaussianBlur( *pyramids[k], *L );
+  }
+
+  delete L;
+}
+
+//--------------------------------------------------------------------
+
+float calculateGradients(Array2Df* H, Array2Df* G, int k)
+{
+  const int width = H->getCols();
+  const int height = H->getRows();
+  const float divider = pow( 2.0f, k+1 );
+  float avgGrad = 0.0f;
+
+//#pragma omp parallel for shared(G,H) reduction(+:avgGrad)
+  for( int y=0 ; y<height ; y++ )
+  {
+    for( int x=0 ; x<width ; x++ )
+    {
+      float gx, gy;
+      int w, n, e, s;
+      w = (x == 0 ? 0 : x-1);
+      n = (y == 0 ? 0 : y-1);
+      s = (y+1 == height ? y : y+1);
+      e = (x+1 == width ? x : x+1);
+
+      gx = ((*H)(w,y)-(*H)(e,y)) / divider;
+
+      gy = ((*H)(x,s)-(*H)(x,n)) / divider;
+      // note this implicitely assumes that H(-1)=H(0)
+      // for the fft-pde slover this would need adjustment as H(-1)=H(1)
+      // is assumed, which means gx=0.0, gy=0.0 at the boundaries
+      // however, the impact is not visible so we ignore this here
+
+      (*G)(x,y) = sqrt(gx*gx+gy*gy);
+      avgGrad += (*G)(x,y);
+    }
+  }
+
+  return avgGrad / (width*height);
+}
+
+//--------------------------------------------------------------------
+
+void upSample(const Array2Df& A, Array2Df& B)
+{
+    const int width = B.getCols();
+    const int height = B.getRows();
+    const int awidth = A.getCols();
+    const int aheight = A.getRows();
+
+    //#pragma omp parallel for shared(A, B)
+    for ( int y=0 ; y<height ; y++ )
+    {
+        for ( int x=0 ; x<width ; x++ )
+        {
+            int ax = static_cast<int>(x * 0.5f); //x / 2.f;
+            int ay = static_cast<int>(y * 0.5f); //y / 2.f;
+            ax = (ax<awidth) ? ax : awidth-1;
+            ay = (ay<aheight) ? ay : aheight-1;
+
+            B(x,y) = A(ax,ay);
+        }
+    }
+//--- this code below produces 'use of uninitialized value error'
+//   int width = A->getCols();
+//   int height = A->getRows();
+//   int x,y;
+
+//   for( y=0 ; y<height ; y++ )
+//     for( x=0 ; x<width ; x++ )
+//     {
+//       (*B)(2*x,2*y) = (*A)(x,y);
+//       (*B)(2*x+1,2*y) = (*A)(x,y);
+//       (*B)(2*x,2*y+1) = (*A)(x,y);
+//       (*B)(2*x+1,2*y+1) = (*A)(x,y);
+//     }
+}
+
+
+void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
+                       float avgGrad[], int nlevels, int detail_level,
+                       float alfa, float beta, float noise)
+{
+    const bool newfattal = true;
+    int width = gradients[nlevels-1]->getCols();
+    int height = gradients[nlevels-1]->getRows();
+    Array2Df** fi = new Array2Df*[nlevels];
+
+    fi[nlevels-1] = new Array2Df(width,height);
+    if (newfattal)
+    {
+        //#pragma omp parallel for shared(fi)
+        for ( int k = 0 ; k < width*height ; k++ )
+        {
+            (*fi[nlevels-1])(k) = 1.0f;
+        }
+    }
+
+    for ( int k = nlevels-1; k >= 0 ; k-- )
+    {
+        width = gradients[k]->getCols();
+        height = gradients[k]->getRows();
+
+        // only apply gradients to levels>=detail_level but at least to the coarsest
+        if ( k >= detail_level
+             ||k==nlevels-1
+             || newfattal == false)
+        {
+            //DEBUG_STR << "calculateFiMatrix: apply gradient to level " << k << endl;
+            //#pragma omp parallel for shared(fi,avgGrad)
+            for ( int y = 0; y < height; y++ )
+            {
+                for ( int x = 0; x < width; x++ )
+                {
+                    float grad = ((*gradients[k])(x,y) < 1e-4f) ? 1e-4 : (*gradients[k])(x,y);
+                    float a = alfa * avgGrad[k];
+
+                    float value = powf((grad+noise)/a, beta - 1.0f);
+
+                    if (newfattal)
+                        (*fi[k])(x,y) *= value;
+                    else
+                        (*fi[k])(x,y) = value;
+                }
+            }
+        }
+
+
+        // create next level
+        if ( k>1 )
+        {
+            width = gradients[k-1]->getCols();
+            height = gradients[k-1]->getRows();
+            fi[k-1] = new Array2Df(width,height);
+        }
+        else
+            fi[0] = FI;                         // highest level -> result
+
+        if ( k>0  && newfattal )
+        {
+            upSample(*fi[k], *fi[k-1]);           // upsample to next level
+            gaussianBlur(*fi[k-1], *fi[k-1]);
+        }
+    }
+
+    for ( int k=1 ; k<nlevels ; k++ )
+    {
+        delete fi[k];
+    }
+    delete[] fi;
+}
+
+inline
+void findMaxMinPercentile(const Array2Df& I,
+                                 float minPrct, float& minLum,
+                                 float maxPrct, float& maxLum)
+{
+    const int size = I.getRows() * I.getCols();
+    const float* data = I.data();
+    std::vector<float> vI;
+
+    std::copy(data, data + size, std::back_inserter(vI));
+    std::sort(vI.begin(), vI.end());
+
+    minLum = vI.at( int(minPrct*vI.size()) );
+    maxLum = vI.at( int(maxPrct*vI.size()) );
+}
+
+void solve_pde_fft(Array2Df *F, Array2Df *U);
+
+void tmo_fattal02(size_t width,
+                  size_t height,
+                  const Array2Df& Y,
+                  Array2Df& L,
+                  float alfa,
+                  float beta,
+                  float noise,
+                  int detail_level)
+{
+// #ifdef TIMER_PROFILING
+//     msec_timer stop_watch;
+//     stop_watch.start();
+// #endif
+    static const float black_point = 0.1f;
+    static const float white_point = 0.5f;
+    static const float gamma = 1.0f; // 0.8f;
+    // static const int   detail_level = 3;
+    if ( detail_level < 0 ) detail_level = 0;
+    if ( detail_level > 3 ) detail_level = 3;
+
+  // ph.setValue(2);
+  // if (ph.canceled()) return;
+
+  int MSIZE = 32;         // minimum size of gaussian pyramid
+  // I believe a smaller value than 32 results in slightly better overall
+  // quality but I'm only applying this if the newly implemented fft solver
+  // is used in order not to change behaviour of the old version
+  // TODO: best let the user decide this value
+  // if (fftsolver)
+  {
+     MSIZE = 8;
+  }
+
+  int size = width*height;
+  // unsigned int x,y;
+  // int i, k;
+
+  // find max & min values, normalize to range 0..100 and take logarithm
+  float minLum = Y(0,0);
+  float maxLum = Y(0,0);
+  for ( int i=0 ; i<size ; i++ )
+  {
+      minLum = ( Y(i) < minLum ) ? Y(i) : minLum;
+      maxLum = ( Y(i) > maxLum ) ? Y(i) : maxLum;
+  }
+  Array2Df* H = new Array2Df(width, height);
+  //#pragma omp parallel for private(i) shared(H, Y, maxLum)
+  for ( int i=0 ; i<size ; i++ )
+  {
+      (*H)(i) = logf( 100.0f* Y(i)/maxLum + 1e-4 );
+  }
+  // ph.setValue(4);
+
+  // create gaussian pyramids
+  int mins = (width<height) ? width : height;    // smaller dimension
+  int nlevels = 0;
+  while ( mins >= MSIZE )
+  {
+    nlevels++;
+    mins /= 2;
+  }
+  // std::cout << "DEBUG: nlevels = " << nlevels << ", mins = " << mins << std::endl;
+  // The following lines solves a bug with images particularly small
+  if (nlevels == 0) nlevels = 1;
+
+  Array2Df** pyramids = new Array2Df*[nlevels];
+  createGaussianPyramids(H, pyramids, nlevels);
+  // ph.setValue(8);
+
+  // calculate gradients and its average values on pyramid levels
+  Array2Df** gradients = new Array2Df*[nlevels];
+  float* avgGrad = new float[nlevels];
+  for ( int k=0 ; k<nlevels ; k++ )
+  {
+    gradients[k] = new Array2Df(pyramids[k]->getCols(), pyramids[k]->getRows());
+    avgGrad[k] = calculateGradients(pyramids[k],gradients[k], k);
+  }
+  // ph.setValue(12);
+
+  // calculate fi matrix
+  Array2Df* FI = new Array2Df(width, height);
+  calculateFiMatrix(FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise);
+//  dumpPFS( "FI.pfs", FI, "Y" );
+  for ( int i=0 ; i<nlevels ; i++ )
+  {
+    delete pyramids[i];
+    delete gradients[i];
+  }
+  delete[] pyramids;
+  delete[] gradients;
+  delete[] avgGrad;
+  // ph.setValue(16);
+  // if (ph.canceled()){
+  //   delete FI;
+  //   delete H;
+  //   return;
+  // }
+
+
+  // attenuate gradients
+  Array2Df* Gx = new Array2Df(width, height);
+  Array2Df* Gy = new Array2Df(width, height);
+
+  // the fft solver solves the Poisson pde but with slightly different
+  // boundary conditions, so we need to adjust the assembly of the right hand
+  // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero
+  // Neumann conditions assume U(-1)=U(0)), see also divergence calculation
+  // if (fftsolver)
+    for ( size_t y=0 ; y<height ; y++ )
+      for ( size_t x=0 ; x<width ; x++ )
+      {
+        // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
+        unsigned int yp1 = (y+1 >= height ? height-2 : y+1);
+        unsigned int xp1 = (x+1 >= width ?  width-2  : x+1);
+        // forward differences in H, so need to use between-points approx of FI
+        (*Gx)(x,y) = ((*H)(xp1,y)-(*H)(x,y)) * 0.5*((*FI)(xp1,y)+(*FI)(x,y));
+        (*Gy)(x,y) = ((*H)(x,yp1)-(*H)(x,y)) * 0.5*((*FI)(x,yp1)+(*FI)(x,y));
+      }
+  // else
+  //   for ( size_t y=0 ; y<height ; y++ )
+  //     for ( size_t x=0 ; x<width ; x++ )
+  //     {
+  //       int s, e;
+  //       s = (y+1 == height ? y : y+1);
+  //       e = (x+1 == width ? x : x+1);
+
+  //       (*Gx)(x,y) = ((*H)(e,y)-(*H)(x,y)) * (*FI)(x,y);
+  //       (*Gy)(x,y) = ((*H)(x,s)-(*H)(x,y)) * (*FI)(x,y);
+  //     }
+  delete H;
+  delete FI;
+  // ph.setValue(18);
+
+
+//   dumpPFS( "Gx.pfs", Gx, "Y" );
+//   dumpPFS( "Gy.pfs", Gy, "Y" );
+
+  // calculate divergence
+  Array2Df DivG(width, height);
+  for ( size_t y = 0; y < height; ++y )
+  {
+      for ( size_t x = 0; x < width; ++x )
+      {
+          DivG(x,y) = (*Gx)(x,y) + (*Gy)(x,y);
+          if ( x > 0 ) DivG(x,y) -= (*Gx)(x-1,y);
+          if ( y > 0 ) DivG(x,y) -= (*Gy)(x,y-1);
+
+          // if (fftsolver)
+          {
+              if (x==0) DivG(x,y) += (*Gx)(x,y);
+              if (y==0) DivG(x,y) += (*Gy)(x,y);
+          }
+
+      }
+  }
+  delete Gx;
+  delete Gy;
+  // ph.setValue(20);
+  // if (ph.canceled())
+  // {
+  //     return;
+  // }
+
+//  dumpPFS( "DivG.pfs", DivG, "Y" );
+
+  // solve pde and exponentiate (ie recover compressed image)
+  {
+  Array2Df U(width, height);
+  // if (fftsolver)
+  {
+      solve_pde_fft(&DivG, &U);//, ph);
+  }
+  // else
+  // {
+  //     solve_pde_multigrid(&DivG, &U, ph);
+  // }
+// #ifndef NDEBUG
+//   printf("\npde residual error: %f\n", residual_pde(&U, &DivG));
+// #endif
+  // ph.setValue(90);
+  // if ( ph.canceled() )
+  // {
+  //     return;
+  // }
+
+  for ( size_t idx = 0 ; idx < height*width; ++idx )
+  {
+      L(idx) = expf( gamma * U(idx) );
+  }
+  }
+  // ph.setValue(95);
+
+  // remove percentile of min and max values and renormalize
+  float cut_min = 0.01f * black_point;
+  float cut_max = 1.0f - 0.01f * white_point;
+  assert(cut_min>=0.0f && (cut_max<=1.0f) && (cut_min<cut_max));
+  findMaxMinPercentile(L, cut_min, minLum, cut_max, maxLum);
+  for ( size_t idx = 0; idx < height*width; ++idx )
+  {
+      L(idx) = (L(idx) - minLum) / (maxLum - minLum);
+      if ( L(idx) <= 0.0f )
+      {
+          L(idx) = 0.0;
+      }
+      // note, we intentionally do not cut off values > 1.0
+  }
+// #ifdef TIMER_PROFILING
+//     stop_watch.stop_and_update();
+//     cout << endl;
+//     cout << "tmo_fattal02 = " << stop_watch.get_time() << " msec" << endl;
+// #endif
+
+  // ph.setValue(96);
+}
+
+
+/**
+ *
+ * @file pde_fft.cpp
+ * @brief Direct Poisson solver using the discrete cosine transform
+ *
+ * @author Tino Kluge (tino.kluge@hrz.tu-chemnitz.de)
+ *
+ */
+
+//////////////////////////////////////////////////////////////////////
+// Direct Poisson solver using the discrete cosine transform
+//////////////////////////////////////////////////////////////////////
+// by Tino Kluge (tino.kluge@hrz.tu-chemnitz.de)
+//
+// let U and F be matrices of order (n1,n2), ie n1=height, n2=width
+// and L_x of order (n2,n2) and L_y of order (n1,n1) and both
+// representing the 1d Laplace operator with Neumann boundary conditions,
+// ie L_x and L_y are tridiagonal matrices of the form
+//
+//  ( -2  2          )
+//  (  1 -2  1       )
+//  (     .  .  .    )
+//  (        1 -2  1 )
+//  (           2 -2 )
+//
+// then this solver computes U given F based on the equation
+//
+//  -------------------------
+//  L_y U + (L_x U^tr)^tr = F
+//  -------------------------
+//
+// Note, if the first and last row of L_x and L_y contained one's instead of
+// two's then this equation would be exactly the 2d Poisson equation with
+// Neumann boundary conditions. As a simple rule:
+// - Neumann: assume U(-1)=U(0) --> U(i-1) - 2 U(i) + U(i+1) becomes
+//        i=0: U(0) - 2 U(0) + U(1) = -U(0) + U(1)
+// - our system: assume U(-1)=U(1) --> this becomes
+//        i=0: U(1) - 2(0) + U(1) = -2 U(0) + 2 U(1)
+//
+// The multi grid solver solve_pde_multigrid() solves the 2d Poisson pde
+// with the right Neumann boundary conditions, U(-1)=U(0), see function
+// atimes(). This means the assembly of the right hand side F is different
+// for both solvers.
+
+// #include <iostream>
+
+// #include <boost/math/constants/constants.hpp>
+
+// #include <stdio.h>
+// #include <stdlib.h>
+// #include "arch/math.h"
+// #include <cassert>
+// #ifdef _OPENMP
+// #include <omp.h>
+// #endif
+// #include <vector>
+// #include <fftw3.h>
+
+// #include "Libpfs/progress.h"
+// #include "Libpfs/array2d.h"
+// #include "pde.h"
+
+// using namespace std;
+
+
+// #ifndef SQR
+// #define SQR(x) (x)*(x)
+// #endif
+
+
+// returns T = EVy A EVx^tr
+// note, modifies input data
+void transform_ev2normal(Array2Df *A, Array2Df *T)
+{
+  int width = A->getCols();
+  int height = A->getRows();
+  assert((int)T->getCols()==width && (int)T->getRows()==height);
+
+  // the discrete cosine transform is not exactly the transform needed
+  // need to scale input values to get the right transformation
+  for(int y=1 ; y<height-1 ; y++ )
+    for(int x=1 ; x<width-1 ; x++ )
+      (*A)(x,y)*=0.25f;
+
+  for(int x=1 ; x<width-1 ; x++ )
+  {
+    (*A)(x,0)*=0.5f;
+    (*A)(x,height-1)*=0.5f;
+  }
+  for(int y=1 ; y<height-1 ; y++ )
+  {
+    (*A)(0,y)*=0.5;
+    (*A)(width-1,y)*=0.5f;
+  }
+
+  // note, fftw provides its own memory allocation routines which
+  // ensure that memory is properly 16/32 byte aligned so it can
+  // use SSE/AVX operations (2/4 double ops in parallel), if our
+  // data is not properly aligned fftw won't use SSE/AVX
+  // (I believe new() aligns memory to 16 byte so avoid overhead here)
+  //
+  // double* in = (double*) fftwf_malloc(sizeof(double) * width*height);
+  // fftwf_free(in);
+
+  // executes 2d discrete cosine transform
+  fftwf_plan p;
+  p=fftwf_plan_r2r_2d(height, width, A->data(), T->data(),
+                        FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
+  fftwf_execute(p);
+  fftwf_destroy_plan(p);
+}
+
+
+// returns T = EVy^-1 * A * (EVx^-1)^tr
+void transform_normal2ev(Array2Df *A, Array2Df *T)
+{
+  int width = A->getCols();
+  int height = A->getRows();
+  assert((int)T->getCols()==width && (int)T->getRows()==height);
+
+  // executes 2d discrete cosine transform
+  fftwf_plan p;
+  p=fftwf_plan_r2r_2d(height, width, A->data(), T->data(),
+                        FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
+  fftwf_execute(p);
+  fftwf_destroy_plan(p);
+
+  // need to scale the output matrix to get the right transform
+  for(int y=0 ; y<height ; y++ )
+    for(int x=0 ; x<width ; x++ )
+      (*T)(x,y)*=(1.0f/((height-1)*(width-1)));
+
+  for(int x=0 ; x<width ; x++ )
+  {
+    (*T)(x,0)*=0.5f;
+    (*T)(x,height-1)*=0.5f;
+  }
+  for(int y=0 ; y<height ; y++ )
+  {
+    (*T)(0,y)*=0.5f;
+    (*T)(width-1,y)*=0.5f;
+  }
+}
+
+// returns the eigenvalues of the 1d laplace operator
+std::vector<double> get_lambda(int n)
+{
+  assert(n>1);
+  std::vector<double> v(n);
+  for (int i=0; i<n; i++)
+  {
+    v[i]=-4.0*SQR(sin((double)i/(2*(n-1))*RT_PI));
+  }
+
+  return v;
+}
+
+// // makes boundary conditions compatible so that a solution exists
+// void make_compatible_boundary(Array2Df *F)
+// {
+//   int width = F->getCols();
+//   int height = F->getRows();
+
+//   double sum=0.0;
+//   for(int y=1 ; y<height-1 ; y++ )
+//     for(int x=1 ; x<width-1 ; x++ )
+//       sum+=(*F)(x,y);
+
+//   for(int x=1 ; x<width-1 ; x++ )
+//     sum+=0.5*((*F)(x,0)+(*F)(x,height-1));
+
+//   for(int y=1 ; y<height-1 ; y++ )
+//     sum+=0.5*((*F)(0,y)+(*F)(width-1,y));
+
+//   sum+=0.25*((*F)(0,0)+(*F)(0,height-1)+(*F)(width-1,0)+(*F)(width-1,height-1));
+
+//   //DEBUG_STR << "compatible_boundary: int F = " << sum ;
+//   //DEBUG_STR << " (should be 0 to be solvable)" << std::endl;
+
+//   double add=-sum/(height+width-3);
+//   //DEBUG_STR << "compatible_boundary: adjusting boundary by " << add << std::endl;
+//   for(int x=0 ; x<width ; x++ )
+//   {
+//     (*F)(x,0)+=add;
+//     (*F)(x,height-1)+=add;
+//   }
+//   for(int y=1 ; y<height-1 ; y++ )
+//   {
+//     (*F)(0,y)+=add;
+//     (*F)(width-1,y)+=add;
+//   }
+// }
+
+
+
+// solves Laplace U = F with Neumann boundary conditions
+// if adjust_bound is true then boundary values in F are modified so that
+// the equation has a solution, if adjust_bound is set to false then F is
+// not modified and the equation might not have a solution but an
+// approximate solution with a minimum error is then calculated
+// double precision version
+void solve_pde_fft(Array2Df *F, Array2Df *U)/*, pfs::Progress &ph,
+                                              bool adjust_bound)*/
+{
+   // ph.setValue(20);
+  //DEBUG_STR << "solve_pde_fft: solving Laplace U = F ..." << std::endl;
+  int width = F->getCols();
+  int height = F->getRows();
+  assert((int)U->getCols()==width && (int)U->getRows()==height);
+
+  // activate parallel execution of fft routines
+#ifdef RT_FFTW3F_OMP
+  fftwf_init_threads();
+  fftwf_plan_with_nthreads( omp_get_max_threads() );
+// #else
+//   fftwf_plan_with_nthreads( 2 );
+#endif
+
+  // in general there might not be a solution to the Poisson pde
+  // with Neumann boundary conditions unless the boundary satisfies
+  // an integral condition, this function modifies the boundary so that
+  // the condition is exactly satisfied
+  // if(adjust_bound)
+  // {
+  //   //DEBUG_STR << "solve_pde_fft: checking boundary conditions" << std::endl;
+  //   make_compatible_boundary(F);
+  // }
+
+  // transforms F into eigenvector space: Ftr =
+  //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl;
+  Array2Df* F_tr = new Array2Df(width,height);
+  transform_normal2ev(F, F_tr);
+  // TODO: F no longer needed so could release memory, but as it is an
+  // input parameter we won't do that
+  // ph.setValue(50);
+  // if (ph.canceled())
+  // {
+  //   delete F_tr;
+  //   return;
+  // }
+
+  //DEBUG_STR << "solve_pde_fft: F_tr(0,0) = " << (*F_tr)(0,0);
+  //DEBUG_STR << " (must be 0 for solution to exist)" << std::endl;
+
+  // in the eigenvector space the solution is very simple
+  //DEBUG_STR << "solve_pde_fft: solve in eigenvector space" << std::endl;
+  Array2Df* U_tr = new Array2Df(width,height);
+  std::vector<double> l1=get_lambda(height);
+  std::vector<double> l2=get_lambda(width);
+  for(int y=0 ; y<height ; y++ )
+  {
+    for(int x=0 ; x<width ; x++ )
+    {
+      if(x==0 && y==0)
+        (*U_tr)(x,y)=0.0; // any value ok, only adds a const to the solution
+      else
+        (*U_tr)(x,y)=(*F_tr)(x,y)/(l1[y]+l2[x]);
+    }
+  }
+  delete F_tr;    // no longer needed so release memory
+  // ph.setValue(55);
+
+
+  // transforms U_tr back to the normal space
+  //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl;
+  transform_ev2normal(U_tr, U);
+  delete U_tr;    // no longer needed so release memory
+  // ph.setValue(85);
+
+  // the solution U as calculated will satisfy something like int U = 0
+  // since for any constant c, U-c is also a solution and we are mainly
+  // working in the logspace of (0,1) data we prefer to have
+  // a solution which has no positive values: U_new(x,y)=U(x,y)-max
+  // (not really needed but good for numerics as we later take exp(U))
+  //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl;
+  double max=0.0;
+  for(int i=0; i<width*height; i++)
+    if(max<(*U)(i))
+      max=(*U)(i);
+
+  for(int i=0; i<width*height; i++)
+    (*U)(i)-=max;
+
+
+  // fft parallel threads cleanup, better handled outside this function?
+#ifdef RT_FFTW3F_OMP
+  fftwf_cleanup_threads();
+#endif
+
+  // ph.setValue(90);
+  //DEBUG_STR << "solve_pde_fft: done" << std::endl;
+}
+
+
+// ---------------------------------------------------------------------
+// the functions below are only for test purposes to check the accuracy
+// of the pde solvers
+
+
+// // returns the norm of (Laplace U - F) of all interior points
+// // useful to compare solvers
+// float residual_pde(Array2Df* U, Array2Df* F)
+// {
+//   int width = U->getCols();
+//   int height = U->getRows();
+//   assert((int)F->getCols()==width && (int)F->getRows()==height);
+
+//   double res=0.0;
+//   for(int y=1;y<height-1;y++)
+//     for(int x=1;x<width-1;x++)
+//     {
+//       double laplace=-4.0*(*U)(x,y)+(*U)(x-1,y)+(*U)(x+1,y)
+//                      +(*U)(x,y-1)+(*U)(x,y+1);
+//       res += SQR( laplace-(*F)(x,y) );
+//     }
+//   return static_cast<float>( sqrt(res) );
+// }
+
+
+} // namespace
+
+
+void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb, int detail_level)
+{
+    int w = rgb->getWidth();
+    int h = rgb->getHeight();
+    
+    Array2Df Yr(w, h);
+    Array2Df L(w, h);
+
+    rgb->normalizeFloatTo1();
+
+    #pragma omp parallel for if (multiThread)
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            Yr(x, y) = Color::rgbLuminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x));
+        }
+    }
+
+    float alpha = params->fattal.alpha;
+    float beta = params->fattal.beta;
+    float noise = alpha * 0.01f;
+
+    if (settings->verbose) {
+        std::cout << "ToneMapFattal02: alpha = " << alpha << ", beta = " << beta
+                  << ", detail_level = " << detail_level << std::endl;
+    }
+
+    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level);
+
+    const float epsilon = 1e-4f;
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            float Y = std::max(Yr(x, y), epsilon);
+            float l = std::max(L(x, y), epsilon);
+            rgb->r(y, x) = std::max(rgb->r(y, x)/Y, 0.f) * l;
+            rgb->g(y, x) = std::max(rgb->g(y, x)/Y, 0.f) * l;
+            rgb->b(y, x) = std::max(rgb->b(y, x)/Y, 0.f) * l;
+        }
+    }
+
+    rgb->normalizeFloatTo65535();
+}
+
+} // namespace rtengine
diff --git a/rtgui/CMakeLists.txt b/rtgui/CMakeLists.txt
index e8bbf18b1..36c7a4034 100644
--- a/rtgui/CMakeLists.txt
+++ b/rtgui/CMakeLists.txt
@@ -147,6 +147,7 @@ set(NONCLISOURCEFILES
     xtransprocess.cc
     xtransrawexposure.cc
     zoompanel.cc
+    fattaltonemap.cc
     )
 
 include_directories(BEFORE "${CMAKE_CURRENT_BINARY_DIR}")
diff --git a/rtgui/fattaltonemap.cc b/rtgui/fattaltonemap.cc
new file mode 100644
index 000000000..f9ef660f4
--- /dev/null
+++ b/rtgui/fattaltonemap.cc
@@ -0,0 +1,120 @@
+/** -*- C++ -*-
+ *  
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2017 Alberto Griggio <alberto.griggio@gmail.com>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include "fattaltonemap.h"
+#include <iomanip>
+#include <cmath>
+
+using namespace rtengine;
+using namespace rtengine::procparams;
+
+FattalToneMapping::FattalToneMapping(): FoldableToolPanel(this, "fattal", M("TP_TM_FATTAL_LABEL"), true, true)
+{
+
+//    setEnabledTooltipMarkup(M("TP_EPD_TOOLTIP"));
+    
+    alpha = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_ALPHA"), 0.0, 2.0, 0.01, 1.0));
+    beta = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_BETA"), 0.0, 2.0, 0.01, 1.0));
+
+    alpha->setAdjusterListener(this);
+    beta->setAdjusterListener(this);
+
+    alpha->show();
+    beta->show();
+
+    pack_start(*alpha);
+    pack_start(*beta);
+}
+
+void FattalToneMapping::read(const ProcParams *pp, const ParamsEdited *pedited)
+{
+    disableListener();
+
+    if(pedited) {
+        alpha->setEditedState(pedited->fattal.alpha ? Edited : UnEdited);
+        beta->setEditedState(pedited->fattal.beta ? Edited : UnEdited);
+        set_inconsistent(multiImage && !pedited->fattal.enabled);
+    }
+
+    setEnabled(pp->fattal.enabled);
+    alpha->setValue(pp->fattal.alpha);
+    beta->setValue(pp->fattal.beta);
+
+    enableListener();
+}
+
+void FattalToneMapping::write(ProcParams *pp, ParamsEdited *pedited)
+{
+    pp->fattal.alpha = alpha->getValue();
+    pp->fattal.beta = beta->getValue();
+    pp->fattal.enabled = getEnabled();
+
+    if(pedited) {
+        pedited->fattal.alpha = alpha->getEditedState();
+        pedited->fattal.beta = beta->getEditedState();
+        pedited->fattal.enabled = !get_inconsistent();
+    }
+}
+
+void FattalToneMapping::setDefaults(const ProcParams *defParams, const ParamsEdited *pedited)
+{
+    alpha->setDefault(defParams->fattal.alpha);
+    beta->setDefault(defParams->fattal.beta);
+
+    if(pedited) {
+        alpha->setDefaultEditedState(pedited->fattal.alpha ? Edited : UnEdited);
+        beta->setDefaultEditedState(pedited->fattal.beta ? Edited : UnEdited);
+    } else {
+        alpha->setDefaultEditedState(Irrelevant);
+        beta->setDefaultEditedState(Irrelevant);
+    }
+}
+
+void FattalToneMapping::adjusterChanged(Adjuster* a, double newval)
+{
+    if(listener && getEnabled()) {
+        if(a == alpha) {
+            listener->panelChanged(EvTMFattalAlpha, Glib::ustring::format(std::setw(2), std::fixed, std::setprecision(2), a->getValue()));
+        } else if(a == beta) {
+            listener->panelChanged(EvTMFattalBeta, Glib::ustring::format(std::setw(2), std::fixed, std::setprecision(2), a->getValue()));
+        }
+    }
+}
+
+void FattalToneMapping::enabledChanged ()
+{
+    if (listener) {
+        if (get_inconsistent()) {
+            listener->panelChanged (EvTMFattalEnabled, M("GENERAL_UNCHANGED"));
+        } else if (getEnabled()) {
+            listener->panelChanged (EvTMFattalEnabled, M("GENERAL_ENABLED"));
+        } else {
+            listener->panelChanged (EvTMFattalEnabled, M("GENERAL_DISABLED"));
+        }
+    }
+}
+
+void FattalToneMapping::setBatchMode(bool batchMode)
+{
+    ToolPanel::setBatchMode(batchMode);
+
+    alpha->showEditedCB();
+    beta->showEditedCB();
+}
+
diff --git a/rtgui/fattaltonemap.h b/rtgui/fattaltonemap.h
new file mode 100644
index 000000000..8ee93aa3c
--- /dev/null
+++ b/rtgui/fattaltonemap.h
@@ -0,0 +1,44 @@
+/** -*- C++ -*-
+ *  
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2017 Alberto Griggio <alberto.griggio@gmail.com>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <gtkmm.h>
+#include "adjuster.h"
+#include "toolpanel.h"
+
+class FattalToneMapping: public ToolParamBlock, public AdjusterListener, public FoldableToolPanel
+{
+protected:
+    Adjuster *alpha;
+    Adjuster *beta;
+
+public:
+
+    FattalToneMapping();
+
+    void read           (const rtengine::procparams::ProcParams* pp, const ParamsEdited* pedited = nullptr);
+    void write          (rtengine::procparams::ProcParams* pp, ParamsEdited* pedited = nullptr);
+    void setDefaults    (const rtengine::procparams::ProcParams* defParams, const ParamsEdited* pedited = nullptr);
+    void setBatchMode   (bool batchMode);
+
+    void adjusterChanged (Adjuster* a, double newval);
+    void enabledChanged  ();
+};
+
diff --git a/rtgui/paramsedited.cc b/rtgui/paramsedited.cc
index 0827a0d7f..17e53f2dd 100644
--- a/rtgui/paramsedited.cc
+++ b/rtgui/paramsedited.cc
@@ -264,6 +264,9 @@ void ParamsEdited::set (bool v)
     epd.edgeStopping        = v;
     epd.scale               = v;
     epd.reweightingIterates = v;
+    fattal.enabled   = v;
+    fattal.alpha     = v;
+    fattal.beta      = v;
     sh.enabled       = v;
     sh.hq            = v;
     sh.highlights    = v;
@@ -804,6 +807,10 @@ void ParamsEdited::initFrom (const std::vector<rtengine::procparams::ProcParams>
         epd.scale = epd.scale && p.epd.scale == other.epd.scale;
         epd.reweightingIterates = epd.reweightingIterates && p.epd.reweightingIterates == other.epd.reweightingIterates;
 
+        fattal.enabled = fattal.enabled && p.fattal.enabled == other.fattal.enabled;
+        fattal.alpha = fattal.alpha && p.fattal.alpha == other.fattal.alpha;
+        fattal.beta = fattal.beta && p.fattal.beta == other.fattal.beta;
+        
         sh.enabled = sh.enabled && p.sh.enabled == other.sh.enabled;
         sh.hq = sh.hq && p.sh.hq == other.sh.hq;
         sh.highlights = sh.highlights && p.sh.highlights == other.sh.highlights;
@@ -1972,6 +1979,16 @@ void ParamsEdited::combine (rtengine::procparams::ProcParams& toEdit, const rten
         toEdit.epd.reweightingIterates    = mods.epd.reweightingIterates;
     }
 
+    if (fattal.enabled) {
+        toEdit.fattal.enabled = mods.fattal.enabled;
+    }
+    if (fattal.alpha) {
+        toEdit.fattal.alpha = mods.fattal.alpha;
+    }
+    if (fattal.beta) {
+        toEdit.fattal.beta = mods.fattal.beta;
+    }    
+
     if (sh.enabled) {
         toEdit.sh.enabled         = mods.sh.enabled;
     }
diff --git a/rtgui/paramsedited.h b/rtgui/paramsedited.h
index 46a68d3f5..552aa8515 100644
--- a/rtgui/paramsedited.h
+++ b/rtgui/paramsedited.h
@@ -365,6 +365,14 @@ public:
 };
 
 
+class FattalToneMappingParamsEdited {
+public:
+    bool enabled;
+    bool alpha;
+    bool beta;
+};
+
+
 class SHParamsEdited
 {
 
@@ -800,6 +808,7 @@ public:
     DefringeParamsEdited          defringe;
     DirPyrDenoiseParamsEdited     dirpyrDenoise;
     EPDParamsEdited               epd;
+    FattalToneMappingParamsEdited fattal;
     ImpulseDenoiseParamsEdited    impulseDenoise;
     SHParamsEdited                sh;
     CropParamsEdited              crop;
diff --git a/rtgui/toolpanelcoord.cc b/rtgui/toolpanelcoord.cc
index ed84ae8b6..b88827482 100644
--- a/rtgui/toolpanelcoord.cc
+++ b/rtgui/toolpanelcoord.cc
@@ -90,6 +90,7 @@ ToolPanelCoordinator::ToolPanelCoordinator (bool batch) : ipc (nullptr), hasChan
     rawexposure         = Gtk::manage (new RAWExposure ());
     bayerrawexposure    = Gtk::manage (new BayerRAWExposure ());
     xtransrawexposure   = Gtk::manage (new XTransRAWExposure ());
+    fattal              = Gtk::manage(new FattalToneMapping());
 
     // So Demosaic, Line noise filter, Green Equilibration, Ca-Correction (garder le nom de section identique!) and Black-Level will be moved in a "Bayer sensor" tool,
     // and a separate Demosaic and Black Level tool will be created in an "X-Trans sensor" tool
@@ -114,6 +115,7 @@ ToolPanelCoordinator::ToolPanelCoordinator (bool batch) : ipc (nullptr), hasChan
     addPanel (colorPanel, rgbcurves);
     addPanel (colorPanel, colortoning);
     addPanel (exposurePanel, epd);
+    addPanel (exposurePanel, fattal);
     addPanel (exposurePanel, retinex);
     addPanel (exposurePanel, pcvignette);
     addPanel (exposurePanel, gradient);
diff --git a/rtgui/toolpanelcoord.h b/rtgui/toolpanelcoord.h
index 155679687..3da061b99 100644
--- a/rtgui/toolpanelcoord.h
+++ b/rtgui/toolpanelcoord.h
@@ -78,6 +78,7 @@
 #include "colortoning.h"
 #include "filmsimulation.h"
 #include "prsharpening.h"
+#include "fattaltonemap.h"
 #include "guiutils.h"
 
 class ImageEditorCoordinator;
@@ -145,6 +146,7 @@ protected:
     RAWExposure* rawexposure;
     BayerRAWExposure* bayerrawexposure;
     XTransRAWExposure* xtransrawexposure;
+    FattalToneMapping *fattal;
 
     std::vector<PParamsChangeListener*> paramcListeners;
 

From d418b9d01dd75667e944f2f4d4557b32bba0b408 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Fri, 3 Nov 2017 14:58:27 +0100
Subject: [PATCH 02/39] various fixes to Fattal, and moved it later in the
 pipeline

---
 rtengine/dcrop.cc             |  18 +++++-
 rtengine/improccoordinator.cc |   4 ++
 rtengine/improcfun.cc         |  13 ----
 rtengine/improcfun.h          |   2 +-
 rtengine/rtthumbnail.cc       |   4 ++
 rtengine/simpleprocess.cc     |   4 ++
 rtengine/tmo_fattal02.cc      | 115 +++++++++++++++++++++++-----------
 7 files changed, 108 insertions(+), 52 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 0738e48b0..818d4d8bb 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -808,6 +808,10 @@ void Crop::update (int todo)
         parent->ipf.chromiLuminanceCurve (this, 1, labnCrop, labnCrop, parent->chroma_acurve, parent->chroma_bcurve, parent->satcurve, parent->lhskcurve,  parent->clcurve, parent->lumacurve, utili, autili, butili, ccutili, cclutili, clcutili, dummy, dummy);
         parent->ipf.vibrance (labnCrop);
 
+        if (params.fattal.enabled) {
+            parent->ipf.ToneMapFattal02(labnCrop, 3);
+        }
+        
         if ((params.colorappearance.enabled && !params.colorappearance.tonecie) ||  (!params.colorappearance.enabled)) {
             parent->ipf.EPDToneMap (labnCrop, 5, skip);
         }
@@ -1077,6 +1081,11 @@ void Crop::freeAll ()
 namespace
 {
 
+bool check_need_full_image(const ProcParams &params)
+{
+    return params.fattal.enabled; // agriggio - maybe we can do this for wavelets too?
+}
+
 bool check_need_larger_crop_for_lcp_distortion (int fw, int fh, int x, int y, int w, int h, const ProcParams &params)
 {
     if (x == 0 && y == 0 && w == fw && h == fh) {
@@ -1139,6 +1148,14 @@ bool Crop::setCropSizes (int rcx, int rcy, int rcw, int rch, int skip, bool inte
     ory = by1;
     orw = bw;
     orh = bh;
+
+    if (check_need_full_image(parent->params)) {
+        orx = bx1 = 0;
+        ory = by1 = 0;
+        orw = bw = parent->fullw;
+        orh = bh = parent->fullh;
+    }
+    
     ProcParams& params = parent->params;
 
     parent->ipf.transCoord (parent->fw, parent->fh, bx1, by1, bw, bh, orx, ory, orw, orh);
@@ -1178,7 +1195,6 @@ bool Crop::setCropSizes (int rcx, int rcy, int rcw, int rch, int skip, bool inte
         orh = min (y2 - y1, parent->fh - ory);
     }
 
-
     PreviewProps cp (orx, ory, orw, orh, skip);
     int orW, orH;
     parent->imgsrc->getSize (cp, orW, orH);
diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc
index 9d265f90b..e3ff874b6 100644
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -636,6 +636,10 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall)
         ipf.chromiLuminanceCurve (nullptr, pW, nprevl, nprevl, chroma_acurve, chroma_bcurve, satcurve, lhskcurve, clcurve, lumacurve, utili, autili, butili, ccutili, cclutili, clcutili, histCCurve, histLCurve);
         ipf.vibrance (nprevl);
 
+        if (params.fattal.enabled) {
+            ipf.ToneMapFattal02(nprevl, 3);
+        }
+        
         if ((params.colorappearance.enabled && !params.colorappearance.tonecie) ||  (!params.colorappearance.enabled)) {
             ipf.EPDToneMap (nprevl, 5, scale);
         }
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index 21bc9d9c5..516e0ee9d 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -3119,19 +3119,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
         }
     }
 
-    std::unique_ptr<Imagefloat> fattal;
-    if (params->fattal.enabled) {
-        fattal.reset(working->copy());
-        int detail_level = 3;
-        if (scale < 8) {
-            detail_level = 3;
-        } else {
-            detail_level = 0;
-        }
-        ToneMapFattal02(fattal.get(), detail_level);
-        working = fattal.get();
-    }
-
     int h_th = 0, s_th = 0;
 
     if (shmap) {
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 53de02d3d..c1e4a1e35 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -347,7 +347,7 @@ public:
     void Badpixelscam (CieImage * src, CieImage * dst, double radius, int thresh, int mode,  float b_l, float t_l, float t_r, float b_r, float skinprot, float chrom, int hotbad);
     void BadpixelsLab (LabImage * src, LabImage * dst, double radius, int thresh, int mode, float b_l, float t_l, float t_r, float b_r, float skinprot, float chrom);
 
-    void ToneMapFattal02(Imagefloat *rgb, int detail_level);
+    void ToneMapFattal02(LabImage *lab, int detail_level);
     
     Image8*     lab2rgb   (LabImage* lab, int cx, int cy, int cw, int ch, const procparams::ColorManagementParams &icm);
     Image16*    lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int ch, const procparams::ColorManagementParams &icm, bool bw, GammaValues *ga = nullptr);
diff --git a/rtengine/rtthumbnail.cc b/rtengine/rtthumbnail.cc
index 9bdee796b..6cc50a6a8 100644
--- a/rtengine/rtthumbnail.cc
+++ b/rtengine/rtthumbnail.cc
@@ -1275,6 +1275,10 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, eSensorT
 
     ipf.vibrance (labView);
 
+    if (params.fattal.enabled) {
+        ipf.ToneMapFattal02(labView, 0);
+    }
+
     if ((params.colorappearance.enabled && !params.colorappearance.tonecie) || !params.colorappearance.enabled) {
         ipf.EPDToneMap (labView, 5, 6);
     }
diff --git a/rtengine/simpleprocess.cc b/rtengine/simpleprocess.cc
index 93d2a3149..de57386dd 100644
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@@ -1013,6 +1013,10 @@ private:
 
         ipf.chromiLuminanceCurve (nullptr, 1, labView, labView, curve1, curve2, satcurve, lhskcurve, clcurve, lumacurve, utili, autili, butili, ccutili, cclutili, clcutili, dummy, dummy);
 
+        if (params.fattal.enabled) {
+            ipf.ToneMapFattal02(labView, 3);
+        }
+
         if ((params.colorappearance.enabled && !params.colorappearance.tonecie) || (!params.colorappearance.enabled)) {
             ipf.EPDToneMap (labView, 5, 1);
         }
diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index d50488554..7d5a866b1 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -68,6 +68,8 @@
 #include "array2D.h"
 #include "improcfun.h"
 #include "settings.h"
+#include "iccstore.h"
+
 
 namespace rtengine {
 
@@ -155,6 +157,15 @@ void gaussianBlur(const Array2Df& I, Array2Df& L)
 
     Array2Df T(width,height);
 
+    if (width < 3 || height < 3) {
+        if (&I != &L) {
+            for (int i = 0, n = width*height; i < n; ++i) {
+                L(i) = I(i);
+            }
+        }
+        return;
+    }
+
     //--- X blur
     //#pragma omp parallel for shared(I, T)
     for ( int y=0 ; y<height ; y++ )
@@ -202,10 +213,20 @@ void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
 
   for ( int k=1 ; k<nlevels ; k++ )
   {
-    width /= 2;
-    height /= 2;
-    pyramids[k] = new Array2Df(width,height);
-    downSample(*L, *pyramids[k]);
+      if (width > 2 && height > 2) {
+          width /= 2;
+          height /= 2;
+          pyramids[k] = new Array2Df(width,height);
+          downSample(*L, *pyramids[k]);
+      } else {
+          // RT - now nlevels is fixed in tmo_fattal02 (see the comment in
+          // there), so it might happen that we have to add some padding to
+          // the gaussian pyramids
+          pyramids[k] = new Array2Df(width,height);
+          for (int j = 0, n = width*height; j < n; ++j) {
+              (*pyramids[k])(j) = (*L)(j);
+          }
+      }
 
     delete L;
     L = new Array2Df(width,height);
@@ -379,7 +400,7 @@ void findMaxMinPercentile(const Array2Df& I,
     maxLum = vI.at( int(maxPrct*vI.size()) );
 }
 
-void solve_pde_fft(Array2Df *F, Array2Df *U);
+void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread);
 
 void tmo_fattal02(size_t width,
                   size_t height,
@@ -388,7 +409,8 @@ void tmo_fattal02(size_t width,
                   float alfa,
                   float beta,
                   float noise,
-                  int detail_level)
+                  int detail_level,
+                  bool multithread)
 {
 // #ifdef TIMER_PROFILING
 //     msec_timer stop_watch;
@@ -404,15 +426,21 @@ void tmo_fattal02(size_t width,
   // ph.setValue(2);
   // if (ph.canceled()) return;
 
-  int MSIZE = 32;         // minimum size of gaussian pyramid
-  // I believe a smaller value than 32 results in slightly better overall
-  // quality but I'm only applying this if the newly implemented fft solver
-  // is used in order not to change behaviour of the old version
-  // TODO: best let the user decide this value
-  // if (fftsolver)
-  {
-     MSIZE = 8;
-  }
+    /* RT -- we use a hardcoded value of 8 for nlevels, to limit the
+     * dependency of the result on the image size. When using an auto computed
+     * nlevels value, you would get vastly different results with different
+     * image sizes, making it essentially impossible to preview the tool
+     * inside RT. With a hardcoded value, the results for the preview are much
+     * closer to those for the final image */
+  // int MSIZE = 32;         // minimum size of gaussian pyramid
+  // // I believe a smaller value than 32 results in slightly better overall
+  // // quality but I'm only applying this if the newly implemented fft solver
+  // // is used in order not to change behaviour of the old version
+  // // TODO: best let the user decide this value
+  // // if (fftsolver)
+  // {
+  //    MSIZE = 8;
+  // }
 
   int size = width*height;
   // unsigned int x,y;
@@ -435,16 +463,17 @@ void tmo_fattal02(size_t width,
   // ph.setValue(4);
 
   // create gaussian pyramids
-  int mins = (width<height) ? width : height;    // smaller dimension
-  int nlevels = 0;
-  while ( mins >= MSIZE )
-  {
-    nlevels++;
-    mins /= 2;
-  }
-  // std::cout << "DEBUG: nlevels = " << nlevels << ", mins = " << mins << std::endl;
-  // The following lines solves a bug with images particularly small
-  if (nlevels == 0) nlevels = 1;
+  // int mins = (width<height) ? width : height;    // smaller dimension
+  // int nlevels = 0;
+  // while ( mins >= MSIZE )
+  // {
+  //   nlevels++;
+  //   mins /= 2;
+  // }
+  // // std::cout << "DEBUG: nlevels = " << nlevels << ", mins = " << mins << std::endl;
+  // // The following lines solves a bug with images particularly small
+  // if (nlevels == 0) nlevels = 1;
+  const int nlevels = 7; // RT -- see above
 
   Array2Df** pyramids = new Array2Df*[nlevels];
   createGaussianPyramids(H, pyramids, nlevels);
@@ -551,7 +580,7 @@ void tmo_fattal02(size_t width,
   Array2Df U(width, height);
   // if (fftsolver)
   {
-      solve_pde_fft(&DivG, &U);//, ph);
+      solve_pde_fft(&DivG, &U, multithread);//, ph);
   }
   // else
   // {
@@ -798,7 +827,7 @@ std::vector<double> get_lambda(int n)
 // not modified and the equation might not have a solution but an
 // approximate solution with a minimum error is then calculated
 // double precision version
-void solve_pde_fft(Array2Df *F, Array2Df *U)/*, pfs::Progress &ph,
+void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress &ph,
                                               bool adjust_bound)*/
 {
    // ph.setValue(20);
@@ -809,8 +838,10 @@ void solve_pde_fft(Array2Df *F, Array2Df *U)/*, pfs::Progress &ph,
 
   // activate parallel execution of fft routines
 #ifdef RT_FFTW3F_OMP
-  fftwf_init_threads();
-  fftwf_plan_with_nthreads( omp_get_max_threads() );
+  if (multithread) {
+      fftwf_init_threads();
+      fftwf_plan_with_nthreads( omp_get_max_threads() );
+  }
 // #else
 //   fftwf_plan_with_nthreads( 2 );
 #endif
@@ -883,7 +914,9 @@ void solve_pde_fft(Array2Df *F, Array2Df *U)/*, pfs::Progress &ph,
 
   // fft parallel threads cleanup, better handled outside this function?
 #ifdef RT_FFTW3F_OMP
-  fftwf_cleanup_threads();
+  if (multithread) {
+      fftwf_cleanup_threads();
+  }
 #endif
 
   // ph.setValue(90);
@@ -916,10 +949,7 @@ void solve_pde_fft(Array2Df *F, Array2Df *U)/*, pfs::Progress &ph,
 // }
 
 
-} // namespace
-
-
-void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb, int detail_level)
+void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level, bool multiThread)
 {
     int w = rgb->getWidth();
     int h = rgb->getHeight();
@@ -936,8 +966,8 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb, int detail_level)
         }
     }
 
-    float alpha = params->fattal.alpha;
-    float beta = params->fattal.beta;
+    // float alpha = params->fattal.alpha;
+    // float beta = params->fattal.beta;
     float noise = alpha * 0.01f;
 
     if (settings->verbose) {
@@ -945,7 +975,7 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb, int detail_level)
                   << ", detail_level = " << detail_level << std::endl;
     }
 
-    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level);
+    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
 
     const float epsilon = 1e-4f;
     for (int y = 0; y < h; y++) {
@@ -961,4 +991,15 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb, int detail_level)
     rgb->normalizeFloatTo65535();
 }
 
+} // namespace
+
+
+void ImProcFunctions::ToneMapFattal02(LabImage *lab, int detail_level)
+{
+    Imagefloat tmp(lab->W, lab->H);
+    lab2rgb(*lab, tmp, params->icm.working);
+    tmo_fattal02_RT(&tmp, params->fattal.alpha, params->fattal.beta, detail_level, multiThread);
+    rgb2lab(tmp, *lab, params->icm.working);
+}
+
 } // namespace rtengine

From c9615d440d9afdff0f99511a7a4ad3111a61980a Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Fri, 3 Nov 2017 22:10:10 +0100
Subject: [PATCH 03/39] trying out some hacks to make Fattal results less
 dependent on the size of the input image

---
 rtengine/tmo_fattal02.cc | 92 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 7d5a866b1..9d015dfdb 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -401,6 +401,7 @@ void findMaxMinPercentile(const Array2Df& I,
 }
 
 void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread);
+void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread); // RT
 
 void tmo_fattal02(size_t width,
                   size_t height,
@@ -426,7 +427,7 @@ void tmo_fattal02(size_t width,
   // ph.setValue(2);
   // if (ph.canceled()) return;
 
-    /* RT -- we use a hardcoded value of 8 for nlevels, to limit the
+    /* RT -- we use a hardcoded value for nlevels, to limit the
      * dependency of the result on the image size. When using an auto computed
      * nlevels value, you would get vastly different results with different
      * image sizes, making it essentially impossible to preview the tool
@@ -441,6 +442,7 @@ void tmo_fattal02(size_t width,
   // {
   //    MSIZE = 8;
   // }
+        
 
   int size = width*height;
   // unsigned int x,y;
@@ -462,6 +464,41 @@ void tmo_fattal02(size_t width,
   }
   // ph.setValue(4);
 
+  /** RT - this is also here to reduce the dependency of the results on the
+   * input image size, with the primary aim of having a preview in RT that is
+   * reasonably close to the actual output image. Intuitively, what we do is
+   * to put a cap on the dimension of the image processed, so that it is close
+   * in size to the typical preview that you will see on a normal consumer
+   * monitor. (That's where the 1920 comes from here.) However, we can't
+   * simply downscale the input Y array and then upscale it on output, because
+   * that would cause a big loss of sharpness (confirmed by testing).
+   * So, we use a different method: we downscale the H array, so that we
+   * compute a downscaled gaussian pyramid and a downscaled FI matrix. Then,
+   * we upscale the FI matrix later on, before it gets combined with the
+   * original input luminance array H. This seems to preserve the input
+   * sharpness and at the same time significantly reduce the dependency of the
+   * result on the input size. Clearly this is a hack, and keep in mind that I
+   * do not really know how Fattal works (it comes from LuminanceHDR almost
+   * verbatim), so this should probably be revised/reviewed by someone who
+   * knows better... also, we use a quite naive bilinear interpolation
+   * algorithm (see rescale_bilinear below), which could definitely be
+   * improved */
+  const int RT_dimension_cap = 1920;
+  int fullwidth = width;
+  int fullheight = height;
+  int dim = std::min(width, height);
+  Array2Df *fullH = nullptr;
+  if (dim > RT_dimension_cap) {
+      float s = float(RT_dimension_cap) / float(dim);
+      Array2Df *HH = new Array2Df(width * s, height * s);
+      rescale_bilinear(*H, *HH, multithread);
+      fullH = H;
+      H = HH;
+      width = H->getCols();
+      height = H->getRows();
+  }
+  /** RT */
+
   // create gaussian pyramids
   // int mins = (width<height) ? width : height;    // smaller dimension
   // int nlevels = 0;
@@ -508,6 +545,18 @@ void tmo_fattal02(size_t width,
   //   return;
   // }
 
+  /** - RT - bring back the FI image to the input size if it was downscaled */
+  if (fullH) {
+      Array2Df *FI2 = new Array2Df(fullwidth, fullheight);
+      rescale_bilinear(*FI, *FI2, multithread);
+      delete FI;
+      FI = FI2;
+      width = fullwidth;
+      height = fullheight;
+      delete H;
+      H = fullH;
+  }
+  /** RT */
 
   // attenuate gradients
   Array2Df* Gx = new Array2Df(width, height);
@@ -949,6 +998,47 @@ void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress
 // }
 
 
+/*****************************************************************************
+ * RT code from here on
+ *****************************************************************************/
+
+inline float get_bilinear_value(const Array2Df &src, float x, float y)
+{
+    // Get integer and fractional parts of numbers
+    int xi = x;
+    int yi = y;
+    float xf = x - xi;
+    float yf = y - yi;
+    int xi1 = std::min(xi+1, src.getCols()-1);
+    int yi1 = std::min(yi+1, src.getRows()-1);
+ 
+    float bl = src(xi, yi);
+    float br = src(xi1, yi);
+    float tl = src(xi, yi1);
+    float tr = src(xi1, yi1);
+ 
+    // interpolate
+    float b = xf * br + (1.f - xf) * bl;
+    float t = xf * tr + (1.f - xf) * tl;
+    float pxf = yf * t + (1.f - yf) * b;
+    return pxf;
+}
+
+
+void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
+{
+    float col_scale = float(src.getCols())/float(dst.getCols());
+    float row_scale = float(src.getRows())/float(dst.getRows());
+
+    #pragma omp parallel for if (multithread)
+    for (int x = 0; x < dst.getCols(); ++x) {
+        for (int y = 0; y < dst.getRows(); ++y) {
+            dst(x, y) = get_bilinear_value(src, x * col_scale, y * row_scale);
+        }
+    }
+}
+
+
 void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level, bool multiThread)
 {
     int w = rgb->getWidth();

From f6faccc759c2076591a5533903164fb7ce39ba90 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sat, 4 Nov 2017 18:16:51 +0100
Subject: [PATCH 04/39] moved Fattal earlier in the pipeline (before rgbProc,
 right after distortion/perspective/ca)

---
 rtengine/dcrop.cc             | 11 +++++++----
 rtengine/improccoordinator.cc | 13 +++++++++----
 rtengine/improcfun.h          |  2 +-
 rtengine/rtthumbnail.cc       |  8 ++++----
 rtengine/simpleprocess.cc     |  8 ++++----
 rtengine/tmo_fattal02.cc      | 24 ++++++++++++------------
 6 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 818d4d8bb..8bcd1e6a3 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -715,6 +715,13 @@ void Crop::update (int todo)
         transCrop = nullptr;
     }
 
+    std::unique_ptr<Imagefloat> fattalCrop;
+    if ((todo & M_RGBCURVE) && params.fattal.enabled) {
+        fattalCrop.reset(baseCrop->copy());
+        parent->ipf.ToneMapFattal02(fattalCrop.get());
+        baseCrop = fattalCrop.get();
+    }
+    
     if ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {
 
         const int W = baseCrop->getWidth();
@@ -808,10 +815,6 @@ void Crop::update (int todo)
         parent->ipf.chromiLuminanceCurve (this, 1, labnCrop, labnCrop, parent->chroma_acurve, parent->chroma_bcurve, parent->satcurve, parent->lhskcurve,  parent->clcurve, parent->lumacurve, utili, autili, butili, ccutili, cclutili, clcutili, dummy, dummy);
         parent->ipf.vibrance (labnCrop);
 
-        if (params.fattal.enabled) {
-            parent->ipf.ToneMapFattal02(labnCrop, 3);
-        }
-        
         if ((params.colorappearance.enabled && !params.colorappearance.tonecie) ||  (!params.colorappearance.enabled)) {
             parent->ipf.EPDToneMap (labnCrop, 5, skip);
         }
diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc
index e3ff874b6..dffd9572e 100644
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -407,6 +407,15 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall)
         }
     }
 
+    if ((todo & M_RGBCURVE) && params.fattal.enabled) {
+        Imagefloat *fattalprev = oprevi->copy();
+        ipf.ToneMapFattal02(fattalprev);
+        if (oprevi != orig_prev) {
+            delete oprevi;
+        }
+        oprevi = fattalprev;
+    } 
+
     if ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {
         const int W = oprevi->getWidth();
         const int H = oprevi->getHeight();
@@ -636,10 +645,6 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall)
         ipf.chromiLuminanceCurve (nullptr, pW, nprevl, nprevl, chroma_acurve, chroma_bcurve, satcurve, lhskcurve, clcurve, lumacurve, utili, autili, butili, ccutili, cclutili, clcutili, histCCurve, histLCurve);
         ipf.vibrance (nprevl);
 
-        if (params.fattal.enabled) {
-            ipf.ToneMapFattal02(nprevl, 3);
-        }
-        
         if ((params.colorappearance.enabled && !params.colorappearance.tonecie) ||  (!params.colorappearance.enabled)) {
             ipf.EPDToneMap (nprevl, 5, scale);
         }
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index c1e4a1e35..46ad670fa 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -347,7 +347,7 @@ public:
     void Badpixelscam (CieImage * src, CieImage * dst, double radius, int thresh, int mode,  float b_l, float t_l, float t_r, float b_r, float skinprot, float chrom, int hotbad);
     void BadpixelsLab (LabImage * src, LabImage * dst, double radius, int thresh, int mode, float b_l, float t_l, float t_r, float b_r, float skinprot, float chrom);
 
-    void ToneMapFattal02(LabImage *lab, int detail_level);
+    void ToneMapFattal02(Imagefloat *rgb);
     
     Image8*     lab2rgb   (LabImage* lab, int cx, int cy, int cw, int ch, const procparams::ColorManagementParams &icm);
     Image16*    lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int ch, const procparams::ColorManagementParams &icm, bool bw, GammaValues *ga = nullptr);
diff --git a/rtengine/rtthumbnail.cc b/rtengine/rtthumbnail.cc
index 6cc50a6a8..efe5d7868 100644
--- a/rtengine/rtthumbnail.cc
+++ b/rtengine/rtthumbnail.cc
@@ -1102,6 +1102,10 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, eSensorT
         baseImg = trImg;
     }
 
+    if (params.fattal.enabled) {
+        ipf.ToneMapFattal02(baseImg);
+    }
+
     // update blurmap
     SHMap* shmap = nullptr;
 
@@ -1275,10 +1279,6 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, eSensorT
 
     ipf.vibrance (labView);
 
-    if (params.fattal.enabled) {
-        ipf.ToneMapFattal02(labView, 0);
-    }
-
     if ((params.colorappearance.enabled && !params.colorappearance.tonecie) || !params.colorappearance.enabled) {
         ipf.EPDToneMap (labView, 5, 6);
     }
diff --git a/rtengine/simpleprocess.cc b/rtengine/simpleprocess.cc
index de57386dd..c8d45acf8 100644
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@@ -833,6 +833,10 @@ private:
         //ImProcFunctions ipf (&params, true);
         ImProcFunctions &ipf = * (ipf_p.get());
 
+        if (params.fattal.enabled) {
+            ipf.ToneMapFattal02(baseImg);
+        }
+        
         if (params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {
             const int W = baseImg->getWidth();
             const int H = baseImg->getHeight();
@@ -1013,10 +1017,6 @@ private:
 
         ipf.chromiLuminanceCurve (nullptr, 1, labView, labView, curve1, curve2, satcurve, lhskcurve, clcurve, lumacurve, utili, autili, butili, ccutili, cclutili, clcutili, dummy, dummy);
 
-        if (params.fattal.enabled) {
-            ipf.ToneMapFattal02(labView, 3);
-        }
-
         if ((params.colorappearance.enabled && !params.colorappearance.tonecie) || (!params.colorappearance.enabled)) {
             ipf.EPDToneMap (labView, 5, 1);
         }
diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 9d015dfdb..768a296b2 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1030,7 +1030,9 @@ void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
     float col_scale = float(src.getCols())/float(dst.getCols());
     float row_scale = float(src.getRows())/float(dst.getRows());
 
+#ifdef _OPENMP
     #pragma omp parallel for if (multithread)
+#endif
     for (int x = 0; x < dst.getCols(); ++x) {
         for (int y = 0; y < dst.getRows(); ++y) {
             dst(x, y) = get_bilinear_value(src, x * col_scale, y * row_scale);
@@ -1047,17 +1049,17 @@ void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level,
     Array2Df Yr(w, h);
     Array2Df L(w, h);
 
-    rgb->normalizeFloatTo1();
-
+    const float epsilon = 1e-4f;
+    
+#ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
+#endif
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
-            Yr(x, y) = Color::rgbLuminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x));
+            Yr(x, y) = std::max(Color::rgbLuminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x)), epsilon); // clip really black pixels, otherwise it doesn't work at all (not sure why...)
         }
     }
 
-    // float alpha = params->fattal.alpha;
-    // float beta = params->fattal.beta;
     float noise = alpha * 0.01f;
 
     if (settings->verbose) {
@@ -1067,10 +1069,9 @@ void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level,
 
     tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
 
-    const float epsilon = 1e-4f;
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
-            float Y = std::max(Yr(x, y), epsilon);
+            float Y = Yr(x, y);
             float l = std::max(L(x, y), epsilon);
             rgb->r(y, x) = std::max(rgb->r(y, x)/Y, 0.f) * l;
             rgb->g(y, x) = std::max(rgb->g(y, x)/Y, 0.f) * l;
@@ -1084,12 +1085,11 @@ void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level,
 } // namespace
 
 
-void ImProcFunctions::ToneMapFattal02(LabImage *lab, int detail_level)
+void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 {
-    Imagefloat tmp(lab->W, lab->H);
-    lab2rgb(*lab, tmp, params->icm.working);
-    tmo_fattal02_RT(&tmp, params->fattal.alpha, params->fattal.beta, detail_level, multiThread);
-    rgb2lab(tmp, *lab, params->icm.working);
+    const int detail_level = 3;
+    tmo_fattal02_RT(rgb, params->fattal.alpha, params->fattal.beta, detail_level, multiThread);
 }
 
+
 } // namespace rtengine

From 9db14cc52101e899dff1c76115b9fa5314e3efa6 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sat, 4 Nov 2017 21:10:26 +0100
Subject: [PATCH 05/39] Fattal: added sanity check for the values of alpha and
 beta

---
 rtengine/tmo_fattal02.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 768a296b2..d0c3ea640 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1043,6 +1043,11 @@ void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
 
 void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level, bool multiThread)
 {
+    // sanity check
+    if (alpha <= 0 || beta <= 0) {
+        return;
+    }
+    
     int w = rgb->getWidth();
     int h = rgb->getHeight();
     

From b64707bea63e89ccb9a7abf13c480eeefcfaba79 Mon Sep 17 00:00:00 2001
From: Hombre <natureh.510@gmail.com>
Date: Sat, 4 Nov 2017 21:27:11 +0100
Subject: [PATCH 06/39] Adding ADD/SET mechanism to Fattal HDR tool +
 PartialPaste dialog gui

+ range of Beta limited to 0.7 <> 1.3.

ADD/SET mechanism was missing for EPD tool too, so it's included in this
commit.

see #4168
---
 rtdata/languages/Francais    |  4 ++++
 rtdata/languages/default     |  5 +++--
 rtgui/addsetids.h            |  7 +++++++
 rtgui/batchtoolpanelcoord.cc |  4 ++++
 rtgui/epd.cc                 |  8 ++++++++
 rtgui/epd.h                  |  1 +
 rtgui/fattaltonemap.cc       |  8 +++++++-
 rtgui/fattaltonemap.h        |  2 ++
 rtgui/partialpastedlg.cc     |  8 ++++++++
 rtgui/partialpastedlg.h      |  3 ++-
 rtgui/preferences.cc         | 13 +++++++++++++
 rtgui/toolpanelcoord.cc      |  2 +-
 12 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/rtdata/languages/Francais b/rtdata/languages/Francais
index c39d9f5e5..b5a1434bc 100644
--- a/rtdata/languages/Francais
+++ b/rtdata/languages/Francais
@@ -904,6 +904,7 @@ PARTIALPASTE_SHADOWSHIGHLIGHTS;Ombres/Hautes lumières
 PARTIALPASTE_SHARPENEDGE;Bords
 PARTIALPASTE_SHARPENING;Netteté
 PARTIALPASTE_SHARPENMICRO;Microcontraste
+PARTIALPASTE_TM_FATTAL;Compression tonale HDR (Fattal02)
 PARTIALPASTE_VIBRANCE;Vibrance
 PARTIALPASTE_VIGNETTING;Correction du vignettage
 PARTIALPASTE_WAVELETGROUP;Niveaux d'ondelette
@@ -1892,6 +1893,9 @@ TP_SHARPENMICRO_AMOUNT;Quantité
 TP_SHARPENMICRO_LABEL;Microcontraste
 TP_SHARPENMICRO_MATRIX;Matrice 3×3 au lieu de 5×5
 TP_SHARPENMICRO_UNIFORMITY;Uniformité
+TP_TM_FATTAL_LABEL;Compression Tonale HDR (Fattal02)
+TP_TM_FATTAL_ALPHA;Seuil (Alpha)
+TP_TM_FATTAL_BETA;Quantité (Beta)
 TP_VIBRANCE_AVOIDCOLORSHIFT;Éviter les dérives de teinte
 TP_VIBRANCE_CURVEEDITOR_SKINTONES;TT
 TP_VIBRANCE_CURVEEDITOR_SKINTONES_LABEL;Tons chair
diff --git a/rtdata/languages/default b/rtdata/languages/default
index 81a9cf231..a9fe2414e 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -928,6 +928,7 @@ PARTIALPASTE_SHADOWSHIGHLIGHTS;Shadows/highlights
 PARTIALPASTE_SHARPENEDGE;Edges
 PARTIALPASTE_SHARPENING;Sharpening (USM/RL)
 PARTIALPASTE_SHARPENMICRO;Microcontrast
+PARTIALPASTE_TM_FATTAL;HDR Tone mapping (Fattal02)
 PARTIALPASTE_VIBRANCE;Vibrance
 PARTIALPASTE_VIGNETTING;Vignetting correction
 PARTIALPASTE_WAVELETGROUP;Wavelet Levels
@@ -1929,8 +1930,8 @@ TP_SHARPENMICRO_LABEL;Microcontrast
 TP_SHARPENMICRO_MATRIX;3×3 matrix instead of 5×5
 TP_SHARPENMICRO_UNIFORMITY;Uniformity
 TP_TM_FATTAL_LABEL;HDR Tone Mapping (Fattal02)
-TP_TM_FATTAL_ALPHA;Alpha
-TP_TM_FATTAL_BETA;Beta
+TP_TM_FATTAL_ALPHA;Threshold (Alpha)
+TP_TM_FATTAL_BETA;Amount (Beta)
 TP_VIBRANCE_AVOIDCOLORSHIFT;Avoid color shift
 TP_VIBRANCE_CURVEEDITOR_SKINTONES;HH
 TP_VIBRANCE_CURVEEDITOR_SKINTONES_LABEL;Skin-tones
diff --git a/rtgui/addsetids.h b/rtgui/addsetids.h
index 2ee2e6053..07cf47d18 100644
--- a/rtgui/addsetids.h
+++ b/rtgui/addsetids.h
@@ -122,6 +122,13 @@ enum {
     ADDSET_SHARP_EDGETOL,
     ADDSET_SHARP_HALOCTRL,
     ADDSET_RESIZE_SCALE,
+    ADDSET_EPD_STRENGTH,
+    ADDSET_EPD_GAMMA,
+    ADDSET_EPD_EDGESTOPPING,
+    ADDSET_EPD_SCALE,
+    ADDSET_EPD_REWEIGHTINGITERATES,
+    ADDSET_FATTAL_ALPHA,
+    ADDSET_FATTAL_BETA,
 
     ADDSET_PARAM_NUM // THIS IS USED AS A DELIMITER!!
 };
diff --git a/rtgui/batchtoolpanelcoord.cc b/rtgui/batchtoolpanelcoord.cc
index e7672e4c7..a854db612 100644
--- a/rtgui/batchtoolpanelcoord.cc
+++ b/rtgui/batchtoolpanelcoord.cc
@@ -151,6 +151,8 @@ void BatchToolPanelCoordinator::initSession ()
             sharpenEdge->setAdjusterBehavior (false, false);
             sharpenMicro->setAdjusterBehavior (false, false);
             icm->setAdjusterBehavior (false, false);
+            epd->setAdjusterBehavior (false, false, false, false, false);
+            fattal->setAdjusterBehavior (false, false);
 
             chmixer->setAdjusterBehavior (false);
             blackwhite->setAdjusterBehavior (false, false);
@@ -189,6 +191,8 @@ void BatchToolPanelCoordinator::initSession ()
             cacorrection->setAdjusterBehavior (options.baBehav[ADDSET_CA]);
             sharpening->setAdjusterBehavior (options.baBehav[ADDSET_SHARP_RADIUS], options.baBehav[ADDSET_SHARP_AMOUNT], options.baBehav[ADDSET_SHARP_DAMPING], options.baBehav[ADDSET_SHARP_ITER], options.baBehav[ADDSET_SHARP_EDGETOL], options.baBehav[ADDSET_SHARP_HALOCTRL]);
             prsharpening->setAdjusterBehavior (options.baBehav[ADDSET_SHARP_RADIUS], options.baBehav[ADDSET_SHARP_AMOUNT], options.baBehav[ADDSET_SHARP_DAMPING], options.baBehav[ADDSET_SHARP_ITER], options.baBehav[ADDSET_SHARP_EDGETOL], options.baBehav[ADDSET_SHARP_HALOCTRL]);
+            epd->setAdjusterBehavior (options.baBehav[ADDSET_EPD_STRENGTH], options.baBehav[ADDSET_EPD_GAMMA], options.baBehav[ADDSET_EPD_EDGESTOPPING], options.baBehav[ADDSET_EPD_SCALE], options.baBehav[ADDSET_EPD_REWEIGHTINGITERATES]);
+            fattal->setAdjusterBehavior (options.baBehav[ADDSET_FATTAL_ALPHA], options.baBehav[ADDSET_FATTAL_BETA]);
 
             sharpenEdge->setAdjusterBehavior (options.baBehav[ADDSET_SHARPENEDGE_AMOUNT], options.baBehav[ADDSET_SHARPENEDGE_PASS]);
             sharpenMicro->setAdjusterBehavior (options.baBehav[ADDSET_SHARPENMICRO_AMOUNT], options.baBehav[ADDSET_SHARPENMICRO_UNIFORMITY]);
diff --git a/rtgui/epd.cc b/rtgui/epd.cc
index d7848aee0..602585dfb 100644
--- a/rtgui/epd.cc
+++ b/rtgui/epd.cc
@@ -182,3 +182,11 @@ void EdgePreservingDecompositionUI::setBatchMode(bool batchMode)
     reweightingIterates->showEditedCB();
 }
 
+void EdgePreservingDecompositionUI::setAdjusterBehavior (bool stAdd, bool gAdd, bool esAdd, bool scAdd, bool rAdd)
+{
+    strength->setAddMode(stAdd);
+    gamma->setAddMode(gAdd);
+    edgeStopping->setAddMode(esAdd);
+    scale->setAddMode(scAdd);
+    reweightingIterates->setAddMode(rAdd);
+}
diff --git a/rtgui/epd.h b/rtgui/epd.h
index c9fc5d0af..2e18cc0c9 100644
--- a/rtgui/epd.h
+++ b/rtgui/epd.h
@@ -43,6 +43,7 @@ public:
 
     void adjusterChanged (Adjuster* a, double newval);
     void enabledChanged  ();
+    void setAdjusterBehavior (bool stAdd, bool gAdd, bool esAdd, bool scAdd, bool rAdd);
 };
 
 #endif
diff --git a/rtgui/fattaltonemap.cc b/rtgui/fattaltonemap.cc
index f9ef660f4..a1fd3d342 100644
--- a/rtgui/fattaltonemap.cc
+++ b/rtgui/fattaltonemap.cc
@@ -30,7 +30,7 @@ FattalToneMapping::FattalToneMapping(): FoldableToolPanel(this, "fattal", M("TP_
 //    setEnabledTooltipMarkup(M("TP_EPD_TOOLTIP"));
     
     alpha = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_ALPHA"), 0.0, 2.0, 0.01, 1.0));
-    beta = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_BETA"), 0.0, 2.0, 0.01, 1.0));
+    beta = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_BETA"), 0.7, 1.3, 0.01, 1.0));
 
     alpha->setAdjusterListener(this);
     beta->setAdjusterListener(this);
@@ -118,3 +118,9 @@ void FattalToneMapping::setBatchMode(bool batchMode)
     beta->showEditedCB();
 }
 
+void FattalToneMapping::setAdjusterBehavior (bool alphaAdd, bool betaAdd)
+{
+    alpha->setAddMode(alphaAdd);
+    beta->setAddMode(betaAdd);
+}
+
diff --git a/rtgui/fattaltonemap.h b/rtgui/fattaltonemap.h
index 8ee93aa3c..cb3abcfc3 100644
--- a/rtgui/fattaltonemap.h
+++ b/rtgui/fattaltonemap.h
@@ -40,5 +40,7 @@ public:
 
     void adjusterChanged (Adjuster* a, double newval);
     void enabledChanged  ();
+    void setAdjusterBehavior (bool alphaAdd, bool betaAdd);
+
 };
 
diff --git a/rtgui/partialpastedlg.cc b/rtgui/partialpastedlg.cc
index ce26078fc..7d91e0172 100644
--- a/rtgui/partialpastedlg.cc
+++ b/rtgui/partialpastedlg.cc
@@ -51,6 +51,7 @@ PartialPasteDlg::PartialPasteDlg (const Glib::ustring &title, Gtk::Window* paren
     exposure    = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_EXPOSURE")));
     sh          = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_SHADOWSHIGHLIGHTS")));
     epd         = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_EPD")));
+    fattal      = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_TM_FATTAL")));
     retinex     = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_RETINEX")));
     pcvignette  = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_PCVIGNETTE")));
     gradient    = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_GRADIENT")));
@@ -143,6 +144,7 @@ PartialPasteDlg::PartialPasteDlg (const Glib::ustring &title, Gtk::Window* paren
     vboxes[0]->pack_start (*exposure, Gtk::PACK_SHRINK, 2);
     vboxes[0]->pack_start (*sh, Gtk::PACK_SHRINK, 2);
     vboxes[0]->pack_start (*epd, Gtk::PACK_SHRINK, 2);
+    vboxes[0]->pack_start (*fattal, Gtk::PACK_SHRINK, 2);
     vboxes[0]->pack_start (*retinex, Gtk::PACK_SHRINK, 2);
     vboxes[0]->pack_start (*pcvignette, Gtk::PACK_SHRINK, 2);
     vboxes[0]->pack_start (*gradient, Gtk::PACK_SHRINK, 2);
@@ -298,6 +300,7 @@ PartialPasteDlg::PartialPasteDlg (const Glib::ustring &title, Gtk::Window* paren
     exposureConn    = exposure->signal_toggled().connect (sigc::bind (sigc::mem_fun(*basic, &Gtk::CheckButton::set_inconsistent), true));
     shConn          = sh->signal_toggled().connect (sigc::bind (sigc::mem_fun(*basic, &Gtk::CheckButton::set_inconsistent), true));
     epdConn         = epd->signal_toggled().connect (sigc::bind (sigc::mem_fun(*basic, &Gtk::CheckButton::set_inconsistent), true));
+    fattalConn      = fattal->signal_toggled().connect (sigc::bind (sigc::mem_fun(*basic, &Gtk::CheckButton::set_inconsistent), true));
     retinexConn     = retinex->signal_toggled().connect (sigc::bind (sigc::mem_fun(*basic, &Gtk::CheckButton::set_inconsistent), true));
     pcvignetteConn  = pcvignette->signal_toggled().connect (sigc::bind (sigc::mem_fun(*basic, &Gtk::CheckButton::set_inconsistent), true));
     gradientConn    = gradient->signal_toggled().connect (sigc::bind (sigc::mem_fun(*basic, &Gtk::CheckButton::set_inconsistent), true));
@@ -517,6 +520,7 @@ void PartialPasteDlg::basicToggled ()
     exposure->set_active (basic->get_active ());
     sh->set_active (basic->get_active ());
     epd->set_active (basic->get_active ());
+    fattal->set_active (basic->get_active ());
     pcvignette->set_active (basic->get_active ());
     gradient->set_active (basic->get_active ());
     retinex->set_active (basic->get_active ());
@@ -711,6 +715,10 @@ void PartialPasteDlg::applyPaste (rtengine::procparams::ProcParams* dstPP, Param
         filterPE.epd        = falsePE.epd;
     }
 
+    if (!fattal->get_active ()) {
+        filterPE.fattal     = falsePE.fattal;
+    }
+
     if (!retinex->get_active ()) {
         filterPE.retinex        = falsePE.retinex;
     }
diff --git a/rtgui/partialpastedlg.h b/rtgui/partialpastedlg.h
index 8fa6dbd23..baef6b9aa 100644
--- a/rtgui/partialpastedlg.h
+++ b/rtgui/partialpastedlg.h
@@ -46,6 +46,7 @@ public:
     Gtk::CheckButton* exposure;
     Gtk::CheckButton* sh;
     Gtk::CheckButton* epd;
+    Gtk::CheckButton* fattal;
     Gtk::CheckButton* retinex;
     Gtk::CheckButton* pcvignette;
     Gtk::CheckButton* gradient;
@@ -124,7 +125,7 @@ public:
     sigc::connection everythingConn, basicConn, detailConn, colorConn, lensConn, compositionConn, metaConn, rawConn, wavConn;
 
     sigc::connection wbConn, exposureConn, shConn, pcvignetteConn, gradientConn, labcurveConn, colorappearanceConn;
-    sigc::connection sharpenConn, gradsharpenConn, microcontrastConn, impdenConn, dirpyrdenConn, defringeConn, epdConn, dirpyreqConn, waveletConn, retinexConn;
+    sigc::connection sharpenConn, gradsharpenConn, microcontrastConn, impdenConn, dirpyrdenConn, defringeConn, epdConn, fattalConn, dirpyreqConn, waveletConn, retinexConn;
     sigc::connection vibranceConn, chmixerConn, hsveqConn, rgbcurvesConn, chmixerbwConn, colortoningConn, filmSimulationConn;
     sigc::connection distortionConn, cacorrConn, vignettingConn, lcpConn;
     sigc::connection coarserotConn, finerotConn, cropConn, resizeConn, prsharpeningConn, perspectiveConn, commonTransConn;
diff --git a/rtgui/preferences.cc b/rtgui/preferences.cc
index e03542b3e..a2a111707 100644
--- a/rtgui/preferences.cc
+++ b/rtgui/preferences.cc
@@ -187,6 +187,19 @@ Gtk::Widget* Preferences::getBatchProcPanel ()
     appendBehavList (mi, M ("TP_EXPOSURE_CONTRAST"), ADDSET_TC_CONTRAST, false);
     appendBehavList (mi, M ("TP_EXPOSURE_SATURATION"), ADDSET_TC_SATURATION, false);
 
+    mi = behModel->append ();
+    mi->set_value (behavColumns.label, M ("TP_EPD_LABEL"));
+    appendBehavList (mi, M ("TP_EPD_STRENGTH"), ADDSET_EPD_STRENGTH, false);
+    appendBehavList (mi, M ("TP_EPD_GAMMA"), ADDSET_EPD_GAMMA, false);
+    appendBehavList (mi, M ("TP_EPD_EDGESTOPPING"), ADDSET_EPD_EDGESTOPPING, false);
+    appendBehavList (mi, M ("TP_EPD_SCALE"), ADDSET_EPD_SCALE, false);
+    appendBehavList (mi, M ("TP_EPD_REWEIGHTINGITERATES"), ADDSET_EPD_REWEIGHTINGITERATES, false);
+
+    mi = behModel->append ();
+    mi->set_value (behavColumns.label, M ("TP_TM_FATTAL_LABEL"));
+    appendBehavList (mi, M ("TP_TM_FATTAL_ALPHA"), ADDSET_FATTAL_ALPHA, false);
+    appendBehavList (mi, M ("TP_TM_FATTAL_BETA"), ADDSET_FATTAL_BETA, false);
+
     mi = behModel->append ();
     mi->set_value (behavColumns.label, M ("TP_RETINEX_LABEL"));
     appendBehavList (mi, M ("TP_RETINEX_STRENGTH"), ADDSET_RETI_STR, false);
diff --git a/rtgui/toolpanelcoord.cc b/rtgui/toolpanelcoord.cc
index b88827482..9153c6fb4 100644
--- a/rtgui/toolpanelcoord.cc
+++ b/rtgui/toolpanelcoord.cc
@@ -90,7 +90,7 @@ ToolPanelCoordinator::ToolPanelCoordinator (bool batch) : ipc (nullptr), hasChan
     rawexposure         = Gtk::manage (new RAWExposure ());
     bayerrawexposure    = Gtk::manage (new BayerRAWExposure ());
     xtransrawexposure   = Gtk::manage (new XTransRAWExposure ());
-    fattal              = Gtk::manage(new FattalToneMapping());
+    fattal              = Gtk::manage (new FattalToneMapping ());
 
     // So Demosaic, Line noise filter, Green Equilibration, Ca-Correction (garder le nom de section identique!) and Black-Level will be moved in a "Bayer sensor" tool,
     // and a separate Demosaic and Black Level tool will be created in an "X-Trans sensor" tool

From 0a9f382967996fd074f12600218328e01959a56e Mon Sep 17 00:00:00 2001
From: Hombre <natureh.510@gmail.com>
Date: Sat, 4 Nov 2017 23:09:58 +0100
Subject: [PATCH 07/39] Refactored name for Fattal / Alpha & Beta + updated
 ranges (see #4168)

Ranges for Threshold and Amount is now -100 / +100 integer. Effective
range should be modified in ImProcFunctions::ToneMapFattal02 if ever.
---
 rtdata/languages/Francais |  7 +++--
 rtdata/languages/default  |  8 ++---
 rtengine/procevents.h     |  4 +--
 rtengine/procparams.cc    | 24 +++++++--------
 rtengine/procparams.h     |  8 ++---
 rtengine/refreshmap.cc    |  4 +--
 rtengine/tmo_fattal02.cc  |  8 ++++-
 rtgui/fattaltonemap.cc    | 62 +++++++++++++++++++--------------------
 rtgui/fattaltonemap.h     |  4 +--
 rtgui/paramsedited.cc     | 16 +++++-----
 rtgui/paramsedited.h      |  4 +--
 11 files changed, 79 insertions(+), 70 deletions(-)

diff --git a/rtdata/languages/Francais b/rtdata/languages/Francais
index b5a1434bc..e90a61b41 100644
--- a/rtdata/languages/Francais
+++ b/rtdata/languages/Francais
@@ -703,6 +703,9 @@ HISTORY_MSG_472;PS - Adoucir les transitions
 HISTORY_MSG_473;PS - Utiliser LMMSE
 HISTORY_MSG_474;PS - Égaliser
 HISTORY_MSG_475;PS - Égaliser par canal
+HISTORY_MSG_488;Compression tonale HDR
+HISTORY_MSG_489;CT HDR - Seuil
+HISTORY_MSG_490;CT HDR - Quantité
 HISTORY_NEWSNAPSHOT;Ajouter
 HISTORY_NEWSNAPSHOT_TOOLTIP;Raccourci: <b>Alt-s</b>
 HISTORY_SNAPSHOT;Capture
@@ -1894,8 +1897,8 @@ TP_SHARPENMICRO_LABEL;Microcontraste
 TP_SHARPENMICRO_MATRIX;Matrice 3×3 au lieu de 5×5
 TP_SHARPENMICRO_UNIFORMITY;Uniformité
 TP_TM_FATTAL_LABEL;Compression Tonale HDR (Fattal02)
-TP_TM_FATTAL_ALPHA;Seuil (Alpha)
-TP_TM_FATTAL_BETA;Quantité (Beta)
+TP_TM_FATTAL_THRESHOLD;Seuil
+TP_TM_FATTAL_AMOUNT;Quantité
 TP_VIBRANCE_AVOIDCOLORSHIFT;Éviter les dérives de teinte
 TP_VIBRANCE_CURVEEDITOR_SKINTONES;TT
 TP_VIBRANCE_CURVEEDITOR_SKINTONES_LABEL;Tons chair
diff --git a/rtdata/languages/default b/rtdata/languages/default
index a9fe2414e..f7c59e633 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -720,8 +720,8 @@ HISTORY_MSG_485;Lens Correction
 HISTORY_MSG_486;Lens Correction - Camera
 HISTORY_MSG_487;Lens Correction - Lens
 HISTORY_MSG_488;HDR Tone Mapping
-HISTORY_MSG_489;HDR TM - Alpha
-HISTORY_MSG_490;HDR TM - Beta
+HISTORY_MSG_489;HDR TM - Threshold
+HISTORY_MSG_490;HDR TM - Amount
 HISTORY_NEWSNAPSHOT;Add
 HISTORY_NEWSNAPSHOT_TOOLTIP;Shortcut: <b>Alt-s</b>
 HISTORY_SNAPSHOT;Snapshot
@@ -1930,8 +1930,8 @@ TP_SHARPENMICRO_LABEL;Microcontrast
 TP_SHARPENMICRO_MATRIX;3×3 matrix instead of 5×5
 TP_SHARPENMICRO_UNIFORMITY;Uniformity
 TP_TM_FATTAL_LABEL;HDR Tone Mapping (Fattal02)
-TP_TM_FATTAL_ALPHA;Threshold (Alpha)
-TP_TM_FATTAL_BETA;Amount (Beta)
+TP_TM_FATTAL_THRESHOLD;Threshold (Alpha)
+TP_TM_FATTAL_AMOUNT;Amount (Beta)
 TP_VIBRANCE_AVOIDCOLORSHIFT;Avoid color shift
 TP_VIBRANCE_CURVEEDITOR_SKINTONES;HH
 TP_VIBRANCE_CURVEEDITOR_SKINTONES_LABEL;Skin-tones
diff --git a/rtengine/procevents.h b/rtengine/procevents.h
index 2bd4107ed..3aa5505b5 100644
--- a/rtengine/procevents.h
+++ b/rtengine/procevents.h
@@ -517,8 +517,8 @@ enum ProcEvent {
     EvLensCorrLensfunLens = 486,
     // Fattal tone mapping
     EvTMFattalEnabled = 487,
-    EvTMFattalAlpha = 488,
-    EvTMFattalBeta = 489,
+    EvTMFattalThreshold = 488,
+    EvTMFattalAmount = 489,
 
     NUMOFEVENTS
 
diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index 9bbed9d4e..7662a9920 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -2449,12 +2449,12 @@ int ProcParams::save (const Glib::ustring &fname, const Glib::ustring &fname2, b
             keyFile.set_boolean ("FattalToneMapping", "Enabled", fattal.enabled);
         }
 
-        if (!pedited || pedited->fattal.alpha) {
-            keyFile.set_double ("FattalToneMapping", "Alpha", fattal.alpha);
+        if (!pedited || pedited->fattal.threshold) {
+            keyFile.set_integer ("FattalToneMapping", "Threshold", fattal.threshold);
         }
 
-        if (!pedited || pedited->fattal.beta) {
-            keyFile.set_double ("FattalToneMapping", "Beta", fattal.beta);
+        if (!pedited || pedited->fattal.amount) {
+            keyFile.set_integer ("FattalToneMapping", "Amount", fattal.amount);
         }
         
         /*
@@ -5613,19 +5613,19 @@ int ProcParams::load (const Glib::ustring &fname, ParamsEdited* pedited)
                 }
             }
 
-            if (keyFile.has_key ("FattalToneMapping", "Alpha")) {
-                fattal.alpha = keyFile.get_double ("FattalToneMapping", "Alpha");
+            if (keyFile.has_key ("FattalToneMapping", "Threshold")) {
+                fattal.threshold = keyFile.get_double ("FattalToneMapping", "Threshold");
 
                 if (pedited) {
-                    pedited->fattal.alpha = true;
+                    pedited->fattal.threshold = true;
                 }
             }
 
-            if (keyFile.has_key ("FattalToneMapping", "Beta")) {
-                fattal.beta = keyFile.get_double ("FattalToneMapping", "Beta");
+            if (keyFile.has_key ("FattalToneMapping", "Amount")) {
+                fattal.amount = keyFile.get_double ("FattalToneMapping", "Amount");
 
                 if (pedited) {
-                    pedited->fattal.beta = true;
+                    pedited->fattal.amount = true;
                 }
             }
         }        
@@ -8489,8 +8489,8 @@ bool ProcParams::operator== (const ProcParams& other)
         && epd.scale == other.epd.scale
         && epd.reweightingIterates == other.epd.reweightingIterates
         && fattal.enabled == other.fattal.enabled
-        && fattal.alpha == other.fattal.alpha
-        && fattal.beta == other.fattal.beta
+        && fattal.threshold == other.fattal.threshold
+        && fattal.amount == other.fattal.amount
         && defringe.enabled == other.defringe.enabled
         && defringe.radius == other.defringe.radius
         && defringe.threshold == other.defringe.threshold
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index f2d1c7b00..c69df915b 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -743,8 +743,8 @@ public:
 class FattalToneMappingParams {
 public:
     bool enabled;
-    double alpha;
-    double beta;
+    int threshold;
+    int amount;
     
     FattalToneMappingParams()
     {
@@ -754,8 +754,8 @@ public:
     void setDefaults()
     {
         enabled = false;
-        alpha = 1.0;
-        beta = 1.0;
+        threshold = 0;
+        amount = 0;
     }
 };
 
diff --git a/rtengine/refreshmap.cc b/rtengine/refreshmap.cc
index 95e129ad6..fa6b52c4e 100644
--- a/rtengine/refreshmap.cc
+++ b/rtengine/refreshmap.cc
@@ -515,8 +515,8 @@ int refreshmap[rtengine::NUMOFEVENTS] = {
     DARKFRAME,        // EvLensCorrLensfunCamera
     DARKFRAME,        // EvLensCorrLensfunLens
     RGBCURVE,         // EvTMFattalEnabled
-    RGBCURVE,         // EvTMFattalAlpha
-    RGBCURVE          // EvTMFattalBeta
+    RGBCURVE,         // EvTMFattalThreshold
+    RGBCURVE          // EvTMFattalAmount
 
 };
 
diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index d0c3ea640..b44f1c1b6 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1093,7 +1093,13 @@ void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level,
 void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 {
     const int detail_level = 3;
-    tmo_fattal02_RT(rgb, params->fattal.alpha, params->fattal.beta, detail_level, multiThread);
+    double alpha = 1.;
+    if (params->fattal.threshold < 0) {
+        alpha += (params->fattal.threshold * 0.9) / 100.;
+    } else if (params->fattal.threshold > 0) {
+        alpha += params->fattal.threshold / 100.;
+    }
+    tmo_fattal02_RT(rgb, alpha, 1. - (params->fattal.amount * 0.3) / 100., detail_level, multiThread);
 }
 
 
diff --git a/rtgui/fattaltonemap.cc b/rtgui/fattaltonemap.cc
index a1fd3d342..79e545908 100644
--- a/rtgui/fattaltonemap.cc
+++ b/rtgui/fattaltonemap.cc
@@ -28,18 +28,18 @@ FattalToneMapping::FattalToneMapping(): FoldableToolPanel(this, "fattal", M("TP_
 {
 
 //    setEnabledTooltipMarkup(M("TP_EPD_TOOLTIP"));
-    
-    alpha = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_ALPHA"), 0.0, 2.0, 0.01, 1.0));
-    beta = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_BETA"), 0.7, 1.3, 0.01, 1.0));
 
-    alpha->setAdjusterListener(this);
-    beta->setAdjusterListener(this);
+    threshold = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_THRESHOLD"), -100., 100., 1., 0.0));
+    amount = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_AMOUNT"), -100., 100., 1., 0.0));
 
-    alpha->show();
-    beta->show();
+    threshold->setAdjusterListener(this);
+    amount->setAdjusterListener(this);
 
-    pack_start(*alpha);
-    pack_start(*beta);
+    threshold->show();
+    amount->show();
+
+    pack_start(*threshold);
+    pack_start(*amount);
 }
 
 void FattalToneMapping::read(const ProcParams *pp, const ParamsEdited *pedited)
@@ -47,52 +47,52 @@ void FattalToneMapping::read(const ProcParams *pp, const ParamsEdited *pedited)
     disableListener();
 
     if(pedited) {
-        alpha->setEditedState(pedited->fattal.alpha ? Edited : UnEdited);
-        beta->setEditedState(pedited->fattal.beta ? Edited : UnEdited);
+        threshold->setEditedState(pedited->fattal.threshold ? Edited : UnEdited);
+        amount->setEditedState(pedited->fattal.amount ? Edited : UnEdited);
         set_inconsistent(multiImage && !pedited->fattal.enabled);
     }
 
     setEnabled(pp->fattal.enabled);
-    alpha->setValue(pp->fattal.alpha);
-    beta->setValue(pp->fattal.beta);
+    threshold->setValue(pp->fattal.threshold);
+    amount->setValue(pp->fattal.amount);
 
     enableListener();
 }
 
 void FattalToneMapping::write(ProcParams *pp, ParamsEdited *pedited)
 {
-    pp->fattal.alpha = alpha->getValue();
-    pp->fattal.beta = beta->getValue();
+    pp->fattal.threshold = threshold->getValue();
+    pp->fattal.amount = amount->getValue();
     pp->fattal.enabled = getEnabled();
 
     if(pedited) {
-        pedited->fattal.alpha = alpha->getEditedState();
-        pedited->fattal.beta = beta->getEditedState();
+        pedited->fattal.threshold = threshold->getEditedState();
+        pedited->fattal.amount = amount->getEditedState();
         pedited->fattal.enabled = !get_inconsistent();
     }
 }
 
 void FattalToneMapping::setDefaults(const ProcParams *defParams, const ParamsEdited *pedited)
 {
-    alpha->setDefault(defParams->fattal.alpha);
-    beta->setDefault(defParams->fattal.beta);
+    threshold->setDefault(defParams->fattal.threshold);
+    amount->setDefault(defParams->fattal.amount);
 
     if(pedited) {
-        alpha->setDefaultEditedState(pedited->fattal.alpha ? Edited : UnEdited);
-        beta->setDefaultEditedState(pedited->fattal.beta ? Edited : UnEdited);
+        threshold->setDefaultEditedState(pedited->fattal.threshold ? Edited : UnEdited);
+        amount->setDefaultEditedState(pedited->fattal.amount ? Edited : UnEdited);
     } else {
-        alpha->setDefaultEditedState(Irrelevant);
-        beta->setDefaultEditedState(Irrelevant);
+        threshold->setDefaultEditedState(Irrelevant);
+        amount->setDefaultEditedState(Irrelevant);
     }
 }
 
 void FattalToneMapping::adjusterChanged(Adjuster* a, double newval)
 {
     if(listener && getEnabled()) {
-        if(a == alpha) {
-            listener->panelChanged(EvTMFattalAlpha, Glib::ustring::format(std::setw(2), std::fixed, std::setprecision(2), a->getValue()));
-        } else if(a == beta) {
-            listener->panelChanged(EvTMFattalBeta, Glib::ustring::format(std::setw(2), std::fixed, std::setprecision(2), a->getValue()));
+        if(a == threshold) {
+            listener->panelChanged(EvTMFattalThreshold, a->getTextValue());
+        } else if(a == amount) {
+            listener->panelChanged(EvTMFattalAmount, a->getTextValue());
         }
     }
 }
@@ -114,13 +114,13 @@ void FattalToneMapping::setBatchMode(bool batchMode)
 {
     ToolPanel::setBatchMode(batchMode);
 
-    alpha->showEditedCB();
-    beta->showEditedCB();
+    threshold->showEditedCB();
+    amount->showEditedCB();
 }
 
 void FattalToneMapping::setAdjusterBehavior (bool alphaAdd, bool betaAdd)
 {
-    alpha->setAddMode(alphaAdd);
-    beta->setAddMode(betaAdd);
+    threshold->setAddMode(alphaAdd);
+    amount->setAddMode(betaAdd);
 }
 
diff --git a/rtgui/fattaltonemap.h b/rtgui/fattaltonemap.h
index cb3abcfc3..2398970ce 100644
--- a/rtgui/fattaltonemap.h
+++ b/rtgui/fattaltonemap.h
@@ -26,8 +26,8 @@
 class FattalToneMapping: public ToolParamBlock, public AdjusterListener, public FoldableToolPanel
 {
 protected:
-    Adjuster *alpha;
-    Adjuster *beta;
+    Adjuster *threshold;
+    Adjuster *amount;
 
 public:
 
diff --git a/rtgui/paramsedited.cc b/rtgui/paramsedited.cc
index 17e53f2dd..dc1a65b64 100644
--- a/rtgui/paramsedited.cc
+++ b/rtgui/paramsedited.cc
@@ -265,8 +265,8 @@ void ParamsEdited::set (bool v)
     epd.scale               = v;
     epd.reweightingIterates = v;
     fattal.enabled   = v;
-    fattal.alpha     = v;
-    fattal.beta      = v;
+    fattal.threshold = v;
+    fattal.amount    = v;
     sh.enabled       = v;
     sh.hq            = v;
     sh.highlights    = v;
@@ -808,8 +808,8 @@ void ParamsEdited::initFrom (const std::vector<rtengine::procparams::ProcParams>
         epd.reweightingIterates = epd.reweightingIterates && p.epd.reweightingIterates == other.epd.reweightingIterates;
 
         fattal.enabled = fattal.enabled && p.fattal.enabled == other.fattal.enabled;
-        fattal.alpha = fattal.alpha && p.fattal.alpha == other.fattal.alpha;
-        fattal.beta = fattal.beta && p.fattal.beta == other.fattal.beta;
+        fattal.threshold = fattal.threshold && p.fattal.threshold == other.fattal.threshold;
+        fattal.amount = fattal.amount && p.fattal.amount == other.fattal.amount;
         
         sh.enabled = sh.enabled && p.sh.enabled == other.sh.enabled;
         sh.hq = sh.hq && p.sh.hq == other.sh.hq;
@@ -1982,11 +1982,11 @@ void ParamsEdited::combine (rtengine::procparams::ProcParams& toEdit, const rten
     if (fattal.enabled) {
         toEdit.fattal.enabled = mods.fattal.enabled;
     }
-    if (fattal.alpha) {
-        toEdit.fattal.alpha = mods.fattal.alpha;
+    if (fattal.threshold) {
+        toEdit.fattal.threshold = mods.fattal.threshold;
     }
-    if (fattal.beta) {
-        toEdit.fattal.beta = mods.fattal.beta;
+    if (fattal.amount) {
+        toEdit.fattal.amount = mods.fattal.amount;
     }    
 
     if (sh.enabled) {
diff --git a/rtgui/paramsedited.h b/rtgui/paramsedited.h
index 552aa8515..4d205b1f5 100644
--- a/rtgui/paramsedited.h
+++ b/rtgui/paramsedited.h
@@ -368,8 +368,8 @@ public:
 class FattalToneMappingParamsEdited {
 public:
     bool enabled;
-    bool alpha;
-    bool beta;
+    bool threshold;
+    bool amount;
 };
 
 

From 34ed12c3186bb503247171af11871354e204de28 Mon Sep 17 00:00:00 2001
From: Hombre <natureh.510@gmail.com>
Date: Sun, 5 Nov 2017 00:00:27 +0100
Subject: [PATCH 08/39] HDR Tone compression : Threshold and Amount swapped in
 GUI (#4168)

---
 rtgui/fattaltonemap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtgui/fattaltonemap.cc b/rtgui/fattaltonemap.cc
index 79e545908..aefe405d5 100644
--- a/rtgui/fattaltonemap.cc
+++ b/rtgui/fattaltonemap.cc
@@ -38,8 +38,8 @@ FattalToneMapping::FattalToneMapping(): FoldableToolPanel(this, "fattal", M("TP_
     threshold->show();
     amount->show();
 
-    pack_start(*threshold);
     pack_start(*amount);
+    pack_start(*threshold);
 }
 
 void FattalToneMapping::read(const ProcParams *pp, const ParamsEdited *pedited)

From ceb3ebf24b60a5a89816d9dd39747e2000654f37 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sun, 5 Nov 2017 13:34:33 +0100
Subject: [PATCH 09/39] Fattal: protect fftwf calls with a mutex (shared also
 with the denoise routine)

Initialization of the fftwMutex now happens in rtengine::init()
---
 rtengine/FTblockDN.cc    | 5 +++--
 rtengine/init.cc         | 2 ++
 rtengine/tmo_fattal02.cc | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 50b178388..fadb74bac 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -71,6 +71,8 @@ namespace rtengine
 
 
 extern const Settings* settings;
+extern MyMutex *fftwMutex;
+
 
 void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width, const int height, const Median medianType, const int iterations, const int numThreads, float **buffer)
 {
@@ -445,8 +447,7 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
         return;
     }
 
-    static MyMutex FftwMutex;
-    MyMutex::MyLock lock(FftwMutex);
+    MyMutex::MyLock lock(*fftwMutex);
 
     const nrquality nrQuality = (dnparams.smethod == "shal") ? QUALITY_STANDARD : QUALITY_HIGH;//shrink method
     const float qhighFactor = (nrQuality == QUALITY_HIGH) ? 1.f / static_cast<float>( settings->nrhigh) : 1.0f;
diff --git a/rtengine/init.cc b/rtengine/init.cc
index 6c1f4b98a..8d2cf9174 100644
--- a/rtengine/init.cc
+++ b/rtengine/init.cc
@@ -38,6 +38,7 @@ namespace rtengine
 const Settings* settings;
 
 MyMutex* lcmsMutex = nullptr;
+MyMutex *fftwMutex = nullptr;
 
 int init (const Settings* s, Glib::ustring baseDir, Glib::ustring userSettingsDir, bool loadAll)
 {
@@ -101,6 +102,7 @@ int init (const Settings* s, Glib::ustring baseDir, Glib::ustring userSettingsDi
     Color::init ();
     delete lcmsMutex;
     lcmsMutex = new MyMutex;
+    fftwMutex = new MyMutex;
     return 0;
 }
 
diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index b44f1c1b6..0b9914e0f 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -74,6 +74,7 @@
 namespace rtengine {
 
 extern const Settings *settings;
+extern MyMutex *fftwMutex;
 
 using namespace std;
 
@@ -629,6 +630,7 @@ void tmo_fattal02(size_t width,
   Array2Df U(width, height);
   // if (fftsolver)
   {
+      MyMutex::MyLock lock(*fftwMutex);
       solve_pde_fft(&DivG, &U, multithread);//, ph);
   }
   // else

From 54783f6e2f6d7118a8afa65d40b72148a0b3dede Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sun, 5 Nov 2017 15:07:03 +0100
Subject: [PATCH 10/39] Fixed bug (uninitialized values) in
 ImProcFunctions::Median_Denoise

The code was not copying border pixels from medianOut to dst, leading to the
use of uninitialized values. This was not visible in RT because Median_Denoise
was always called with src == dst, so not copying the border causes no harm
---
 rtengine/FTblockDN.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index fadb74bac..7f0147b4c 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -366,9 +366,8 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
 #ifdef _OPENMP
         #pragma omp parallel for num_threads(numThreads) if (numThreads>1)
 #endif
-
-        for (int i = border; i < height - border; ++i) {
-            for (int j = border; j < width - border; ++j) {
+        for (int i = 0; i < height; ++i) {
+            for (int j = 0; j < width; ++j) {
                 dst[i][j] = medianOut[i][j];
             }
         }

From d7136fc6684c7d1763191fb33f54b24d22bdbde6 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sun, 5 Nov 2017 15:09:22 +0100
Subject: [PATCH 11/39] Fattal: apply a median filter on luminance on the deep
 shadows, to avoid boosting noise

Now preview matches output also for noisy images
---
 rtengine/tmo_fattal02.cc | 90 ++++++++++++++++++++++++++++++----------
 1 file changed, 68 insertions(+), 22 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 0b9914e0f..f17371100 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -73,6 +73,10 @@
 
 namespace rtengine {
 
+/******************************************************************************
+ * RT code
+ ******************************************************************************/
+
 extern const Settings *settings;
 extern MyMutex *fftwMutex;
 
@@ -127,6 +131,15 @@ public:
     }
 };
 
+// upper bound on image dimension used in tmo_fattal02 -- see the comment there
+const int RT_dimension_cap = 1920;
+
+void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread);
+
+
+/******************************************************************************
+ * Luminance HDR code (modifications are marked with an RT comment)
+ ******************************************************************************/
 
 void downSample(const Array2Df& A, Array2Df& B)
 {
@@ -402,7 +415,6 @@ void findMaxMinPercentile(const Array2Df& I,
 }
 
 void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread);
-void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread); // RT
 
 void tmo_fattal02(size_t width,
                   size_t height,
@@ -470,9 +482,10 @@ void tmo_fattal02(size_t width,
    * reasonably close to the actual output image. Intuitively, what we do is
    * to put a cap on the dimension of the image processed, so that it is close
    * in size to the typical preview that you will see on a normal consumer
-   * monitor. (That's where the 1920 comes from here.) However, we can't
-   * simply downscale the input Y array and then upscale it on output, because
-   * that would cause a big loss of sharpness (confirmed by testing).
+   * monitor. (That's where the 1920 value for RT_dimension_cap comes from.)
+   * However, we can't simply downscale the input Y array and then upscale it
+   * on output, because that would cause a big loss of sharpness (confirmed by
+   * testing).
    * So, we use a different method: we downscale the H array, so that we
    * compute a downscaled gaussian pyramid and a downscaled FI matrix. Then,
    * we upscale the FI matrix later on, before it gets combined with the
@@ -484,10 +497,9 @@ void tmo_fattal02(size_t width,
    * knows better... also, we use a quite naive bilinear interpolation
    * algorithm (see rescale_bilinear below), which could definitely be
    * improved */
-  const int RT_dimension_cap = 1920;
   int fullwidth = width;
   int fullheight = height;
-  int dim = std::min(width, height);
+  int dim = std::max(width, height);
   Array2Df *fullH = nullptr;
   if (dim > RT_dimension_cap) {
       float s = float(RT_dimension_cap) / float(dim);
@@ -1042,9 +1054,22 @@ void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
     }
 }
 
+} // namespace
 
-void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level, bool multiThread)
+
+void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 {
+    const int detail_level = 3;
+
+    float alpha = 1.f;
+    if (params->fattal.threshold < 0) {
+        alpha += (params->fattal.threshold * 0.9f) / 100.f;
+    } else if (params->fattal.threshold > 0) {
+        alpha += params->fattal.threshold / 100.f;
+    }
+
+    float beta = 1.f - (params->fattal.amount * 0.3f) / 100.f;
+    
     // sanity check
     if (alpha <= 0 || beta <= 0) {
         return;
@@ -1067,6 +1092,42 @@ void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level,
         }
     }
 
+    // median filter on the deep shadows, to avoid boosting noise
+    {
+        const float luminance_noise_floor = 65.535f; // 0.1% -- is this ok?
+        
+#ifdef _OPENMP
+        int num_threads = multiThread ? omp_get_max_threads() : 1;
+#else
+        int num_threads = 1;
+#endif
+        Array2Df Yr_med(w, h);
+        float r = float(std::max(w, h)) / float(RT_dimension_cap);
+        Median med;
+        if (r >= 3) {
+            med = Median::TYPE_7X7;
+        } else if (r >= 2) {
+            med = Median::TYPE_5X5_STRONG;
+        } else if (r >= 1) {
+            med = Median::TYPE_5X5_SOFT;
+        } else {
+            med = Median::TYPE_3X3_STRONG;
+        }
+        Median_Denoise(Yr, Yr_med, w, h, med, 1, num_threads);
+
+#ifdef _OPENMP
+        #pragma omp parallel for if (multiThread)
+#endif
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++) {
+                if (Yr(x, y) <= luminance_noise_floor) {
+                    Yr(x, y) = Yr_med(x, y);
+                }
+            }
+        }
+    }
+    
+
     float noise = alpha * 0.01f;
 
     if (settings->verbose) {
@@ -1089,20 +1150,5 @@ void tmo_fattal02_RT(Imagefloat *rgb, float alpha, float beta, int detail_level,
     rgb->normalizeFloatTo65535();
 }
 
-} // namespace
-
-
-void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
-{
-    const int detail_level = 3;
-    double alpha = 1.;
-    if (params->fattal.threshold < 0) {
-        alpha += (params->fattal.threshold * 0.9) / 100.;
-    } else if (params->fattal.threshold > 0) {
-        alpha += params->fattal.threshold / 100.;
-    }
-    tmo_fattal02_RT(rgb, alpha, 1. - (params->fattal.amount * 0.3) / 100., detail_level, multiThread);
-}
-
 
 } // namespace rtengine

From d187c2e20b7f7ad070b674009277a2177ba1dfcb Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sun, 5 Nov 2017 16:05:50 +0100
Subject: [PATCH 12/39] Fattal: raised threshold on black pixels clipping to
 avoid pixel artifacts

---
 rtengine/tmo_fattal02.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index f17371100..9146fb09b 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1082,20 +1082,20 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
     Array2Df L(w, h);
 
     const float epsilon = 1e-4f;
+    const float luminance_noise_floor = 65.535f;
+    const float min_luminance = 1.f;
     
 #ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
 #endif
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
-            Yr(x, y) = std::max(Color::rgbLuminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x)), epsilon); // clip really black pixels, otherwise it doesn't work at all (not sure why...)
+            Yr(x, y) = std::max(Color::rgbLuminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x)), min_luminance); // clip really black pixels
         }
     }
 
     // median filter on the deep shadows, to avoid boosting noise
     {
-        const float luminance_noise_floor = 65.535f; // 0.1% -- is this ok?
-        
 #ifdef _OPENMP
         int num_threads = multiThread ? omp_get_max_threads() : 1;
 #else

From 59043cc978cb5cdf2d68637388e62eb151a9d083 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sun, 5 Nov 2017 16:25:13 +0100
Subject: [PATCH 13/39] Speedup for log-loop in tmo_fattal02

---
 rtengine/tmo_fattal02.cc | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index f17371100..281ea702b 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -69,8 +69,10 @@
 #include "improcfun.h"
 #include "settings.h"
 #include "iccstore.h"
-
-
+#define BENCHMARK
+#include "StopWatch.h"
+#include "sleef.c"
+#include "opthelper.h"
 namespace rtengine {
 
 /******************************************************************************
@@ -426,6 +428,7 @@ void tmo_fattal02(size_t width,
                   int detail_level,
                   bool multithread)
 {
+    BENCHFUN
 // #ifdef TIMER_PROFILING
 //     msec_timer stop_watch;
 //     stop_watch.start();
@@ -471,10 +474,35 @@ void tmo_fattal02(size_t width,
   }
   Array2Df* H = new Array2Df(width, height);
   //#pragma omp parallel for private(i) shared(H, Y, maxLum)
-  for ( int i=0 ; i<size ; i++ )
+  StopWatch Stop1("logf");
+  float temp = 100.f / maxLum;
+  #pragma omp parallel
   {
-      (*H)(i) = logf( 100.0f* Y(i)/maxLum + 1e-4 );
+#ifdef __SSE2__
+  vfloat epsv = F2V(1e-4);
+  vfloat tempv = F2V(temp);
+#endif
+  #pragma omp for schedule(dynamic,16)
+  for ( size_t i=0 ; i<height ; i++ ) {
+      size_t j = 0;
+#ifdef __SSE2__
+      for(; j < width - 3; j+=4)
+      {
+          STVFU((*H)[i][j], xlogf(tempv * LVFU(Y[i][j]) + epsv));
+      }
+#endif
+      for(; j < width; j++)
+      {
+          (*H)[i][j] = xlogf( temp * Y[i][j] + 1e-4 );
+      }
   }
+  }
+//  #pragma omp parallel for
+//  for ( int i=0 ; i<size ; i++ )
+//  {
+//      (*H)(i) = xlogf( temp * Y(i) + 1e-4 );
+//  }
+  Stop1.stop();
   // ph.setValue(4);
 
   /** RT - this is also here to reduce the dependency of the results on the
@@ -893,6 +921,7 @@ std::vector<double> get_lambda(int n)
 void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress &ph,
                                               bool adjust_bound)*/
 {
+BENCHFUN
    // ph.setValue(20);
   //DEBUG_STR << "solve_pde_fft: solving Laplace U = F ..." << std::endl;
   int width = F->getCols();
@@ -1059,6 +1088,7 @@ void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
 
 void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 {
+    BENCHFUN
     const int detail_level = 3;
 
     float alpha = 1.f;
@@ -1094,6 +1124,7 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 
     // median filter on the deep shadows, to avoid boosting noise
     {
+        StopWatch Stop1("Median");
         const float luminance_noise_floor = 65.535f; // 0.1% -- is this ok?
         
 #ifdef _OPENMP

From facb37be91d2e780ddf8d9f550e6b71a0c341411 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sun, 5 Nov 2017 16:47:22 +0100
Subject: [PATCH 14/39] Speedup for exp-loop

---
 rtengine/tmo_fattal02.cc | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 64888bdc9..b69d4fcb1 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -215,6 +215,7 @@ void gaussianBlur(const Array2Df& I, Array2Df& L)
 
 void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
 {
+    BENCHFUN
   int width = H->getCols();
   int height = H->getRows();
   const int size = width*height;
@@ -256,6 +257,7 @@ void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
 
 float calculateGradients(Array2Df* H, Array2Df* G, int k)
 {
+    BENCHFUN
   const int width = H->getCols();
   const int height = H->getRows();
   const float divider = pow( 2.0f, k+1 );
@@ -331,6 +333,7 @@ void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
                        float avgGrad[], int nlevels, int detail_level,
                        float alfa, float beta, float noise)
 {
+    BENCHFUN
     const bool newfattal = true;
     int width = gradients[nlevels-1]->getCols();
     int height = gradients[nlevels-1]->getRows();
@@ -346,6 +349,7 @@ void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
         }
     }
 
+StopWatch Stop1("test");
     for ( int k = nlevels-1; k >= 0 ; k-- )
     {
         width = gradients[k]->getCols();
@@ -375,7 +379,6 @@ void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
             }
         }
 
-
         // create next level
         if ( k>1 )
         {
@@ -392,6 +395,7 @@ void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
             gaussianBlur(*fi[k-1], *fi[k-1]);
         }
     }
+Stop1.stop();
 
     for ( int k=1 ; k<nlevels ; k++ )
     {
@@ -686,11 +690,34 @@ void tmo_fattal02(size_t width,
   //     return;
   // }
 
-  for ( size_t idx = 0 ; idx < height*width; ++idx )
+StopWatch Stope("expf");
+  #pragma omp parallel
   {
-      L(idx) = expf( gamma * U(idx) );
+#ifdef __SSE2__
+  vfloat gammav = F2V(gamma);
+#endif
+  #pragma omp for schedule(dynamic,16)
+  for ( size_t i=0 ; i<height ; i++ ) {
+      size_t j = 0;
+#ifdef __SSE2__
+      for(; j < width - 3; j+=4)
+      {
+          STVFU(L[i][j], xexpf(gammav * LVFU(U[i][j])));
+      }
+#endif
+      for(; j < width; j++)
+      {
+          L[i][j] = xexpf( gamma * U[i][j]);
+      }
   }
   }
+
+//  for ( size_t idx = 0 ; idx < height*width; ++idx )
+//  {
+//      L(idx) = xexpf( gamma * U(idx) );
+//  }
+Stope.stop();
+  }
   // ph.setValue(95);
 
   // remove percentile of min and max values and renormalize

From 1a2b7d2cde1eec44eb8290b0f57882ee1d46c8a3 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sun, 5 Nov 2017 21:32:06 +0100
Subject: [PATCH 15/39] Fattal: extract luminance using the current working
 space matrix, not the sRGB one

Doesn't seem to make any practical difference though...
---
 rtengine/tmo_fattal02.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index b69d4fcb1..32b93d79b 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1110,6 +1110,12 @@ void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
     }
 }
 
+
+inline float luminance(float r, float g, float b, TMatrix ws)
+{
+    return r * ws[1][0] + g * ws[1][1] + b * ws[1][2];
+}
+
 } // namespace
 
 
@@ -1141,13 +1147,14 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
     const float epsilon = 1e-4f;
     const float luminance_noise_floor = 65.535f;
     const float min_luminance = 1.f;
+    TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix(params->icm.working);
     
 #ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
 #endif
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
-            Yr(x, y) = std::max(Color::rgbLuminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x)), min_luminance); // clip really black pixels
+            Yr(x, y) = std::max(luminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x), ws), min_luminance); // clip really black pixels
         }
     }
 

From b640a37d79a065b08261bf85f6094eddd3b6f25a Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sun, 5 Nov 2017 21:49:38 +0100
Subject: [PATCH 16/39] Fattal GUI: set a lower bound of 0 for amount

---
 rtgui/fattaltonemap.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtgui/fattaltonemap.cc b/rtgui/fattaltonemap.cc
index aefe405d5..d14b76004 100644
--- a/rtgui/fattaltonemap.cc
+++ b/rtgui/fattaltonemap.cc
@@ -29,14 +29,14 @@ FattalToneMapping::FattalToneMapping(): FoldableToolPanel(this, "fattal", M("TP_
 
 //    setEnabledTooltipMarkup(M("TP_EPD_TOOLTIP"));
 
+    amount = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_AMOUNT"), 0., 100., 1., 0.0));
     threshold = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_THRESHOLD"), -100., 100., 1., 0.0));
-    amount = Gtk::manage(new Adjuster (M("TP_TM_FATTAL_AMOUNT"), -100., 100., 1., 0.0));
 
-    threshold->setAdjusterListener(this);
     amount->setAdjusterListener(this);
+    threshold->setAdjusterListener(this);
 
-    threshold->show();
     amount->show();
+    threshold->show();
 
     pack_start(*amount);
     pack_start(*threshold);

From ab061283a6a28360a2e630593af7a7278b180db9 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Mon, 6 Nov 2017 01:12:15 +0100
Subject: [PATCH 17/39] fattal, about 100x speedup for findMaxMinPercentile()

---
 rtengine/tmo_fattal02.cc | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index b69d4fcb1..846518df7 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -409,15 +409,35 @@ void findMaxMinPercentile(const Array2Df& I,
                                  float minPrct, float& minLum,
                                  float maxPrct, float& maxLum)
 {
+    BENCHFUN
     const int size = I.getRows() * I.getCols();
     const float* data = I.data();
-    std::vector<float> vI;
 
-    std::copy(data, data + size, std::back_inserter(vI));
-    std::sort(vI.begin(), vI.end());
+    LUTu histo(65535, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
+    histo.clear();
+#pragma omp parallel
+{
+    LUTu histothr(65535, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
+    histothr.clear();
+#pragma omp for nowait
+    for(int i = 0; i< size; ++i) {
+        histothr[(unsigned int)(65535.f * data[i])]++;
+    }
+#pragma omp critical
+    histo += histothr;
+}
+    int k = 0;
+    int count = 0;
+    while(count < minPrct*size) {
+        count += histo[k++];
+    }
+    minLum = k /65535.f;
+
+    while(count < maxPrct*size) {
+        count += histo[k++];
+    }
+    maxLum = k /65535.f;
 
-    minLum = vI.at( int(minPrct*vI.size()) );
-    maxLum = vI.at( int(maxPrct*vI.size()) );
 }
 
 void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread);

From d810de4445bc04a62879c76b808931060f958c4e Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Mon, 6 Nov 2017 19:08:30 +0100
Subject: [PATCH 18/39] Double speed for rescale_bilinear

---
 rtengine/tmo_fattal02.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 37e01ea97..71b47cdaf 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1123,8 +1123,8 @@ void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
 #ifdef _OPENMP
     #pragma omp parallel for if (multithread)
 #endif
-    for (int x = 0; x < dst.getCols(); ++x) {
-        for (int y = 0; y < dst.getRows(); ++y) {
+    for (int y = 0; y < dst.getRows(); ++y) {
+        for (int x = 0; x < dst.getCols(); ++x) {
             dst(x, y) = get_bilinear_value(src, x * col_scale, y * row_scale);
         }
     }

From 81fbf0c2e15fa1cd02501d52ed48f52e5c43a18d Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Mon, 6 Nov 2017 19:09:50 +0100
Subject: [PATCH 19/39] Applied patch from @agriggio to check multithreading in
 fftw3

---
 CMakeLists.txt | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d99a4feaa..626da7019 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -343,11 +343,30 @@ if(OPTION_OMP)
 endif()
 
 # check for libfftw3f_omp
+include(CheckCSourceCompiles)
 if(OPENMP_FOUND)
-    find_library(fftw3f_omp fftw3f_omp PATHS ${FFTW3F_LIBRARY_DIRS})
-    if(fftw3f_omp)
+    set(CMAKE_REQUIRED_INCLUDES ${FFTW3F_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LIBRARIES)
+    foreach(l ${FFTW3F_LIBRARIES})
+        find_library(_f ${l} PATHS ${FFTW3F_LIBRARY_DIRS})
+        set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} ${_f})
+    endforeach()
+    check_c_source_compiles(
+"#include <fftw3.h>
+int main()
+{
+    fftwf_init_threads();
+    fftwf_plan_with_nthreads(1);
+    return 0;
+}" _fftw3f_multithread)
+    if(_fftw3f_multithread)
         add_definitions(-DRT_FFTW3F_OMP)
-        set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARIES} ${fftw3f_omp})
+    else()
+        find_library(fftw3f_omp fftw3f_omp PATHS ${FFTW3F_LIBRARY_DIRS})
+        if(fftw3f_omp)
+            add_definitions(-DRT_FFTW3F_OMP)
+            set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARIES} ${fftw3f_omp})
+        endif()
     endif()
 endif()
 

From 11f756239b5156c09aa8970f7b2d61aadf12b320 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Mon, 6 Nov 2017 20:53:36 +0100
Subject: [PATCH 20/39] Added comment to findMaxMinPercentile, interpolated
 minLum and maxLum. Removed some stop watches

---
 rtengine/tmo_fattal02.cc | 89 ++++++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 71b47cdaf..9d7d2c6a4 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -215,7 +215,6 @@ void gaussianBlur(const Array2Df& I, Array2Df& L)
 
 void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
 {
-    BENCHFUN
   int width = H->getCols();
   int height = H->getRows();
   const int size = width*height;
@@ -257,7 +256,6 @@ void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
 
 float calculateGradients(Array2Df* H, Array2Df* G, int k)
 {
-    BENCHFUN
   const int width = H->getCols();
   const int height = H->getRows();
   const float divider = pow( 2.0f, k+1 );
@@ -333,7 +331,6 @@ void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
                        float avgGrad[], int nlevels, int detail_level,
                        float alfa, float beta, float noise)
 {
-    BENCHFUN
     const bool newfattal = true;
     int width = gradients[nlevels-1]->getCols();
     int height = gradients[nlevels-1]->getRows();
@@ -349,7 +346,6 @@ void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
         }
     }
 
-StopWatch Stop1("test");
     for ( int k = nlevels-1; k >= 0 ; k-- )
     {
         width = gradients[k]->getCols();
@@ -395,7 +391,6 @@ StopWatch Stop1("test");
             gaussianBlur(*fi[k-1], *fi[k-1]);
         }
     }
-Stop1.stop();
 
     for ( int k=1 ; k<nlevels ; k++ )
     {
@@ -409,34 +404,69 @@ void findMaxMinPercentile(const Array2Df& I,
                                  float minPrct, float& minLum,
                                  float maxPrct, float& maxLum)
 {
-    BENCHFUN
+    assert(minPcrt <= maxPcrt);
+
     const int size = I.getRows() * I.getCols();
     const float* data = I.data();
 
-    LUTu histo(65535, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
+    // we need to find the (minPrct*size) smallest value and the (maxPrct*size) smallest value in I
+    // We use a histogram based search for speed and to reduce memory usage
+    // memory usage of this method is 65536 * sizeof(float) * (t + 1) byte, where t is the number of threads
+
+    // We need one global histogram
+    LUTu histo(65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
     histo.clear();
+#ifdef _OPENMP
 #pragma omp parallel
+#endif
 {
-    LUTu histothr(65535, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
+    // We need one histogram per thread
+    LUTu histothr(65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
     histothr.clear();
+
+#ifdef _OPENMP
 #pragma omp for nowait
+#endif
     for(int i = 0; i< size; ++i) {
+        // values are in [0;1] range, so we have to multiply with 65535 to get the histogram index
         histothr[(unsigned int)(65535.f * data[i])]++;
     }
+
+#ifdef _OPENMP
 #pragma omp critical
+#endif
+    // add per thread histogram to global histogram
     histo += histothr;
 }
+
     int k = 0;
     int count = 0;
+
+    // find (minPrct*size) smallest value
     while(count < minPrct*size) {
         count += histo[k++];
     }
-    minLum = k /65535.f;
+    if(k > 0) { // interpolate
+        int count_ = count - histo[k - 1];
+        float c0 = count - minPrct * size;
+        float c1 = minPrct * size - count_;
+        minLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f);
+    } else {
+        minLum = k /65535.f;
+    }
 
+    // find (maxPrct*size) smallest value
     while(count < maxPrct*size) {
         count += histo[k++];
     }
-    maxLum = k /65535.f;
+    if(k > 0) { // interpolate
+        int count_ = count - histo[k - 1];
+        float c0 = count - maxPrct * size;
+        float c1 = maxPrct * size - count_;
+        maxLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f);
+    } else {
+        maxLum = k /65535.f;
+    }
 
 }
 
@@ -452,7 +482,6 @@ void tmo_fattal02(size_t width,
                   int detail_level,
                   bool multithread)
 {
-    BENCHFUN
 // #ifdef TIMER_PROFILING
 //     msec_timer stop_watch;
 //     stop_watch.start();
@@ -497,37 +526,28 @@ void tmo_fattal02(size_t width,
       maxLum = ( Y(i) > maxLum ) ? Y(i) : maxLum;
   }
   Array2Df* H = new Array2Df(width, height);
-  //#pragma omp parallel for private(i) shared(H, Y, maxLum)
-  StopWatch Stop1("logf");
+
   float temp = 100.f / maxLum;
+  float eps = 1e-4f;
   #pragma omp parallel
   {
 #ifdef __SSE2__
-  vfloat epsv = F2V(1e-4);
+  vfloat epsv = F2V(eps);
   vfloat tempv = F2V(temp);
 #endif
   #pragma omp for schedule(dynamic,16)
-  for ( size_t i=0 ; i<height ; i++ ) {
+  for (size_t i=0 ; i<height ; ++i) {
       size_t j = 0;
 #ifdef __SSE2__
-      for(; j < width - 3; j+=4)
-      {
+      for(; j < width - 3; j+=4) {
           STVFU((*H)[i][j], xlogf(tempv * LVFU(Y[i][j]) + epsv));
       }
 #endif
-      for(; j < width; j++)
-      {
-          (*H)[i][j] = xlogf( temp * Y[i][j] + 1e-4 );
+      for(; j < width; ++j) {
+          (*H)[i][j] = xlogf(temp * Y[i][j] + eps);
       }
   }
   }
-//  #pragma omp parallel for
-//  for ( int i=0 ; i<size ; i++ )
-//  {
-//      (*H)(i) = xlogf( temp * Y(i) + 1e-4 );
-//  }
-  Stop1.stop();
-  // ph.setValue(4);
 
   /** RT - this is also here to reduce the dependency of the results on the
    * input image size, with the primary aim of having a preview in RT that is
@@ -710,33 +730,25 @@ void tmo_fattal02(size_t width,
   //     return;
   // }
 
-StopWatch Stope("expf");
   #pragma omp parallel
   {
 #ifdef __SSE2__
   vfloat gammav = F2V(gamma);
 #endif
   #pragma omp for schedule(dynamic,16)
-  for ( size_t i=0 ; i<height ; i++ ) {
+  for (size_t i=0 ; i<height ; i++) {
       size_t j = 0;
 #ifdef __SSE2__
-      for(; j < width - 3; j+=4)
-      {
+      for(; j < width - 3; j+=4) {
           STVFU(L[i][j], xexpf(gammav * LVFU(U[i][j])));
       }
 #endif
-      for(; j < width; j++)
-      {
+      for(; j < width; j++) {
           L[i][j] = xexpf( gamma * U[i][j]);
       }
   }
   }
 
-//  for ( size_t idx = 0 ; idx < height*width; ++idx )
-//  {
-//      L(idx) = xexpf( gamma * U(idx) );
-//  }
-Stope.stop();
   }
   // ph.setValue(95);
 
@@ -968,7 +980,6 @@ std::vector<double> get_lambda(int n)
 void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress &ph,
                                               bool adjust_bound)*/
 {
-BENCHFUN
    // ph.setValue(20);
   //DEBUG_STR << "solve_pde_fft: solving Laplace U = F ..." << std::endl;
   int width = F->getCols();

From d37f82498b32f44965176742b00b6bd444387a89 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Mon, 6 Nov 2017 22:48:40 +0100
Subject: [PATCH 21/39] Fattal: fixed typo in assertion, and added one more
 assertion

---
 rtengine/tmo_fattal02.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 9d7d2c6a4..5b2f88107 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -404,7 +404,7 @@ void findMaxMinPercentile(const Array2Df& I,
                                  float minPrct, float& minLum,
                                  float maxPrct, float& maxLum)
 {
-    assert(minPcrt <= maxPcrt);
+    assert(minPrct <= maxPrct);
 
     const int size = I.getRows() * I.getCols();
     const float* data = I.data();
@@ -1232,6 +1232,9 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 
     tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
 
+#ifdef _OPENMP
+    #pragma omp parallel for if(multiThread)
+#endif
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
             float Y = Yr(x, y);
@@ -1239,6 +1242,10 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
             rgb->r(y, x) = std::max(rgb->r(y, x)/Y, 0.f) * l;
             rgb->g(y, x) = std::max(rgb->g(y, x)/Y, 0.f) * l;
             rgb->b(y, x) = std::max(rgb->b(y, x)/Y, 0.f) * l;
+            
+            assert(std::isfinite(rgb->r(y, x)));
+            assert(std::isfinite(rgb->g(y, x)));
+            assert(std::isfinite(rgb->b(y, x)));
         }
     }
 

From 6e17e38136a7e46e62addb4200af72bdd1437d04 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Mon, 6 Nov 2017 23:00:52 +0100
Subject: [PATCH 22/39] no need to use a temporary image for Fattal in dcrop

---
 rtengine/dcrop.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 8bcd1e6a3..710890b2c 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -715,11 +715,8 @@ void Crop::update (int todo)
         transCrop = nullptr;
     }
 
-    std::unique_ptr<Imagefloat> fattalCrop;
     if ((todo & M_RGBCURVE) && params.fattal.enabled) {
-        fattalCrop.reset(baseCrop->copy());
-        parent->ipf.ToneMapFattal02(fattalCrop.get());
-        baseCrop = fattalCrop.get();
+        parent->ipf.ToneMapFattal02(baseCrop);
     }
     
     if ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {

From 4d010f8decf277c24694d89a26365515e03ea1ce Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Tue, 7 Nov 2017 10:31:03 +0100
Subject: [PATCH 23/39] do use a temporary image for Fattal in dcrop when it is
 needed

Because, sometimes it is needed indeed...
---
 rtengine/dcrop.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 710890b2c..41867d1c9 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -715,8 +715,15 @@ void Crop::update (int todo)
         transCrop = nullptr;
     }
 
+    std::unique_ptr<Imagefloat> fattalCrop;
     if ((todo & M_RGBCURVE) && params.fattal.enabled) {
-        parent->ipf.ToneMapFattal02(baseCrop);
+        Imagefloat *f = baseCrop;
+        if (f == origCrop) {
+            fattalCrop.reset(baseCrop->copy());
+            f = fattalCrop.get();
+        }
+        parent->ipf.ToneMapFattal02(f);
+        baseCrop = f;
     }
     
     if ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {

From 31520c1a740868858048583a16ff0b86eb27cdfa Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Tue, 7 Nov 2017 11:18:07 +0100
Subject: [PATCH 24/39] remove "(Fattal02)" from the name of the HDR Tone
 Mapping tool

---
 rtdata/languages/default | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rtdata/languages/default b/rtdata/languages/default
index f7c59e633..2c89629d1 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -928,7 +928,7 @@ PARTIALPASTE_SHADOWSHIGHLIGHTS;Shadows/highlights
 PARTIALPASTE_SHARPENEDGE;Edges
 PARTIALPASTE_SHARPENING;Sharpening (USM/RL)
 PARTIALPASTE_SHARPENMICRO;Microcontrast
-PARTIALPASTE_TM_FATTAL;HDR Tone mapping (Fattal02)
+PARTIALPASTE_TM_FATTAL;HDR Tone mapping
 PARTIALPASTE_VIBRANCE;Vibrance
 PARTIALPASTE_VIGNETTING;Vignetting correction
 PARTIALPASTE_WAVELETGROUP;Wavelet Levels
@@ -1929,9 +1929,9 @@ TP_SHARPENMICRO_AMOUNT;Quantity
 TP_SHARPENMICRO_LABEL;Microcontrast
 TP_SHARPENMICRO_MATRIX;3×3 matrix instead of 5×5
 TP_SHARPENMICRO_UNIFORMITY;Uniformity
-TP_TM_FATTAL_LABEL;HDR Tone Mapping (Fattal02)
-TP_TM_FATTAL_THRESHOLD;Threshold (Alpha)
-TP_TM_FATTAL_AMOUNT;Amount (Beta)
+TP_TM_FATTAL_LABEL;HDR Tone Mapping
+TP_TM_FATTAL_THRESHOLD;Threshold
+TP_TM_FATTAL_AMOUNT;Amount
 TP_VIBRANCE_AVOIDCOLORSHIFT;Avoid color shift
 TP_VIBRANCE_CURVEEDITOR_SKINTONES;HH
 TP_VIBRANCE_CURVEEDITOR_SKINTONES_LABEL;Skin-tones

From b1938e272caee4ba7575f7fb521ee72790e88025 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Tue, 7 Nov 2017 18:00:00 +0100
Subject: [PATCH 25/39] added a dedicated Median_Denoise function that applies
 the median filter only on dark pixels

This gives a slight performance improvement for Fattal
---
 rtengine/FTblockDN.cc    | 198 ++++++++++++++++++++++++---------------
 rtengine/improcfun.h     |   1 +
 rtengine/tmo_fattal02.cc |  14 +--
 3 files changed, 122 insertions(+), 91 deletions(-)

diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 7f0147b4c..839b90a47 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -74,8 +74,13 @@ extern const Settings* settings;
 extern MyMutex *fftwMutex;
 
 
-void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width, const int height, const Median medianType, const int iterations, const int numThreads, float **buffer)
+namespace {
+
+template <bool useUpperBound>
+void do_median_denoise(float **src, float **dst, float upperBound, const int width, const int height, const ImProcFunctions::Median medianType, const int iterations, const int numThreads, float **buffer)
 {
+    typedef ImProcFunctions::Median Median;
+    
     int border = 1;
 
     switch (medianType) {
@@ -156,13 +161,17 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
             switch (medianType) {
                 case Median::TYPE_3X3_SOFT: {
                     for (; j < width - border; ++j) {
-                        medianOut[i][j] = median(
-                                              medianIn[i - 1][j],
-                                              medianIn[i][j - 1],
-                                              medianIn[i][j],
-                                              medianIn[i][j + 1],
-                                              medianIn[i + 1][j]
-                                          );
+                        if (!useUpperBound || medianIn[i][j] <= upperBound) {
+                            medianOut[i][j] = median(
+                                                  medianIn[i - 1][j],
+                                                  medianIn[i][j - 1],
+                                                  medianIn[i][j],
+                                                  medianIn[i][j + 1],
+                                                  medianIn[i + 1][j]
+                                              );
+                        } else {
+                            medianOut[i][j] = medianIn[i][j];
+                        }
                     }
 
                     break;
@@ -170,17 +179,21 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
 
                 case Median::TYPE_3X3_STRONG: {
                     for (; j < width - border; ++j) {
-                        medianOut[i][j] = median(
-                                              medianIn[i - 1][j - 1],
-                                              medianIn[i - 1][j],
-                                              medianIn[i - 1][j + 1],
-                                              medianIn[i][j - 1],
-                                              medianIn[i][j],
-                                              medianIn[i][j + 1],
-                                              medianIn[i + 1][j - 1],
-                                              medianIn[i + 1][j],
-                                              medianIn[i + 1][j + 1]
-                                          );
+                        if (!useUpperBound || medianIn[i][j] <= upperBound) {
+                            medianOut[i][j] = median(
+                                                  medianIn[i - 1][j - 1],
+                                                  medianIn[i - 1][j],
+                                                  medianIn[i - 1][j + 1],
+                                                  medianIn[i][j - 1],
+                                                  medianIn[i][j],
+                                                  medianIn[i][j + 1],
+                                                  medianIn[i + 1][j - 1],
+                                                  medianIn[i + 1][j],
+                                                  medianIn[i + 1][j + 1]
+                                              );
+                        } else {
+                            medianOut[i][j] = medianIn[i][j];
+                        }
                     }
 
                     break;
@@ -188,21 +201,25 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
 
                 case Median::TYPE_5X5_SOFT: {
                     for (; j < width - border; ++j) {
-                        medianOut[i][j] = median(
-                                              medianIn[i - 2][j],
-                                              medianIn[i - 1][j - 1],
-                                              medianIn[i - 1][j],
-                                              medianIn[i - 1][j + 1],
-                                              medianIn[i][j - 2],
-                                              medianIn[i][j - 1],
-                                              medianIn[i][j],
-                                              medianIn[i][j + 1],
-                                              medianIn[i][j + 2],
-                                              medianIn[i + 1][j - 1],
-                                              medianIn[i + 1][j],
-                                              medianIn[i + 1][j + 1],
-                                              medianIn[i + 2][j]
-                                          );
+                        if (!useUpperBound || medianIn[i][j] <= upperBound) {
+                            medianOut[i][j] = median(
+                                                  medianIn[i - 2][j],
+                                                  medianIn[i - 1][j - 1],
+                                                  medianIn[i - 1][j],
+                                                  medianIn[i - 1][j + 1],
+                                                  medianIn[i][j - 2],
+                                                  medianIn[i][j - 1],
+                                                  medianIn[i][j],
+                                                  medianIn[i][j + 1],
+                                                  medianIn[i][j + 2],
+                                                  medianIn[i + 1][j - 1],
+                                                  medianIn[i + 1][j],
+                                                  medianIn[i + 1][j + 1],
+                                                  medianIn[i + 2][j]
+                                              );
+                        } else {
+                            medianOut[i][j] = medianIn[i][j];
+                        }
                     }
 
                     break;
@@ -210,8 +227,7 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
 
                 case Median::TYPE_5X5_STRONG: {
 #ifdef __SSE2__
-
-                    for (; j < width - border - 3; j += 4) {
+                    for (; !useUpperBound && j < width - border - 3; j += 4) {
                         STVFU(
                             medianOut[i][j],
                             median(
@@ -245,35 +261,38 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
                     }
 
 #endif
-
                     for (; j < width - border; ++j) {
-                        medianOut[i][j] = median(
-                                              medianIn[i - 2][j - 2],
-                                              medianIn[i - 2][j - 1],
-                                              medianIn[i - 2][j],
-                                              medianIn[i - 2][j + 1],
-                                              medianIn[i - 2][j + 2],
-                                              medianIn[i - 1][j - 2],
-                                              medianIn[i - 1][j - 1],
-                                              medianIn[i - 1][j],
-                                              medianIn[i - 1][j + 1],
-                                              medianIn[i - 1][j + 2],
-                                              medianIn[i][j - 2],
-                                              medianIn[i][j - 1],
-                                              medianIn[i][j],
-                                              medianIn[i][j + 1],
-                                              medianIn[i][j + 2],
-                                              medianIn[i + 1][j - 2],
-                                              medianIn[i + 1][j - 1],
-                                              medianIn[i + 1][j],
-                                              medianIn[i + 1][j + 1],
-                                              medianIn[i + 1][j + 2],
-                                              medianIn[i + 2][j - 2],
-                                              medianIn[i + 2][j - 1],
-                                              medianIn[i + 2][j],
-                                              medianIn[i + 2][j + 1],
-                                              medianIn[i + 2][j + 2]
-                                          );
+                        if (!useUpperBound || medianIn[i][j] <= upperBound) {
+                            medianOut[i][j] = median(
+                                                  medianIn[i - 2][j - 2],
+                                                  medianIn[i - 2][j - 1],
+                                                  medianIn[i - 2][j],
+                                                  medianIn[i - 2][j + 1],
+                                                  medianIn[i - 2][j + 2],
+                                                  medianIn[i - 1][j - 2],
+                                                  medianIn[i - 1][j - 1],
+                                                  medianIn[i - 1][j],
+                                                  medianIn[i - 1][j + 1],
+                                                  medianIn[i - 1][j + 2],
+                                                  medianIn[i][j - 2],
+                                                  medianIn[i][j - 1],
+                                                  medianIn[i][j],
+                                                  medianIn[i][j + 1],
+                                                  medianIn[i][j + 2],
+                                                  medianIn[i + 1][j - 2],
+                                                  medianIn[i + 1][j - 1],
+                                                  medianIn[i + 1][j],
+                                                  medianIn[i + 1][j + 1],
+                                                  medianIn[i + 1][j + 2],
+                                                  medianIn[i + 2][j - 2],
+                                                  medianIn[i + 2][j - 1],
+                                                  medianIn[i + 2][j],
+                                                  medianIn[i + 2][j + 1],
+                                                  medianIn[i + 2][j + 2]
+                                              );
+                        } else {
+                            medianOut[i][j] = medianIn[i][j];
+                        }
                     }
 
                     break;
@@ -283,7 +302,7 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
 #ifdef __SSE2__
                     std::array<vfloat, 49> vpp ALIGNED16;
 
-                    for (; j < width - border - 3; j += 4) {
+                    for (; !useUpperBound && j < width - border - 3; j += 4) {
                         for (int kk = 0, ii = -border; ii <= border; ++ii) {
                             for (int jj = -border; jj <= border; ++jj, ++kk) {
                                 vpp[kk] = LVFU(medianIn[i + ii][j + jj]);
@@ -296,15 +315,19 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
 #endif
 
                     std::array<float, 49> pp;
-
+                    
                     for (; j < width - border; ++j) {
-                        for (int kk = 0, ii = -border; ii <= border; ++ii) {
-                            for (int jj = -border; jj <= border; ++jj, ++kk) {
-                                pp[kk] = medianIn[i + ii][j + jj];
+                        if (!useUpperBound || medianIn[i][j] <= upperBound) {
+                            for (int kk = 0, ii = -border; ii <= border; ++ii) {
+                                for (int jj = -border; jj <= border; ++jj, ++kk) {
+                                    pp[kk] = medianIn[i + ii][j + jj];
+                                }
                             }
+                            
+                            medianOut[i][j] = median(pp);
+                        } else {
+                            medianOut[i][j] = medianIn[i][j];
                         }
-
-                        medianOut[i][j] = median(pp);
                     }
 
                     break;
@@ -314,7 +337,7 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
 #ifdef __SSE2__
                     std::array<vfloat, 81> vpp ALIGNED16;
 
-                    for (; j < width - border - 3; j += 4) {
+                    for (; !useUpperBound && j < width - border - 3; j += 4) {
                         for (int kk = 0, ii = -border; ii <= border; ++ii) {
                             for (int jj = -border; jj <= border; ++jj, ++kk) {
                                 vpp[kk] = LVFU(medianIn[i + ii][j + jj]);
@@ -327,15 +350,19 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
 #endif
 
                     std::array<float, 81> pp;
-
+                    
                     for (; j < width - border; ++j) {
-                        for (int kk = 0, ii = -border; ii <= border; ++ii) {
-                            for (int jj = -border; jj <= border; ++jj, ++kk) {
-                                pp[kk] = medianIn[i + ii][j + jj];
+                        if (!useUpperBound || medianIn[i][j] <= upperBound) {
+                            for (int kk = 0, ii = -border; ii <= border; ++ii) {
+                                for (int jj = -border; jj <= border; ++jj, ++kk) {
+                                    pp[kk] = medianIn[i + ii][j + jj];
+                                }
                             }
+                        
+                            medianOut[i][j] = median(pp);
+                        } else {
+                            medianOut[i][j] = medianIn[i][j];
                         }
-
-                        medianOut[i][j] = median(pp);
                     }
 
                     for (; j < width; ++j) {
@@ -382,6 +409,21 @@ void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width,
     }
 }
 
+} // namespace
+
+
+void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width, const int height, const Median medianType, const int iterations, const int numThreads, float **buffer)
+{
+    do_median_denoise<false>(src, dst, width, height, 0.f, medianType, iterations, numThreads, buffer);
+}
+
+
+void ImProcFunctions::Median_Denoise(float **src, float **dst, float upperBound, const int width, const int height, const Median medianType, const int iterations, const int numThreads, float **buffer)
+{
+    do_median_denoise<true>(src, dst, upperBound, width, height, medianType, iterations, numThreads, buffer);
+}
+
+
 void ImProcFunctions::Tile_calc(int tilesize, int overlap, int kall, int imwidth, int imheight, int &numtiles_W, int &numtiles_H, int &tilewidth, int &tileheight, int &tileWskip, int &tileHskip)
 
 {
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 46ad670fa..364713fc1 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -305,6 +305,7 @@ public:
 
 
     void Median_Denoise ( float **src, float **dst, int width, int height, Median medianType, int iterations, int numThreads, float **buffer = nullptr);
+    void Median_Denoise ( float **src, float **dst, float upperBound, int width, int height, Median medianType, int iterations, int numThreads, float **buffer = nullptr);
     void RGB_denoise (int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &nresi, float &highresi);
     void RGB_denoise_infoGamCurve (const procparams::DirPyrDenoiseParams & dnparams, const bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope);
     void RGB_denoise_info (Imagefloat * src, Imagefloat * provicalc, bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &minredaut, float & minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread = false);
diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 5b2f88107..1f415737a 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1196,7 +1196,6 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 #else
         int num_threads = 1;
 #endif
-        Array2Df Yr_med(w, h);
         float r = float(std::max(w, h)) / float(RT_dimension_cap);
         Median med;
         if (r >= 3) {
@@ -1208,18 +1207,7 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
         } else {
             med = Median::TYPE_3X3_STRONG;
         }
-        Median_Denoise(Yr, Yr_med, w, h, med, 1, num_threads);
-
-#ifdef _OPENMP
-        #pragma omp parallel for if (multiThread)
-#endif
-        for (int y = 0; y < h; y++) {
-            for (int x = 0; x < w; x++) {
-                if (Yr(x, y) <= luminance_noise_floor) {
-                    Yr(x, y) = Yr_med(x, y);
-                }
-            }
-        }
+        Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads);
     }
     
 

From b8ff601efa00c1d2adcf9e57f0d00bcb15904150 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Tue, 7 Nov 2017 21:07:29 +0100
Subject: [PATCH 26/39] fixed silly typo in Median_Denoise, leading to segfault

---
 rtengine/FTblockDN.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 839b90a47..a14446358 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -414,7 +414,7 @@ void do_median_denoise(float **src, float **dst, float upperBound, const int wid
 
 void ImProcFunctions::Median_Denoise(float **src, float **dst, const int width, const int height, const Median medianType, const int iterations, const int numThreads, float **buffer)
 {
-    do_median_denoise<false>(src, dst, width, height, 0.f, medianType, iterations, numThreads, buffer);
+    do_median_denoise<false>(src, dst, 0.f, width, height, medianType, iterations, numThreads, buffer);
 }
 
 

From 75405404a91cade0fb9b4d0e2f06596fc663fd44 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Wed, 8 Nov 2017 18:07:03 +0100
Subject: [PATCH 27/39] Fattal: speed up FFT computation by rescaling the
 images to FFTW-friendly dimensions

---
 rtengine/tmo_fattal02.cc | 81 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 2 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 1f415737a..939ceeff8 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1142,11 +1142,81 @@ void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
 }
 
 
+void rescale_nearest(const Array2Df &src, Array2Df &dst, bool multithread)
+{
+    const int width = src.getCols();
+    const int height = src.getRows();
+    const int nw = dst.getCols();
+    const int nh = dst.getRows();
+
+#ifdef _OPENMP
+    #pragma omp parallel for if (multithread)
+#endif
+    for (int y = 0; y < nh; ++y) {
+        int sy = y * height / nh;
+        for (int x = 0; x < nw; ++x) {
+            int sx = x * width / nw;
+            dst(x, y) = src(sx, sy);
+        }
+    }
+}
+
+
 inline float luminance(float r, float g, float b, TMatrix ws)
 {
     return r * ws[1][0] + g * ws[1][1] + b * ws[1][2];
 }
 
+
+inline int round_up_pow2(int dim)
+{
+    // from https://graphics.stanford.edu/~seander/bithacks.html
+    assert(dim > 0);
+    unsigned int v = dim;
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return v;
+}
+
+inline int find_fast_dim(int dim)
+{
+    // as per the FFTW docs:
+    //
+    //   FFTW is generally best at handling sizes of the form 2a 3b 5c 7d 11e
+    //   13f, where e+f is either 0 or 1.
+    // 
+    // Here, we try to round up to the nearest dim that can be expressed in
+    // the above form. This is not exhaustive, but should be ok for pictures
+    // up to 100MPix at least
+    int d1 = round_up_pow2(dim);
+    std::vector<int> d = {
+        d1/128 * 65,
+        d1/64 * 33,
+        d1/512 * 273,
+        d1/16 * 9,
+        d1/8 * 5,
+        d1/16 * 11,
+        d1/128 * 91,
+        d1/4 * 3,
+        d1/64 * 49,
+        d1/16 * 13,
+        d1/8 * 7,
+        d1
+    };
+    for (size_t i = 0; i < d.size(); ++i) {
+        if (d[i] >= dim) {
+            return d[i];
+        }
+    }
+    assert(false);
+    return dim;
+}
+
 } // namespace
 
 
@@ -1209,7 +1279,6 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
         }
         Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads);
     }
-    
 
     float noise = alpha * 0.01f;
 
@@ -1218,7 +1287,15 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
                   << ", detail_level = " << detail_level << std::endl;
     }
 
-    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
+    //tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
+    {
+        int w2 = find_fast_dim(w) + 1;
+        int h2 = find_fast_dim(h) + 1;
+        Array2Df buf(w2, h2);
+        rescale_nearest(Yr, buf, multiThread);
+        tmo_fattal02(w2, h2, buf, buf, alpha, beta, noise, detail_level, multiThread);
+        rescale_nearest(buf, L, multiThread);
+    }
 
 #ifdef _OPENMP
     #pragma omp parallel for if(multiThread)

From 2ff9ca01451d91c9d1ef4f6cd12dbef4ff151749 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 8 Nov 2017 19:01:52 +0100
Subject: [PATCH 28/39] lot of small speedups for fattal

---
 rtengine/tmo_fattal02.cc | 215 +++++++++++++++++++--------------------
 1 file changed, 103 insertions(+), 112 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 939ceeff8..851a88884 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -171,8 +171,6 @@ void gaussianBlur(const Array2Df& I, Array2Df& L)
     const int width = I.getCols();
     const int height = I.getRows();
 
-    Array2Df T(width,height);
-
     if (width < 3 || height < 3) {
         if (&I != &L) {
             for (int i = 0, n = width*height; i < n; ++i) {
@@ -182,8 +180,10 @@ void gaussianBlur(const Array2Df& I, Array2Df& L)
         return;
     }
 
+    Array2Df T(width,height);
+
     //--- X blur
-    //#pragma omp parallel for shared(I, T)
+    #pragma omp parallel for shared(I, T)
     for ( int y=0 ; y<height ; y++ )
     {
         for ( int x=1 ; x<width-1 ; x++ )
@@ -198,8 +198,24 @@ void gaussianBlur(const Array2Df& I, Array2Df& L)
     }
 
     //--- Y blur
-    //#pragma omp parallel for shared(T, L)
-    for ( int x=0 ; x<width ; x++ )
+    #pragma omp parallel for
+    for ( int x=0 ; x<width-7 ; x+=8 )
+    {
+        for ( int y=1 ; y<height-1 ; y++ )
+        {
+            for(int xx = 0; xx < 8; ++xx) {
+                float t = 2.f * T(x+xx,y);
+                t += T(x+xx,y-1);
+                t += T(x+xx,y+1);
+                L(x+xx,y) = t * 0.25f; // t/4.0f;
+            }
+        }
+        for(int xx = 0; xx < 8; ++xx) {
+            L(x+xx,0) = ( 3.f * T(x+xx,0) + T(x+xx,1) ) * 0.25f; // / 4.0f;
+            L(x+xx,height-1) = ( 3.f * T(x+xx,height-1) + T(x+xx,height-2) ) * 0.25f; // / 4.0f;
+        }
+    }
+    for ( int x = width - (width % 8) ; x<width ; x++ )
     {
         for ( int y=1 ; y<height-1 ; y++ )
         {
@@ -244,9 +260,11 @@ void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
           }
       }
 
-    delete L;
-    L = new Array2Df(width,height);
-    gaussianBlur( *pyramids[k], *L );
+    if(k < nlevels -1) {
+        delete L;
+        L = new Array2Df(width,height);
+        gaussianBlur( *pyramids[k], *L );
+    }
   }
 
   delete L;
@@ -261,27 +279,27 @@ float calculateGradients(Array2Df* H, Array2Df* G, int k)
   const float divider = pow( 2.0f, k+1 );
   float avgGrad = 0.0f;
 
-//#pragma omp parallel for shared(G,H) reduction(+:avgGrad)
+#pragma omp parallel for reduction(+:avgGrad)
   for( int y=0 ; y<height ; y++ )
   {
+    int n = (y == 0 ? 0 : y-1);
+    int s = (y+1 == height ? y : y+1);
     for( int x=0 ; x<width ; x++ )
     {
       float gx, gy;
-      int w, n, e, s;
+      int w, e;
       w = (x == 0 ? 0 : x-1);
-      n = (y == 0 ? 0 : y-1);
-      s = (y+1 == height ? y : y+1);
       e = (x+1 == width ? x : x+1);
 
-      gx = ((*H)(w,y)-(*H)(e,y)) / divider;
+      gx = ((*H)(w,y)-(*H)(e,y));
 
-      gy = ((*H)(x,s)-(*H)(x,n)) / divider;
+      gy = ((*H)(x,s)-(*H)(x,n));
       // note this implicitely assumes that H(-1)=H(0)
       // for the fft-pde slover this would need adjustment as H(-1)=H(1)
       // is assumed, which means gx=0.0, gy=0.0 at the boundaries
       // however, the impact is not visible so we ignore this here
 
-      (*G)(x,y) = sqrt(gx*gx+gy*gy);
+      (*G)(x,y) = sqrt(gx*gx+gy*gy) / divider;
       avgGrad += (*G)(x,y);
     }
   }
@@ -357,7 +375,7 @@ void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
              || newfattal == false)
         {
             //DEBUG_STR << "calculateFiMatrix: apply gradient to level " << k << endl;
-            //#pragma omp parallel for shared(fi,avgGrad)
+            #pragma omp parallel for shared(fi,avgGrad)
             for ( int y = 0; y < height; y++ )
             {
                 for ( int x = 0; x < width; x++ )
@@ -365,7 +383,7 @@ void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
                     float grad = ((*gradients[k])(x,y) < 1e-4f) ? 1e-4 : (*gradients[k])(x,y);
                     float a = alfa * avgGrad[k];
 
-                    float value = powf((grad+noise)/a, beta - 1.0f);
+                    float value = pow((grad+noise)/a, beta - 1.0f);
 
                     if (newfattal)
                         (*fi[k])(x,y) *= value;
@@ -514,19 +532,18 @@ void tmo_fattal02(size_t width,
         
 
   int size = width*height;
-  // unsigned int x,y;
-  // int i, k;
 
-  // find max & min values, normalize to range 0..100 and take logarithm
+  // find max value, normalize to range 0..100 and take logarithm
   float minLum = Y(0,0);
   float maxLum = Y(0,0);
+
+  #pragma omp parallel for reduction(max:maxLum)
   for ( int i=0 ; i<size ; i++ )
   {
-      minLum = ( Y(i) < minLum ) ? Y(i) : minLum;
-      maxLum = ( Y(i) > maxLum ) ? Y(i) : maxLum;
+      maxLum = std::max(maxLum, Y(i));
   }
-  Array2Df* H = new Array2Df(width, height);
 
+  Array2Df* H = new Array2Df(width, height);
   float temp = 100.f / maxLum;
   float eps = 1e-4f;
   #pragma omp parallel
@@ -652,71 +669,50 @@ void tmo_fattal02(size_t width,
   // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero
   // Neumann conditions assume U(-1)=U(0)), see also divergence calculation
   // if (fftsolver)
-    for ( size_t y=0 ; y<height ; y++ )
+#pragma omp parallel for
+    for ( size_t y=0 ; y<height ; y++ ) {
+      // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
+      unsigned int yp1 = (y+1 >= height ? height-2 : y+1);
       for ( size_t x=0 ; x<width ; x++ )
       {
         // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
-        unsigned int yp1 = (y+1 >= height ? height-2 : y+1);
         unsigned int xp1 = (x+1 >= width ?  width-2  : x+1);
         // forward differences in H, so need to use between-points approx of FI
-        (*Gx)(x,y) = ((*H)(xp1,y)-(*H)(x,y)) * 0.5*((*FI)(xp1,y)+(*FI)(x,y));
-        (*Gy)(x,y) = ((*H)(x,yp1)-(*H)(x,y)) * 0.5*((*FI)(x,yp1)+(*FI)(x,y));
+        (*Gx)(x,y) = ((*H)(xp1,y)-(*H)(x,y)) * 0.5 * ((*FI)(xp1,y)+(*FI)(x,y));
+        (*Gy)(x,y) = ((*H)(x,yp1)-(*H)(x,y)) * 0.5 * ((*FI)(x,yp1)+(*FI)(x,y));
       }
-  // else
-  //   for ( size_t y=0 ; y<height ; y++ )
-  //     for ( size_t x=0 ; x<width ; x++ )
-  //     {
-  //       int s, e;
-  //       s = (y+1 == height ? y : y+1);
-  //       e = (x+1 == width ? x : x+1);
-
-  //       (*Gx)(x,y) = ((*H)(e,y)-(*H)(x,y)) * (*FI)(x,y);
-  //       (*Gy)(x,y) = ((*H)(x,s)-(*H)(x,y)) * (*FI)(x,y);
-  //     }
+    }
   delete H;
-  delete FI;
-  // ph.setValue(18);
-
-
-//   dumpPFS( "Gx.pfs", Gx, "Y" );
-//   dumpPFS( "Gy.pfs", Gy, "Y" );
 
   // calculate divergence
-  Array2Df DivG(width, height);
+#pragma omp parallel for
   for ( size_t y = 0; y < height; ++y )
   {
       for ( size_t x = 0; x < width; ++x )
       {
-          DivG(x,y) = (*Gx)(x,y) + (*Gy)(x,y);
-          if ( x > 0 ) DivG(x,y) -= (*Gx)(x-1,y);
-          if ( y > 0 ) DivG(x,y) -= (*Gy)(x,y-1);
+          (*FI)(x,y) = (*Gx)(x,y) + (*Gy)(x,y);
+          if ( x > 0 ) (*FI)(x,y) -= (*Gx)(x-1,y);
+          if ( y > 0 ) (*FI)(x,y) -= (*Gy)(x,y-1);
 
           // if (fftsolver)
           {
-              if (x==0) DivG(x,y) += (*Gx)(x,y);
-              if (y==0) DivG(x,y) += (*Gy)(x,y);
+              if (x==0) (*FI)(x,y) += (*Gx)(x,y);
+              if (y==0) (*FI)(x,y) += (*Gy)(x,y);
           }
 
       }
   }
   delete Gx;
   delete Gy;
-  // ph.setValue(20);
-  // if (ph.canceled())
-  // {
-  //     return;
-  // }
-
-//  dumpPFS( "DivG.pfs", DivG, "Y" );
 
   // solve pde and exponentiate (ie recover compressed image)
   {
-  Array2Df U(width, height);
   // if (fftsolver)
   {
       MyMutex::MyLock lock(*fftwMutex);
-      solve_pde_fft(&DivG, &U, multithread);//, ph);
+      solve_pde_fft(FI, &L, multithread);//, ph);
   }
+  delete FI;
   // else
   // {
   //     solve_pde_multigrid(&DivG, &U, ph);
@@ -729,7 +725,6 @@ void tmo_fattal02(size_t width,
   // {
   //     return;
   // }
-
   #pragma omp parallel
   {
 #ifdef __SSE2__
@@ -740,15 +735,14 @@ void tmo_fattal02(size_t width,
       size_t j = 0;
 #ifdef __SSE2__
       for(; j < width - 3; j+=4) {
-          STVFU(L[i][j], xexpf(gammav * LVFU(U[i][j])));
+          STVFU(L[i][j], xexpf(gammav * LVFU(L[i][j])));
       }
 #endif
       for(; j < width; j++) {
-          L[i][j] = xexpf( gamma * U[i][j]);
+          L[i][j] = xexpf( gamma * L[i][j]);
       }
   }
   }
-
   }
   // ph.setValue(95);
 
@@ -757,22 +751,15 @@ void tmo_fattal02(size_t width,
   float cut_max = 1.0f - 0.01f * white_point;
   assert(cut_min>=0.0f && (cut_max<=1.0f) && (cut_min<cut_max));
   findMaxMinPercentile(L, cut_min, minLum, cut_max, maxLum);
-  for ( size_t idx = 0; idx < height*width; ++idx )
-  {
-      L(idx) = (L(idx) - minLum) / (maxLum - minLum);
-      if ( L(idx) <= 0.0f )
-      {
-          L(idx) = 0.0;
-      }
-      // note, we intentionally do not cut off values > 1.0
-  }
-// #ifdef TIMER_PROFILING
-//     stop_watch.stop_and_update();
-//     cout << endl;
-//     cout << "tmo_fattal02 = " << stop_watch.get_time() << " msec" << endl;
-// #endif
+  float dividor = (maxLum - minLum);
 
-  // ph.setValue(96);
+#pragma omp parallel for
+  for (size_t i = 0; i < height; ++i) {
+    for (size_t j = 0; j < width; ++j) {
+        L[i][j] = std::max((L[i][j] - minLum) / dividor, 0.f);
+        // note, we intentionally do not cut off values > 1.0
+    }
+  }
 }
 
 
@@ -856,6 +843,7 @@ void transform_ev2normal(Array2Df *A, Array2Df *T)
 
   // the discrete cosine transform is not exactly the transform needed
   // need to scale input values to get the right transformation
+  #pragma omp parallel for
   for(int y=1 ; y<height-1 ; y++ )
     for(int x=1 ; x<width-1 ; x++ )
       (*A)(x,y)*=0.25f;
@@ -904,10 +892,11 @@ void transform_normal2ev(Array2Df *A, Array2Df *T)
   fftwf_destroy_plan(p);
 
   // need to scale the output matrix to get the right transform
+  float factor = (1.0f/((height-1)*(width-1)));
+#pragma omp parallel for
   for(int y=0 ; y<height ; y++ )
     for(int x=0 ; x<width ; x++ )
-      (*T)(x,y)*=(1.0f/((height-1)*(width-1)));
-
+      (*T)(x,y)*= factor;
   for(int x=0 ; x<width ; x++ )
   {
     (*T)(x,0)*=0.5f;
@@ -1024,28 +1013,24 @@ void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress
 
   // in the eigenvector space the solution is very simple
   //DEBUG_STR << "solve_pde_fft: solve in eigenvector space" << std::endl;
-  Array2Df* U_tr = new Array2Df(width,height);
+//  Array2Df* U_tr = new Array2Df(width,height);
   std::vector<double> l1=get_lambda(height);
   std::vector<double> l2=get_lambda(width);
+
+#pragma omp parallel for
   for(int y=0 ; y<height ; y++ )
   {
     for(int x=0 ; x<width ; x++ )
     {
-      if(x==0 && y==0)
-        (*U_tr)(x,y)=0.0; // any value ok, only adds a const to the solution
-      else
-        (*U_tr)(x,y)=(*F_tr)(x,y)/(l1[y]+l2[x]);
+        (*F_tr)(x,y)=(*F_tr)(x,y)/(l1[y]+l2[x]);
     }
   }
-  delete F_tr;    // no longer needed so release memory
-  // ph.setValue(55);
-
+  (*F_tr)(0,0)=0.f; // any value ok, only adds a const to the solution
 
   // transforms U_tr back to the normal space
   //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl;
-  transform_ev2normal(U_tr, U);
-  delete U_tr;    // no longer needed so release memory
-  // ph.setValue(85);
+  transform_ev2normal(F_tr, U);
+  delete F_tr;    // no longer needed so release memory
 
   // the solution U as calculated will satisfy something like int U = 0
   // since for any constant c, U-c is also a solution and we are mainly
@@ -1053,14 +1038,16 @@ void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress
   // a solution which has no positive values: U_new(x,y)=U(x,y)-max
   // (not really needed but good for numerics as we later take exp(U))
   //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl;
-  double max=0.0;
-  for(int i=0; i<width*height; i++)
-    if(max<(*U)(i))
-      max=(*U)(i);
+  float max=0.f;
+  #pragma omp parallel for reduction(max:max)
+  for(int i=0; i<width*height; i++) {
+    max = std::max(max, (*U)(i));
+  }
 
-  for(int i=0; i<width*height; i++)
+  #pragma omp parallel for
+  for(int i=0; i<width*height; i++) {
     (*U)(i)-=max;
-
+  }
 
   // fft parallel threads cleanup, better handled outside this function?
 #ifdef RT_FFTW3F_OMP
@@ -1135,13 +1122,13 @@ void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
     #pragma omp parallel for if (multithread)
 #endif
     for (int y = 0; y < dst.getRows(); ++y) {
+        float ymrs = y * row_scale;
         for (int x = 0; x < dst.getCols(); ++x) {
-            dst(x, y) = get_bilinear_value(src, x * col_scale, y * row_scale);
+            dst(x, y) = get_bilinear_value(src, x * col_scale, ymrs);
         }
     }
 }
 
-
 void rescale_nearest(const Array2Df &src, Array2Df &dst, bool multithread)
 {
     const int width = src.getCols();
@@ -1189,10 +1176,11 @@ inline int find_fast_dim(int dim)
     //
     //   FFTW is generally best at handling sizes of the form 2a 3b 5c 7d 11e
     //   13f, where e+f is either 0 or 1.
-    // 
+    //
     // Here, we try to round up to the nearest dim that can be expressed in
     // the above form. This is not exhaustive, but should be ok for pictures
     // up to 100MPix at least
+
     int d1 = round_up_pow2(dim);
     std::vector<int> d = {
         d1/128 * 65,
@@ -1238,18 +1226,17 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
     if (alpha <= 0 || beta <= 0) {
         return;
     }
-    
+
     int w = rgb->getWidth();
     int h = rgb->getHeight();
     
     Array2Df Yr(w, h);
-    Array2Df L(w, h);
 
     const float epsilon = 1e-4f;
     const float luminance_noise_floor = 65.535f;
     const float min_luminance = 1.f;
     TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix(params->icm.working);
-    
+
 #ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
 #endif
@@ -1258,7 +1245,6 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
             Yr(x, y) = std::max(luminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x), ws), min_luminance); // clip really black pixels
         }
     }
-
     // median filter on the deep shadows, to avoid boosting noise
     {
 #ifdef _OPENMP
@@ -1280,6 +1266,7 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
         Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads);
     }
 
+
     float noise = alpha * 0.01f;
 
     if (settings->verbose) {
@@ -1287,34 +1274,38 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
                   << ", detail_level = " << detail_level << std::endl;
     }
 
-    //tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
-    {
         int w2 = find_fast_dim(w) + 1;
         int h2 = find_fast_dim(h) + 1;
         Array2Df buf(w2, h2);
         rescale_nearest(Yr, buf, multiThread);
         tmo_fattal02(w2, h2, buf, buf, alpha, beta, noise, detail_level, multiThread);
+        Array2Df L(w, h);
         rescale_nearest(buf, L, multiThread);
-    }
 
+//    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
+
+StopWatch Stopx("second Last part");
 #ifdef _OPENMP
     #pragma omp parallel for if(multiThread)
 #endif
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
             float Y = Yr(x, y);
-            float l = std::max(L(x, y), epsilon);
-            rgb->r(y, x) = std::max(rgb->r(y, x)/Y, 0.f) * l;
-            rgb->g(y, x) = std::max(rgb->g(y, x)/Y, 0.f) * l;
-            rgb->b(y, x) = std::max(rgb->b(y, x)/Y, 0.f) * l;
+            float l = std::max(L(x, y), epsilon) * (65535.f / Y);
+            rgb->r(y, x) = std::max(rgb->r(y, x), 0.f) * l;
+            rgb->g(y, x) = std::max(rgb->g(y, x), 0.f) * l;
+            rgb->b(y, x) = std::max(rgb->b(y, x), 0.f) * l;
             
             assert(std::isfinite(rgb->r(y, x)));
             assert(std::isfinite(rgb->g(y, x)));
             assert(std::isfinite(rgb->b(y, x)));
         }
     }
+Stopx.stop();
+StopWatch Stopy("Last part");
 
-    rgb->normalizeFloatTo65535();
+//    rgb->normalizeFloatTo65535();
+Stopy.stop();
 }
 
 

From cb735125a266905f52083280960d4dfe7e2b2b8d Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 8 Nov 2017 19:09:06 +0100
Subject: [PATCH 29/39] Removed two stopwatches

---
 rtengine/tmo_fattal02.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index 851a88884..f8e647802 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1284,7 +1284,6 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 
 //    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
 
-StopWatch Stopx("second Last part");
 #ifdef _OPENMP
     #pragma omp parallel for if(multiThread)
 #endif
@@ -1301,11 +1300,6 @@ StopWatch Stopx("second Last part");
             assert(std::isfinite(rgb->b(y, x)));
         }
     }
-Stopx.stop();
-StopWatch Stopy("Last part");
-
-//    rgb->normalizeFloatTo65535();
-Stopy.stop();
 }
 
 

From b0ebcc30eb2912116f6522d9126048a3d8128483 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Wed, 8 Nov 2017 21:49:50 +0100
Subject: [PATCH 30/39] Fattal: reduced number of temporary buffers needed

---
 rtengine/tmo_fattal02.cc | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index f8e647802..c1b0a4a5f 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -488,7 +488,7 @@ void findMaxMinPercentile(const Array2Df& I,
 
 }
 
-void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread);
+void solve_pde_fft(Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread);
 
 void tmo_fattal02(size_t width,
                   size_t height,
@@ -625,7 +625,9 @@ void tmo_fattal02(size_t width,
   {
     gradients[k] = new Array2Df(pyramids[k]->getCols(), pyramids[k]->getRows());
     avgGrad[k] = calculateGradients(pyramids[k],gradients[k], k);
+    delete pyramids[k];
   }
+  delete[] pyramids;
   // ph.setValue(12);
 
   // calculate fi matrix
@@ -634,10 +636,8 @@ void tmo_fattal02(size_t width,
 //  dumpPFS( "FI.pfs", FI, "Y" );
   for ( int i=0 ; i<nlevels ; i++ )
   {
-    delete pyramids[i];
     delete gradients[i];
   }
-  delete[] pyramids;
   delete[] gradients;
   delete[] avgGrad;
   // ph.setValue(16);
@@ -702,7 +702,7 @@ void tmo_fattal02(size_t width,
 
       }
   }
-  delete Gx;
+  //delete Gx; // RT - reused as temp buffer in solve_pde_fft, deleted later
   delete Gy;
 
   // solve pde and exponentiate (ie recover compressed image)
@@ -710,8 +710,9 @@ void tmo_fattal02(size_t width,
   // if (fftsolver)
   {
       MyMutex::MyLock lock(*fftwMutex);
-      solve_pde_fft(FI, &L, multithread);//, ph);
+      solve_pde_fft(FI, &L, Gx, multithread);//, ph);
   }
+  delete Gx;
   delete FI;
   // else
   // {
@@ -966,7 +967,7 @@ std::vector<double> get_lambda(int n)
 // not modified and the equation might not have a solution but an
 // approximate solution with a minimum error is then calculated
 // double precision version
-void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress &ph,
+void solve_pde_fft(Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*, pfs::Progress &ph,
                                               bool adjust_bound)*/
 {
    // ph.setValue(20);
@@ -974,6 +975,7 @@ void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress
   int width = F->getCols();
   int height = F->getRows();
   assert((int)U->getCols()==width && (int)U->getRows()==height);
+  assert(buf->getCols()==width && buf->getRows()==height);
 
   // activate parallel execution of fft routines
 #ifdef RT_FFTW3F_OMP
@@ -997,7 +999,7 @@ void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress
 
   // transforms F into eigenvector space: Ftr =
   //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl;
-  Array2Df* F_tr = new Array2Df(width,height);
+  Array2Df* F_tr = buf; //new Array2Df(width,height);
   transform_normal2ev(F, F_tr);
   // TODO: F no longer needed so could release memory, but as it is an
   // input parameter we won't do that
@@ -1030,7 +1032,7 @@ void solve_pde_fft(Array2Df *F, Array2Df *U, bool multithread)/*, pfs::Progress
   // transforms U_tr back to the normal space
   //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl;
   transform_ev2normal(F_tr, U);
-  delete F_tr;    // no longer needed so release memory
+//  delete F_tr;    // no longer needed so release memory
 
   // the solution U as calculated will satisfy something like int U = 0
   // since for any constant c, U-c is also a solution and we are mainly
@@ -1266,7 +1268,6 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
         Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads);
     }
 
-
     float noise = alpha * 0.01f;
 
     if (settings->verbose) {
@@ -1274,13 +1275,11 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
                   << ", detail_level = " << detail_level << std::endl;
     }
 
-        int w2 = find_fast_dim(w) + 1;
-        int h2 = find_fast_dim(h) + 1;
-        Array2Df buf(w2, h2);
-        rescale_nearest(Yr, buf, multiThread);
-        tmo_fattal02(w2, h2, buf, buf, alpha, beta, noise, detail_level, multiThread);
-        Array2Df L(w, h);
-        rescale_nearest(buf, L, multiThread);
+    int w2 = find_fast_dim(w) + 1;
+    int h2 = find_fast_dim(h) + 1;
+    Array2Df L(w2, h2);
+    rescale_nearest(Yr, L, multiThread);
+    tmo_fattal02(w2, h2, L, L, alpha, beta, noise, detail_level, multiThread);
 
 //    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
 
@@ -1288,9 +1287,11 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
     #pragma omp parallel for if(multiThread)
 #endif
     for (int y = 0; y < h; y++) {
+        int yy = y * h2 / h;
         for (int x = 0; x < w; x++) {
+            int xx = x * w2 / w;
             float Y = Yr(x, y);
-            float l = std::max(L(x, y), epsilon) * (65535.f / Y);
+            float l = std::max(L(xx, yy), epsilon) * (65535.f / Y);
             rgb->r(y, x) = std::max(rgb->r(y, x), 0.f) * l;
             rgb->g(y, x) = std::max(rgb->g(y, x), 0.f) * l;
             rgb->b(y, x) = std::max(rgb->b(y, x), 0.f) * l;

From 67b6aec64eaf4a7ec270b6d736ad60471e65fdcd Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Thu, 9 Nov 2017 13:06:12 +0100
Subject: [PATCH 31/39] Fattal: one less buffer alloc/dealloc

---
 rtengine/FTblockDN.cc    | 2 +-
 rtengine/tmo_fattal02.cc | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index a14446358..6cb503f6f 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -117,7 +117,7 @@ void do_median_denoise(float **src, float **dst, float upperBound, const int wid
 
     // we need a buffer if src == dst or if (src != dst && iterations > 1)
     if (src == dst || iterations > 1) {
-        if (buffer == nullptr) { // we didn't get a buufer => create one
+        if (buffer == nullptr) { // we didn't get a buffer => create one
             allocBuffer = new float*[height];
 
             for (int i = 0; i < height; ++i) {
diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index c1b0a4a5f..d537cce3f 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -1248,6 +1248,10 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
         }
     }
     // median filter on the deep shadows, to avoid boosting noise
+    // because w2 >= w and h2 >= h, we can use the L buffer as temporary buffer for Median_Denoise()
+    int w2 = find_fast_dim(w) + 1;
+    int h2 = find_fast_dim(h) + 1;
+    Array2Df L(w2, h2);
     {
 #ifdef _OPENMP
         int num_threads = multiThread ? omp_get_max_threads() : 1;
@@ -1265,7 +1269,7 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
         } else {
             med = Median::TYPE_3X3_STRONG;
         }
-        Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads);
+        Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads, L);
     }
 
     float noise = alpha * 0.01f;
@@ -1275,9 +1279,6 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
                   << ", detail_level = " << detail_level << std::endl;
     }
 
-    int w2 = find_fast_dim(w) + 1;
-    int h2 = find_fast_dim(h) + 1;
-    Array2Df L(w2, h2);
     rescale_nearest(Yr, L, multiThread);
     tmo_fattal02(w2, h2, L, L, alpha, beta, noise, detail_level, multiThread);
 

From 2cdc5fc69ac281978da665e100abbfda6b6044ee Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sat, 11 Nov 2017 16:18:38 +0100
Subject: [PATCH 32/39] move Fattal before transform

---
 rtengine/dcrop.cc             | 83 ++++++++++++++++++++++++++---------
 rtengine/improccoordinator.cc | 47 ++++++++++----------
 rtengine/rtthumbnail.cc       |  8 ++--
 rtengine/simpleprocess.cc     |  8 ++--
 4 files changed, 93 insertions(+), 53 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 41867d1c9..62f755aee 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -690,14 +690,46 @@ void Crop::update (int todo)
     // has to be called after setCropSizes! Tools prior to this point can't handle the Edit mechanism, but that shouldn't be a problem.
     createBuffer (cropw, croph);
 
+    std::unique_ptr<Imagefloat> fattalCrop;
+    bool need_cropping = false;
+    bool has_fattal = false;
+    
+    if ((todo & (M_TRANSFORM | M_RGBCURVE)) && params.fattal.enabled) {
+        has_fattal = true;
+        Imagefloat *f = baseCrop;
+        if (f == origCrop) {
+            fattalCrop.reset(baseCrop->copy());
+            f = fattalCrop.get();
+        }
+        parent->ipf.ToneMapFattal02(f);
+        need_cropping = (cropx || cropy || trafw != cropw || trafh != croph);
+        baseCrop = f;
+    }
+    
     // transform
     if (needstransform || ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled)) {
+        int tx = cropx;
+        int ty = cropy;
+        int tw = cropw;
+        int th = croph;
+        
+        if (has_fattal) {
+            tx = 0;
+            ty = 0;
+            tw = trafw;
+            th = trafh;
+            if (transCrop) {
+                delete transCrop;
+                transCrop = nullptr;
+            }
+        }
+        
         if (!transCrop) {
-            transCrop = new Imagefloat (cropw, croph);
+            transCrop = new Imagefloat (tw, th);
         }
 
         if (needstransform)
-            parent->ipf.transform (baseCrop, transCrop, cropx / skip, cropy / skip, trafx / skip, trafy / skip, skips (parent->fw, skip), skips (parent->fh, skip), parent->getFullWidth(), parent->getFullHeight(),
+            parent->ipf.transform (baseCrop, transCrop, tx / skip, ty / skip, trafx / skip, trafy / skip, skips (parent->fw, skip), skips (parent->fh, skip), parent->getFullWidth(), parent->getFullHeight(),
                                    parent->imgsrc->getMetaData(),
                                    parent->imgsrc->getRotateDegree(), false);
         else {
@@ -715,17 +747,28 @@ void Crop::update (int todo)
         transCrop = nullptr;
     }
 
-    std::unique_ptr<Imagefloat> fattalCrop;
-    if ((todo & M_RGBCURVE) && params.fattal.enabled) {
-        Imagefloat *f = baseCrop;
-        if (f == origCrop) {
-            fattalCrop.reset(baseCrop->copy());
-            f = fattalCrop.get();
+    if (need_cropping) {
+        Imagefloat *c = new Imagefloat(cropw, croph);
+
+        int oy = skips(cropy, skip);
+        int ox = skips(cropx, skip);
+#ifdef _OPENMP
+        #pragma omp parallel for
+#endif
+        for (int y = 0; y < croph; ++y) {
+            int cy = y + oy;
+            for (int x = 0; x < cropw; ++x) {
+                int cx = x + ox;
+                c->r(y, x) = baseCrop->r(cy, cx);
+                c->g(y, x) = baseCrop->g(cy, cx);
+                c->b(y, x) = baseCrop->b(cy, cx);
+            }
         }
-        parent->ipf.ToneMapFattal02(f);
-        baseCrop = f;
+        fattalCrop.reset(c);
+        baseCrop = c;
     }
     
+
     if ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {
 
         const int W = baseCrop->getWidth();
@@ -1156,13 +1199,6 @@ bool Crop::setCropSizes (int rcx, int rcy, int rcw, int rch, int skip, bool inte
     orw = bw;
     orh = bh;
 
-    if (check_need_full_image(parent->params)) {
-        orx = bx1 = 0;
-        ory = by1 = 0;
-        orw = bw = parent->fullw;
-        orh = bh = parent->fullh;
-    }
-    
     ProcParams& params = parent->params;
 
     parent->ipf.transCoord (parent->fw, parent->fh, bx1, by1, bw, bh, orx, ory, orw, orh);
@@ -1202,6 +1238,16 @@ bool Crop::setCropSizes (int rcx, int rcy, int rcw, int rch, int skip, bool inte
         orh = min (y2 - y1, parent->fh - ory);
     }
 
+    leftBorder  = skips (rqx1 - bx1, skip);
+    upperBorder = skips (rqy1 - by1, skip);
+
+    if (check_need_full_image(parent->params)) {
+        orx = 0;
+        ory = 0;
+        orw = parent->fullw;
+        orh = parent->fullh;
+    }    
+    
     PreviewProps cp (orx, ory, orw, orh, skip);
     int orW, orH;
     parent->imgsrc->getSize (cp, orW, orH);
@@ -1212,9 +1258,6 @@ bool Crop::setCropSizes (int rcx, int rcy, int rcw, int rch, int skip, bool inte
     int cw = skips (bw, skip);
     int ch = skips (bh, skip);
 
-    leftBorder  = skips (rqx1 - bx1, skip);
-    upperBorder = skips (rqy1 - by1, skip);
-
     if (settings->verbose) {
         printf ("setsizes starts (%d, %d, %d, %d, %d, %d)\n", orW, orH, trafw, trafh, cw, ch);
     }
diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc
index dffd9572e..f456cc4c6 100644
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -385,36 +385,33 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall)
 
     readyphase++;
 
-    progress ("Rotate / Distortion...", 100 * readyphase / numofphases);
-    // Remove transformation if unneeded
-    bool needstransform = ipf.needsTransform();
-
-    if (!needstransform && ! ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) && orig_prev != oprevi) {
-        delete oprevi;
-        oprevi = orig_prev;
-    }
-
-    if ((needstransform || ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled)) ) {
-        if (!oprevi || oprevi == orig_prev) {
-            oprevi = new Imagefloat (pW, pH);
-        }
-
-        if (needstransform)
-            ipf.transform (orig_prev, oprevi, 0, 0, 0, 0, pW, pH, fw, fh, 
-                           imgsrc->getMetaData(), imgsrc->getRotateDegree(), false);
-        else {
-            orig_prev->copyData (oprevi);
-        }
-    }
-
-    if ((todo & M_RGBCURVE) && params.fattal.enabled) {
-        Imagefloat *fattalprev = oprevi->copy();
+    if ((todo & (M_TRANSFORM | M_RGBCURVE)) && params.fattal.enabled) {
+        Imagefloat *fattalprev = orig_prev->copy();
         ipf.ToneMapFattal02(fattalprev);
         if (oprevi != orig_prev) {
             delete oprevi;
         }
         oprevi = fattalprev;
-    } 
+    } else {
+        oprevi = orig_prev;
+    }
+
+    progress ("Rotate / Distortion...", 100 * readyphase / numofphases);
+    // Remove transformation if unneeded
+    bool needstransform = ipf.needsTransform();
+    
+    if ((needstransform || ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled)) ) {
+        assert(oprevi);
+        Imagefloat *op = oprevi;
+        oprevi = new Imagefloat (pW, pH);
+
+        if (needstransform)
+            ipf.transform (op, oprevi, 0, 0, 0, 0, pW, pH, fw, fh, 
+                           imgsrc->getMetaData(), imgsrc->getRotateDegree(), false);
+        else {
+            op->copyData (oprevi);
+        }
+    }
 
     if ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {
         const int W = oprevi->getWidth();
diff --git a/rtengine/rtthumbnail.cc b/rtengine/rtthumbnail.cc
index efe5d7868..0d5fcf574 100644
--- a/rtengine/rtthumbnail.cc
+++ b/rtengine/rtthumbnail.cc
@@ -1090,6 +1090,10 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, eSensorT
 
     ipf.firstAnalysis (baseImg, params, hist16);
 
+    if (params.fattal.enabled) {
+        ipf.ToneMapFattal02(baseImg);
+    }
+    
     // perform transform
     if (ipf.needsTransform()) {
         Imagefloat* trImg = new Imagefloat (fw, fh);
@@ -1102,10 +1106,6 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, eSensorT
         baseImg = trImg;
     }
 
-    if (params.fattal.enabled) {
-        ipf.ToneMapFattal02(baseImg);
-    }
-
     // update blurmap
     SHMap* shmap = nullptr;
 
diff --git a/rtengine/simpleprocess.cc b/rtengine/simpleprocess.cc
index c8d45acf8..cb1ea9c45 100644
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@@ -810,6 +810,10 @@ private:
 
         ipf.firstAnalysis (baseImg, params, hist16);
 
+        if (params.fattal.enabled) {
+            ipf.ToneMapFattal02(baseImg);
+        }
+                
         // perform transform (excepted resizing)
         if (ipf.needsTransform()) {
             Imagefloat* trImg = nullptr;
@@ -833,10 +837,6 @@ private:
         //ImProcFunctions ipf (&params, true);
         ImProcFunctions &ipf = * (ipf_p.get());
 
-        if (params.fattal.enabled) {
-            ipf.ToneMapFattal02(baseImg);
-        }
-        
         if (params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {
             const int W = baseImg->getWidth();
             const int H = baseImg->getHeight();

From 619b3e9c6370ffb2689bb144ae40e4c50bcb0864 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Thu, 16 Nov 2017 17:35:21 +0100
Subject: [PATCH 33/39] fixed off-by-one error leading to segfault

---
 rtengine/dcrop.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 62f755aee..0fe9402f1 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -750,8 +750,8 @@ void Crop::update (int todo)
     if (need_cropping) {
         Imagefloat *c = new Imagefloat(cropw, croph);
 
-        int oy = skips(cropy, skip);
-        int ox = skips(cropx, skip);
+        int oy = cropy / skip;
+        int ox = cropx / skip;
 #ifdef _OPENMP
         #pragma omp parallel for
 #endif

From b25bac8c532844961225467d4baf7ad1ad6cd62e Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Fri, 17 Nov 2017 15:27:56 +0100
Subject: [PATCH 34/39] improved performance of Fattal in dcrop

Now only Fattal works on the full image, the rest of the pipeline (including denoising) always operates only on the visible crop
---
 rtengine/dcrop.cc | 116 +++++++++++++++++++++++-----------------------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 0fe9402f1..4cf98859e 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -691,45 +691,79 @@ void Crop::update (int todo)
     createBuffer (cropw, croph);
 
     std::unique_ptr<Imagefloat> fattalCrop;
-    bool need_cropping = false;
-    bool has_fattal = false;
-    
     if ((todo & (M_TRANSFORM | M_RGBCURVE)) && params.fattal.enabled) {
-        has_fattal = true;
         Imagefloat *f = baseCrop;
+        int fw = skips(parent->fw, skip);
+        int fh = skips(parent->fh, skip);
+        bool need_cropping = false;
+
+        if (cropx || cropy || trafw != fw || trafh != fh) {
+            need_cropping = true;
+            // fattal needs to work on the full image. So here we get the full
+            // image from imgsrc, and replace the denoised crop in case
+            f = new Imagefloat(fw, fh);
+            PreviewProps pp (0, 0, parent->fw, parent->fh, skip);
+            int tr = getCoarseBitMask(params.coarse);
+            parent->imgsrc->getImage(parent->currWB, tr, f, pp, params.toneCurve, params.icm, params.raw);
+            parent->imgsrc->convertColorSpace(f, params.icm, parent->currWB);
+
+            if (params.dirpyrDenoise.enabled) {
+                // copy the denoised crop
+                int oy = cropy / skip;
+                int ox = cropx / skip;
+#ifdef _OPENMP
+                #pragma omp parallel for
+#endif
+                for (int y = 0; y < baseCrop->getHeight(); ++y) {
+                    int dy = oy + y;
+                    for (int x = 0; x < baseCrop->getWidth(); ++x) {
+                        int dx = ox + x;
+                        f->r(dy, dx) = baseCrop->r(y, x);
+                        f->g(dy, dx) = baseCrop->g(y, x);
+                        f->b(dy, dx) = baseCrop->b(y, x);
+                    }
+                }
+            }
+        }
         if (f == origCrop) {
             fattalCrop.reset(baseCrop->copy());
             f = fattalCrop.get();
         }
         parent->ipf.ToneMapFattal02(f);
-        need_cropping = (cropx || cropy || trafw != cropw || trafh != croph);
-        baseCrop = f;
+
+        // crop back to the size expected by the rest of the pipeline
+        if (need_cropping) {
+            Imagefloat *c = new Imagefloat(cropw, croph);
+
+            int oy = cropy / skip;
+            int ox = cropx / skip;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+            for (int y = 0; y < croph; ++y) {
+                int cy = y + oy;
+                for (int x = 0; x < cropw; ++x) {
+                    int cx = x + ox;
+                    c->r(y, x) = f->r(cy, cx);
+                    c->g(y, x) = f->g(cy, cx);
+                    c->b(y, x) = f->b(cy, cx);
+                }
+            }
+            fattalCrop.reset(c);
+            baseCrop = c;
+        } else {
+            baseCrop = f;
+        }
     }
     
     // transform
     if (needstransform || ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled)) {
-        int tx = cropx;
-        int ty = cropy;
-        int tw = cropw;
-        int th = croph;
-        
-        if (has_fattal) {
-            tx = 0;
-            ty = 0;
-            tw = trafw;
-            th = trafh;
-            if (transCrop) {
-                delete transCrop;
-                transCrop = nullptr;
-            }
-        }
-        
         if (!transCrop) {
-            transCrop = new Imagefloat (tw, th);
+            transCrop = new Imagefloat (cropw, croph);
         }
 
         if (needstransform)
-            parent->ipf.transform (baseCrop, transCrop, tx / skip, ty / skip, trafx / skip, trafy / skip, skips (parent->fw, skip), skips (parent->fh, skip), parent->getFullWidth(), parent->getFullHeight(),
+            parent->ipf.transform (baseCrop, transCrop, cropx / skip, cropy / skip, trafx / skip, trafy / skip, skips (parent->fw, skip), skips (parent->fh, skip), parent->getFullWidth(), parent->getFullHeight(),
                                    parent->imgsrc->getMetaData(),
                                    parent->imgsrc->getRotateDegree(), false);
         else {
@@ -747,28 +781,6 @@ void Crop::update (int todo)
         transCrop = nullptr;
     }
 
-    if (need_cropping) {
-        Imagefloat *c = new Imagefloat(cropw, croph);
-
-        int oy = cropy / skip;
-        int ox = cropx / skip;
-#ifdef _OPENMP
-        #pragma omp parallel for
-#endif
-        for (int y = 0; y < croph; ++y) {
-            int cy = y + oy;
-            for (int x = 0; x < cropw; ++x) {
-                int cx = x + ox;
-                c->r(y, x) = baseCrop->r(cy, cx);
-                c->g(y, x) = baseCrop->g(cy, cx);
-                c->b(y, x) = baseCrop->b(cy, cx);
-            }
-        }
-        fattalCrop.reset(c);
-        baseCrop = c;
-    }
-    
-
     if ((todo & (M_TRANSFORM | M_RGBCURVE))  && params.dirpyrequalizer.cbdlMethod == "bef" && params.dirpyrequalizer.enabled && !params.colorappearance.enabled) {
 
         const int W = baseCrop->getWidth();
@@ -1131,11 +1143,6 @@ void Crop::freeAll ()
 namespace
 {
 
-bool check_need_full_image(const ProcParams &params)
-{
-    return params.fattal.enabled; // agriggio - maybe we can do this for wavelets too?
-}
-
 bool check_need_larger_crop_for_lcp_distortion (int fw, int fh, int x, int y, int w, int h, const ProcParams &params)
 {
     if (x == 0 && y == 0 && w == fw && h == fh) {
@@ -1241,13 +1248,6 @@ bool Crop::setCropSizes (int rcx, int rcy, int rcw, int rch, int skip, bool inte
     leftBorder  = skips (rqx1 - bx1, skip);
     upperBorder = skips (rqy1 - by1, skip);
 
-    if (check_need_full_image(parent->params)) {
-        orx = 0;
-        ory = 0;
-        orw = parent->fullw;
-        orh = parent->fullh;
-    }    
-    
     PreviewProps cp (orx, ory, orw, orh, skip);
     int orW, orH;
     parent->imgsrc->getSize (cp, orW, orH);

From 1467b858c5e009ac66b66dc34d9b6ff6ff5f06b1 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Fri, 17 Nov 2017 15:34:48 +0100
Subject: [PATCH 35/39] fixed missing memory deallocation

---
 rtengine/dcrop.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 4cf98859e..c8baf42df 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -702,6 +702,7 @@ void Crop::update (int todo)
             // fattal needs to work on the full image. So here we get the full
             // image from imgsrc, and replace the denoised crop in case
             f = new Imagefloat(fw, fh);
+            fattalCrop.reset(f);
             PreviewProps pp (0, 0, parent->fw, parent->fh, skip);
             int tr = getCoarseBitMask(params.coarse);
             parent->imgsrc->getImage(parent->currWB, tr, f, pp, params.toneCurve, params.icm, params.raw);

From 6212d6e0c6366df28296a042bdd547126a4ab3b7 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Fri, 17 Nov 2017 17:46:13 +0100
Subject: [PATCH 36/39] better (local) caching of fattal results in dcrop

Use new M_HDR todo code instead of using M_RGBCURVE for fattal
(ported from Hombre's commit a9d02a7dca0b5dcd31f642046d94e1663b17c9ff)
---
 rtengine/dcrop.cc             | 17 ++++--------
 rtengine/improccoordinator.cc | 11 +++-----
 rtengine/refreshmap.cc        |  6 ++--
 rtengine/refreshmap.h         | 52 ++++++++++++++++++-----------------
 4 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index c8baf42df..c6b888a08 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -168,7 +168,7 @@ void Crop::update (int todo)
 
     bool needstransform  = parent->ipf.needsTransform();
 
-    if (todo & (M_INIT | M_LINDENOISE)) {
+    if (todo & (M_INIT | M_LINDENOISE | M_HDR)) {
         MyMutex::MyLock lock (parent->minit); // Also used in improccoord
 
         int tr = getCoarseBitMask (params.coarse);
@@ -691,8 +691,8 @@ void Crop::update (int todo)
     createBuffer (cropw, croph);
 
     std::unique_ptr<Imagefloat> fattalCrop;
-    if ((todo & (M_TRANSFORM | M_RGBCURVE)) && params.fattal.enabled) {
-        Imagefloat *f = baseCrop;
+    if ((todo & M_HDR) && params.fattal.enabled) {
+        Imagefloat *f = origCrop;
         int fw = skips(parent->fw, skip);
         int fh = skips(parent->fh, skip);
         bool need_cropping = false;
@@ -726,18 +726,14 @@ void Crop::update (int todo)
                 }
             }
         }
-        if (f == origCrop) {
-            fattalCrop.reset(baseCrop->copy());
-            f = fattalCrop.get();
-        }
         parent->ipf.ToneMapFattal02(f);
 
         // crop back to the size expected by the rest of the pipeline
         if (need_cropping) {
-            Imagefloat *c = new Imagefloat(cropw, croph);
+            Imagefloat *c = origCrop;
 
-            int oy = cropy / skip;
-            int ox = cropx / skip;
+            int oy = trafy / skip;
+            int ox = trafx / skip;
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
@@ -750,7 +746,6 @@ void Crop::update (int todo)
                     c->b(y, x) = f->b(cy, cx);
                 }
             }
-            fattalCrop.reset(c);
             baseCrop = c;
         } else {
             baseCrop = f;
diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc
index f456cc4c6..9172fbf96 100644
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -280,7 +280,7 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall)
         }
     }
 
-    if (todo & (M_INIT | M_LINDENOISE)) {
+    if (todo & (M_INIT | M_LINDENOISE | M_HDR)) {
         MyMutex::MyLock initLock (minit); // Also used in crop window
 
         imgsrc->HLRecovery_Global ( params.toneCurve); // this handles Color HLRecovery
@@ -385,16 +385,13 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall)
 
     readyphase++;
 
-    if ((todo & (M_TRANSFORM | M_RGBCURVE)) && params.fattal.enabled) {
-        Imagefloat *fattalprev = orig_prev->copy();
-        ipf.ToneMapFattal02(fattalprev);
+    if ((todo & M_HDR) && params.fattal.enabled) {
+        ipf.ToneMapFattal02(orig_prev);
         if (oprevi != orig_prev) {
             delete oprevi;
         }
-        oprevi = fattalprev;
-    } else {
-        oprevi = orig_prev;
     }
+    oprevi = orig_prev;
 
     progress ("Rotate / Distortion...", 100 * readyphase / numofphases);
     // Remove transformation if unneeded
diff --git a/rtengine/refreshmap.cc b/rtengine/refreshmap.cc
index fa6b52c4e..c95b53c0a 100644
--- a/rtengine/refreshmap.cc
+++ b/rtengine/refreshmap.cc
@@ -514,9 +514,9 @@ int refreshmap[rtengine::NUMOFEVENTS] = {
     DARKFRAME,        // EvLensCorrMode
     DARKFRAME,        // EvLensCorrLensfunCamera
     DARKFRAME,        // EvLensCorrLensfunLens
-    RGBCURVE,         // EvTMFattalEnabled
-    RGBCURVE,         // EvTMFattalThreshold
-    RGBCURVE          // EvTMFattalAmount
+    ALLNORAW,         // EvTMFattalEnabled
+    HDR,              // EvTMFattalThreshold
+    HDR               // EvTMFattalAmount
 
 };
 
diff --git a/rtengine/refreshmap.h b/rtengine/refreshmap.h
index e262c9394..cea6b3c8e 100644
--- a/rtengine/refreshmap.h
+++ b/rtengine/refreshmap.h
@@ -20,22 +20,23 @@
 #define __REFRESHMAP__
 
 // Use M_VOID if you wish to update the proc params without updating the preview at all !
-#define M_VOID       (1<<16)
+#define M_VOID       (1<<17)
 // Use M_MINUPDATE if you wish to update the preview without modifying the image (think about it like a "refreshPreview")
 // Must NOT be used with other event (i.e. will be used for MINUPDATE only)
-#define M_MINUPDATE  (1<<15)
+#define M_MINUPDATE  (1<<16)
 // Force high quality
-#define M_HIGHQUAL   (1<<14)
+#define M_HIGHQUAL   (1<<15)
 
 // Elementary functions that can be done to
 // the preview image when an event occurs
-#define M_MONITOR     (1<<13)
-#define M_RETINEX     (1<<12)
-#define M_CROP        (1<<11)
-#define M_PREPROC     (1<<10)
-#define M_RAW         (1<<9)
-#define M_INIT        (1<<8)
-#define M_LINDENOISE  (1<<7)
+#define M_MONITOR     (1<<14)
+#define M_RETINEX     (1<<13)
+#define M_CROP        (1<<12)
+#define M_PREPROC     (1<<11)
+#define M_RAW         (1<<10)
+#define M_INIT        (1<<9)
+#define M_LINDENOISE  (1<<8)
+#define M_HDR         (1<<7)
 #define M_TRANSFORM   (1<<6)
 #define M_BLURMAP     (1<<5)
 #define M_AUTOEXP     (1<<4)
@@ -46,21 +47,22 @@
 
 // Bitfield of functions to do to the preview image when an event occurs
 // Use those or create new ones for your new events
-#define FIRST            (M_PREPROC|M_RAW|M_INIT|M_LINDENOISE|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR|M_MONITOR)  // without HIGHQUAL
-#define ALL              (M_PREPROC|M_RAW|M_INIT|M_LINDENOISE|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)  // without HIGHQUAL
-#define DARKFRAME        (M_PREPROC|M_RAW|M_INIT|M_LINDENOISE|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
-#define FLATFIELD        (M_PREPROC|M_RAW|M_INIT|M_LINDENOISE|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
-#define DEMOSAIC                   (M_RAW|M_INIT|M_LINDENOISE|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
-#define ALLNORAW                         (M_INIT|M_LINDENOISE|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
-#define TRANSFORM                                            (M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
-#define AUTOEXP                                                                    (M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
-#define RGBCURVE                                                                             (M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
-#define LUMINANCECURVE                                                                                  (M_LUMACURVE|M_LUMINANCE|M_COLOR)
-#define SHARPENING                                                                                                  (M_LUMINANCE|M_COLOR)
-#define IMPULSEDENOISE                                                                                              (M_LUMINANCE|M_COLOR)
-#define DEFRINGE                                                                                                    (M_LUMINANCE|M_COLOR)
-#define DIRPYRDENOISE                                                                                               (M_LUMINANCE|M_COLOR)
-#define DIRPYREQUALIZER                                                                                             (M_LUMINANCE|M_COLOR)
+#define FIRST            (M_PREPROC|M_RAW|M_INIT|M_LINDENOISE|M_HDR|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR|M_MONITOR)  // without HIGHQUAL
+#define ALL              (M_PREPROC|M_RAW|M_INIT|M_LINDENOISE|M_HDR|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)  // without HIGHQUAL
+#define DARKFRAME        (M_PREPROC|M_RAW|M_INIT|M_LINDENOISE|M_HDR|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define FLATFIELD        (M_PREPROC|M_RAW|M_INIT|M_LINDENOISE|M_HDR|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define DEMOSAIC                   (M_RAW|M_INIT|M_LINDENOISE|M_HDR|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define ALLNORAW                         (M_INIT|M_LINDENOISE|M_HDR|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define HDR                                                  (M_HDR|M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define TRANSFORM                                                  (M_TRANSFORM|M_BLURMAP|M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define AUTOEXP                                                                          (M_AUTOEXP|M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define RGBCURVE                                                                                   (M_RGBCURVE|M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define LUMINANCECURVE                                                                                        (M_LUMACURVE|M_LUMINANCE|M_COLOR)
+#define SHARPENING                                                                                                        (M_LUMINANCE|M_COLOR)
+#define IMPULSEDENOISE                                                                                                    (M_LUMINANCE|M_COLOR)
+#define DEFRINGE                                                                                                          (M_LUMINANCE|M_COLOR)
+#define DIRPYRDENOISE                                                                                                     (M_LUMINANCE|M_COLOR)
+#define DIRPYREQUALIZER                                                                                                   (M_LUMINANCE|M_COLOR)
 #define GAMMA             M_MONITOR
 #define CROP              M_CROP
 #define RESIZE            M_VOID

From 7b9252be3363a930b3308d510cd1923a49aefcc3 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Fri, 17 Nov 2017 21:02:19 +0100
Subject: [PATCH 37/39] fattal: correctly crop the image to the dimensions
 required for distortion correction

Fix for #4187
---
 rtengine/dcrop.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index c6b888a08..71030f35b 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -697,7 +697,7 @@ void Crop::update (int todo)
         int fh = skips(parent->fh, skip);
         bool need_cropping = false;
 
-        if (cropx || cropy || trafw != fw || trafh != fh) {
+        if (trafx || trafy || trafw != fw || trafh != fh) {
             need_cropping = true;
             // fattal needs to work on the full image. So here we get the full
             // image from imgsrc, and replace the denoised crop in case
@@ -710,8 +710,8 @@ void Crop::update (int todo)
 
             if (params.dirpyrDenoise.enabled) {
                 // copy the denoised crop
-                int oy = cropy / skip;
-                int ox = cropx / skip;
+                int oy = trafy / skip;
+                int ox = trafx / skip;
 #ifdef _OPENMP
                 #pragma omp parallel for
 #endif
@@ -737,9 +737,9 @@ void Crop::update (int todo)
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-            for (int y = 0; y < croph; ++y) {
+            for (int y = 0; y < trafh; ++y) {
                 int cy = y + oy;
-                for (int x = 0; x < cropw; ++x) {
+                for (int x = 0; x < trafw; ++x) {
                     int cx = x + ox;
                     c->r(y, x) = f->r(cy, cx);
                     c->g(y, x) = f->g(cy, cx);

From e384edba55bf9c9700aa425cbf8e6e284af4a793 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Fri, 17 Nov 2017 21:28:17 +0100
Subject: [PATCH 38/39] fattal: use a common cache for 1:1 detail crops when
 denoise is turned off

---
 rtengine/dcrop.cc             | 51 +++++++++++++++++++++--------------
 rtengine/improccoordinator.cc | 10 ++++++-
 rtengine/improccoordinator.h  |  1 +
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 71030f35b..513039ad6 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -696,37 +696,48 @@ void Crop::update (int todo)
         int fw = skips(parent->fw, skip);
         int fh = skips(parent->fh, skip);
         bool need_cropping = false;
+        bool need_fattal = true;
 
         if (trafx || trafy || trafw != fw || trafh != fh) {
             need_cropping = true;
             // fattal needs to work on the full image. So here we get the full
             // image from imgsrc, and replace the denoised crop in case
-            f = new Imagefloat(fw, fh);
-            fattalCrop.reset(f);
-            PreviewProps pp (0, 0, parent->fw, parent->fh, skip);
-            int tr = getCoarseBitMask(params.coarse);
-            parent->imgsrc->getImage(parent->currWB, tr, f, pp, params.toneCurve, params.icm, params.raw);
-            parent->imgsrc->convertColorSpace(f, params.icm, parent->currWB);
+            if (!params.dirpyrDenoise.enabled && skip == 1 && parent->fattal_11_dcrop_cache) {
+                f = parent->fattal_11_dcrop_cache;
+                need_fattal = false;
+            } else {
+                f = new Imagefloat(fw, fh);
+                fattalCrop.reset(f);
+                PreviewProps pp (0, 0, parent->fw, parent->fh, skip);
+                int tr = getCoarseBitMask(params.coarse);
+                parent->imgsrc->getImage(parent->currWB, tr, f, pp, params.toneCurve, params.icm, params.raw);
+                parent->imgsrc->convertColorSpace(f, params.icm, parent->currWB);
 
-            if (params.dirpyrDenoise.enabled) {
-                // copy the denoised crop
-                int oy = trafy / skip;
-                int ox = trafx / skip;
+                if (params.dirpyrDenoise.enabled) {
+                    // copy the denoised crop
+                    int oy = trafy / skip;
+                    int ox = trafx / skip;
 #ifdef _OPENMP
-                #pragma omp parallel for
+                    #pragma omp parallel for
 #endif
-                for (int y = 0; y < baseCrop->getHeight(); ++y) {
-                    int dy = oy + y;
-                    for (int x = 0; x < baseCrop->getWidth(); ++x) {
-                        int dx = ox + x;
-                        f->r(dy, dx) = baseCrop->r(y, x);
-                        f->g(dy, dx) = baseCrop->g(y, x);
-                        f->b(dy, dx) = baseCrop->b(y, x);
+                    for (int y = 0; y < baseCrop->getHeight(); ++y) {
+                        int dy = oy + y;
+                        for (int x = 0; x < baseCrop->getWidth(); ++x) {
+                            int dx = ox + x;
+                            f->r(dy, dx) = baseCrop->r(y, x);
+                            f->g(dy, dx) = baseCrop->g(y, x);
+                            f->b(dy, dx) = baseCrop->b(y, x);
+                        }
                     }
+                } else if (skip == 1) {
+                    parent->fattal_11_dcrop_cache = f; // cache this globally
+                    fattalCrop.release();
                 }
             }
         }
-        parent->ipf.ToneMapFattal02(f);
+        if (need_fattal) {
+            parent->ipf.ToneMapFattal02(f);
+        }
 
         // crop back to the size expected by the rest of the pipeline
         if (need_cropping) {
@@ -735,7 +746,7 @@ void Crop::update (int todo)
             int oy = trafy / skip;
             int ox = trafx / skip;
 #ifdef _OPENMP
-#pragma omp parallel for
+            #pragma omp parallel for
 #endif
             for (int y = 0; y < trafh; ++y) {
                 int cy = y + oy;
diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc
index 9172fbf96..532068eee 100644
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -33,7 +33,7 @@ namespace rtengine
 extern const Settings* settings;
 
 ImProcCoordinator::ImProcCoordinator ()
-    : orig_prev (nullptr), oprevi (nullptr), oprevl (nullptr), nprevl (nullptr), previmg (nullptr), workimg (nullptr),
+    : orig_prev (nullptr), oprevi (nullptr), oprevl (nullptr), nprevl (nullptr), fattal_11_dcrop_cache(nullptr), previmg (nullptr), workimg (nullptr),
       ncie (nullptr), imgsrc (nullptr), shmap (nullptr), lastAwbEqual (0.), lastAwbTempBias (0.0), ipf (&params, true), monitorIntent (RI_RELATIVE),
       softProof (false), gamutCheck (false), scale (10), highDetailPreprocessComputed (false), highDetailRawComputed (false),
       allocated (false), bwAutoR (-9000.f), bwAutoG (-9000.f), bwAutoB (-9000.f), CAMMean (NAN),
@@ -111,6 +111,10 @@ ImProcCoordinator::~ImProcCoordinator ()
     mProcessing.lock();
     mProcessing.unlock();
     freeAll ();
+    if (fattal_11_dcrop_cache) {
+        delete fattal_11_dcrop_cache;
+        fattal_11_dcrop_cache = nullptr;
+    }
 
     std::vector<Crop*> toDel = crops;
 
@@ -386,6 +390,10 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall)
     readyphase++;
 
     if ((todo & M_HDR) && params.fattal.enabled) {
+        if (fattal_11_dcrop_cache) {
+            delete fattal_11_dcrop_cache;
+            fattal_11_dcrop_cache = nullptr;
+        }
         ipf.ToneMapFattal02(orig_prev);
         if (oprevi != orig_prev) {
             delete oprevi;
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index 41b901e93..2f5fe52e5 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -57,6 +57,7 @@ protected:
     Imagefloat *oprevi;
     LabImage *oprevl;
     LabImage *nprevl;
+    Imagefloat *fattal_11_dcrop_cache; // global cache for ToneMapFattal02 used in 1:1 detail windows (except when denoise is active)
     Image8 *previmg;  // displayed image in monitor color space, showing the output profile as well (soft-proofing enabled, which then correspond to workimg) or not
     Image8 *workimg;  // internal image in output color space for analysis
     CieImage *ncie;

From e9d30532654055f1c6cc289c52092812920cae13 Mon Sep 17 00:00:00 2001
From: Alberto Griggio <agriggio@users.noreply.github.com>
Date: Sat, 18 Nov 2017 17:03:57 +0100
Subject: [PATCH 39/39] run tmo_fattal02.cc through astyle

---
 rtengine/tmo_fattal02.cc | 1311 ++++++++++++++++++++------------------
 1 file changed, 683 insertions(+), 628 deletions(-)

diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc
index d537cce3f..6e4b45ccb 100644
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -3,7 +3,7 @@
  *  This file is part of RawTherapee.
  *
  *  Ported from LuminanceHDR by Alberto Griggio <alberto.griggio@gmail.com>
- *  
+ *
  *  RawTherapee is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation, either version 3 of the License, or
@@ -73,7 +73,8 @@
 #include "StopWatch.h"
 #include "sleef.c"
 #include "opthelper.h"
-namespace rtengine {
+namespace rtengine
+{
 
 /******************************************************************************
  * RT code
@@ -84,66 +85,68 @@ extern MyMutex *fftwMutex;
 
 using namespace std;
 
-namespace {
+namespace
+{
 
-class Array2Df: public array2D<float> {
+class Array2Df: public array2D<float>
+{
     typedef array2D<float> Super;
 public:
     Array2Df(): Super() {}
-    Array2Df(int w, int h): Super(w, h) {}
+    Array2Df (int w, int h): Super (w, h) {}
 
-    float &operator()(int w, int h)
+    float &operator() (int w, int h)
     {
         return (*this)[h][w];
     }
 
-    const float &operator()(int w, int h) const
+    const float &operator() (int w, int h) const
     {
         return (*this)[h][w];
     }
 
-    float &operator()(int i)
+    float &operator() (int i)
     {
-        return static_cast<float *>(*this)[i];
+        return static_cast<float *> (*this)[i];
     }
 
-    const float &operator()(int i) const
+    const float &operator() (int i) const
     {
-        return const_cast<Array2Df &>(*this).operator()(i);
+        return const_cast<Array2Df &> (*this).operator() (i);
     }
 
     int getRows() const
     {
-        return const_cast<Array2Df &>(*this).height();
+        return const_cast<Array2Df &> (*this).height();
     }
 
     int getCols() const
     {
-        return const_cast<Array2Df &>(*this).width();
+        return const_cast<Array2Df &> (*this).width();
     }
 
     float *data()
     {
-        return static_cast<float *>(*this);
+        return static_cast<float *> (*this);
     }
 
     const float *data() const
     {
-        return const_cast<Array2Df &>(*this).data();
+        return const_cast<Array2Df &> (*this).data();
     }
 };
 
 // upper bound on image dimension used in tmo_fattal02 -- see the comment there
 const int RT_dimension_cap = 1920;
 
-void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread);
+void rescale_bilinear (const Array2Df &src, Array2Df &dst, bool multithread);
 
 
 /******************************************************************************
  * Luminance HDR code (modifications are marked with an RT comment)
  ******************************************************************************/
 
-void downSample(const Array2Df& A, Array2Df& B)
+void downSample (const Array2Df& A, Array2Df& B)
 {
     const int width = B.getCols();
     const int height = B.getRows();
@@ -153,163 +156,164 @@ void downSample(const Array2Df& A, Array2Df& B)
     // speed improvements. The main issue is the pde solver and in case of the
     // fft solver uses optimised threaded fftw routines.
     //#pragma omp parallel for
-    for ( int y=0 ; y<height ; y++ )
-    {
-        for ( int x=0 ; x<width ; x++ )
-        {
-            float p = A(2*x,2*y);
-            p += A(2*x+1,2*y);
-            p += A(2*x,2*y+1);
-            p += A(2*x+1,2*y+1);
-            B(x,y) = p * 0.25f; // p / 4.0f;
+    for ( int y = 0 ; y < height ; y++ ) {
+        for ( int x = 0 ; x < width ; x++ ) {
+            float p = A (2 * x, 2 * y);
+            p += A (2 * x + 1, 2 * y);
+            p += A (2 * x, 2 * y + 1);
+            p += A (2 * x + 1, 2 * y + 1);
+            B (x, y) = p * 0.25f; // p / 4.0f;
         }
     }
 }
 
-void gaussianBlur(const Array2Df& I, Array2Df& L)
+void gaussianBlur (const Array2Df& I, Array2Df& L)
 {
     const int width = I.getCols();
     const int height = I.getRows();
 
     if (width < 3 || height < 3) {
         if (&I != &L) {
-            for (int i = 0, n = width*height; i < n; ++i) {
-                L(i) = I(i);
+            for (int i = 0, n = width * height; i < n; ++i) {
+                L (i) = I (i);
             }
         }
+
         return;
     }
 
-    Array2Df T(width,height);
+    Array2Df T (width, height);
 
     //--- X blur
     #pragma omp parallel for shared(I, T)
-    for ( int y=0 ; y<height ; y++ )
-    {
-        for ( int x=1 ; x<width-1 ; x++ )
-        {
-            float t = 2.f * I(x,y);
-            t += I(x-1,y);
-            t += I(x+1,y);
-            T(x,y) = t * 0.25f; // t / 4.f;
+
+    for ( int y = 0 ; y < height ; y++ ) {
+        for ( int x = 1 ; x < width - 1 ; x++ ) {
+            float t = 2.f * I (x, y);
+            t += I (x - 1, y);
+            t += I (x + 1, y);
+            T (x, y) = t * 0.25f; // t / 4.f;
         }
-        T(0,y) = ( 3.f * I(0,y)+ I(1,y) ) * 0.25f; // / 4.f;
-        T(width-1,y) = ( 3.f * I(width-1,y) + I(width-2,y) ) * 0.25f; // / 4.f;
+
+        T (0, y) = ( 3.f * I (0, y) + I (1, y) ) * 0.25f; // / 4.f;
+        T (width - 1, y) = ( 3.f * I (width - 1, y) + I (width - 2, y) ) * 0.25f; // / 4.f;
     }
 
     //--- Y blur
     #pragma omp parallel for
-    for ( int x=0 ; x<width-7 ; x+=8 )
-    {
-        for ( int y=1 ; y<height-1 ; y++ )
-        {
-            for(int xx = 0; xx < 8; ++xx) {
-                float t = 2.f * T(x+xx,y);
-                t += T(x+xx,y-1);
-                t += T(x+xx,y+1);
-                L(x+xx,y) = t * 0.25f; // t/4.0f;
+
+    for ( int x = 0 ; x < width - 7 ; x += 8 ) {
+        for ( int y = 1 ; y < height - 1 ; y++ ) {
+            for (int xx = 0; xx < 8; ++xx) {
+                float t = 2.f * T (x + xx, y);
+                t += T (x + xx, y - 1);
+                t += T (x + xx, y + 1);
+                L (x + xx, y) = t * 0.25f; // t/4.0f;
             }
         }
-        for(int xx = 0; xx < 8; ++xx) {
-            L(x+xx,0) = ( 3.f * T(x+xx,0) + T(x+xx,1) ) * 0.25f; // / 4.0f;
-            L(x+xx,height-1) = ( 3.f * T(x+xx,height-1) + T(x+xx,height-2) ) * 0.25f; // / 4.0f;
+
+        for (int xx = 0; xx < 8; ++xx) {
+            L (x + xx, 0) = ( 3.f * T (x + xx, 0) + T (x + xx, 1) ) * 0.25f; // / 4.0f;
+            L (x + xx, height - 1) = ( 3.f * T (x + xx, height - 1) + T (x + xx, height - 2) ) * 0.25f; // / 4.0f;
         }
     }
-    for ( int x = width - (width % 8) ; x<width ; x++ )
-    {
-        for ( int y=1 ; y<height-1 ; y++ )
-        {
-            float t = 2.f * T(x,y);
-            t += T(x,y-1);
-            t += T(x,y+1);
-            L(x,y) = t * 0.25f; // t/4.0f;
+
+    for ( int x = width - (width % 8) ; x < width ; x++ ) {
+        for ( int y = 1 ; y < height - 1 ; y++ ) {
+            float t = 2.f * T (x, y);
+            t += T (x, y - 1);
+            t += T (x, y + 1);
+            L (x, y) = t * 0.25f; // t/4.0f;
         }
-        L(x,0) = ( 3.f * T(x,0) + T(x,1) ) * 0.25f; // / 4.0f;
-        L(x,height-1) = ( 3.f * T(x,height-1) + T(x,height-2) ) * 0.25f; // / 4.0f;
+
+        L (x, 0) = ( 3.f * T (x, 0) + T (x, 1) ) * 0.25f; // / 4.0f;
+        L (x, height - 1) = ( 3.f * T (x, height - 1) + T (x, height - 2) ) * 0.25f; // / 4.0f;
     }
 }
 
-void createGaussianPyramids( Array2Df* H, Array2Df** pyramids, int nlevels)
+void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels)
 {
-  int width = H->getCols();
-  int height = H->getRows();
-  const int size = width*height;
+    int width = H->getCols();
+    int height = H->getRows();
+    const int size = width * height;
+
+    pyramids[0] = new Array2Df (width, height);
 
-  pyramids[0] = new Array2Df(width,height);
 //#pragma omp parallel for shared(pyramids, H)
-  for( int i=0 ; i<size ; i++ )
-    (*pyramids[0])(i) = (*H)(i);
-
-  Array2Df* L = new Array2Df(width,height);
-  gaussianBlur( *pyramids[0], *L );
-
-  for ( int k=1 ; k<nlevels ; k++ )
-  {
-      if (width > 2 && height > 2) {
-          width /= 2;
-          height /= 2;
-          pyramids[k] = new Array2Df(width,height);
-          downSample(*L, *pyramids[k]);
-      } else {
-          // RT - now nlevels is fixed in tmo_fattal02 (see the comment in
-          // there), so it might happen that we have to add some padding to
-          // the gaussian pyramids
-          pyramids[k] = new Array2Df(width,height);
-          for (int j = 0, n = width*height; j < n; ++j) {
-              (*pyramids[k])(j) = (*L)(j);
-          }
-      }
-
-    if(k < nlevels -1) {
-        delete L;
-        L = new Array2Df(width,height);
-        gaussianBlur( *pyramids[k], *L );
+    for ( int i = 0 ; i < size ; i++ ) {
+        (*pyramids[0]) (i) = (*H) (i);
     }
-  }
 
-  delete L;
+    Array2Df* L = new Array2Df (width, height);
+    gaussianBlur ( *pyramids[0], *L );
+
+    for ( int k = 1 ; k < nlevels ; k++ ) {
+        if (width > 2 && height > 2) {
+            width /= 2;
+            height /= 2;
+            pyramids[k] = new Array2Df (width, height);
+            downSample (*L, *pyramids[k]);
+        } else {
+            // RT - now nlevels is fixed in tmo_fattal02 (see the comment in
+            // there), so it might happen that we have to add some padding to
+            // the gaussian pyramids
+            pyramids[k] = new Array2Df (width, height);
+
+            for (int j = 0, n = width * height; j < n; ++j) {
+                (*pyramids[k]) (j) = (*L) (j);
+            }
+        }
+
+        if (k < nlevels - 1) {
+            delete L;
+            L = new Array2Df (width, height);
+            gaussianBlur ( *pyramids[k], *L );
+        }
+    }
+
+    delete L;
 }
 
 //--------------------------------------------------------------------
 
-float calculateGradients(Array2Df* H, Array2Df* G, int k)
+float calculateGradients (Array2Df* H, Array2Df* G, int k)
 {
-  const int width = H->getCols();
-  const int height = H->getRows();
-  const float divider = pow( 2.0f, k+1 );
-  float avgGrad = 0.0f;
+    const int width = H->getCols();
+    const int height = H->getRows();
+    const float divider = pow ( 2.0f, k + 1 );
+    float avgGrad = 0.0f;
 
-#pragma omp parallel for reduction(+:avgGrad)
-  for( int y=0 ; y<height ; y++ )
-  {
-    int n = (y == 0 ? 0 : y-1);
-    int s = (y+1 == height ? y : y+1);
-    for( int x=0 ; x<width ; x++ )
-    {
-      float gx, gy;
-      int w, e;
-      w = (x == 0 ? 0 : x-1);
-      e = (x+1 == width ? x : x+1);
+    #pragma omp parallel for reduction(+:avgGrad)
 
-      gx = ((*H)(w,y)-(*H)(e,y));
+    for ( int y = 0 ; y < height ; y++ ) {
+        int n = (y == 0 ? 0 : y - 1);
+        int s = (y + 1 == height ? y : y + 1);
 
-      gy = ((*H)(x,s)-(*H)(x,n));
-      // note this implicitely assumes that H(-1)=H(0)
-      // for the fft-pde slover this would need adjustment as H(-1)=H(1)
-      // is assumed, which means gx=0.0, gy=0.0 at the boundaries
-      // however, the impact is not visible so we ignore this here
+        for ( int x = 0 ; x < width ; x++ ) {
+            float gx, gy;
+            int w, e;
+            w = (x == 0 ? 0 : x - 1);
+            e = (x + 1 == width ? x : x + 1);
 
-      (*G)(x,y) = sqrt(gx*gx+gy*gy) / divider;
-      avgGrad += (*G)(x,y);
+            gx = ((*H) (w, y) - (*H) (e, y));
+
+            gy = ((*H) (x, s) - (*H) (x, n));
+            // note this implicitely assumes that H(-1)=H(0)
+            // for the fft-pde slover this would need adjustment as H(-1)=H(1)
+            // is assumed, which means gx=0.0, gy=0.0 at the boundaries
+            // however, the impact is not visible so we ignore this here
+
+            (*G) (x, y) = sqrt (gx * gx + gy * gy) / divider;
+            avgGrad += (*G) (x, y);
+        }
     }
-  }
 
-  return avgGrad / (width*height);
+    return avgGrad / (width * height);
 }
 
 //--------------------------------------------------------------------
 
-void upSample(const Array2Df& A, Array2Df& B)
+void upSample (const Array2Df& A, Array2Df& B)
 {
     const int width = B.getCols();
     const int height = B.getRows();
@@ -317,18 +321,17 @@ void upSample(const Array2Df& A, Array2Df& B)
     const int aheight = A.getRows();
 
     //#pragma omp parallel for shared(A, B)
-    for ( int y=0 ; y<height ; y++ )
-    {
-        for ( int x=0 ; x<width ; x++ )
-        {
-            int ax = static_cast<int>(x * 0.5f); //x / 2.f;
-            int ay = static_cast<int>(y * 0.5f); //y / 2.f;
-            ax = (ax<awidth) ? ax : awidth-1;
-            ay = (ay<aheight) ? ay : aheight-1;
+    for ( int y = 0 ; y < height ; y++ ) {
+        for ( int x = 0 ; x < width ; x++ ) {
+            int ax = static_cast<int> (x * 0.5f); //x / 2.f;
+            int ay = static_cast<int> (y * 0.5f); //y / 2.f;
+            ax = (ax < awidth) ? ax : awidth - 1;
+            ay = (ay < aheight) ? ay : aheight - 1;
 
-            B(x,y) = A(ax,ay);
+            B (x, y) = A (ax, ay);
         }
     }
+
 //--- this code below produces 'use of uninitialized value error'
 //   int width = A->getCols();
 //   int height = A->getRows();
@@ -345,84 +348,78 @@ void upSample(const Array2Df& A, Array2Df& B)
 }
 
 
-void calculateFiMatrix(Array2Df* FI, Array2Df* gradients[],
-                       float avgGrad[], int nlevels, int detail_level,
-                       float alfa, float beta, float noise)
+void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[],
+                        float avgGrad[], int nlevels, int detail_level,
+                        float alfa, float beta, float noise)
 {
     const bool newfattal = true;
-    int width = gradients[nlevels-1]->getCols();
-    int height = gradients[nlevels-1]->getRows();
+    int width = gradients[nlevels - 1]->getCols();
+    int height = gradients[nlevels - 1]->getRows();
     Array2Df** fi = new Array2Df*[nlevels];
 
-    fi[nlevels-1] = new Array2Df(width,height);
-    if (newfattal)
-    {
+    fi[nlevels - 1] = new Array2Df (width, height);
+
+    if (newfattal) {
         //#pragma omp parallel for shared(fi)
-        for ( int k = 0 ; k < width*height ; k++ )
-        {
-            (*fi[nlevels-1])(k) = 1.0f;
+        for ( int k = 0 ; k < width * height ; k++ ) {
+            (*fi[nlevels - 1]) (k) = 1.0f;
         }
     }
 
-    for ( int k = nlevels-1; k >= 0 ; k-- )
-    {
+    for ( int k = nlevels - 1; k >= 0 ; k-- ) {
         width = gradients[k]->getCols();
         height = gradients[k]->getRows();
 
         // only apply gradients to levels>=detail_level but at least to the coarsest
         if ( k >= detail_level
-             ||k==nlevels-1
-             || newfattal == false)
-        {
+                || k == nlevels - 1
+                || newfattal == false) {
             //DEBUG_STR << "calculateFiMatrix: apply gradient to level " << k << endl;
             #pragma omp parallel for shared(fi,avgGrad)
-            for ( int y = 0; y < height; y++ )
-            {
-                for ( int x = 0; x < width; x++ )
-                {
-                    float grad = ((*gradients[k])(x,y) < 1e-4f) ? 1e-4 : (*gradients[k])(x,y);
+            for ( int y = 0; y < height; y++ ) {
+                for ( int x = 0; x < width; x++ ) {
+                    float grad = ((*gradients[k]) (x, y) < 1e-4f) ? 1e-4 : (*gradients[k]) (x, y);
                     float a = alfa * avgGrad[k];
 
-                    float value = pow((grad+noise)/a, beta - 1.0f);
+                    float value = pow ((grad + noise) / a, beta - 1.0f);
 
-                    if (newfattal)
-                        (*fi[k])(x,y) *= value;
-                    else
-                        (*fi[k])(x,y) = value;
+                    if (newfattal) {
+                        (*fi[k]) (x, y) *= value;
+                    } else {
+                        (*fi[k]) (x, y) = value;
+                    }
                 }
             }
         }
 
         // create next level
-        if ( k>1 )
-        {
-            width = gradients[k-1]->getCols();
-            height = gradients[k-1]->getRows();
-            fi[k-1] = new Array2Df(width,height);
+        if ( k > 1 ) {
+            width = gradients[k - 1]->getCols();
+            height = gradients[k - 1]->getRows();
+            fi[k - 1] = new Array2Df (width, height);
+        } else {
+            fi[0] = FI;    // highest level -> result
         }
-        else
-            fi[0] = FI;                         // highest level -> result
 
-        if ( k>0  && newfattal )
-        {
-            upSample(*fi[k], *fi[k-1]);           // upsample to next level
-            gaussianBlur(*fi[k-1], *fi[k-1]);
+        if ( k > 0  && newfattal ) {
+            upSample (*fi[k], *fi[k - 1]);        // upsample to next level
+            gaussianBlur (*fi[k - 1], *fi[k - 1]);
         }
     }
 
-    for ( int k=1 ; k<nlevels ; k++ )
-    {
+    for ( int k = 1 ; k < nlevels ; k++ ) {
         delete fi[k];
     }
+
     delete[] fi;
 }
 
 inline
-void findMaxMinPercentile(const Array2Df& I,
-                                 float minPrct, float& minLum,
-                                 float maxPrct, float& maxLum)
+void findMaxMinPercentile (const Array2Df& I,
+                           float minPrct, float& minLum,
+                           float maxPrct, float& maxLum)
 {
-    assert(minPrct <= maxPrct);
+    assert (minPrct <= maxPrct);
 
     const int size = I.getRows() * I.getCols();
     const float* data = I.data();
@@ -432,73 +429,76 @@ void findMaxMinPercentile(const Array2Df& I,
     // memory usage of this method is 65536 * sizeof(float) * (t + 1) byte, where t is the number of threads
 
     // We need one global histogram
-    LUTu histo(65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
+    LUTu histo (65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
     histo.clear();
 #ifdef _OPENMP
-#pragma omp parallel
+    #pragma omp parallel
 #endif
-{
-    // We need one histogram per thread
-    LUTu histothr(65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
-    histothr.clear();
+    {
+        // We need one histogram per thread
+        LUTu histothr (65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
+        histothr.clear();
 
 #ifdef _OPENMP
-#pragma omp for nowait
+        #pragma omp for nowait
 #endif
-    for(int i = 0; i< size; ++i) {
-        // values are in [0;1] range, so we have to multiply with 65535 to get the histogram index
-        histothr[(unsigned int)(65535.f * data[i])]++;
+
+        for (int i = 0; i < size; ++i) {
+            // values are in [0;1] range, so we have to multiply with 65535 to get the histogram index
+            histothr[ (unsigned int) (65535.f * data[i])]++;
+        }
+
+#ifdef _OPENMP
+        #pragma omp critical
+#endif
+        // add per thread histogram to global histogram
+        histo += histothr;
     }
 
-#ifdef _OPENMP
-#pragma omp critical
-#endif
-    // add per thread histogram to global histogram
-    histo += histothr;
-}
-
     int k = 0;
     int count = 0;
 
     // find (minPrct*size) smallest value
-    while(count < minPrct*size) {
+    while (count < minPrct * size) {
         count += histo[k++];
     }
-    if(k > 0) { // interpolate
+
+    if (k > 0) { // interpolate
         int count_ = count - histo[k - 1];
         float c0 = count - minPrct * size;
         float c1 = minPrct * size - count_;
         minLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f);
     } else {
-        minLum = k /65535.f;
+        minLum = k / 65535.f;
     }
 
     // find (maxPrct*size) smallest value
-    while(count < maxPrct*size) {
+    while (count < maxPrct * size) {
         count += histo[k++];
     }
-    if(k > 0) { // interpolate
+
+    if (k > 0) { // interpolate
         int count_ = count - histo[k - 1];
         float c0 = count - maxPrct * size;
         float c1 = maxPrct * size - count_;
         maxLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f);
     } else {
-        maxLum = k /65535.f;
+        maxLum = k / 65535.f;
     }
 
 }
 
-void solve_pde_fft(Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread);
+void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread);
 
-void tmo_fattal02(size_t width,
-                  size_t height,
-                  const Array2Df& Y,
-                  Array2Df& L,
-                  float alfa,
-                  float beta,
-                  float noise,
-                  int detail_level,
-                  bool multithread)
+void tmo_fattal02 (size_t width,
+                   size_t height,
+                   const Array2Df& Y,
+                   Array2Df& L,
+                   float alfa,
+                   float beta,
+                   float noise,
+                   int detail_level,
+                   bool multithread)
 {
 // #ifdef TIMER_PROFILING
 //     msec_timer stop_watch;
@@ -507,12 +507,18 @@ void tmo_fattal02(size_t width,
     static const float black_point = 0.1f;
     static const float white_point = 0.5f;
     static const float gamma = 1.0f; // 0.8f;
-    // static const int   detail_level = 3;
-    if ( detail_level < 0 ) detail_level = 0;
-    if ( detail_level > 3 ) detail_level = 3;
 
-  // ph.setValue(2);
-  // if (ph.canceled()) return;
+    // static const int   detail_level = 3;
+    if ( detail_level < 0 ) {
+        detail_level = 0;
+    }
+
+    if ( detail_level > 3 ) {
+        detail_level = 3;
+    }
+
+    // ph.setValue(2);
+    // if (ph.canceled()) return;
 
     /* RT -- we use a hardcoded value for nlevels, to limit the
      * dependency of the result on the image size. When using an auto computed
@@ -520,247 +526,274 @@ void tmo_fattal02(size_t width,
      * image sizes, making it essentially impossible to preview the tool
      * inside RT. With a hardcoded value, the results for the preview are much
      * closer to those for the final image */
-  // int MSIZE = 32;         // minimum size of gaussian pyramid
-  // // I believe a smaller value than 32 results in slightly better overall
-  // // quality but I'm only applying this if the newly implemented fft solver
-  // // is used in order not to change behaviour of the old version
-  // // TODO: best let the user decide this value
-  // // if (fftsolver)
-  // {
-  //    MSIZE = 8;
-  // }
-        
+    // int MSIZE = 32;         // minimum size of gaussian pyramid
+    // // I believe a smaller value than 32 results in slightly better overall
+    // // quality but I'm only applying this if the newly implemented fft solver
+    // // is used in order not to change behaviour of the old version
+    // // TODO: best let the user decide this value
+    // // if (fftsolver)
+    // {
+    //    MSIZE = 8;
+    // }
 
-  int size = width*height;
 
-  // find max value, normalize to range 0..100 and take logarithm
-  float minLum = Y(0,0);
-  float maxLum = Y(0,0);
+    int size = width * height;
 
-  #pragma omp parallel for reduction(max:maxLum)
-  for ( int i=0 ; i<size ; i++ )
-  {
-      maxLum = std::max(maxLum, Y(i));
-  }
+    // find max value, normalize to range 0..100 and take logarithm
+    float minLum = Y (0, 0);
+    float maxLum = Y (0, 0);
 
-  Array2Df* H = new Array2Df(width, height);
-  float temp = 100.f / maxLum;
-  float eps = 1e-4f;
-  #pragma omp parallel
-  {
-#ifdef __SSE2__
-  vfloat epsv = F2V(eps);
-  vfloat tempv = F2V(temp);
-#endif
-  #pragma omp for schedule(dynamic,16)
-  for (size_t i=0 ; i<height ; ++i) {
-      size_t j = 0;
-#ifdef __SSE2__
-      for(; j < width - 3; j+=4) {
-          STVFU((*H)[i][j], xlogf(tempv * LVFU(Y[i][j]) + epsv));
-      }
-#endif
-      for(; j < width; ++j) {
-          (*H)[i][j] = xlogf(temp * Y[i][j] + eps);
-      }
-  }
-  }
+    #pragma omp parallel for reduction(max:maxLum)
 
-  /** RT - this is also here to reduce the dependency of the results on the
-   * input image size, with the primary aim of having a preview in RT that is
-   * reasonably close to the actual output image. Intuitively, what we do is
-   * to put a cap on the dimension of the image processed, so that it is close
-   * in size to the typical preview that you will see on a normal consumer
-   * monitor. (That's where the 1920 value for RT_dimension_cap comes from.)
-   * However, we can't simply downscale the input Y array and then upscale it
-   * on output, because that would cause a big loss of sharpness (confirmed by
-   * testing).
-   * So, we use a different method: we downscale the H array, so that we
-   * compute a downscaled gaussian pyramid and a downscaled FI matrix. Then,
-   * we upscale the FI matrix later on, before it gets combined with the
-   * original input luminance array H. This seems to preserve the input
-   * sharpness and at the same time significantly reduce the dependency of the
-   * result on the input size. Clearly this is a hack, and keep in mind that I
-   * do not really know how Fattal works (it comes from LuminanceHDR almost
-   * verbatim), so this should probably be revised/reviewed by someone who
-   * knows better... also, we use a quite naive bilinear interpolation
-   * algorithm (see rescale_bilinear below), which could definitely be
-   * improved */
-  int fullwidth = width;
-  int fullheight = height;
-  int dim = std::max(width, height);
-  Array2Df *fullH = nullptr;
-  if (dim > RT_dimension_cap) {
-      float s = float(RT_dimension_cap) / float(dim);
-      Array2Df *HH = new Array2Df(width * s, height * s);
-      rescale_bilinear(*H, *HH, multithread);
-      fullH = H;
-      H = HH;
-      width = H->getCols();
-      height = H->getRows();
-  }
-  /** RT */
-
-  // create gaussian pyramids
-  // int mins = (width<height) ? width : height;    // smaller dimension
-  // int nlevels = 0;
-  // while ( mins >= MSIZE )
-  // {
-  //   nlevels++;
-  //   mins /= 2;
-  // }
-  // // std::cout << "DEBUG: nlevels = " << nlevels << ", mins = " << mins << std::endl;
-  // // The following lines solves a bug with images particularly small
-  // if (nlevels == 0) nlevels = 1;
-  const int nlevels = 7; // RT -- see above
-
-  Array2Df** pyramids = new Array2Df*[nlevels];
-  createGaussianPyramids(H, pyramids, nlevels);
-  // ph.setValue(8);
-
-  // calculate gradients and its average values on pyramid levels
-  Array2Df** gradients = new Array2Df*[nlevels];
-  float* avgGrad = new float[nlevels];
-  for ( int k=0 ; k<nlevels ; k++ )
-  {
-    gradients[k] = new Array2Df(pyramids[k]->getCols(), pyramids[k]->getRows());
-    avgGrad[k] = calculateGradients(pyramids[k],gradients[k], k);
-    delete pyramids[k];
-  }
-  delete[] pyramids;
-  // ph.setValue(12);
-
-  // calculate fi matrix
-  Array2Df* FI = new Array2Df(width, height);
-  calculateFiMatrix(FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise);
-//  dumpPFS( "FI.pfs", FI, "Y" );
-  for ( int i=0 ; i<nlevels ; i++ )
-  {
-    delete gradients[i];
-  }
-  delete[] gradients;
-  delete[] avgGrad;
-  // ph.setValue(16);
-  // if (ph.canceled()){
-  //   delete FI;
-  //   delete H;
-  //   return;
-  // }
-
-  /** - RT - bring back the FI image to the input size if it was downscaled */
-  if (fullH) {
-      Array2Df *FI2 = new Array2Df(fullwidth, fullheight);
-      rescale_bilinear(*FI, *FI2, multithread);
-      delete FI;
-      FI = FI2;
-      width = fullwidth;
-      height = fullheight;
-      delete H;
-      H = fullH;
-  }
-  /** RT */
-
-  // attenuate gradients
-  Array2Df* Gx = new Array2Df(width, height);
-  Array2Df* Gy = new Array2Df(width, height);
-
-  // the fft solver solves the Poisson pde but with slightly different
-  // boundary conditions, so we need to adjust the assembly of the right hand
-  // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero
-  // Neumann conditions assume U(-1)=U(0)), see also divergence calculation
-  // if (fftsolver)
-#pragma omp parallel for
-    for ( size_t y=0 ; y<height ; y++ ) {
-      // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
-      unsigned int yp1 = (y+1 >= height ? height-2 : y+1);
-      for ( size_t x=0 ; x<width ; x++ )
-      {
-        // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
-        unsigned int xp1 = (x+1 >= width ?  width-2  : x+1);
-        // forward differences in H, so need to use between-points approx of FI
-        (*Gx)(x,y) = ((*H)(xp1,y)-(*H)(x,y)) * 0.5 * ((*FI)(xp1,y)+(*FI)(x,y));
-        (*Gy)(x,y) = ((*H)(x,yp1)-(*H)(x,y)) * 0.5 * ((*FI)(x,yp1)+(*FI)(x,y));
-      }
+    for ( int i = 0 ; i < size ; i++ ) {
+        maxLum = std::max (maxLum, Y (i));
     }
-  delete H;
 
-  // calculate divergence
-#pragma omp parallel for
-  for ( size_t y = 0; y < height; ++y )
-  {
-      for ( size_t x = 0; x < width; ++x )
-      {
-          (*FI)(x,y) = (*Gx)(x,y) + (*Gy)(x,y);
-          if ( x > 0 ) (*FI)(x,y) -= (*Gx)(x-1,y);
-          if ( y > 0 ) (*FI)(x,y) -= (*Gy)(x,y-1);
+    Array2Df* H = new Array2Df (width, height);
+    float temp = 100.f / maxLum;
+    float eps = 1e-4f;
+    #pragma omp parallel
+    {
+#ifdef __SSE2__
+        vfloat epsv = F2V (eps);
+        vfloat tempv = F2V (temp);
+#endif
+        #pragma omp for schedule(dynamic,16)
 
-          // if (fftsolver)
-          {
-              if (x==0) (*FI)(x,y) += (*Gx)(x,y);
-              if (y==0) (*FI)(x,y) += (*Gy)(x,y);
-          }
+        for (size_t i = 0 ; i < height ; ++i) {
+            size_t j = 0;
+#ifdef __SSE2__
 
-      }
-  }
-  //delete Gx; // RT - reused as temp buffer in solve_pde_fft, deleted later
-  delete Gy;
+            for (; j < width - 3; j += 4) {
+                STVFU ((*H)[i][j], xlogf (tempv * LVFU (Y[i][j]) + epsv));
+            }
 
-  // solve pde and exponentiate (ie recover compressed image)
-  {
-  // if (fftsolver)
-  {
-      MyMutex::MyLock lock(*fftwMutex);
-      solve_pde_fft(FI, &L, Gx, multithread);//, ph);
-  }
-  delete Gx;
-  delete FI;
-  // else
-  // {
-  //     solve_pde_multigrid(&DivG, &U, ph);
-  // }
+#endif
+
+            for (; j < width; ++j) {
+                (*H)[i][j] = xlogf (temp * Y[i][j] + eps);
+            }
+        }
+    }
+
+    /** RT - this is also here to reduce the dependency of the results on the
+     * input image size, with the primary aim of having a preview in RT that is
+     * reasonably close to the actual output image. Intuitively, what we do is
+     * to put a cap on the dimension of the image processed, so that it is close
+     * in size to the typical preview that you will see on a normal consumer
+     * monitor. (That's where the 1920 value for RT_dimension_cap comes from.)
+     * However, we can't simply downscale the input Y array and then upscale it
+     * on output, because that would cause a big loss of sharpness (confirmed by
+     * testing).
+     * So, we use a different method: we downscale the H array, so that we
+     * compute a downscaled gaussian pyramid and a downscaled FI matrix. Then,
+     * we upscale the FI matrix later on, before it gets combined with the
+     * original input luminance array H. This seems to preserve the input
+     * sharpness and at the same time significantly reduce the dependency of the
+     * result on the input size. Clearly this is a hack, and keep in mind that I
+     * do not really know how Fattal works (it comes from LuminanceHDR almost
+     * verbatim), so this should probably be revised/reviewed by someone who
+     * knows better... also, we use a quite naive bilinear interpolation
+     * algorithm (see rescale_bilinear below), which could definitely be
+     * improved */
+    int fullwidth = width;
+    int fullheight = height;
+    int dim = std::max (width, height);
+    Array2Df *fullH = nullptr;
+
+    if (dim > RT_dimension_cap) {
+        float s = float (RT_dimension_cap) / float (dim);
+        Array2Df *HH = new Array2Df (width * s, height * s);
+        rescale_bilinear (*H, *HH, multithread);
+        fullH = H;
+        H = HH;
+        width = H->getCols();
+        height = H->getRows();
+    }
+
+    /** RT */
+
+    // create gaussian pyramids
+    // int mins = (width<height) ? width : height;    // smaller dimension
+    // int nlevels = 0;
+    // while ( mins >= MSIZE )
+    // {
+    //   nlevels++;
+    //   mins /= 2;
+    // }
+    // // std::cout << "DEBUG: nlevels = " << nlevels << ", mins = " << mins << std::endl;
+    // // The following lines solves a bug with images particularly small
+    // if (nlevels == 0) nlevels = 1;
+    const int nlevels = 7; // RT -- see above
+
+    Array2Df** pyramids = new Array2Df*[nlevels];
+    createGaussianPyramids (H, pyramids, nlevels);
+    // ph.setValue(8);
+
+    // calculate gradients and its average values on pyramid levels
+    Array2Df** gradients = new Array2Df*[nlevels];
+    float* avgGrad = new float[nlevels];
+
+    for ( int k = 0 ; k < nlevels ; k++ ) {
+        gradients[k] = new Array2Df (pyramids[k]->getCols(), pyramids[k]->getRows());
+        avgGrad[k] = calculateGradients (pyramids[k], gradients[k], k);
+        delete pyramids[k];
+    }
+
+    delete[] pyramids;
+    // ph.setValue(12);
+
+    // calculate fi matrix
+    Array2Df* FI = new Array2Df (width, height);
+    calculateFiMatrix (FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise);
+
+//  dumpPFS( "FI.pfs", FI, "Y" );
+    for ( int i = 0 ; i < nlevels ; i++ ) {
+        delete gradients[i];
+    }
+
+    delete[] gradients;
+    delete[] avgGrad;
+    // ph.setValue(16);
+    // if (ph.canceled()){
+    //   delete FI;
+    //   delete H;
+    //   return;
+    // }
+
+    /** - RT - bring back the FI image to the input size if it was downscaled */
+    if (fullH) {
+        Array2Df *FI2 = new Array2Df (fullwidth, fullheight);
+        rescale_bilinear (*FI, *FI2, multithread);
+        delete FI;
+        FI = FI2;
+        width = fullwidth;
+        height = fullheight;
+        delete H;
+        H = fullH;
+    }
+
+    /** RT */
+
+    // attenuate gradients
+    Array2Df* Gx = new Array2Df (width, height);
+    Array2Df* Gy = new Array2Df (width, height);
+
+    // the fft solver solves the Poisson pde but with slightly different
+    // boundary conditions, so we need to adjust the assembly of the right hand
+    // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero
+    // Neumann conditions assume U(-1)=U(0)), see also divergence calculation
+    // if (fftsolver)
+    #pragma omp parallel for
+
+    for ( size_t y = 0 ; y < height ; y++ ) {
+        // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
+        unsigned int yp1 = (y + 1 >= height ? height - 2 : y + 1);
+
+        for ( size_t x = 0 ; x < width ; x++ ) {
+            // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
+            unsigned int xp1 = (x + 1 >= width ?  width - 2  : x + 1);
+            // forward differences in H, so need to use between-points approx of FI
+            (*Gx) (x, y) = ((*H) (xp1, y) - (*H) (x, y)) * 0.5 * ((*FI) (xp1, y) + (*FI) (x, y));
+            (*Gy) (x, y) = ((*H) (x, yp1) - (*H) (x, y)) * 0.5 * ((*FI) (x, yp1) + (*FI) (x, y));
+        }
+    }
+
+    delete H;
+
+    // calculate divergence
+    #pragma omp parallel for
+
+    for ( size_t y = 0; y < height; ++y ) {
+        for ( size_t x = 0; x < width; ++x ) {
+            (*FI) (x, y) = (*Gx) (x, y) + (*Gy) (x, y);
+
+            if ( x > 0 ) {
+                (*FI) (x, y) -= (*Gx) (x - 1, y);
+            }
+
+            if ( y > 0 ) {
+                (*FI) (x, y) -= (*Gy) (x, y - 1);
+            }
+
+            // if (fftsolver)
+            {
+                if (x == 0) {
+                    (*FI) (x, y) += (*Gx) (x, y);
+                }
+
+                if (y == 0) {
+                    (*FI) (x, y) += (*Gy) (x, y);
+                }
+            }
+
+        }
+    }
+
+    //delete Gx; // RT - reused as temp buffer in solve_pde_fft, deleted later
+    delete Gy;
+
+    // solve pde and exponentiate (ie recover compressed image)
+    {
+        // if (fftsolver)
+        {
+            MyMutex::MyLock lock (*fftwMutex);
+            solve_pde_fft (FI, &L, Gx, multithread); //, ph);
+        }
+        delete Gx;
+        delete FI;
+        // else
+        // {
+        //     solve_pde_multigrid(&DivG, &U, ph);
+        // }
 // #ifndef NDEBUG
 //   printf("\npde residual error: %f\n", residual_pde(&U, &DivG));
 // #endif
-  // ph.setValue(90);
-  // if ( ph.canceled() )
-  // {
-  //     return;
-  // }
-  #pragma omp parallel
-  {
+        // ph.setValue(90);
+        // if ( ph.canceled() )
+        // {
+        //     return;
+        // }
+        #pragma omp parallel
+        {
 #ifdef __SSE2__
-  vfloat gammav = F2V(gamma);
+            vfloat gammav = F2V (gamma);
 #endif
-  #pragma omp for schedule(dynamic,16)
-  for (size_t i=0 ; i<height ; i++) {
-      size_t j = 0;
+            #pragma omp for schedule(dynamic,16)
+
+            for (size_t i = 0 ; i < height ; i++) {
+                size_t j = 0;
 #ifdef __SSE2__
-      for(; j < width - 3; j+=4) {
-          STVFU(L[i][j], xexpf(gammav * LVFU(L[i][j])));
-      }
+
+                for (; j < width - 3; j += 4) {
+                    STVFU (L[i][j], xexpf (gammav * LVFU (L[i][j])));
+                }
+
 #endif
-      for(; j < width; j++) {
-          L[i][j] = xexpf( gamma * L[i][j]);
-      }
-  }
-  }
-  }
-  // ph.setValue(95);
 
-  // remove percentile of min and max values and renormalize
-  float cut_min = 0.01f * black_point;
-  float cut_max = 1.0f - 0.01f * white_point;
-  assert(cut_min>=0.0f && (cut_max<=1.0f) && (cut_min<cut_max));
-  findMaxMinPercentile(L, cut_min, minLum, cut_max, maxLum);
-  float dividor = (maxLum - minLum);
-
-#pragma omp parallel for
-  for (size_t i = 0; i < height; ++i) {
-    for (size_t j = 0; j < width; ++j) {
-        L[i][j] = std::max((L[i][j] - minLum) / dividor, 0.f);
-        // note, we intentionally do not cut off values > 1.0
+                for (; j < width; j++) {
+                    L[i][j] = xexpf ( gamma * L[i][j]);
+                }
+            }
+        }
+    }
+    // ph.setValue(95);
+
+    // remove percentile of min and max values and renormalize
+    float cut_min = 0.01f * black_point;
+    float cut_max = 1.0f - 0.01f * white_point;
+    assert (cut_min >= 0.0f && (cut_max <= 1.0f) && (cut_min < cut_max));
+    findMaxMinPercentile (L, cut_min, minLum, cut_max, maxLum);
+    float dividor = (maxLum - minLum);
+
+    #pragma omp parallel for
+
+    for (size_t i = 0; i < height; ++i) {
+        for (size_t j = 0; j < width; ++j) {
+            L[i][j] = std::max ((L[i][j] - minLum) / dividor, 0.f);
+            // note, we intentionally do not cut off values > 1.0
+        }
     }
-  }
 }
 
 
@@ -836,91 +869,94 @@ void tmo_fattal02(size_t width,
 
 // returns T = EVy A EVx^tr
 // note, modifies input data
-void transform_ev2normal(Array2Df *A, Array2Df *T)
+void transform_ev2normal (Array2Df *A, Array2Df *T)
 {
-  int width = A->getCols();
-  int height = A->getRows();
-  assert((int)T->getCols()==width && (int)T->getRows()==height);
+    int width = A->getCols();
+    int height = A->getRows();
+    assert ((int)T->getCols() == width && (int)T->getRows() == height);
 
-  // the discrete cosine transform is not exactly the transform needed
-  // need to scale input values to get the right transformation
-  #pragma omp parallel for
-  for(int y=1 ; y<height-1 ; y++ )
-    for(int x=1 ; x<width-1 ; x++ )
-      (*A)(x,y)*=0.25f;
+    // the discrete cosine transform is not exactly the transform needed
+    // need to scale input values to get the right transformation
+    #pragma omp parallel for
 
-  for(int x=1 ; x<width-1 ; x++ )
-  {
-    (*A)(x,0)*=0.5f;
-    (*A)(x,height-1)*=0.5f;
-  }
-  for(int y=1 ; y<height-1 ; y++ )
-  {
-    (*A)(0,y)*=0.5;
-    (*A)(width-1,y)*=0.5f;
-  }
+    for (int y = 1 ; y < height - 1 ; y++ )
+        for (int x = 1 ; x < width - 1 ; x++ ) {
+            (*A) (x, y) *= 0.25f;
+        }
 
-  // note, fftw provides its own memory allocation routines which
-  // ensure that memory is properly 16/32 byte aligned so it can
-  // use SSE/AVX operations (2/4 double ops in parallel), if our
-  // data is not properly aligned fftw won't use SSE/AVX
-  // (I believe new() aligns memory to 16 byte so avoid overhead here)
-  //
-  // double* in = (double*) fftwf_malloc(sizeof(double) * width*height);
-  // fftwf_free(in);
+    for (int x = 1 ; x < width - 1 ; x++ ) {
+        (*A) (x, 0) *= 0.5f;
+        (*A) (x, height - 1) *= 0.5f;
+    }
 
-  // executes 2d discrete cosine transform
-  fftwf_plan p;
-  p=fftwf_plan_r2r_2d(height, width, A->data(), T->data(),
-                        FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
-  fftwf_execute(p);
-  fftwf_destroy_plan(p);
+    for (int y = 1 ; y < height - 1 ; y++ ) {
+        (*A) (0, y) *= 0.5;
+        (*A) (width - 1, y) *= 0.5f;
+    }
+
+    // note, fftw provides its own memory allocation routines which
+    // ensure that memory is properly 16/32 byte aligned so it can
+    // use SSE/AVX operations (2/4 double ops in parallel), if our
+    // data is not properly aligned fftw won't use SSE/AVX
+    // (I believe new() aligns memory to 16 byte so avoid overhead here)
+    //
+    // double* in = (double*) fftwf_malloc(sizeof(double) * width*height);
+    // fftwf_free(in);
+
+    // executes 2d discrete cosine transform
+    fftwf_plan p;
+    p = fftwf_plan_r2r_2d (height, width, A->data(), T->data(),
+                           FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
+    fftwf_execute (p);
+    fftwf_destroy_plan (p);
 }
 
 
 // returns T = EVy^-1 * A * (EVx^-1)^tr
-void transform_normal2ev(Array2Df *A, Array2Df *T)
+void transform_normal2ev (Array2Df *A, Array2Df *T)
 {
-  int width = A->getCols();
-  int height = A->getRows();
-  assert((int)T->getCols()==width && (int)T->getRows()==height);
+    int width = A->getCols();
+    int height = A->getRows();
+    assert ((int)T->getCols() == width && (int)T->getRows() == height);
 
-  // executes 2d discrete cosine transform
-  fftwf_plan p;
-  p=fftwf_plan_r2r_2d(height, width, A->data(), T->data(),
-                        FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
-  fftwf_execute(p);
-  fftwf_destroy_plan(p);
+    // executes 2d discrete cosine transform
+    fftwf_plan p;
+    p = fftwf_plan_r2r_2d (height, width, A->data(), T->data(),
+                           FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
+    fftwf_execute (p);
+    fftwf_destroy_plan (p);
 
-  // need to scale the output matrix to get the right transform
-  float factor = (1.0f/((height-1)*(width-1)));
-#pragma omp parallel for
-  for(int y=0 ; y<height ; y++ )
-    for(int x=0 ; x<width ; x++ )
-      (*T)(x,y)*= factor;
-  for(int x=0 ; x<width ; x++ )
-  {
-    (*T)(x,0)*=0.5f;
-    (*T)(x,height-1)*=0.5f;
-  }
-  for(int y=0 ; y<height ; y++ )
-  {
-    (*T)(0,y)*=0.5f;
-    (*T)(width-1,y)*=0.5f;
-  }
+    // need to scale the output matrix to get the right transform
+    float factor = (1.0f / ((height - 1) * (width - 1)));
+    #pragma omp parallel for
+
+    for (int y = 0 ; y < height ; y++ )
+        for (int x = 0 ; x < width ; x++ ) {
+            (*T) (x, y) *= factor;
+        }
+
+    for (int x = 0 ; x < width ; x++ ) {
+        (*T) (x, 0) *= 0.5f;
+        (*T) (x, height - 1) *= 0.5f;
+    }
+
+    for (int y = 0 ; y < height ; y++ ) {
+        (*T) (0, y) *= 0.5f;
+        (*T) (width - 1, y) *= 0.5f;
+    }
 }
 
 // returns the eigenvalues of the 1d laplace operator
-std::vector<double> get_lambda(int n)
+std::vector<double> get_lambda (int n)
 {
-  assert(n>1);
-  std::vector<double> v(n);
-  for (int i=0; i<n; i++)
-  {
-    v[i]=-4.0*SQR(sin((double)i/(2*(n-1))*RT_PI));
-  }
+    assert (n > 1);
+    std::vector<double> v (n);
 
-  return v;
+    for (int i = 0; i < n; i++) {
+        v[i] = -4.0 * SQR (sin ((double)i / (2 * (n - 1)) * RT_PI));
+    }
+
+    return v;
 }
 
 // // makes boundary conditions compatible so that a solution exists
@@ -967,99 +1003,105 @@ std::vector<double> get_lambda(int n)
 // not modified and the equation might not have a solution but an
 // approximate solution with a minimum error is then calculated
 // double precision version
-void solve_pde_fft(Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*, pfs::Progress &ph,
+void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*, pfs::Progress &ph,
                                               bool adjust_bound)*/
 {
-   // ph.setValue(20);
-  //DEBUG_STR << "solve_pde_fft: solving Laplace U = F ..." << std::endl;
-  int width = F->getCols();
-  int height = F->getRows();
-  assert((int)U->getCols()==width && (int)U->getRows()==height);
-  assert(buf->getCols()==width && buf->getRows()==height);
+    // ph.setValue(20);
+    //DEBUG_STR << "solve_pde_fft: solving Laplace U = F ..." << std::endl;
+    int width = F->getCols();
+    int height = F->getRows();
+    assert ((int)U->getCols() == width && (int)U->getRows() == height);
+    assert (buf->getCols() == width && buf->getRows() == height);
 
-  // activate parallel execution of fft routines
+    // activate parallel execution of fft routines
 #ifdef RT_FFTW3F_OMP
-  if (multithread) {
-      fftwf_init_threads();
-      fftwf_plan_with_nthreads( omp_get_max_threads() );
-  }
+
+    if (multithread) {
+        fftwf_init_threads();
+        fftwf_plan_with_nthreads ( omp_get_max_threads() );
+    }
+
 // #else
 //   fftwf_plan_with_nthreads( 2 );
 #endif
 
-  // in general there might not be a solution to the Poisson pde
-  // with Neumann boundary conditions unless the boundary satisfies
-  // an integral condition, this function modifies the boundary so that
-  // the condition is exactly satisfied
-  // if(adjust_bound)
-  // {
-  //   //DEBUG_STR << "solve_pde_fft: checking boundary conditions" << std::endl;
-  //   make_compatible_boundary(F);
-  // }
+    // in general there might not be a solution to the Poisson pde
+    // with Neumann boundary conditions unless the boundary satisfies
+    // an integral condition, this function modifies the boundary so that
+    // the condition is exactly satisfied
+    // if(adjust_bound)
+    // {
+    //   //DEBUG_STR << "solve_pde_fft: checking boundary conditions" << std::endl;
+    //   make_compatible_boundary(F);
+    // }
 
-  // transforms F into eigenvector space: Ftr =
-  //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl;
-  Array2Df* F_tr = buf; //new Array2Df(width,height);
-  transform_normal2ev(F, F_tr);
-  // TODO: F no longer needed so could release memory, but as it is an
-  // input parameter we won't do that
-  // ph.setValue(50);
-  // if (ph.canceled())
-  // {
-  //   delete F_tr;
-  //   return;
-  // }
+    // transforms F into eigenvector space: Ftr =
+    //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl;
+    Array2Df* F_tr = buf; //new Array2Df(width,height);
+    transform_normal2ev (F, F_tr);
+    // TODO: F no longer needed so could release memory, but as it is an
+    // input parameter we won't do that
+    // ph.setValue(50);
+    // if (ph.canceled())
+    // {
+    //   delete F_tr;
+    //   return;
+    // }
 
-  //DEBUG_STR << "solve_pde_fft: F_tr(0,0) = " << (*F_tr)(0,0);
-  //DEBUG_STR << " (must be 0 for solution to exist)" << std::endl;
+    //DEBUG_STR << "solve_pde_fft: F_tr(0,0) = " << (*F_tr)(0,0);
+    //DEBUG_STR << " (must be 0 for solution to exist)" << std::endl;
 
-  // in the eigenvector space the solution is very simple
-  //DEBUG_STR << "solve_pde_fft: solve in eigenvector space" << std::endl;
+    // in the eigenvector space the solution is very simple
+    //DEBUG_STR << "solve_pde_fft: solve in eigenvector space" << std::endl;
 //  Array2Df* U_tr = new Array2Df(width,height);
-  std::vector<double> l1=get_lambda(height);
-  std::vector<double> l2=get_lambda(width);
+    std::vector<double> l1 = get_lambda (height);
+    std::vector<double> l2 = get_lambda (width);
 
-#pragma omp parallel for
-  for(int y=0 ; y<height ; y++ )
-  {
-    for(int x=0 ; x<width ; x++ )
-    {
-        (*F_tr)(x,y)=(*F_tr)(x,y)/(l1[y]+l2[x]);
+    #pragma omp parallel for
+
+    for (int y = 0 ; y < height ; y++ ) {
+        for (int x = 0 ; x < width ; x++ ) {
+            (*F_tr) (x, y) = (*F_tr) (x, y) / (l1[y] + l2[x]);
+        }
     }
-  }
-  (*F_tr)(0,0)=0.f; // any value ok, only adds a const to the solution
 
-  // transforms U_tr back to the normal space
-  //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl;
-  transform_ev2normal(F_tr, U);
+    (*F_tr) (0, 0) = 0.f; // any value ok, only adds a const to the solution
+
+    // transforms U_tr back to the normal space
+    //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl;
+    transform_ev2normal (F_tr, U);
 //  delete F_tr;    // no longer needed so release memory
 
-  // the solution U as calculated will satisfy something like int U = 0
-  // since for any constant c, U-c is also a solution and we are mainly
-  // working in the logspace of (0,1) data we prefer to have
-  // a solution which has no positive values: U_new(x,y)=U(x,y)-max
-  // (not really needed but good for numerics as we later take exp(U))
-  //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl;
-  float max=0.f;
-  #pragma omp parallel for reduction(max:max)
-  for(int i=0; i<width*height; i++) {
-    max = std::max(max, (*U)(i));
-  }
+    // the solution U as calculated will satisfy something like int U = 0
+    // since for any constant c, U-c is also a solution and we are mainly
+    // working in the logspace of (0,1) data we prefer to have
+    // a solution which has no positive values: U_new(x,y)=U(x,y)-max
+    // (not really needed but good for numerics as we later take exp(U))
+    //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl;
+    float max = 0.f;
+    #pragma omp parallel for reduction(max:max)
 
-  #pragma omp parallel for
-  for(int i=0; i<width*height; i++) {
-    (*U)(i)-=max;
-  }
+    for (int i = 0; i < width * height; i++) {
+        max = std::max (max, (*U) (i));
+    }
 
-  // fft parallel threads cleanup, better handled outside this function?
+    #pragma omp parallel for
+
+    for (int i = 0; i < width * height; i++) {
+        (*U) (i) -= max;
+    }
+
+    // fft parallel threads cleanup, better handled outside this function?
 #ifdef RT_FFTW3F_OMP
-  if (multithread) {
-      fftwf_cleanup_threads();
-  }
+
+    if (multithread) {
+        fftwf_cleanup_threads();
+    }
+
 #endif
 
-  // ph.setValue(90);
-  //DEBUG_STR << "solve_pde_fft: done" << std::endl;
+    // ph.setValue(90);
+    //DEBUG_STR << "solve_pde_fft: done" << std::endl;
 }
 
 
@@ -1092,21 +1134,21 @@ void solve_pde_fft(Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*,
  * RT code from here on
  *****************************************************************************/
 
-inline float get_bilinear_value(const Array2Df &src, float x, float y)
+inline float get_bilinear_value (const Array2Df &src, float x, float y)
 {
     // Get integer and fractional parts of numbers
     int xi = x;
     int yi = y;
     float xf = x - xi;
     float yf = y - yi;
-    int xi1 = std::min(xi+1, src.getCols()-1);
-    int yi1 = std::min(yi+1, src.getRows()-1);
- 
-    float bl = src(xi, yi);
-    float br = src(xi1, yi);
-    float tl = src(xi, yi1);
-    float tr = src(xi1, yi1);
- 
+    int xi1 = std::min (xi + 1, src.getCols() - 1);
+    int yi1 = std::min (yi + 1, src.getRows() - 1);
+
+    float bl = src (xi, yi);
+    float br = src (xi1, yi);
+    float tl = src (xi, yi1);
+    float tr = src (xi1, yi1);
+
     // interpolate
     float b = xf * br + (1.f - xf) * bl;
     float t = xf * tr + (1.f - xf) * tl;
@@ -1115,23 +1157,25 @@ inline float get_bilinear_value(const Array2Df &src, float x, float y)
 }
 
 
-void rescale_bilinear(const Array2Df &src, Array2Df &dst, bool multithread)
+void rescale_bilinear (const Array2Df &src, Array2Df &dst, bool multithread)
 {
-    float col_scale = float(src.getCols())/float(dst.getCols());
-    float row_scale = float(src.getRows())/float(dst.getRows());
+    float col_scale = float (src.getCols()) / float (dst.getCols());
+    float row_scale = float (src.getRows()) / float (dst.getRows());
 
 #ifdef _OPENMP
     #pragma omp parallel for if (multithread)
 #endif
+
     for (int y = 0; y < dst.getRows(); ++y) {
         float ymrs = y * row_scale;
+
         for (int x = 0; x < dst.getCols(); ++x) {
-            dst(x, y) = get_bilinear_value(src, x * col_scale, ymrs);
+            dst (x, y) = get_bilinear_value (src, x * col_scale, ymrs);
         }
     }
 }
 
-void rescale_nearest(const Array2Df &src, Array2Df &dst, bool multithread)
+void rescale_nearest (const Array2Df &src, Array2Df &dst, bool multithread)
 {
     const int width = src.getCols();
     const int height = src.getRows();
@@ -1141,26 +1185,28 @@ void rescale_nearest(const Array2Df &src, Array2Df &dst, bool multithread)
 #ifdef _OPENMP
     #pragma omp parallel for if (multithread)
 #endif
+
     for (int y = 0; y < nh; ++y) {
         int sy = y * height / nh;
+
         for (int x = 0; x < nw; ++x) {
             int sx = x * width / nw;
-            dst(x, y) = src(sx, sy);
+            dst (x, y) = src (sx, sy);
         }
     }
 }
 
 
-inline float luminance(float r, float g, float b, TMatrix ws)
+inline float luminance (float r, float g, float b, TMatrix ws)
 {
     return r * ws[1][0] + g * ws[1][1] + b * ws[1][2];
 }
 
 
-inline int round_up_pow2(int dim)
+inline int round_up_pow2 (int dim)
 {
     // from https://graphics.stanford.edu/~seander/bithacks.html
-    assert(dim > 0);
+    assert (dim > 0);
     unsigned int v = dim;
     v--;
     v |= v >> 1;
@@ -1172,7 +1218,7 @@ inline int round_up_pow2(int dim)
     return v;
 }
 
-inline int find_fast_dim(int dim)
+inline int find_fast_dim (int dim)
 {
     // as per the FFTW docs:
     //
@@ -1183,39 +1229,42 @@ inline int find_fast_dim(int dim)
     // the above form. This is not exhaustive, but should be ok for pictures
     // up to 100MPix at least
 
-    int d1 = round_up_pow2(dim);
+    int d1 = round_up_pow2 (dim);
     std::vector<int> d = {
-        d1/128 * 65,
-        d1/64 * 33,
-        d1/512 * 273,
-        d1/16 * 9,
-        d1/8 * 5,
-        d1/16 * 11,
-        d1/128 * 91,
-        d1/4 * 3,
-        d1/64 * 49,
-        d1/16 * 13,
-        d1/8 * 7,
+        d1 / 128 * 65,
+        d1 / 64 * 33,
+        d1 / 512 * 273,
+        d1 / 16 * 9,
+        d1 / 8 * 5,
+        d1 / 16 * 11,
+        d1 / 128 * 91,
+        d1 / 4 * 3,
+        d1 / 64 * 49,
+        d1 / 16 * 13,
+        d1 / 8 * 7,
         d1
     };
+
     for (size_t i = 0; i < d.size(); ++i) {
         if (d[i] >= dim) {
             return d[i];
         }
     }
-    assert(false);
+
+    assert (false);
     return dim;
 }
 
 } // namespace
 
 
-void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
+void ImProcFunctions::ToneMapFattal02 (Imagefloat *rgb)
 {
     BENCHFUN
     const int detail_level = 3;
 
     float alpha = 1.f;
+
     if (params->fattal.threshold < 0) {
         alpha += (params->fattal.threshold * 0.9f) / 100.f;
     } else if (params->fattal.threshold > 0) {
@@ -1223,7 +1272,7 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
     }
 
     float beta = 1.f - (params->fattal.amount * 0.3f) / 100.f;
-    
+
     // sanity check
     if (alpha <= 0 || beta <= 0) {
         return;
@@ -1231,35 +1280,38 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
 
     int w = rgb->getWidth();
     int h = rgb->getHeight();
-    
-    Array2Df Yr(w, h);
+
+    Array2Df Yr (w, h);
 
     const float epsilon = 1e-4f;
     const float luminance_noise_floor = 65.535f;
     const float min_luminance = 1.f;
-    TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix(params->icm.working);
+    TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix (params->icm.working);
 
 #ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
 #endif
+
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
-            Yr(x, y) = std::max(luminance(rgb->r(y, x), rgb->g(y, x), rgb->b(y, x), ws), min_luminance); // clip really black pixels
+            Yr (x, y) = std::max (luminance (rgb->r (y, x), rgb->g (y, x), rgb->b (y, x), ws), min_luminance); // clip really black pixels
         }
     }
+
     // median filter on the deep shadows, to avoid boosting noise
     // because w2 >= w and h2 >= h, we can use the L buffer as temporary buffer for Median_Denoise()
-    int w2 = find_fast_dim(w) + 1;
-    int h2 = find_fast_dim(h) + 1;
-    Array2Df L(w2, h2);
+    int w2 = find_fast_dim (w) + 1;
+    int h2 = find_fast_dim (h) + 1;
+    Array2Df L (w2, h2);
     {
 #ifdef _OPENMP
         int num_threads = multiThread ? omp_get_max_threads() : 1;
 #else
         int num_threads = 1;
 #endif
-        float r = float(std::max(w, h)) / float(RT_dimension_cap);
+        float r = float (std::max (w, h)) / float (RT_dimension_cap);
         Median med;
+
         if (r >= 3) {
             med = Median::TYPE_7X7;
         } else if (r >= 2) {
@@ -1269,7 +1321,8 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
         } else {
             med = Median::TYPE_3X3_STRONG;
         }
-        Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads, L);
+
+        Median_Denoise (Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads, L);
     }
 
     float noise = alpha * 0.01f;
@@ -1279,27 +1332,29 @@ void ImProcFunctions::ToneMapFattal02(Imagefloat *rgb)
                   << ", detail_level = " << detail_level << std::endl;
     }
 
-    rescale_nearest(Yr, L, multiThread);
-    tmo_fattal02(w2, h2, L, L, alpha, beta, noise, detail_level, multiThread);
+    rescale_nearest (Yr, L, multiThread);
+    tmo_fattal02 (w2, h2, L, L, alpha, beta, noise, detail_level, multiThread);
 
 //    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
 
 #ifdef _OPENMP
     #pragma omp parallel for if(multiThread)
 #endif
+
     for (int y = 0; y < h; y++) {
         int yy = y * h2 / h;
+
         for (int x = 0; x < w; x++) {
             int xx = x * w2 / w;
-            float Y = Yr(x, y);
-            float l = std::max(L(xx, yy), epsilon) * (65535.f / Y);
-            rgb->r(y, x) = std::max(rgb->r(y, x), 0.f) * l;
-            rgb->g(y, x) = std::max(rgb->g(y, x), 0.f) * l;
-            rgb->b(y, x) = std::max(rgb->b(y, x), 0.f) * l;
-            
-            assert(std::isfinite(rgb->r(y, x)));
-            assert(std::isfinite(rgb->g(y, x)));
-            assert(std::isfinite(rgb->b(y, x)));
+            float Y = Yr (x, y);
+            float l = std::max (L (xx, yy), epsilon) * (65535.f / Y);
+            rgb->r (y, x) = std::max (rgb->r (y, x), 0.f) * l;
+            rgb->g (y, x) = std::max (rgb->g (y, x), 0.f) * l;
+            rgb->b (y, x) = std::max (rgb->b (y, x), 0.f) * l;
+
+            assert (std::isfinite (rgb->r (y, x)));
+            assert (std::isfinite (rgb->g (y, x)));
+            assert (std::isfinite (rgb->b (y, x)));
         }
     }
 }