Moved findMinMaxPercentile() to rt_algo.*, use bool multiThread in fattal tonemapper, fixes #4195

2017-11-25 13:56:42 +01:00
parent dc248860a1
commit 0950963f84
4 changed files with 230 additions and 122 deletions
--- a/rtengine/CMakeLists.txt
+++ b/rtengine/CMakeLists.txt
@@ -106,6 +106,7 @@ set(RTENGINESOURCEFILES
    rawimage.cc
    rawimagesource.cc
    refreshmap.cc
+    rt_algo.cc
    rtthumbnail.cc
    shmap.cc
    simpleprocess.cc
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -0,0 +1,163 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2017 Ingo Weyrich <heckflosse67@gmx.de>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <cstddef>
+#include <cmath>
+#include <cassert>
+#include <algorithm>
+#include <vector>
+#include <cstdint>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace rtengine
+{
+void findMinMaxPercentile (const float *data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multithread)
+{
+    // we need to find the (minPrct*size) smallest value and the (maxPrct*size) smallest value in data
+    // We use a histogram based search for speed and to reduce memory usage
+    // memory usage of this method is histoSize * sizeof(uint32_t) * (t + 1) byte,
+    // where t is the number of threads and histoSize is in [1;65536]
+    // The current implementation is not guaranteed to work correctly if size > 2^32 (4294967296)
+    assert (minPrct <= maxPrct);
+
+    if(size == 0) {
+        return;
+    }
+
+    size_t numThreads = 1;
+#ifdef _OPENMP
+    // Because we have an overhead in the critical region of the main loop for each thread
+    // we make a rough calculation to reduce the number of threads for small data size
+    // This also works fine for the minmax loop
+    if(multithread) {
+        size_t maxThreads = omp_get_max_threads();
+        while (size > numThreads * numThreads * 16384 && numThreads < maxThreads) {
+            ++numThreads;
+        }
+    }
+#endif
+
+    // We need min and max value of data to calculate the scale factor for the histogram
+    float minVal = data[0];
+    float maxVal = data[0];
+#ifdef _OPENMP
+    #pragma omp parallel for reduction(min:minVal) reduction(max:maxVal) num_threads(numThreads)
+#endif
+    for (size_t i = 1; i < size; ++i) {
+        minVal = std::min(minVal, data[i]);
+        maxVal = std::max(maxVal, data[i]);
+    }
+
+    if(std::fabs(maxVal - minVal) == 0.f) { // fast exit, also avoids division by zero in calculation of scale factor
+        minOut = maxOut = minVal;
+        return;
+    }
+
+    // caution: currently this works correctly only for histoSize in range[1;65536]
+    // for small data size (i.e. thumbnails) we reduce the size of the histogram to the size of data
+    const unsigned int histoSize = std::min(static_cast<size_t>(65536), size);
+
+    // calculate scale factor to use full range of histogram
+    const float scale = (histoSize - 1) / (maxVal - minVal);
+
+    // We need one main histogram
+    std::vector<uint32_t> histo(histoSize, 0);
+
+    if(numThreads == 1) {
+        // just one thread => use main histogram
+        for (size_t i = 0; i < size; ++i) {
+            // we have to subtract minVal and multiply with scale to get the data in [0;histosize] range
+            histo[ (uint16_t) (scale * (data[i] - minVal))]++;
+        }
+    } else {
+#ifdef _OPENMP
+    #pragma omp parallel num_threads(numThreads)
+#endif
+        {
+            // We need one histogram per thread
+            std::vector<uint32_t> histothr(histoSize, 0);
+
+#ifdef _OPENMP
+            #pragma omp for nowait
+#endif
+
+            for (size_t i = 0; i < size; ++i) {
+                // we have to subtract minVal and multiply with scale to get the data in [0;histosize] range
+                histothr[ (uint16_t) (scale * (data[i] - minVal))]++;
+            }
+
+#ifdef _OPENMP
+            #pragma omp critical
+#endif
+            {
+                // add per thread histogram to main histogram
+#ifdef _OPENMP
+                #pragma omp simd
+#endif
+
+                for(size_t i = 0; i < histoSize; ++i) {
+                    histo[i] += histothr[i];
+                }
+            }
+        }
+    }
+
+    size_t k = 0;
+    size_t count = 0;
+
+    // find (minPrct*size) smallest value
+    const float threshmin = minPrct * size;
+    while (count < threshmin) {
+        count += histo[k++];
+    }
+
+    if (k > 0) { // interpolate
+        size_t count_ = count - histo[k - 1];
+        float c0 = count - threshmin;
+        float c1 = threshmin - count_;
+        minOut = (c1 * k + c0 * (k - 1)) / (c0 + c1);
+    } else {
+        minOut = k;
+    }
+    // go back to original range
+    minOut /= scale;
+    minOut += minVal;
+
+    // find (maxPrct*size) smallest value
+    const float threshmax = maxPrct * size;
+    while (count < threshmax) {
+        count += histo[k++];
+    }
+
+    if (k > 0) { // interpolate
+        size_t count_ = count - histo[k - 1];
+        float c0 = count - threshmax;
+        float c1 = threshmax - count_;
+        maxOut = (c1 * k + c0 * (k - 1)) / (c0 + c1);
+    } else {
+        maxOut = k;
+    }
+    // go back to original range
+    maxOut /= scale;
+    maxOut += minVal;
+
+}
+}
--- a/rtengine/rt_algo.h
+++ b/rtengine/rt_algo.h
@@ -0,0 +1,28 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2017 Ingo Weyrich <heckflosse67@gmx.de>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace rtengine
+{
+void findMinMaxPercentile (const float *data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multiThread = true);
+
+}
--- a/rtengine/tmo_fattal02.cc
+++ b/rtengine/tmo_fattal02.cc
@@ -73,6 +73,8 @@
 #include "StopWatch.h"
 #include "sleef.c"
 #include "opthelper.h"
+#include "rt_algo.h"
+
 namespace rtengine
 {

@@ -167,7 +169,7 @@ void downSample (const Array2Df& A, Array2Df& B)
    }
 }

-void gaussianBlur (const Array2Df& I, Array2Df& L)
+void gaussianBlur (const Array2Df& I, Array2Df& L, bool multithread)
 {
    const int width = I.getCols();
    const int height = I.getRows();
@@ -185,7 +187,7 @@ void gaussianBlur (const Array2Df& I, Array2Df& L)
    Array2Df T (width, height);

    //--- X blur
-    #pragma omp parallel for shared(I, T)
+    #pragma omp parallel for shared(I, T) if(multithread)

    for ( int y = 0 ; y < height ; y++ ) {
        for ( int x = 1 ; x < width - 1 ; x++ ) {
@@ -200,7 +202,7 @@ void gaussianBlur (const Array2Df& I, Array2Df& L)
    }

    //--- Y blur
-    #pragma omp parallel for
+    #pragma omp parallel for if(multithread)

    for ( int x = 0 ; x < width - 7 ; x += 8 ) {
        for ( int y = 1 ; y < height - 1 ; y++ ) {
@@ -231,7 +233,7 @@ void gaussianBlur (const Array2Df& I, Array2Df& L)
    }
 }

-void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels)
+void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels, bool multithread)
 {
    int width = H->getCols();
    int height = H->getRows();
@@ -245,7 +247,7 @@ void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels)
    }

    Array2Df* L = new Array2Df (width, height);
-    gaussianBlur ( *pyramids[0], *L );
+    gaussianBlur ( *pyramids[0], *L, multithread );

    for ( int k = 1 ; k < nlevels ; k++ ) {
        if (width > 2 && height > 2) {
@@ -267,7 +269,7 @@ void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels)
        if (k < nlevels - 1) {
            delete L;
            L = new Array2Df (width, height);
-            gaussianBlur ( *pyramids[k], *L );
+            gaussianBlur ( *pyramids[k], *L, multithread );
        }
    }

@@ -276,14 +278,14 @@ void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels)

 //--------------------------------------------------------------------

-float calculateGradients (Array2Df* H, Array2Df* G, int k)
+float calculateGradients (Array2Df* H, Array2Df* G, int k, bool multithread)
 {
    const int width = H->getCols();
    const int height = H->getRows();
    const float divider = pow ( 2.0f, k + 1 );
    float avgGrad = 0.0f;

-    #pragma omp parallel for reduction(+:avgGrad)
+    #pragma omp parallel for reduction(+:avgGrad) if(multithread)

    for ( int y = 0 ; y < height ; y++ ) {
        int n = (y == 0 ? 0 : y - 1);
@@ -350,20 +352,17 @@ void upSample (const Array2Df& A, Array2Df& B)

 void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[],
                        float avgGrad[], int nlevels, int detail_level,
-                        float alfa, float beta, float noise)
+                        float alfa, float beta, float noise, bool multithread)
 {
-    const bool newfattal = true;
    int width = gradients[nlevels - 1]->getCols();
    int height = gradients[nlevels - 1]->getRows();
    Array2Df** fi = new Array2Df*[nlevels];

    fi[nlevels - 1] = new Array2Df (width, height);

-    if (newfattal) {
-        //#pragma omp parallel for shared(fi)
-        for ( int k = 0 ; k < width * height ; k++ ) {
-            (*fi[nlevels - 1]) (k) = 1.0f;
-        }
+    #pragma omp parallel for shared(fi) if(multithread)
+    for ( int k = 0 ; k < width * height ; k++ ) {
+        (*fi[nlevels - 1]) (k) = 1.0f;
    }

    for ( int k = nlevels - 1; k >= 0 ; k-- ) {
@@ -371,23 +370,16 @@ void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[],
        height = gradients[k]->getRows();

        // only apply gradients to levels>=detail_level but at least to the coarsest
-        if ( k >= detail_level
-                || k == nlevels - 1
-                || newfattal == false) {
+        if ((k >= detail_level || k == nlevels - 1) && beta != 1.f)  {
            //DEBUG_STR << "calculateFiMatrix: apply gradient to level " << k << endl;
-            #pragma omp parallel for shared(fi,avgGrad)
+            #pragma omp parallel for shared(fi,avgGrad) if(multithread)
            for ( int y = 0; y < height; y++ ) {
                for ( int x = 0; x < width; x++ ) {
                    float grad = ((*gradients[k]) (x, y) < 1e-4f) ? 1e-4 : (*gradients[k]) (x, y);
                    float a = alfa * avgGrad[k];
-
                    float value = pow ((grad + noise) / a, beta - 1.0f);

-                    if (newfattal) {
-                        (*fi[k]) (x, y) *= value;
-                    } else {
-                        (*fi[k]) (x, y) = value;
-                    }
+                    (*fi[k]) (x, y) *= value;
                }
            }
        }
@@ -401,9 +393,9 @@ void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[],
            fi[0] = FI;    // highest level -> result
        }

-        if ( k > 0  && newfattal ) {
+        if (k > 0) {
            upSample (*fi[k], *fi[k - 1]);        // upsample to next level
-            gaussianBlur (*fi[k - 1], *fi[k - 1]);
+            gaussianBlur (*fi[k - 1], *fi[k - 1], multithread);
        }
    }

@@ -414,80 +406,6 @@ void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[],
    delete[] fi;
 }

-inline
-void findMaxMinPercentile (const Array2Df& I,
-                           float minPrct, float& minLum,
-                           float maxPrct, float& maxLum)
-{
-    assert (minPrct <= maxPrct);
-
-    const int size = I.getRows() * I.getCols();
-    const float* data = I.data();
-
-    // we need to find the (minPrct*size) smallest value and the (maxPrct*size) smallest value in I
-    // We use a histogram based search for speed and to reduce memory usage
-    // memory usage of this method is 65536 * sizeof(float) * (t + 1) byte, where t is the number of threads
-
-    // We need one global histogram
-    LUTu histo (65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
-    histo.clear();
-#ifdef _OPENMP
-    #pragma omp parallel
-#endif
-    {
-        // We need one histogram per thread
-        LUTu histothr (65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
-        histothr.clear();
-
-#ifdef _OPENMP
-        #pragma omp for nowait
-#endif
-
-        for (int i = 0; i < size; ++i) {
-            // values are in [0;1] range, so we have to multiply with 65535 to get the histogram index
-            histothr[ (unsigned int) (65535.f * data[i])]++;
-        }
-
-#ifdef _OPENMP
-        #pragma omp critical
-#endif
-        // add per thread histogram to global histogram
-        histo += histothr;
-    }
-
-    int k = 0;
-    int count = 0;
-
-    // find (minPrct*size) smallest value
-    while (count < minPrct * size) {
-        count += histo[k++];
-    }
-
-    if (k > 0) { // interpolate
-        int count_ = count - histo[k - 1];
-        float c0 = count - minPrct * size;
-        float c1 = minPrct * size - count_;
-        minLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f);
-    } else {
-        minLum = k / 65535.f;
-    }
-
-    // find (maxPrct*size) smallest value
-    while (count < maxPrct * size) {
-        count += histo[k++];
-    }
-
-    if (k > 0) { // interpolate
-        int count_ = count - histo[k - 1];
-        float c0 = count - maxPrct * size;
-        float c1 = maxPrct * size - count_;
-        maxLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f);
-    } else {
-        maxLum = k / 65535.f;
-    }
-
-}
-
 void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread);

 void tmo_fattal02 (size_t width,
@@ -543,7 +461,7 @@ void tmo_fattal02 (size_t width,
    float minLum = Y (0, 0);
    float maxLum = Y (0, 0);

-    #pragma omp parallel for reduction(max:maxLum)
+    #pragma omp parallel for reduction(max:maxLum) if(multithread)

    for ( int i = 0 ; i < size ; i++ ) {
        maxLum = std::max (maxLum, Y (i));
@@ -552,7 +470,7 @@ void tmo_fattal02 (size_t width,
    Array2Df* H = new Array2Df (width, height);
    float temp = 100.f / maxLum;
    float eps = 1e-4f;
-    #pragma omp parallel
+    #pragma omp parallel if(multithread)
    {
 #ifdef __SSE2__
        vfloat epsv = F2V (eps);
@@ -627,7 +545,7 @@ void tmo_fattal02 (size_t width,
    const int nlevels = 7; // RT -- see above

    Array2Df** pyramids = new Array2Df*[nlevels];
-    createGaussianPyramids (H, pyramids, nlevels);
+    createGaussianPyramids (H, pyramids, nlevels, multithread);
    // ph.setValue(8);

    // calculate gradients and its average values on pyramid levels
@@ -636,7 +554,7 @@ void tmo_fattal02 (size_t width,

    for ( int k = 0 ; k < nlevels ; k++ ) {
        gradients[k] = new Array2Df (pyramids[k]->getCols(), pyramids[k]->getRows());
-        avgGrad[k] = calculateGradients (pyramids[k], gradients[k], k);
+        avgGrad[k] = calculateGradients (pyramids[k], gradients[k], k, multithread);
        delete pyramids[k];
    }

@@ -645,7 +563,7 @@ void tmo_fattal02 (size_t width,

    // calculate fi matrix
    Array2Df* FI = new Array2Df (width, height);
-    calculateFiMatrix (FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise);
+    calculateFiMatrix (FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise, multithread);

 //  dumpPFS( "FI.pfs", FI, "Y" );
    for ( int i = 0 ; i < nlevels ; i++ ) {
@@ -684,7 +602,7 @@ void tmo_fattal02 (size_t width,
    // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero
    // Neumann conditions assume U(-1)=U(0)), see also divergence calculation
    // if (fftsolver)
-    #pragma omp parallel for
+    #pragma omp parallel for if(multithread)

    for ( size_t y = 0 ; y < height ; y++ ) {
        // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
@@ -702,7 +620,7 @@ void tmo_fattal02 (size_t width,
    delete H;

    // calculate divergence
-    #pragma omp parallel for
+    #pragma omp parallel for if(multithread)

    for ( size_t y = 0; y < height; ++y ) {
        for ( size_t x = 0; x < width; ++x ) {
@@ -754,7 +672,7 @@ void tmo_fattal02 (size_t width,
        // {
        //     return;
        // }
-        #pragma omp parallel
+        #pragma omp parallel if(multithread)
        {
 #ifdef __SSE2__
            vfloat gammav = F2V (gamma);
@@ -783,10 +701,10 @@ void tmo_fattal02 (size_t width,
    float cut_min = 0.01f * black_point;
    float cut_max = 1.0f - 0.01f * white_point;
    assert (cut_min >= 0.0f && (cut_max <= 1.0f) && (cut_min < cut_max));
-    findMaxMinPercentile (L, cut_min, minLum, cut_max, maxLum);
+    findMinMaxPercentile (L.data(), L.getRows() * L.getCols(), cut_min, minLum, cut_max, maxLum, multithread);
    float dividor = (maxLum - minLum);

-    #pragma omp parallel for
+    #pragma omp parallel for if(multithread)

    for (size_t i = 0; i < height; ++i) {
        for (size_t j = 0; j < width; ++j) {
@@ -869,7 +787,7 @@ void tmo_fattal02 (size_t width,

 // returns T = EVy A EVx^tr
 // note, modifies input data
-void transform_ev2normal (Array2Df *A, Array2Df *T)
+void transform_ev2normal (Array2Df *A, Array2Df *T, bool multithread)
 {
    int width = A->getCols();
    int height = A->getRows();
@@ -877,7 +795,7 @@ void transform_ev2normal (Array2Df *A, Array2Df *T)

    // the discrete cosine transform is not exactly the transform needed
    // need to scale input values to get the right transformation
-    #pragma omp parallel for
+    #pragma omp parallel for if(multithread)

    for (int y = 1 ; y < height - 1 ; y++ )
        for (int x = 1 ; x < width - 1 ; x++ ) {
@@ -913,7 +831,7 @@ void transform_ev2normal (Array2Df *A, Array2Df *T)


 // returns T = EVy^-1 * A * (EVx^-1)^tr
-void transform_normal2ev (Array2Df *A, Array2Df *T)
+void transform_normal2ev (Array2Df *A, Array2Df *T, bool multithread)
 {
    int width = A->getCols();
    int height = A->getRows();
@@ -928,7 +846,7 @@ void transform_normal2ev (Array2Df *A, Array2Df *T)

    // need to scale the output matrix to get the right transform
    float factor = (1.0f / ((height - 1) * (width - 1)));
-    #pragma omp parallel for
+    #pragma omp parallel for if(multithread)

    for (int y = 0 ; y < height ; y++ )
        for (int x = 0 ; x < width ; x++ ) {
@@ -1038,7 +956,7 @@ void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*
    // transforms F into eigenvector space: Ftr =
    //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl;
    Array2Df* F_tr = buf; //new Array2Df(width,height);
-    transform_normal2ev (F, F_tr);
+    transform_normal2ev (F, F_tr, multithread);
    // TODO: F no longer needed so could release memory, but as it is an
    // input parameter we won't do that
    // ph.setValue(50);
@@ -1057,7 +975,7 @@ void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*
    std::vector<double> l1 = get_lambda (height);
    std::vector<double> l2 = get_lambda (width);

-    #pragma omp parallel for
+    #pragma omp parallel for if(multithread)

    for (int y = 0 ; y < height ; y++ ) {
        for (int x = 0 ; x < width ; x++ ) {
@@ -1069,7 +987,7 @@ void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*

    // transforms U_tr back to the normal space
    //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl;
-    transform_ev2normal (F_tr, U);
+    transform_ev2normal (F_tr, U, multithread);
 //  delete F_tr;    // no longer needed so release memory

    // the solution U as calculated will satisfy something like int U = 0
@@ -1079,13 +997,13 @@ void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*
    // (not really needed but good for numerics as we later take exp(U))
    //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl;
    float max = 0.f;
-    #pragma omp parallel for reduction(max:max)
+    #pragma omp parallel for reduction(max:max) if(multithread)

    for (int i = 0; i < width * height; i++) {
        max = std::max (max, (*U) (i));
    }

-    #pragma omp parallel for
+    #pragma omp parallel for if(multithread)

    for (int i = 0; i < width * height; i++) {
        (*U) (i) -= max;
@@ -1335,8 +1253,6 @@ void ImProcFunctions::ToneMapFattal02 (Imagefloat *rgb)
    rescale_nearest (Yr, L, multiThread);
    tmo_fattal02 (w2, h2, L, L, alpha, beta, noise, detail_level, multiThread);

-//    tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread);
-
 #ifdef _OPENMP
    #pragma omp parallel for if(multiThread)
 #endif