From 0950963f8421a4858bb8d6b225ea52b95e919f61 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Sat, 25 Nov 2017 13:56:42 +0100 Subject: [PATCH] Moved findMinMaxPercentile() to rt_algo.*, use bool multiThread in fattal tonemapper, fixes #4195 --- rtengine/CMakeLists.txt | 1 + rtengine/rt_algo.cc | 163 +++++++++++++++++++++++++++++++++++++++ rtengine/rt_algo.h | 28 +++++++ rtengine/tmo_fattal02.cc | 160 +++++++++----------------------------- 4 files changed, 230 insertions(+), 122 deletions(-) create mode 100644 rtengine/rt_algo.cc create mode 100644 rtengine/rt_algo.h diff --git a/rtengine/CMakeLists.txt b/rtengine/CMakeLists.txt index 53b374388..70416af62 100644 --- a/rtengine/CMakeLists.txt +++ b/rtengine/CMakeLists.txt @@ -106,6 +106,7 @@ set(RTENGINESOURCEFILES rawimage.cc rawimagesource.cc refreshmap.cc + rt_algo.cc rtthumbnail.cc shmap.cc simpleprocess.cc diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc new file mode 100644 index 000000000..92a3a59b6 --- /dev/null +++ b/rtengine/rt_algo.cc @@ -0,0 +1,163 @@ +/* + * This file is part of RawTherapee. + * + * Copyright (c) 2017 Ingo Weyrich + * + * RawTherapee is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RawTherapee is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RawTherapee. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#ifdef _OPENMP +#include +#endif + +namespace rtengine +{ +void findMinMaxPercentile (const float *data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multithread) +{ + // we need to find the (minPrct*size) smallest value and the (maxPrct*size) smallest value in data + // We use a histogram based search for speed and to reduce memory usage + // memory usage of this method is histoSize * sizeof(uint32_t) * (t + 1) byte, + // where t is the number of threads and histoSize is in [1;65536] + // The current implementation is not guaranteed to work correctly if size > 2^32 (4294967296) + assert (minPrct <= maxPrct); + + if(size == 0) { + return; + } + + size_t numThreads = 1; +#ifdef _OPENMP + // Because we have an overhead in the critical region of the main loop for each thread + // we make a rough calculation to reduce the number of threads for small data size + // This also works fine for the minmax loop + if(multithread) { + size_t maxThreads = omp_get_max_threads(); + while (size > numThreads * numThreads * 16384 && numThreads < maxThreads) { + ++numThreads; + } + } +#endif + + // We need min and max value of data to calculate the scale factor for the histogram + float minVal = data[0]; + float maxVal = data[0]; +#ifdef _OPENMP + #pragma omp parallel for reduction(min:minVal) reduction(max:maxVal) num_threads(numThreads) +#endif + for (size_t i = 1; i < size; ++i) { + minVal = std::min(minVal, data[i]); + maxVal = std::max(maxVal, data[i]); + } + + if(std::fabs(maxVal - minVal) == 0.f) { // fast exit, also avoids division by zero in calculation of scale factor + minOut = maxOut = minVal; + return; + } + + // caution: currently this works correctly only for histoSize in range[1;65536] + // for small data size (i.e. thumbnails) we reduce the size of the histogram to the size of data + const unsigned int histoSize = std::min(static_cast(65536), size); + + // calculate scale factor to use full range of histogram + const float scale = (histoSize - 1) / (maxVal - minVal); + + // We need one main histogram + std::vector histo(histoSize, 0); + + if(numThreads == 1) { + // just one thread => use main histogram + for (size_t i = 0; i < size; ++i) { + // we have to subtract minVal and multiply with scale to get the data in [0;histosize] range + histo[ (uint16_t) (scale * (data[i] - minVal))]++; + } + } else { +#ifdef _OPENMP + #pragma omp parallel num_threads(numThreads) +#endif + { + // We need one histogram per thread + std::vector histothr(histoSize, 0); + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (size_t i = 0; i < size; ++i) { + // we have to subtract minVal and multiply with scale to get the data in [0;histosize] range + histothr[ (uint16_t) (scale * (data[i] - minVal))]++; + } + +#ifdef _OPENMP + #pragma omp critical +#endif + { + // add per thread histogram to main histogram +#ifdef _OPENMP + #pragma omp simd +#endif + + for(size_t i = 0; i < histoSize; ++i) { + histo[i] += histothr[i]; + } + } + } + } + + size_t k = 0; + size_t count = 0; + + // find (minPrct*size) smallest value + const float threshmin = minPrct * size; + while (count < threshmin) { + count += histo[k++]; + } + + if (k > 0) { // interpolate + size_t count_ = count - histo[k - 1]; + float c0 = count - threshmin; + float c1 = threshmin - count_; + minOut = (c1 * k + c0 * (k - 1)) / (c0 + c1); + } else { + minOut = k; + } + // go back to original range + minOut /= scale; + minOut += minVal; + + // find (maxPrct*size) smallest value + const float threshmax = maxPrct * size; + while (count < threshmax) { + count += histo[k++]; + } + + if (k > 0) { // interpolate + size_t count_ = count - histo[k - 1]; + float c0 = count - threshmax; + float c1 = threshmax - count_; + maxOut = (c1 * k + c0 * (k - 1)) / (c0 + c1); + } else { + maxOut = k; + } + // go back to original range + maxOut /= scale; + maxOut += minVal; + +} +} \ No newline at end of file diff --git a/rtengine/rt_algo.h b/rtengine/rt_algo.h new file mode 100644 index 000000000..1e2d11bbb --- /dev/null +++ b/rtengine/rt_algo.h @@ -0,0 +1,28 @@ +/* + * This file is part of RawTherapee. + * + * Copyright (c) 2017 Ingo Weyrich + * + * RawTherapee is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RawTherapee is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RawTherapee. If not, see . + */ + +#pragma once + +#include + +namespace rtengine +{ +void findMinMaxPercentile (const float *data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multiThread = true); + +} \ No newline at end of file diff --git a/rtengine/tmo_fattal02.cc b/rtengine/tmo_fattal02.cc index 0dff22a54..1b301123a 100644 --- a/rtengine/tmo_fattal02.cc +++ b/rtengine/tmo_fattal02.cc @@ -73,6 +73,8 @@ #include "StopWatch.h" #include "sleef.c" #include "opthelper.h" +#include "rt_algo.h" + namespace rtengine { @@ -167,7 +169,7 @@ void downSample (const Array2Df& A, Array2Df& B) } } -void gaussianBlur (const Array2Df& I, Array2Df& L) +void gaussianBlur (const Array2Df& I, Array2Df& L, bool multithread) { const int width = I.getCols(); const int height = I.getRows(); @@ -185,7 +187,7 @@ void gaussianBlur (const Array2Df& I, Array2Df& L) Array2Df T (width, height); //--- X blur - #pragma omp parallel for shared(I, T) + #pragma omp parallel for shared(I, T) if(multithread) for ( int y = 0 ; y < height ; y++ ) { for ( int x = 1 ; x < width - 1 ; x++ ) { @@ -200,7 +202,7 @@ void gaussianBlur (const Array2Df& I, Array2Df& L) } //--- Y blur - #pragma omp parallel for + #pragma omp parallel for if(multithread) for ( int x = 0 ; x < width - 7 ; x += 8 ) { for ( int y = 1 ; y < height - 1 ; y++ ) { @@ -231,7 +233,7 @@ void gaussianBlur (const Array2Df& I, Array2Df& L) } } -void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels) +void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels, bool multithread) { int width = H->getCols(); int height = H->getRows(); @@ -245,7 +247,7 @@ void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels) } Array2Df* L = new Array2Df (width, height); - gaussianBlur ( *pyramids[0], *L ); + gaussianBlur ( *pyramids[0], *L, multithread ); for ( int k = 1 ; k < nlevels ; k++ ) { if (width > 2 && height > 2) { @@ -267,7 +269,7 @@ void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels) if (k < nlevels - 1) { delete L; L = new Array2Df (width, height); - gaussianBlur ( *pyramids[k], *L ); + gaussianBlur ( *pyramids[k], *L, multithread ); } } @@ -276,14 +278,14 @@ void createGaussianPyramids ( Array2Df* H, Array2Df** pyramids, int nlevels) //-------------------------------------------------------------------- -float calculateGradients (Array2Df* H, Array2Df* G, int k) +float calculateGradients (Array2Df* H, Array2Df* G, int k, bool multithread) { const int width = H->getCols(); const int height = H->getRows(); const float divider = pow ( 2.0f, k + 1 ); float avgGrad = 0.0f; - #pragma omp parallel for reduction(+:avgGrad) + #pragma omp parallel for reduction(+:avgGrad) if(multithread) for ( int y = 0 ; y < height ; y++ ) { int n = (y == 0 ? 0 : y - 1); @@ -350,20 +352,17 @@ void upSample (const Array2Df& A, Array2Df& B) void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[], float avgGrad[], int nlevels, int detail_level, - float alfa, float beta, float noise) + float alfa, float beta, float noise, bool multithread) { - const bool newfattal = true; int width = gradients[nlevels - 1]->getCols(); int height = gradients[nlevels - 1]->getRows(); Array2Df** fi = new Array2Df*[nlevels]; fi[nlevels - 1] = new Array2Df (width, height); - if (newfattal) { - //#pragma omp parallel for shared(fi) - for ( int k = 0 ; k < width * height ; k++ ) { - (*fi[nlevels - 1]) (k) = 1.0f; - } + #pragma omp parallel for shared(fi) if(multithread) + for ( int k = 0 ; k < width * height ; k++ ) { + (*fi[nlevels - 1]) (k) = 1.0f; } for ( int k = nlevels - 1; k >= 0 ; k-- ) { @@ -371,23 +370,16 @@ void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[], height = gradients[k]->getRows(); // only apply gradients to levels>=detail_level but at least to the coarsest - if ( k >= detail_level - || k == nlevels - 1 - || newfattal == false) { + if ((k >= detail_level || k == nlevels - 1) && beta != 1.f) { //DEBUG_STR << "calculateFiMatrix: apply gradient to level " << k << endl; - #pragma omp parallel for shared(fi,avgGrad) + #pragma omp parallel for shared(fi,avgGrad) if(multithread) for ( int y = 0; y < height; y++ ) { for ( int x = 0; x < width; x++ ) { float grad = ((*gradients[k]) (x, y) < 1e-4f) ? 1e-4 : (*gradients[k]) (x, y); float a = alfa * avgGrad[k]; - float value = pow ((grad + noise) / a, beta - 1.0f); - if (newfattal) { - (*fi[k]) (x, y) *= value; - } else { - (*fi[k]) (x, y) = value; - } + (*fi[k]) (x, y) *= value; } } } @@ -401,9 +393,9 @@ void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[], fi[0] = FI; // highest level -> result } - if ( k > 0 && newfattal ) { + if (k > 0) { upSample (*fi[k], *fi[k - 1]); // upsample to next level - gaussianBlur (*fi[k - 1], *fi[k - 1]); + gaussianBlur (*fi[k - 1], *fi[k - 1], multithread); } } @@ -414,80 +406,6 @@ void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[], delete[] fi; } -inline -void findMaxMinPercentile (const Array2Df& I, - float minPrct, float& minLum, - float maxPrct, float& maxLum) -{ - assert (minPrct <= maxPrct); - - const int size = I.getRows() * I.getCols(); - const float* data = I.data(); - - // we need to find the (minPrct*size) smallest value and the (maxPrct*size) smallest value in I - // We use a histogram based search for speed and to reduce memory usage - // memory usage of this method is 65536 * sizeof(float) * (t + 1) byte, where t is the number of threads - - // We need one global histogram - LUTu histo (65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE); - histo.clear(); -#ifdef _OPENMP - #pragma omp parallel -#endif - { - // We need one histogram per thread - LUTu histothr (65536, LUT_CLIP_BELOW | LUT_CLIP_ABOVE); - histothr.clear(); - -#ifdef _OPENMP - #pragma omp for nowait -#endif - - for (int i = 0; i < size; ++i) { - // values are in [0;1] range, so we have to multiply with 65535 to get the histogram index - histothr[ (unsigned int) (65535.f * data[i])]++; - } - -#ifdef _OPENMP - #pragma omp critical -#endif - // add per thread histogram to global histogram - histo += histothr; - } - - int k = 0; - int count = 0; - - // find (minPrct*size) smallest value - while (count < minPrct * size) { - count += histo[k++]; - } - - if (k > 0) { // interpolate - int count_ = count - histo[k - 1]; - float c0 = count - minPrct * size; - float c1 = minPrct * size - count_; - minLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f); - } else { - minLum = k / 65535.f; - } - - // find (maxPrct*size) smallest value - while (count < maxPrct * size) { - count += histo[k++]; - } - - if (k > 0) { // interpolate - int count_ = count - histo[k - 1]; - float c0 = count - maxPrct * size; - float c1 = maxPrct * size - count_; - maxLum = (c1 * k + c0 * (k - 1)) / ((c0 + c1) * 65535.f); - } else { - maxLum = k / 65535.f; - } - -} - void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread); void tmo_fattal02 (size_t width, @@ -543,7 +461,7 @@ void tmo_fattal02 (size_t width, float minLum = Y (0, 0); float maxLum = Y (0, 0); - #pragma omp parallel for reduction(max:maxLum) + #pragma omp parallel for reduction(max:maxLum) if(multithread) for ( int i = 0 ; i < size ; i++ ) { maxLum = std::max (maxLum, Y (i)); @@ -552,7 +470,7 @@ void tmo_fattal02 (size_t width, Array2Df* H = new Array2Df (width, height); float temp = 100.f / maxLum; float eps = 1e-4f; - #pragma omp parallel + #pragma omp parallel if(multithread) { #ifdef __SSE2__ vfloat epsv = F2V (eps); @@ -627,7 +545,7 @@ void tmo_fattal02 (size_t width, const int nlevels = 7; // RT -- see above Array2Df** pyramids = new Array2Df*[nlevels]; - createGaussianPyramids (H, pyramids, nlevels); + createGaussianPyramids (H, pyramids, nlevels, multithread); // ph.setValue(8); // calculate gradients and its average values on pyramid levels @@ -636,7 +554,7 @@ void tmo_fattal02 (size_t width, for ( int k = 0 ; k < nlevels ; k++ ) { gradients[k] = new Array2Df (pyramids[k]->getCols(), pyramids[k]->getRows()); - avgGrad[k] = calculateGradients (pyramids[k], gradients[k], k); + avgGrad[k] = calculateGradients (pyramids[k], gradients[k], k, multithread); delete pyramids[k]; } @@ -645,7 +563,7 @@ void tmo_fattal02 (size_t width, // calculate fi matrix Array2Df* FI = new Array2Df (width, height); - calculateFiMatrix (FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise); + calculateFiMatrix (FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise, multithread); // dumpPFS( "FI.pfs", FI, "Y" ); for ( int i = 0 ; i < nlevels ; i++ ) { @@ -684,7 +602,7 @@ void tmo_fattal02 (size_t width, // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero // Neumann conditions assume U(-1)=U(0)), see also divergence calculation // if (fftsolver) - #pragma omp parallel for + #pragma omp parallel for if(multithread) for ( size_t y = 0 ; y < height ; y++ ) { // sets index+1 based on the boundary assumption H(N+1)=H(N-1) @@ -702,7 +620,7 @@ void tmo_fattal02 (size_t width, delete H; // calculate divergence - #pragma omp parallel for + #pragma omp parallel for if(multithread) for ( size_t y = 0; y < height; ++y ) { for ( size_t x = 0; x < width; ++x ) { @@ -754,7 +672,7 @@ void tmo_fattal02 (size_t width, // { // return; // } - #pragma omp parallel + #pragma omp parallel if(multithread) { #ifdef __SSE2__ vfloat gammav = F2V (gamma); @@ -783,10 +701,10 @@ void tmo_fattal02 (size_t width, float cut_min = 0.01f * black_point; float cut_max = 1.0f - 0.01f * white_point; assert (cut_min >= 0.0f && (cut_max <= 1.0f) && (cut_min < cut_max)); - findMaxMinPercentile (L, cut_min, minLum, cut_max, maxLum); + findMinMaxPercentile (L.data(), L.getRows() * L.getCols(), cut_min, minLum, cut_max, maxLum, multithread); float dividor = (maxLum - minLum); - #pragma omp parallel for + #pragma omp parallel for if(multithread) for (size_t i = 0; i < height; ++i) { for (size_t j = 0; j < width; ++j) { @@ -869,7 +787,7 @@ void tmo_fattal02 (size_t width, // returns T = EVy A EVx^tr // note, modifies input data -void transform_ev2normal (Array2Df *A, Array2Df *T) +void transform_ev2normal (Array2Df *A, Array2Df *T, bool multithread) { int width = A->getCols(); int height = A->getRows(); @@ -877,7 +795,7 @@ void transform_ev2normal (Array2Df *A, Array2Df *T) // the discrete cosine transform is not exactly the transform needed // need to scale input values to get the right transformation - #pragma omp parallel for + #pragma omp parallel for if(multithread) for (int y = 1 ; y < height - 1 ; y++ ) for (int x = 1 ; x < width - 1 ; x++ ) { @@ -913,7 +831,7 @@ void transform_ev2normal (Array2Df *A, Array2Df *T) // returns T = EVy^-1 * A * (EVx^-1)^tr -void transform_normal2ev (Array2Df *A, Array2Df *T) +void transform_normal2ev (Array2Df *A, Array2Df *T, bool multithread) { int width = A->getCols(); int height = A->getRows(); @@ -928,7 +846,7 @@ void transform_normal2ev (Array2Df *A, Array2Df *T) // need to scale the output matrix to get the right transform float factor = (1.0f / ((height - 1) * (width - 1))); - #pragma omp parallel for + #pragma omp parallel for if(multithread) for (int y = 0 ; y < height ; y++ ) for (int x = 0 ; x < width ; x++ ) { @@ -1038,7 +956,7 @@ void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/* // transforms F into eigenvector space: Ftr = //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl; Array2Df* F_tr = buf; //new Array2Df(width,height); - transform_normal2ev (F, F_tr); + transform_normal2ev (F, F_tr, multithread); // TODO: F no longer needed so could release memory, but as it is an // input parameter we won't do that // ph.setValue(50); @@ -1057,7 +975,7 @@ void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/* std::vector l1 = get_lambda (height); std::vector l2 = get_lambda (width); - #pragma omp parallel for + #pragma omp parallel for if(multithread) for (int y = 0 ; y < height ; y++ ) { for (int x = 0 ; x < width ; x++ ) { @@ -1069,7 +987,7 @@ void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/* // transforms U_tr back to the normal space //DEBUG_STR << "solve_pde_fft: transform U_tr to normal space (fft)" << std::endl; - transform_ev2normal (F_tr, U); + transform_ev2normal (F_tr, U, multithread); // delete F_tr; // no longer needed so release memory // the solution U as calculated will satisfy something like int U = 0 @@ -1079,13 +997,13 @@ void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/* // (not really needed but good for numerics as we later take exp(U)) //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl; float max = 0.f; - #pragma omp parallel for reduction(max:max) + #pragma omp parallel for reduction(max:max) if(multithread) for (int i = 0; i < width * height; i++) { max = std::max (max, (*U) (i)); } - #pragma omp parallel for + #pragma omp parallel for if(multithread) for (int i = 0; i < width * height; i++) { (*U) (i) -= max; @@ -1335,8 +1253,6 @@ void ImProcFunctions::ToneMapFattal02 (Imagefloat *rgb) rescale_nearest (Yr, L, multiThread); tmo_fattal02 (w2, h2, L, L, alpha, beta, noise, detail_level, multiThread); -// tmo_fattal02(w, h, Yr, L, alpha, beta, noise, detail_level, multiThread); - #ifdef _OPENMP #pragma omp parallel for if(multiThread) #endif