From 7b0cda5c611de2d25f687a589af2f93b7b758e5e Mon Sep 17 00:00:00 2001 From: Ingo Date: Tue, 22 Jan 2013 19:00:36 +0100 Subject: [PATCH] Performance optimization for impulse_nr on multi-core-systems (Issue 1671) --- rtengine/impulse_denoise.h | 68 ++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/rtengine/impulse_denoise.h b/rtengine/impulse_denoise.h index 0ae7e5b58..4d911201c 100644 --- a/rtengine/impulse_denoise.h +++ b/rtengine/impulse_denoise.h @@ -5,7 +5,7 @@ * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * RawTherapee is distributed in the hope that it will be useful, * but widthITheightOUT ANY widthARRANTY; without even the implied warranty of * MERCheightANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -15,7 +15,7 @@ * along with RawTherapee. If not, see . * * 2010 Emil Martinec - * + * */ #include #include "rt_math.h" @@ -29,17 +29,17 @@ using namespace std; namespace rtengine { void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) { - - + + // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // impulse noise removal // local variables - + int width = lab->W; int height = lab->H; - + float hpfabs, hfnbrave; - + // buffer for the lowpass image float ** lpf = new float *[height]; // buffer for the highpass image @@ -51,45 +51,57 @@ void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) { //memset (impish[i], 0, width*sizeof(unsigned short)); } - + //The cleaning algorithm starts here - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // modified bilateral filter for lowpass image, omitting input pixel; or Gaussian blur - + static float eps = 1.0; float wtdsum[3], dirwt, norm; - int i1, j1; - + int i1, j1; + //rangeblur (lab->L, lpf, impish /*used as buffer here*/, width, height, thresh, false); - #ifdef _OPENMP - #pragma omp parallel - #endif +#ifdef _OPENMP +#pragma omp parallel +#endif { AlignedBufferMP buffer(max(width,height)); - + gaussHorizontal (lab->L, lpf, buffer, width, height, max(2.0,thresh-1.0)); gaussVertical (lpf, lpf, buffer, width, height, max(2.0,thresh-1.0)); } - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - + float impthr = max(1.0,5.5-thresh); - + float impthrDiv24 = impthr / 24.0f; //Issue 1671: moved the Division outside the loop, impthr can be optimized out too, but I let in the code at the moment + +#ifdef _OPENMP + #pragma omp parallel for private(hpfabs, hfnbrave,i1,j1) +#endif for (int i=0; i < height; i++) for (int j=0; j < width; j++) { - + hpfabs = fabs(lab->L[i][j]-lpf[i][j]); //block average of high pass data for (i1=max(0,i-2), hfnbrave=0; i1<=min(i+2,height-1); i1++ ) for (j1=max(0,j-2); j1<=min(j+2,width-1); j1++ ) { hfnbrave += fabs(lab->L[i1][j1]-lpf[i1][j1]); } - hfnbrave = (hfnbrave-hpfabs)/24; - hpfabs>(hfnbrave*impthr) ? impish[i][j]=1 : impish[i][j]=0; - + impish[i][j] = (hpfabs>((hfnbrave-hpfabs)*impthrDiv24)); + }//now impulsive values have been identified - + +// Issue 1671: +// often, noise isn't evenly distributed, e.g. only a few noisy pixels in the bright sky, but many in the dark foreground, +// so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work +// Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better +// choice for the chunk_size than 16 +// race conditions are avoided by the array impish +#ifdef _OPENMP + #pragma omp parallel for private(wtdsum,norm,dirwt,i1,j1) schedule(dynamic,16) +#endif for (int i=0; i < height; i++) for (int j=0; j < width; j++) { if (!impish[i][j]) continue; @@ -110,17 +122,17 @@ void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) { lab->L[i][j]=wtdsum[0]/norm;//low pass filter lab->a[i][j]=wtdsum[1]/norm;//low pass filter lab->b[i][j]=wtdsum[2]/norm;//low pass filter - } - + } + }//now impulsive values have been corrected - + for (int i=0; i