From 86df10be88d26550fa3b9e72d9b62e68bcfeffcc Mon Sep 17 00:00:00 2001 From: Ingo Date: Sun, 7 Jun 2015 12:18:38 +0200 Subject: [PATCH] Disable nested parallelism when compiled with clang, Issue 2731 --- rtengine/FTblockDN.cc | 78 +++++++++++++------------ rtengine/cplx_wavelet_level.h | 26 ++++----- rtengine/ipwavelet.cc | 106 +++++++++++++++++++--------------- rtengine/opthelper.h | 3 + 4 files changed, 117 insertions(+), 96 deletions(-) diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index e8c94b10f..2343ff865 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -645,7 +645,8 @@ do { // Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles int numthreads = MIN(numtiles,omp_get_max_threads()); if(options.rgbDenoiseThreadLimit > 0) - numthreads = MIN(numthreads,options.rgbDenoiseThreadLimit); + numthreads = MIN(numthreads,options.rgbDenoiseThreadLimit); +#ifdef _RT_NESTED_OPENMP denoiseNestedLevels = omp_get_max_threads() / numthreads; bool oldNested = omp_get_nested(); if(denoiseNestedLevels < 2) @@ -654,7 +655,8 @@ do { omp_set_nested(true); if(options.rgbDenoiseThreadLimit > 0) while(denoiseNestedLevels*numthreads > options.rgbDenoiseThreadLimit) - denoiseNestedLevels--; + denoiseNestedLevels--; +#endif if(settings->verbose) printf("RGB_denoise uses %d main thread(s) and up to %d nested thread(s) for each main thread\n",numthreads,denoiseNestedLevels); #endif @@ -736,8 +738,8 @@ do { if(!denoiseMethodRgb){//lab mode //modification Jacques feb 2013 and july 2014 -#ifdef _OPENMP -#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) +#ifdef _RT_NESTED_OPENMP +#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif for (int i=tiletop; i1) +#ifdef _RT_NESTED_OPENMP +#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif for (int i=tiletop; i1) +#ifdef _RT_NESTED_OPENMP +#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif for (int i=tiletop; imaxlevel(); -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel for schedule(dynamic) collapse(2) num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif for (int lvl=0; lvlL to Lin before it gets modified by reconstruction Lin = new array2D(width,height); -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif for(int i=0;i1) #endif { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP int subThread = masterThread * denoiseNestedLevels + omp_get_thread_num(); #else int subThread = 0; @@ -1151,7 +1153,7 @@ do { float *fLblox = fLbloxArray[subThread]; float pBuf[width + TS + 2*blkrad*offset] ALIGNED16; float nbrwt[TS*TS] ALIGNED64; -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for #endif for (int vblk=0; vblk1) #endif for (int i=0; i1) +#ifdef _RT_NESTED_OPENMP +#pragma omp parallel for schedule(dynamic,16) num_threads(denoiseNestedLevels) #endif for (int i=tiletop; i1) +#ifdef _RT_NESTED_OPENMP +#pragma omp parallel for num_threads(denoiseNestedLevels) #endif for (int i=tiletop; i1) +#ifdef _RT_NESTED_OPENMP +#pragma omp parallel for num_threads(denoiseNestedLevels) #endif for (int i=tiletop; i1) #endif { @@ -1868,7 +1870,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposit if(!memoryAllocationFailed) { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for schedule(dynamic) collapse(2) #endif for (int lvl=maxlvl-1; lvl>=0; lvl--) {//for levels less than max, use level diff to make edge mask @@ -1968,7 +1970,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi maxHL = WaveletCoeffs_L.level_H(lvl); } bool memoryAllocationFailed = false; -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif { @@ -1983,7 +1985,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi if(!memoryAllocationFailed) { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for schedule(dynamic) collapse(2) #endif for (int lvl=0; lvl=0; lvl--) {//for levels less than max, use level diff to make edge mask @@ -2086,7 +2088,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi maxHL = WaveletCoeffs_L.level_H(lvl); } bool memoryAllocationFailed = false; -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif { @@ -2099,7 +2101,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi } if(!memoryAllocationFailed) { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for schedule(dynamic) collapse(2) #endif for (int lvl=0; lvl1) #endif { @@ -2143,7 +2145,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi } if(!memoryAllocationFailed) { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for schedule(dynamic) collapse(2) #endif for (int lvl=0; lvldata+datalen, labdn->W, labdn->H, levwav, 1 ); } -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp section #endif { diff --git a/rtengine/cplx_wavelet_level.h b/rtengine/cplx_wavelet_level.h index 3eef39397..1c1efde27 100644 --- a/rtengine/cplx_wavelet_level.h +++ b/rtengine/cplx_wavelet_level.h @@ -215,7 +215,7 @@ namespace rtengine { * Applies a Haar filter * */ -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel for num_threads(numThreads) if(numThreads>1) #endif for (int k=0; k1) #endif { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for nowait #endif for(int i = 0; i < skip; i++) { for(int j=0;j1) #endif for (int k=0; k1) #endif for(int i = 0; i < dstheight; i++) { @@ -509,7 +509,7 @@ namespace rtengine { // calculate coefficients int shift=skip*(taps-offset-1);//align filter with data -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel for num_threads(numThreads) if(numThreads>1) #endif for(int i = 0; i < dstheight; i++) { @@ -550,14 +550,14 @@ namespace rtengine { } } } -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel num_threads(numThreads) if(numThreads>1) #endif { T tmpLo[m_w] ALIGNED64; T tmpHi[m_w] ALIGNED64; if(subsamp_out) { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for #endif for(int row=0;row template void wavelet_level::decompose_level(E *src, E *dst, float *filterV, float *filterH, int taps, int offset) { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel num_threads(numThreads) if(numThreads>1) #endif { @@ -588,7 +588,7 @@ namespace rtengine { T tmpHi[m_w] ALIGNED64; /* filter along rows and columns */ if(subsamp_out) { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for #endif for(int row=0;row 0) - numthreads = MIN(numthreads,maxnumberofthreadsforwavelet); + numthreads = MIN(numthreads,maxnumberofthreadsforwavelet); +#ifdef _RT_NESTED_OPENMP wavNestedLevels = omp_get_max_threads() / numthreads; bool oldNested = omp_get_nested(); if(wavNestedLevels < 2) @@ -447,7 +448,8 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int omp_set_nested(true); if(maxnumberofthreadsforwavelet > 0) while(wavNestedLevels*numthreads > maxnumberofthreadsforwavelet) - wavNestedLevels--; + wavNestedLevels--; +#endif if(settings->verbose) printf("Ip Wavelet uses %d main thread(s) and up to %d nested thread(s) for each main thread\n",numthreads,wavNestedLevels); @@ -493,16 +495,16 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int for (int i=0; iL; } - -#ifdef _OPENMP + +#ifdef _RT_NESTED_OPENMP #pragma omp parallel for num_threads(wavNestedLevels) if(wavNestedLevels>1) #endif - + for (int i=tiletop; i1) #endif for (int i=1; i1) #endif for (int lvl=0; lvl<3; lvl++) { @@ -777,9 +779,9 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int } bool highlight = params->toneCurve.hrenabled; -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel for schedule(dynamic,16) num_threads(wavNestedLevels) if(wavNestedLevels>1) -#endif +#endif for (int i=tiletop; i 1) { @@ -936,10 +938,10 @@ omp_set_nested(oldNested); delete dsttmp; } -// if (settings->verbose) { + if (settings->verbose) { t2e.set(); printf("Wavelet performed in %d usec:\n", t2e.etime(t1e)); -// } + } }//end o @@ -960,12 +962,12 @@ omp_set_nested(oldNested); float thres = 5.f;//different fom zero to take into account only data large enough max=0.f; min=0.f; -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp parallel num_threads(wavNestedLevels) if(wavNestedLevels>1) #endif { float lmax = 0.f, lmin = 0.f; -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for reduction(+:averaP,averaN,countP,countN) nowait #endif for(int i=0;i1) #endif for(int i=0;i1) #endif for (int i=0; i1) +#endif { float lminL = FLT_MAX; float lmaxL = 0.f; + +#ifdef _RT_NESTED_OPENMP #pragma omp for +#endif for(int i = 0; i < W_L*H_L; i++) { if(WavCoeffs_L0[i] < lminL) lminL = WavCoeffs_L0[i]; if(WavCoeffs_L0[i] > lmaxL) lmaxL = WavCoeffs_L0[i]; } +#ifdef _RT_NESTED_OPENMP #pragma omp critical +#endif { if(lminL < min0) min0 = lminL; if(lmaxL > max0) max0 = lmaxL; } @@ -1389,13 +1402,13 @@ if(cp.tonemap && cp.contmet==2) { for (int i=0; i1) #endif { if(contrast != 0.f) { // contrast = 0.f means that all will be multiplied by 1.f, so we can skip this step { -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for #endif for (int i=0; i1) #endif { @@ -1601,7 +1617,7 @@ if(cp.detectedge && lipschitz==true) { //enabled Lipschitz control...more memory float huebuffer[W_L] ALIGNED64; float chrbuffer[W_L] ALIGNED64; #endif // __SSE2__ -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for schedule(dynamic,16) #endif for (int i=0; i1) #endif { if(cp.chrores != 0.f) { // cp.chrores == 0.f means all will be multiplied by 1.f, so we can skip the processing of residual -#ifdef _OPENMP +#ifdef _RT_NESTED_OPENMP #pragma omp for nowait #endif for (int i=0; i