Disable nested parallelism when compiled with clang, Issue 2731

This commit is contained in:
Ingo
2015-06-07 12:18:38 +02:00
parent 1ff51dde15
commit 86df10be88
4 changed files with 117 additions and 96 deletions

View File

@@ -645,7 +645,8 @@ do {
// Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
int numthreads = MIN(numtiles,omp_get_max_threads());
if(options.rgbDenoiseThreadLimit > 0)
numthreads = MIN(numthreads,options.rgbDenoiseThreadLimit);
numthreads = MIN(numthreads,options.rgbDenoiseThreadLimit);
#ifdef _RT_NESTED_OPENMP
denoiseNestedLevels = omp_get_max_threads() / numthreads;
bool oldNested = omp_get_nested();
if(denoiseNestedLevels < 2)
@@ -654,7 +655,8 @@ do {
omp_set_nested(true);
if(options.rgbDenoiseThreadLimit > 0)
while(denoiseNestedLevels*numthreads > options.rgbDenoiseThreadLimit)
denoiseNestedLevels--;
denoiseNestedLevels--;
#endif
if(settings->verbose)
printf("RGB_denoise uses %d main thread(s) and up to %d nested thread(s) for each main thread\n",numthreads,denoiseNestedLevels);
#endif
@@ -736,8 +738,8 @@ do {
if(!denoiseMethodRgb){//lab mode
//modification Jacques feb 2013 and july 2014
#ifdef _OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
for (int i=tiletop; i<tilebottom; i++) {
int i1 = i - tiletop;
@@ -781,8 +783,8 @@ do {
}
}
} else {//RGB mode
#ifdef _OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
for (int i=tiletop; i<tilebottom; i++) {
int i1 = i - tiletop;
@@ -814,8 +816,8 @@ do {
}
}
} else {//image is not raw; use Lab parametrization
#ifdef _OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
for (int i=tiletop; i<tilebottom; i++) {
int i1 = i - tiletop;
@@ -928,7 +930,7 @@ do {
if(!memoryAllocationFailed) {
// precalculate madL, because it's used in adecomp and bdecomp
int maxlvl = Ldecomp->maxlevel();
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for schedule(dynamic) collapse(2) num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
for (int lvl=0; lvl<maxlvl; lvl++) {
@@ -1023,7 +1025,7 @@ do {
if(!memoryAllocationFailed) {
// copy labdn->L to Lin before it gets modified by reconstruction
Lin = new array2D<float>(width,height);
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
for(int i=0;i<height;i++)
@@ -1134,14 +1136,14 @@ do {
fLbloxArray[i] = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof(float));
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
int masterThread = omp_get_thread_num();
#endif
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
{
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
int subThread = masterThread * denoiseNestedLevels + omp_get_thread_num();
#else
int subThread = 0;
@@ -1151,7 +1153,7 @@ do {
float *fLblox = fLbloxArray[subThread];
float pBuf[width + TS + 2*blkrad*offset] ALIGNED16;
float nbrwt[TS*TS] ALIGNED64;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for (int vblk=0; vblk<numblox_H; vblk++) {
@@ -1239,7 +1241,7 @@ do {
}
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
for (int i=0; i<height; i++) {
@@ -1283,8 +1285,8 @@ do {
realred /= 100.f;
realblue /= 100.f;
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,16) num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for schedule(dynamic,16) num_threads(denoiseNestedLevels)
#endif
for (int i=tiletop; i<tilebottom; i++){
int i1 = i-tiletop;
@@ -1329,8 +1331,8 @@ do {
}
}
} else {//RGB mode
#ifdef _OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels)
#endif
for (int i=tiletop; i<tilebottom; i++){
int i1 = i-tiletop;
@@ -1365,8 +1367,8 @@ do {
}
} else {
#ifdef _OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels)
#endif
for (int i=tiletop; i<tilebottom; i++){
int i1 = i-tiletop;
@@ -1427,7 +1429,7 @@ do {
}
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
omp_set_nested(oldNested);
#endif
//copy denoised image to output
@@ -1854,7 +1856,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposit
maxHL = WaveletCoeffs_L.level_H(lvl);
}
bool memoryAllocationFailed = false;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
{
@@ -1868,7 +1870,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposit
if(!memoryAllocationFailed) {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int lvl=maxlvl-1; lvl>=0; lvl--) {//for levels less than max, use level diff to make edge mask
@@ -1968,7 +1970,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi
maxHL = WaveletCoeffs_L.level_H(lvl);
}
bool memoryAllocationFailed = false;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
{
@@ -1983,7 +1985,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi
if(!memoryAllocationFailed) {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int lvl=0; lvl<maxlvl; lvl++) {
@@ -2001,7 +2003,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi
}
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int lvl=maxlvl-1; lvl>=0; lvl--) {//for levels less than max, use level diff to make edge mask
@@ -2086,7 +2088,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi
maxHL = WaveletCoeffs_L.level_H(lvl);
}
bool memoryAllocationFailed = false;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
{
@@ -2099,7 +2101,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi
}
if(!memoryAllocationFailed) {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int lvl=0; lvl<maxlvl; lvl++) {
@@ -2130,7 +2132,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi
maxHL = WaveletCoeffs_L.level_H(lvl);
}
bool memoryAllocationFailed = false;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
#endif
{
@@ -2143,7 +2145,7 @@ SSEFUNCTION bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposi
}
if(!memoryAllocationFailed) {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int lvl=0; lvl<maxlvl; lvl++) {
@@ -2589,7 +2591,7 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
for (int i=0; i<hei; i++)
bcalc[i] = new float[wid];
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for if(multiThread)
#endif
for(int ii=0;ii<hei;ii++){
@@ -2707,7 +2709,7 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
//fill tile from image; convert RGB to "luma/chroma"
if (isRAW) {//image is raw; use channel differences for chroma channels
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for if(multiThread)
#endif
for (int i=tiletop; i<tilebottom; i+=2) {
@@ -2747,7 +2749,7 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
}
#endif
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for if(multiThread)
#endif
for (int i=tiletop; i<tilebottom; i+=2) {
@@ -2762,7 +2764,7 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
}
if (!denoiseMethodRgb){//lab mode, modification Jacques feb 2013 and july 2014
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for if(multiThread)
#endif
for (int i=tiletop; i<tilebottom; i++) {
@@ -2883,17 +2885,17 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
schoice=2;
const int levwav=5;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel sections if(multiThread)
#endif
{
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp section
#endif
{
adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H, levwav, 1 );
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp section
#endif
{

View File

@@ -215,7 +215,7 @@ namespace rtengine {
* Applies a Haar filter
*
*/
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(numThreads) if(numThreads>1)
#endif
for (int k=0; k<height; k++) {
@@ -234,18 +234,18 @@ namespace rtengine {
* Applies a Haar filter
*
*/
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(numThreads) if(numThreads>1)
#endif
{
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for nowait
#endif
for(int i = 0; i < skip; i++) {
for(int j=0;j<width;j++)
dst[width*i+j] = (srcLo[i*width+j] + srcHi[i*width+j]);
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for(int i = skip; i < height; i++) {
@@ -394,7 +394,7 @@ namespace rtengine {
// calculate coefficients
int shift = skip*(taps-offset-1);//align filter with data
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(numThreads) if(numThreads>1)
#endif
for (int k=0; k<height; k++) {
@@ -450,7 +450,7 @@ namespace rtengine {
__m128 fourv = _mm_set1_ps(4.f);
__m128 srcFactorv = _mm_set1_ps(srcFactor);
__m128 dstFactorv = _mm_set1_ps(blend);
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(numThreads) if(numThreads>1)
#endif
for(int i = 0; i < dstheight; i++) {
@@ -509,7 +509,7 @@ namespace rtengine {
// calculate coefficients
int shift=skip*(taps-offset-1);//align filter with data
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(numThreads) if(numThreads>1)
#endif
for(int i = 0; i < dstheight; i++) {
@@ -550,14 +550,14 @@ namespace rtengine {
}
}
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(numThreads) if(numThreads>1)
#endif
{
T tmpLo[m_w] ALIGNED64;
T tmpHi[m_w] ALIGNED64;
if(subsamp_out) {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for(int row=0;row<m_h;row+=2) {
@@ -566,7 +566,7 @@ namespace rtengine {
AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
}
} else {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for(int row=0;row<m_h;row++) {
@@ -580,7 +580,7 @@ namespace rtengine {
#else
template<typename T> template<typename E> void wavelet_level<T>::decompose_level(E *src, E *dst, float *filterV, float *filterH, int taps, int offset) {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(numThreads) if(numThreads>1)
#endif
{
@@ -588,7 +588,7 @@ namespace rtengine {
T tmpHi[m_w] ALIGNED64;
/* filter along rows and columns */
if(subsamp_out) {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for(int row=0;row<m_h;row+=2) {
@@ -597,7 +597,7 @@ namespace rtengine {
AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
}
} else {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for(int row=0;row<m_h;row++) {

View File

@@ -438,7 +438,8 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int
numthreads = MIN(numtiles,omp_get_max_threads());
if(maxnumberofthreadsforwavelet > 0)
numthreads = MIN(numthreads,maxnumberofthreadsforwavelet);
numthreads = MIN(numthreads,maxnumberofthreadsforwavelet);
#ifdef _RT_NESTED_OPENMP
wavNestedLevels = omp_get_max_threads() / numthreads;
bool oldNested = omp_get_nested();
if(wavNestedLevels < 2)
@@ -447,7 +448,8 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int
omp_set_nested(true);
if(maxnumberofthreadsforwavelet > 0)
while(wavNestedLevels*numthreads > maxnumberofthreadsforwavelet)
wavNestedLevels--;
wavNestedLevels--;
#endif
if(settings->verbose)
printf("Ip Wavelet uses %d main thread(s) and up to %d nested thread(s) for each main thread\n",numthreads,wavNestedLevels);
@@ -493,16 +495,16 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int
for (int i=0; i<tileheight; i++)
Lold[i] = LoldBuffer + i*tilewidth;
}
} else {
labco = new LabImage(width,height);
Lold = lab->L;
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
for (int i=tiletop; i<tilebottom; i++) {
int i1 = i - tiletop;
int j;
@@ -554,7 +556,7 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int
}
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
for (int i=1; i<hei-1; i++) {
@@ -618,7 +620,7 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int
float madL[8][3];
bool memoryAllocationFailed = false;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for schedule(dynamic) collapse(2) num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
for (int lvl=0; lvl<3; lvl++) {
@@ -777,9 +779,9 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int
}
bool highlight = params->toneCurve.hrenabled;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for schedule(dynamic,16) num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
#endif
for (int i=tiletop; i<tilebottom; i++){
int i1 = i-tiletop;
float L,a,b;
@@ -928,7 +930,7 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int
delete [] sigmaN;
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
omp_set_nested(oldNested);
#endif
if(numtiles > 1) {
@@ -936,10 +938,10 @@ omp_set_nested(oldNested);
delete dsttmp;
}
// if (settings->verbose) {
if (settings->verbose) {
t2e.set();
printf("Wavelet performed in %d usec:\n", t2e.etime(t1e));
// }
}
}//end o
@@ -960,12 +962,12 @@ omp_set_nested(oldNested);
float thres = 5.f;//different fom zero to take into account only data large enough
max=0.f;
min=0.f;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
{
float lmax = 0.f, lmin = 0.f;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for reduction(+:averaP,averaN,countP,countN) nowait
#endif
for(int i=0;i<datalen;i++) {
@@ -982,7 +984,7 @@ omp_set_nested(oldNested);
countN++;
}
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp critical
#endif
{
@@ -1002,7 +1004,7 @@ omp_set_nested(oldNested);
float variP = 0.f, variN = 0.f;
float thres = 5.f;//different fom zero to take into account only data large enough
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for reduction(+:variP,variN,countP,countN) num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
for(int i=0;i<datalen;i++) {
@@ -1087,7 +1089,7 @@ float *ImProcFunctions::ContrastDR(float *Source, int skip, struct cont_params c
int n=W_L*H_L;
if(Contrast == NULL) Contrast = new float[n];
memcpy(Contrast, Source, n*sizeof(float));
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<W_L*H_L; i++) {//contrast
@@ -1122,12 +1124,12 @@ SSEFUNCTION float *ImProcFunctions::CompressDR(float *Source, int skip, struct c
int n=W_L*H_L;
#ifdef __SSE2__
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel
#endif
{
__m128 epsv = _mm_set1_ps( eps );
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for(int ii = 0; ii < n-3; ii+=4)
@@ -1137,7 +1139,7 @@ SSEFUNCTION float *ImProcFunctions::CompressDR(float *Source, int skip, struct c
Source[ii] = xlogf(Source[ii] + eps);
#else
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for
#endif
for(int ii = 0; ii < n; ii++)
@@ -1168,7 +1170,7 @@ SSEFUNCTION float *ImProcFunctions::CompressDR(float *Source, int skip, struct c
#ifdef __SSE2__
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel
#endif
{
@@ -1176,7 +1178,7 @@ SSEFUNCTION float *ImProcFunctions::CompressDR(float *Source, int skip, struct c
__m128 epsv = _mm_set1_ps( eps );
__m128 DetailBoostv = _mm_set1_ps( DetailBoost );
__m128 tempv = _mm_set1_ps( temp );
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for(int i = 0; i < n-3; i+=4){
@@ -1195,7 +1197,7 @@ SSEFUNCTION float *ImProcFunctions::CompressDR(float *Source, int skip, struct c
}
#else
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for
#endif
for(int i = 0; i < n; i++){
@@ -1218,7 +1220,9 @@ void ImProcFunctions::ContrastResid(float * WavCoeffs_L0, unsigned int Iterates
cp.TMmeth=2;//default after testing
if(cp.TMmeth ==1) {min0 = 0.0f;max0=32768.f;}
else if (cp.TMmeth ==2) {min0 = 0.0f;}
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for
#endif
for(int i = 0; i < W_L*H_L; i++)
{ WavCoeffs_L0[i]= (WavCoeffs_L0[i] - min0)/max0;
WavCoeffs_L0[i]*=gamm;
@@ -1232,9 +1236,9 @@ void ImProcFunctions::ContrastResid(float * WavCoeffs_L0, unsigned int Iterates
CompressDR(WavCoeffs_L0, skip, cp, W_L, H_L, Compression,DetailBoost,max0, min0, ave, ah, bh, al, bl, factorx, WavCoeffs_L0);
#ifdef _OPENMP
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < W_L*H_L; ii++)
WavCoeffs_L0[ii] = WavCoeffs_L0[ii]*max0*(1.f/gamm) + min0;
}
@@ -1255,7 +1259,9 @@ void ImProcFunctions::ContrastResid(float * WavCoeffs_L0, unsigned int Iterates
if(cp.TMmeth ==1) {min0 = 0.0f;max0=32768.f;}
else if (cp.TMmeth ==2) {min0 = 0.0f;}
// max0=32768.f;
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for
#endif
for(int i = 0; i < W_L*H_L; i++)
{ WavCoeffs_L0[i]= (WavCoeffs_L0[i] - min0)/max0;
WavCoeffs_L0[i]*=gamm;
@@ -1272,9 +1278,9 @@ void ImProcFunctions::ContrastResid(float * WavCoeffs_L0, unsigned int Iterates
epd.CompressDynamicRange(WavCoeffs_L0, sca/float(skip), edgest, Compression, DetailBoost, Iterates, rew, WavCoeffs_L0);
//Restore past range, also desaturate a bit per Mantiuk's Color correction for tone mapping.
#ifdef _OPENMP
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for // removed schedule(dynamic,10)
#endif
for(int ii = 0; ii < W_L*H_L; ii++)
WavCoeffs_L0[ii] = WavCoeffs_L0[ii]*max0*(1.f/gamm) + min0;
}
@@ -1288,7 +1294,7 @@ void ImProcFunctions::WaveletcontAllLfinal(LabImage * labco, float ** varhue, fl
float * WavCoeffs_L0 = WaveletCoeffs_L.coeff0;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int dir=1; dir<4; dir++) {
@@ -1326,24 +1332,31 @@ void ImProcFunctions::WaveletcontAllLfinal(LabImage * labco, float ** varhue, fl
float min0 = FLT_MAX;
if(contrast != 0.f || cp.tonemap) { // contrast = 0.f means that all will be multiplied by 1.f, so we can skip this step
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel for reduction(+:avedbl) num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
for (int i=0; i<W_L*H_L; i++) {
avedbl += WavCoeffs_L0[i];
}
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
{
float lminL = FLT_MAX;
float lmaxL = 0.f;
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for(int i = 0; i < W_L*H_L; i++) {
if(WavCoeffs_L0[i] < lminL) lminL = WavCoeffs_L0[i];
if(WavCoeffs_L0[i] > lmaxL) lmaxL = WavCoeffs_L0[i];
}
#ifdef _RT_NESTED_OPENMP
#pragma omp critical
#endif
{ if(lminL < min0) min0 = lminL;
if(lmaxL > max0) max0 = lmaxL;
}
@@ -1389,13 +1402,13 @@ if(cp.tonemap && cp.contmet==2) {
for (int i=0; i<W_L*H_L; i++)
koeLi[j][i]=0.f;
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
{
if(contrast != 0.f) { // contrast = 0.f means that all will be multiplied by 1.f, so we can skip this step
{
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for
#endif
for (int i=0; i<W_L*H_L; i++) {//contrast
@@ -1421,14 +1434,17 @@ if(cp.tonemap && cp.contmet==2) {
if(cp.tonemap && cp.contmet==1) {
float maxp=max0*256.f;
float minp=min0*256.f;
#pragma omp single
#ifdef _RT_NESTED_OPENMP
#pragma omp single
#endif
ContrastResid(WavCoeffs_L0, 5, skip, cp, W_L, H_L, maxp, minp, ave, ah, bh, al, bl, factorx );
}
#ifdef _RT_NESTED_OPENMP
#pragma omp barrier
#endif
if(cp.conres != 0.f || cp.conresH != 0.f) { // cp.conres = 0.f and cp.comresH = 0.f means that all will be multiplied by 1.f, so we can skip this step
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for nowait
#endif
for (int i=0; i<W_L*H_L; i++) {
@@ -1481,7 +1497,7 @@ if(cp.detectedge && lipschitz==true) { //enabled Lipschitz control...more memory
for (int i=0; i<H_L; i++){
tmC[i] = &tmCBuffer[i*W_L];
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int lvl=0; lvl<3; lvl++) {
@@ -1499,7 +1515,7 @@ if(cp.detectedge && lipschitz==true) { //enabled Lipschitz control...more memory
float aamp=1.f+cp.eddetthrHi/100.f;
for (int lvl=0; lvl<3; lvl++) {
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic,16)
#endif
for (int i=1; i<H_L-1; i++) {
@@ -1561,7 +1577,7 @@ if(cp.detectedge && lipschitz==true) { //enabled Lipschitz control...more memory
// end
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int dir=1; dir<4; dir++) {
@@ -1593,7 +1609,7 @@ if(cp.detectedge && lipschitz==true) { //enabled Lipschitz control...more memory
float * WavCoeffs_a0 = WaveletCoeffs_a.coeff0;
float * WavCoeffs_b0 = WaveletCoeffs_b.coeff0;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
{
@@ -1601,7 +1617,7 @@ if(cp.detectedge && lipschitz==true) { //enabled Lipschitz control...more memory
float huebuffer[W_L] ALIGNED64;
float chrbuffer[W_L] ALIGNED64;
#endif // __SSE2__
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic,16)
#endif
for (int i=0; i<H_L; i++) {
@@ -1655,13 +1671,13 @@ if(cp.detectedge && lipschitz==true) { //enabled Lipschitz control...more memory
float * WavCoeffs_ab0 = WaveletCoeffs_ab.coeff0;
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp parallel num_threads(wavNestedLevels) if(wavNestedLevels>1)
#endif
{
if(cp.chrores != 0.f) { // cp.chrores == 0.f means all will be multiplied by 1.f, so we can skip the processing of residual
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for nowait
#endif
for (int i=0; i<W_L*H_L; i++) {
@@ -1707,7 +1723,7 @@ if(cp.detectedge && lipschitz==true) { //enabled Lipschitz control...more memory
}
}
#ifdef _OPENMP
#ifdef _RT_NESTED_OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int dir=1; dir<4; dir++) {

View File

@@ -69,4 +69,7 @@
#define ALIGNED64
#define ALIGNED16
#endif
#ifndef __clang__
#define _RT_NESTED_OPENMP _OPENMP
#endif
#endif