From 7918fab5621bbdbd48ef20ee59e2f7079680656c Mon Sep 17 00:00:00 2001
From: Ingo <heckflosse@i-weyrich.de>
Date: Tue, 22 Jan 2013 18:14:53 +0100
Subject: [PATCH] Performance optimization for impulse_nr on multi-core-systems

---
 rtengine/FTblockDN.cc | 688 +++++++++++++++++++++++-------------------
 1 file changed, 373 insertions(+), 315 deletions(-)
diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 140b85135..ebe60d550 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -59,87 +59,87 @@
 #define epsilon 0.001f/(TS*TS) //tolerance
 
 namespace rtengine {
-	
+
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	/*
-	 Structure of the algorithm: 
-	 
-	 1. Compute an initial denoise of the image via undecimated wavelet transform 
+	 Structure of the algorithm:
+
+	 1. Compute an initial denoise of the image via undecimated wavelet transform
 	 and universal thresholding modulated by user input.
-	 2. Decompose the residual image into TSxTS size tiles, shifting by 'offset' each step 
+	 2. Decompose the residual image into TSxTS size tiles, shifting by 'offset' each step
 	 (so roughly each pixel is in (TS/offset)^2 tiles); Discrete Cosine transform the tiles.
 	 3. Filter the DCT data to pick out patterns missed by the wavelet denoise
 	 4. Inverse DCT the denoised tile data and combine the tiles into a denoised output image.
-	 
+
 	 */
-	
+
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	
-	
-	
-	
+
+
+
+
+
 	void ImProcFunctions::RGB_denoise(Imagefloat * src, Imagefloat * dst, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const procparams::DefringeParams & defringe)
-	{	
-		
+	{
+
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		
+
 		/*if (plistener) {
 		 plistener->setProgressStr ("Denoise...");
 		 plistener->setProgress (0.0);
 		 }*/
-		
+
 		volatile double progress = 0.0;
-		
+
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		
+
 		const short int imheight=src->height, imwidth=src->width;
-		
+
 		if (dnparams.luma==0 && dnparams.chroma==0) {
             //nothing to do; copy src to dst
             memcpy(dst->data,src->data,dst->width*dst->height*3*sizeof(float));
 			return;
 		}
-		
+
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 		// gamma transform for input data
 		float gam = dnparams.gamma;
 		float gamthresh = 0.001;
 		float gamslope = exp(log((double)gamthresh)/gam)/gamthresh;
-		
+
 		LUTf gamcurve(65536,0);
-		
+
 		for (int i=0; i<65536; i++) {
 			gamcurve[i] = (Color::gamma((double)i/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f;
 		}
-		
+
 		// inverse gamma transform for output data
 		float igam = 1/gam;
 		float igamthresh = gamthresh*gamslope;
 		float igamslope = 1/gamslope;
-		
+
 		LUTf igamcurve(65536,0);
-		
+
 		for (int i=0; i<65536; i++) {
 			igamcurve[i] = (Color::gamma((float)i/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
 		}
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		
-		
+
+
 		//srand((unsigned)time(0));//test with random data
-		
+
 		const float gain = pow (2.0, dnparams.expcomp);
-		
+
 		float noisevar_Ldetail = SQR((SQR(100-dnparams.Ldetail) + 50*(100-dnparams.Ldetail)) * TS * 0.5f);
-		
-		
+
+
 		array2D<float> tilemask_in(TS,TS);
 		array2D<float> tilemask_out(TS,TS);
-		
+
 		const int border = MAX(2,TS/16);
-		
+
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
@@ -150,25 +150,25 @@ namespace rtengine {
 			for (int j=0; j<TS; j++) {
 				float j1 = abs((j>TS/2 ? j-TS+1 : j));
 				tilemask_in[i][j] = (vmask * (j1<border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
-				
+
 				tilemask_out[i][j] = (vmask2 * (j1<2*border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
-				
+
 			}
 		}
-		
+
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 		// begin tile processing of image
-		
+
 		//output buffer
 		Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
 		for (int n=0; n<3*imwidth*imheight; n++) dsttmp->data[n] = 0;
-		
+
 		const int tilesize = 1024;
 		const int overlap = 128;
-		
+
 		int numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip;
-		
+
 		if (imwidth<tilesize) {
 			numtiles_W = 1;
 			tileWskip = imwidth;
@@ -191,25 +191,77 @@ namespace rtengine {
 		}
 		//now we have tile dimensions, overlaps
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		
-		//adding omp here slows it down
+
+        // According to FFTW-Doc 'it is safe to execute the same plan in parallel by multiple threads', so we now create 4 plans
+        // outside the parallel region and use them inside the parallel region.
+
+        // calculate max size of numblox_W.
+        int max_numblox_W = ceil(((float)(MIN(imwidth,tilewidth)))/(offset))+2*blkrad;
+        // calculate min size of numblox_W.
+		int min_numblox_W = ceil(((float)((MIN(imwidth,((numtiles_W - 1) * tileWskip) + tilewidth) ) - ((numtiles_W - 1) * tileWskip)))/(offset))+2*blkrad;
+
+        // these are needed only for creation of the plans and will be freed before entering the parallel loop
+		float * Lbloxtmp;
+		float * fLbloxtmp;
+		Lbloxtmp  = fftwf_alloc_real(max_numblox_W*TS*TS);
+		fLbloxtmp = fftwf_alloc_real(max_numblox_W*TS*TS);
+
+		int nfwd[2]={TS,TS};
+
+        //for DCT:
+		const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
+		const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
+
+        fftwf_plan plan_forward_blox[2];
+        fftwf_plan plan_backward_blox[2];
+
+        // Creating the plans with FFTW_MEASURE instead of FFTW_ESTIMATE speeds up the execute a bit
+		plan_forward_blox[0]  = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
+		plan_backward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
+		plan_forward_blox[1]  = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
+		plan_backward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
+		fftwf_free ( Lbloxtmp );
+		fftwf_free ( fLbloxtmp );
+
+#ifdef _OPENMP
+        // Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
+        int numtiles = numtiles_W * numtiles_H;
+        int numthreads = MIN(numtiles,omp_get_max_threads());
+        //if(options.RgbDenoiseThreadLimit > 0) numthreads = MIN(numthreads,options.RgbDenoiseThreadLimit);
+#pragma omp parallel num_threads(numthreads)
+#endif
+        {
+		//DCT block data storage
+		float * Lblox;
+		float * fLblox;
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+        {
+		Lblox  = fftwf_alloc_real(max_numblox_W*TS*TS);
+		fLblox = fftwf_alloc_real(max_numblox_W*TS*TS);
+        }
+#ifdef _OPENMP
+#pragma omp for schedule(dynamic) collapse(2)
+#endif
 		for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
 			for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
-				
+
 				int tileright = MIN(imwidth,tileleft+tilewidth);
 				int tilebottom = MIN(imheight,tiletop+tileheight);
 				int width  = tileright-tileleft;
 				int height = tilebottom-tiletop;
-				
+
 				//input L channel
-				array2D<float> Lin(width,height,ARRAY2D_CLEAR_DATA);
+				array2D<float> Lin(width,height);
 				//wavelet denoised image
 				LabImage * labdn = new LabImage(width,height);
 				//residual between input and denoised L channel
 				array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
 				//pixel weight
 				array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
-				
+//
+
 				//#ifdef _OPENMP
 				//#pragma omp parallel for
 				//#endif
@@ -220,145 +272,142 @@ namespace rtengine {
 						int i1 = i - tiletop;
 						for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
 							int j1 = j - tileleft;
-							
+
 							float X = gain*src->r[i][j];
 							float Y = gain*src->g[i][j];
 							float Z = gain*src->b[i][j];
-							
+
 							X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
 							Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
 							Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-							
+
 							labdn->L[i1][j1] = Y;
 							labdn->a[i1][j1] = (X-Y);
 							labdn->b[i1][j1] = (Y-Z);
-							
-							Ldetail[i1][j1] = 0;
+
+//							Ldetail[i1][j1] = 0;
 							Lin[i1][j1] = Y;
-							totwt[i1][j1] = 0;
+//							totwt[i1][j1] = 0;
 						}
-					} 
+					}
 				} else {//image is not raw; use Lab parametrization
 					for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
 						int i1 = i - tiletop;
 						for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
 							int j1 = j - tileleft;
-							
+
 							//TODO: use embedded profile if present, instead of assuming sRGB
 							float rtmp = Color::igammatab_srgb[ src->r[i][j] ];
 							float gtmp = Color::igammatab_srgb[ src->g[i][j] ];
 							float btmp = Color::igammatab_srgb[ src->b[i][j] ];
-							
+
 							//perhaps use LCH or YCrCb ???
 							float X = xyz_sRGB[0][0]*rtmp + xyz_sRGB[0][1]*gtmp + xyz_sRGB[0][2]*btmp;
 							float Y = xyz_sRGB[1][0]*rtmp + xyz_sRGB[1][1]*gtmp + xyz_sRGB[1][2]*btmp;
 							float Z = xyz_sRGB[2][0]*rtmp + xyz_sRGB[2][1]*gtmp + xyz_sRGB[2][2]*btmp;
-							
+
 							X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
 							Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
 							Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-							
+
 							labdn->L[i1][j1] = Y;
 							labdn->a[i1][j1] = (X-Y);
 							labdn->b[i1][j1] = (Y-Z);
-							
-							Ldetail[i1][j1] = 0;
+
+//							Ldetail[i1][j1] = 0;
 							Lin[i1][j1] = Y;
-							totwt[i1][j1] = 0;
+//							totwt[i1][j1] = 0;
 						}
 					}
 				}
-				
-				
+
+
 				//initial impulse denoise
 				if (dnparams.luma>0.01) {
 					impulse_nr (labdn, MIN(50.0f,dnparams.luma)/20.0f);
 				}
-				
+
 				int datalen = labdn->W * labdn->H;
-				
+
 				//now perform basic wavelet denoise
-				//last two arguments of wavelet decomposition are max number of wavelet decomposition levels; 
-				//and whether to subsample the image after wavelet filtering.  Subsampling is coded as 
-				//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample 
+				//last two arguments of wavelet decomposition are max number of wavelet decomposition levels;
+				//and whether to subsample the image after wavelet filtering.  Subsampling is coded as
+				//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
 				//the first level only, 7 means subsample the first three levels, etc.
-				wavelet_decomposition Ldecomp(labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
-				wavelet_decomposition adecomp(labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
-				wavelet_decomposition bdecomp(labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );
-				
+
 				float noisevarL	 = SQR((dnparams.luma/125.0f)*(1+ dnparams.luma/25.0f));
+
 				float noisevarab = SQR(dnparams.chroma/10.0f);
-				
+                { // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
+                wavelet_decomposition* Ldecomp;
+                wavelet_decomposition* adecomp;
+                wavelet_decomposition* bdecomp;
+
+                Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
+                adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
+                bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );
+
 				//WaveletDenoiseAll_BiShrink(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
-				WaveletDenoiseAll(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
-				
-				Ldecomp.reconstruct(labdn->data);
-				adecomp.reconstruct(labdn->data+datalen);
-				bdecomp.reconstruct(labdn->data+2*datalen);
-				
+				WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarab);
+
+				Ldecomp->reconstruct(labdn->data);
+				delete Ldecomp;
+				adecomp->reconstruct(labdn->data+datalen);
+				delete adecomp;
+				bdecomp->reconstruct(labdn->data+2*datalen);
+				delete bdecomp;
+                }
+
 				//TODO: at this point wavelet coefficients storage can be freed
-				
+				//Issue 1680: Done now
+
 				//second impulse denoise
 				if (dnparams.luma>0.01) {
 					impulse_nr (labdn, MIN(50.0f,dnparams.luma)/20.0f);
-				}				
+				}
 				//PF_correct_RT(dst, dst, defringe.radius, defringe.threshold);
-				
+
 				//wavelet denoised L channel
 				array2D<float> Lwavdn(width,height);
 				float * Lwavdnptr = Lwavdn;
 				memcpy (Lwavdnptr, labdn->data, width*height*sizeof(float));
-				
+
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				// now do detail recovery using block DCT to detect  
+				// now do detail recovery using block DCT to detect
 				// patterns missed by wavelet denoise
 				// blocks are not the same thing as tiles!
-				
-				
-				// allocate DCT data structures
-				
+
+
+
 				// calculation for detail recovery blocks
 				const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad;
 				const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad;
+
 				//const int nrtiles = numblox_W*numblox_H;
 				// end of tiling calc
-				
-				//DCT block data storage
-				float * Lblox  = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
-				float * fLblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
-				
-				
-				//make a plan for FFTW
-				fftwf_plan plan_forward_blox, plan_backward_blox;
-				
-				int nfwd[2]={TS,TS};
-				
-				//for DCT:
-				const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
-				const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
-				
-				plan_forward_blox  = fftwf_plan_many_r2r(2, nfwd, numblox_W, Lblox, NULL, 1, TS*TS, fLblox, NULL, 1, TS*TS, fwdkind, FFTW_ESTIMATE );
-				plan_backward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, fLblox, NULL, 1, TS*TS, Lblox, NULL, 1, TS*TS, bwdkind, FFTW_ESTIMATE );
-				
+                {
+
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				// Main detail recovery algorithm: Block loop
-				//OpenMP here			
-				//adding omp here leads to artifacts	
+				//OpenMP here
+				//adding omp here leads to artifacts
+                AlignedBufferMP<float> buffer(width + TS + 2*blkrad*offset);
 				for (int vblk=0; vblk<numblox_H; vblk++) {
 					//printf("vblock=%d",vblk);
 					int vblkmod = vblk%8;
-					
+
 					int top = (vblk-blkrad)*offset;
-					
-					float * buffer = new float [width + TS + 2*blkrad*offset];
-					float * datarow = buffer+blkrad*offset;
-					
+                    AlignedBuffer<float>* pBuf = buffer.acquire();
+//					float * buffer = new float [width + TS + 2*blkrad*offset];
+					float * datarow = (float*)pBuf->data +blkrad*offset;
+
 					//#ifdef _OPENMP
 					//#pragma omp parallel for
 					//#endif
 					//TODO: implement using AlignedBufferMP
+//					#pragma omp parallel for
 					for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) {
 						int row = top + i;
 						int rr = row;
@@ -366,25 +415,25 @@ namespace rtengine {
 							rr = MIN(-row,height-1);
 						} else if (row>=height) {
 							rr = MAX(0,2*height-2-row);
-						} 
-						
+						}
+
 						for (int j=0; j<labdn->W; j++) {
 							datarow[j] = (Lin[rr][j]-Lwavdn[rr][j]);
 						}
-						
+
 						for (int j=-blkrad*offset; j<0; j++) {
 							datarow[j] = datarow[MIN(-j,width-1)];
 						}
 						for (int j=width; j<width+TS+blkrad*offset; j++) {
 							datarow[j] = datarow[MAX(0,2*width-2-j)];
 						}//now we have a padded data row
-						
+
 						//now fill this row of the blocks with Lab high pass data
 						//OMP here does not add speed, better handled on the outside loop
 						for (int hblk=0; hblk<numblox_W; hblk++) {
 							int left = (hblk-blkrad)*offset;
 							int indx = (hblk)*TS;//index of block in malloc
-							
+
 							for (int j=0; j<TS; j++) {
 								Lblox[(indx + i)*TS+j]  = tilemask_in[i][j]*datarow[left+j];// luma data
 								if (top+i>=0 && top+i<height && left+j>=0 && left+j<width) {
@@ -393,66 +442,59 @@ namespace rtengine {
 							}
 						}
 					}//end of filling block row
-					delete[] buffer;
-					
+					buffer.release(pBuf);
+
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-					
 					//fftwf_print_plan (plan_forward_blox);
-					fftwf_execute_r2r(plan_forward_blox,Lblox,fLblox);		// DCT an entire row of tiles
-					
+					if(numblox_W == max_numblox_W)
+                        fftwf_execute_r2r(plan_forward_blox[0],Lblox,fLblox);		// DCT an entire row of tiles
+                    else
+                        fftwf_execute_r2r(plan_forward_blox[1],Lblox,fLblox);		// DCT an entire row of tiles
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 					// now process the vblk row of blocks for noise reduction
 					for (int hblk=0; hblk<numblox_W; hblk++) {
-						
+
 						RGBtile_denoise (fLblox, vblk, hblk, numblox_H, numblox_W, noisevar_Ldetail );
-						
+
 					}//end of horizontal block loop
-					
+
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-					
+
 					//now perform inverse FT of an entire row of blocks
-					fftwf_execute_r2r(plan_backward_blox,fLblox,Lblox);	//for DCT
-					
+					if(numblox_W == max_numblox_W)
+                        fftwf_execute_r2r(plan_backward_blox[0],fLblox,Lblox);	//for DCT
+                    else
+                        fftwf_execute_r2r(plan_backward_blox[1],fLblox,Lblox);	//for DCT
+
 					int topproc = (vblk-blkrad)*offset;
-					
+
 					//add row of blocks to output image tile
 					RGBoutput_tile_row (Lblox, Ldetail, tilemask_out, height, width, topproc );
-					
+
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-					
+
 				}//end of vertical block loop
-				
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				
-				// clean up
-				//#pragma omp single nowait
-				fftwf_destroy_plan( plan_forward_blox );
-				//#pragma omp single nowait
-				fftwf_destroy_plan( plan_backward_blox );
-				
-				fftwf_free ( Lblox);
-				fftwf_free ( fLblox);
-				
-				fftwf_cleanup();
-				
+
+                }
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				for (int i=0; i<height; i++) {
 					for (int j=0; j<width; j++) {
 						//may want to include masking threshold for large hipass data to preserve edges/detail
 						float hpdn = Ldetail[i][j]/totwt[i][j];//note that labdn initially stores the denoised hipass data
-						
+
 						labdn->L[i][j] = Lwavdn[i][j] + hpdn;
-						
+
 					}
 				}
-				
+
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				// transform denoised "Lab" to output RGB
-				
+
 				//calculate mask for feathering output tile overlaps
 				float * Vmask = new float [height+1];
 				float * Hmask = new float [width+1];
-				
+
 				for (int i=0; i<height; i++) {
 					Vmask[i] = 1;
 				}
@@ -466,78 +508,86 @@ namespace rtengine {
 					if (tileleft>0) Hmask[i] = mask;
 					if (tileright<imwidth) Hmask[width-i] = mask;
 				}
-				
+
 				//convert back to RGB and write to destination array
 				if (isRAW) {
 #ifdef _OPENMP
-#pragma omp parallel for
+//#pragma omp parallel for
 #endif
 					for (int i=tiletop; i<tilebottom; i++){
 						int i1 = i-tiletop;
 						float X,Y,Z;
 						for (int j=tileleft; j<tileright; j++) {
 							int j1=j-tileleft;
-							
+
 							Y = labdn->L[i1][j1];
 							X = (labdn->a[i1][j1]) + Y;
 							Z = Y - (labdn->b[i1][j1]);
-							
+
 							X = X<32768.0f ? igamcurve[X] : (Color::gamma((float)X/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
 							Y = Y<32768.0f ? igamcurve[Y] : (Color::gamma((float)Y/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
 							Z = Z<32768.0f ? igamcurve[Z] : (Color::gamma((float)Z/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
-														
+
 							float factor = Vmask[i1]*Hmask[j1]/gain;
-							
+
 							dsttmp->r[i][j] += factor*X;
 							dsttmp->g[i][j] += factor*Y;
 							dsttmp->b[i][j] += factor*Z;
-							
+
 						}
 					}
 				} else {
 #ifdef _OPENMP
-#pragma omp parallel for
+//#pragma omp parallel for
 #endif
 					for (int i=tiletop; i<tilebottom; i++){
 						int i1 = i-tiletop;
 						float X,Y,Z;
 						for (int j=tileleft; j<tileright; j++) {
 							int j1=j-tileleft;
-							
+
 							Y = labdn->L[i1][j1];
 							X = (labdn->a[i1][j1]) + Y;
 							Z = Y - (labdn->b[i1][j1]);
-							
+
 							X = X<32768.0f ? igamcurve[X] : (Color::gamma((float)X/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
 							Y = Y<32768.0f ? igamcurve[Y] : (Color::gamma((float)Y/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
 							Z = Z<32768.0f ? igamcurve[Z] : (Color::gamma((float)Z/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
-														
+
 							float factor = Vmask[i1]*Hmask[j1];
-							
+
 							float rtmp = sRGB_xyz[0][0]*X + sRGB_xyz[0][1]*Y + sRGB_xyz[0][2]*Z;
 							float gtmp = sRGB_xyz[1][0]*X + sRGB_xyz[1][1]*Y + sRGB_xyz[1][2]*Z;
 							float btmp = sRGB_xyz[2][0]*X + sRGB_xyz[2][1]*Y + sRGB_xyz[2][2]*Z;
-							
+
 							dsttmp->r[i][j] += factor*rtmp;
 							dsttmp->g[i][j] += factor*gtmp;
 							dsttmp->b[i][j] += factor*btmp;
-							
+
 						}
 					}
 				}
-				
+
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				
+
 				delete labdn;
 				delete[] Vmask;
 				delete[] Hmask;
-				
+
 			}//end of tile row
 		}//end of tile loop
-		
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+{
+		fftwf_free ( Lblox);
+		fftwf_free ( fLblox);
+}
+
+        }
 		//copy denoised image to output
 		memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float));
-		
+
 		if (!isRAW) {//restore original image gamma
 #ifdef _OPENMP
 #pragma omp parallel for
@@ -546,140 +596,146 @@ namespace rtengine {
 				dst->data[i] = Color::gammatab_srgb[ dst->data[i] ];
 			}
 		}
-		
+
 		delete dsttmp;
-		
+
+    // destroy the plans
+	fftwf_destroy_plan( plan_forward_blox[0] );
+	fftwf_destroy_plan( plan_backward_blox[0] );
+	fftwf_destroy_plan( plan_forward_blox[1] );
+	fftwf_destroy_plan( plan_backward_blox[1] );
+	fftwf_cleanup();
+
 	}//end of main RGB_denoise
 
 
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	
-	
-	void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail )	//for DCT
-	{			
-		float * nbrwt  = new float[TS*TS];	//for DCT
-		
-		int blkstart = hblproc*TS*TS;
-		
-		boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation	//for DCT
 
+
+
+	void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail )	//for DCT
+	{
+		float * nbrwt  = new float[TS*TS];	//for DCT
+		int blkstart = hblproc*TS*TS;
+
+		boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation	//for DCT
+#pragma omp parallel for
 		for (int n=0; n<TS*TS; n++) {		//for DCT
 			fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail));
 		}//output neighbor averaged result
-		
+
 		delete[] nbrwt;
-		
+
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 		//printf("vblk=%d  hlk=%d  wsqave=%f   ||   ",vblproc,hblproc,wsqave);
-		
+
 	}//end of function tile_denoise
-	
-	
+
+
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
+
 	void ImProcFunctions::RGBoutput_tile_row (float *bloxrow_L, float ** Ldetail, float ** tilemask_out, int height, int width, int top )
 	{
 		const int numblox_W = ceil(((float)(width))/(offset));
 		const float DCTnorm = 1.0f/(4*TS*TS); //for DCT
 
+		int imin = MAX(0,-top);
+		int bottom = MIN( top+TS,height);
+		int imax = bottom - top;
+
 #ifdef _OPENMP
 #pragma omp parallel for
-#endif			
+#endif
 		//add row of tiles to output image
 		for (int hblk=0; hblk < numblox_W; hblk++) {
 			int left = (hblk-blkrad)*offset;
-			int bottom = MIN( top+TS,height);
 			int right  = MIN(left+TS, width);
-			int imin = MAX(0,-top);
 			int jmin = MAX(0,-left);
-			int imax = bottom - top;
 			int jmax = right - left;
-			
 			int indx = hblk*TS;
-			
+
 			for (int i=imin; i<imax; i++)
 				for (int j=jmin; j<jmax; j++) {
-					
+
 					Ldetail[top+i][left+j] += tilemask_out[i][j]*bloxrow_L[(indx + i)*TS+j]*DCTnorm; //for DCT
-					
+
 				}
 		}
-		
+
 	}
-	
+
 #undef TS
 #undef fTS
 #undef offset
 #undef epsilon
-	
-	
+
+
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
+
 	float ImProcFunctions::MadMax(float * DataList, int & max, int datalen) {
-		
+
 		//computes Median Absolute Deviation and Maximum of DataList
 		//DataList values should mostly have abs val < 65535
-		
+
 		int * histo = new int[65536];
 		//memset(histo, 0, 65536*sizeof(histo));
 		for (int i=0; i<65536; i++) histo[i]=0;
-		
+
 		//calculate histogram of absolute values of HH wavelet coeffs
 		for (int i=0; i<datalen; i++) {
 			histo[MAX(0,MIN(65535,abs((int)DataList[i])))]++;
 		}
-		
+
 		//find median of histogram
 		int median=0, count=0;
 		while (count<datalen/2) {
 			count += histo[median];
 			median++;
 		}
-		
+
 		//find max of histogram
 		max=65535;
 		while (histo[max]==0) {
 			max--;
 		}
-		
+
 		int count_ = count - histo[median-1];
-		
+
 		delete[] histo;
-		
-		// interpolate 
+
+		// interpolate
 		return (( (median-1) + (datalen/2-count_)/((float)(count-count_)) )/0.6745);
-		
+
 	}
-	
+
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	
-	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
-													 wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab ) 
+
+
+	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a,
+													 wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab )
 	{
 		int maxlvl = WaveletCoeffs_L.maxlevel();
 		const float eps = 0.01f;
 		int max;
 		float parfrac = 0.05;
-		
+
 		float madL[8][3], mada[8][3], madb[8][3];
 
-//OpenMP here		
+//OpenMP here
 		for (int lvl=0; lvl<maxlvl; lvl++) {
 			// compute median absolute deviation (MAD) of detail coefficients as robust noise estimator
-			
+
 			int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
 			int Hlvl_L = WaveletCoeffs_L.level_H(lvl);
-			
+
 			int Wlvl_ab = WaveletCoeffs_a.level_W(lvl);
 			int Hlvl_ab = WaveletCoeffs_a.level_H(lvl);
-			
+
 			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
@@ -690,39 +746,39 @@ namespace rtengine {
 				madb[lvl][dir-1] = SQR(MadMax(WavCoeffs_b[dir], max, Wlvl_ab*Hlvl_ab));
 			}
 		}
-		
+
 		for (int lvl=maxlvl-1; lvl>=0; lvl--) {//for levels less than max, use level diff to make edge mask
 			//for (int lvl=0; lvl<maxlvl; lvl++) {
-			
+
 			int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
 			int Hlvl_L = WaveletCoeffs_L.level_H(lvl);
-			
+
 			int Wlvl_ab = WaveletCoeffs_a.level_W(lvl);
 			int Hlvl_ab = WaveletCoeffs_a.level_H(lvl);
-			
+
 			float skip_L = WaveletCoeffs_L.level_stride(lvl);
 			float skip_ab = WaveletCoeffs_a.level_stride(lvl);
-			
+
 			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
-			
+
 			if (lvl==maxlvl-1) {
 				ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
 						  skip_L, skip_ab, noisevar_L, noisevar_ab);//TODO: this implies redundant evaluation of MAD
 			} else {
-				
+
 				float ** WavPars_L = WaveletCoeffs_L.level_coeffs(lvl+1);
 				//float ** WavPars_a = WaveletCoeffs_a.level_coeffs(lvl+1);
 				//float ** WavPars_b = WaveletCoeffs_b.level_coeffs(lvl+1);
-				
+
 				//simple wavelet shrinkage
-				float * sfave = new float[Wlvl_L*Hlvl_L]; 
+				float * sfave = new float[Wlvl_L*Hlvl_L];
 				array2D<float> edge(Wlvl_L,Hlvl_L);
-				AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(Wlvl_L,Hlvl_L));
-				
+				AlignedBufferMP<double>* buffer = new AlignedBufferMP<double> (MAX(Wlvl_L,Hlvl_L));
+
 				//printf("\n level=%d  \n",lvl);
-				
+
 				for (int dir=1; dir<4; dir++) {
 					float mad_L = madL[lvl][dir-1];
 					float mad_a = noisevar_ab*mada[lvl][dir-1];
@@ -730,136 +786,138 @@ namespace rtengine {
 					//float mad_Lpar = madL[lvl+1][dir-1];
 					//float mad_apar = mada[lvl+1][dir-1];
 					//float mad_bpar = mada[lvl+1][dir-1];
-					
+
 					//float skip_ab_ratio = WaveletCoeffs_a.level_stride(lvl+1)/skip_ab;
 					float skip_L_ratio =  WaveletCoeffs_L.level_stride(lvl+1)/skip_L;
-					
+
 					if (noisevar_ab>0.01) {
-						
+
 						//printf("  dir=%d  mad_L=%f		mad_a=%f		mad_b=%f	\n",dir,sqrt(mad_L),sqrt(mad_a),sqrt(mad_b));
-						
-//OpenMP here						
+
+//OpenMP here
 						for (int i=0; i<Hlvl_ab; i++) {
 							for (int j=0; j<Wlvl_ab; j++) {
-								
+
 								int coeffloc_ab = i*Wlvl_ab+j;
 								//int coeffloc_abpar = (MAX(0,i-skip_ab)*Wlvl_ab+MAX(0,j-skip_ab))/skip_ab_ratio;
-								
+
 								int coeffloc_L	= ((i*skip_L)/skip_ab)*Wlvl_L + ((j*skip_L)/skip_ab);
-								
+
 								float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L ])+eps;
 								float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
 								float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
-								
+
 								//float edgefactor = 1-exp(-mag_L/(9*mad_L));// * exp(-mag_a/(4*mad_a)) * exp(-mag_b/(4*mad_b));
-								
+
 								//float coeff_a = sqrt(SQR(WavCoeffs_a[dir][coeffloc_ab])/mad_a+SQR(parfrac*WavPars_a[dir][coeffloc_abpar])/mad_apar);
 								//float coeff_b = sqrt(SQR(WavCoeffs_b[dir][coeffloc_ab])/mad_b+SQR(parfrac*WavPars_b[dir][coeffloc_abpar])/mad_bpar);
-								
+
 								// 'firm' threshold of chroma coefficients
 								//WavCoeffs_a[dir][coeffloc_ab] *= edgefactor*(coeff_a>2 ? 1 : (coeff_a<1 ? 0 : (coeff_a - 1)));
 								//WavCoeffs_b[dir][coeffloc_ab] *= edgefactor*(coeff_b>2 ? 1 : (coeff_b<1 ? 0 : (coeff_b - 1)));
-								
+
 								//float satfactor_a = mad_a/(mad_a+0.5*SQR(WavCoeffs_a[0][coeffloc_ab]));
 								//float satfactor_b = mad_b/(mad_b+0.5*SQR(WavCoeffs_b[0][coeffloc_ab]));
-								
+
 								WavCoeffs_a[dir][coeffloc_ab] *= SQR(1-exp(-(mag_a/mad_a)-(mag_L/(9*mad_L)))/*satfactor_a*/);
 								WavCoeffs_b[dir][coeffloc_ab] *= SQR(1-exp(-(mag_b/mad_b)-(mag_L/(9*mad_L)))/*satfactor_b*/);
-								
+
 							}
 						}//now chrominance coefficients are denoised
 					}
-					
+
 					if (noisevar_L>0.01) {
 						mad_L *= noisevar_L*5/(lvl+1);
-//OpenMP here						
-						for (int i=0; i<Hlvl_L; i++) 
+//OpenMP here
+						for (int i=0; i<Hlvl_L; i++)
 							for (int j=0; j<Wlvl_L; j++) {
-								
+
 								int coeffloc_L = i*Wlvl_L+j;
 								int coeffloc_Lpar = (MAX(0,i-skip_L)*Wlvl_L+MAX(0,j-skip_L))/skip_L_ratio;
-								
+
 								float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L]);
 								//float mag_Lpar = SQR(parfrac*WavPars_L[dir][coeffloc_Lpar]);
 								//float sf_L = SQR(1-expf(-(mag_L/mad_L)-(mag_Lpar/mad_L)));
 								float sf_L = mag_L/(mag_L+mad_L*exp(-mag_L/(9*mad_L))+eps);
-								
+
 								sfave[coeffloc_L] = sf_L;
-								
+
 								//edge[i][j] = (WavCoeffs_L[dir][coeffloc_L] - WavPars_L[dir][coeffloc_Lpar]);
 							}
-						
+
 						//blur edge measure
 						//gaussHorizontal<float> (edge, edge, buffer, Wlvl_L, Hlvl_L, 1<<(lvl+1), false /*multiThread*/);
 						//gaussVertical<float>   (edge, edge, buffer, Wlvl_L, Hlvl_L, 1<<(lvl+1), false);
-						
+
 						boxblur(sfave, sfave, lvl+2, lvl+2, Wlvl_L, Hlvl_L);//increase smoothness by locally averaging shrinkage
-//OpenMP here						
-						for (int i=0; i<Hlvl_L; i++) 
+//OpenMP here
+						for (int i=0; i<Hlvl_L; i++)
 							for (int j=0; j<Wlvl_L; j++) {
-								
+
 								int coeffloc_L = i*Wlvl_L+j;
-								
+
 								float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L]);
 								//float sf_L = SQR(1-expf(-(mag_L/mad_L)-(mag_Lpar/mad_L)));
-								
+
 								float edgefactor = 1;//expf(-SQR(edge[i][j])/mad_L);
-								
+
 								float sf_L = mag_L/(mag_L + edgefactor*mad_L*exp(-mag_L/(9*mad_L))+eps);
-								
+
 								//use smoothed shrinkage unless local shrinkage is much less
 								WavCoeffs_L[dir][coeffloc_L] *= (SQR(edgefactor*sfave[coeffloc_L])+SQR(sf_L))/(edgefactor*sfave[coeffloc_L]+sf_L+eps);
-								
+
 							}//now luminance coeffs are denoised
-						
+
 					}
-					
+
 				}
 				delete[] sfave;
 				delete buffer;
-				
+
 			}
 		}
 	}
-	
+
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	void ImProcFunctions::WaveletDenoiseAll(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
-											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab ) 
+
+	void ImProcFunctions::WaveletDenoiseAll(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a,
+											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab )
 	{
 		int maxlvl = WaveletCoeffs_L.maxlevel();
-
+//        printf("maxlevel = %d\n",maxlvl);
+//omp_set_nested(true);
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		for (int lvl=0; lvl<maxlvl; lvl++) {
-			
+
 			int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
 			int Hlvl_L = WaveletCoeffs_L.level_H(lvl);
-			
+
 			int Wlvl_ab = WaveletCoeffs_a.level_W(lvl);
 			int Hlvl_ab = WaveletCoeffs_a.level_H(lvl);
-			
+
 			float skip_L = WaveletCoeffs_L.level_stride(lvl);
 			float skip_ab = WaveletCoeffs_a.level_stride(lvl);
-			
+
 			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
-			
+//            printf("Hab : %d\n", Hlvl_ab);
+//            printf("Wab : %d\n", Wlvl_ab);
 			ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
 					  skip_L, skip_ab, noisevar_L, noisevar_ab);
 		}
-		
+//omp_set_nested(false);
 	}
-	
+
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level, 
-									int W_L, int H_L, int W_ab, int H_ab, int skip_L, int skip_ab, float noisevar_L, float noisevar_ab) 
-	{	
+
+	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level,
+									int W_L, int H_L, int W_ab, int H_ab, int skip_L, int skip_ab, float noisevar_L, float noisevar_ab)
+	{
 		//simple wavelet shrinkage
 		const float eps = 0.01f;
 		float * sfave = new float[W_L*H_L];
@@ -867,76 +925,76 @@ namespace rtengine {
 		float * sfaveb = new float[W_L*H_L];
 
 		int max;
-		
+
 		//printf("\n level=%d  \n",level);
-	
+
 		for (int dir=1; dir<4; dir++) {
-			float madL = SQR(MadMax(WavCoeffs_L[dir], max, W_L*H_L));	
+			float madL = SQR(MadMax(WavCoeffs_L[dir], max, W_L*H_L));
 			float mada = SQR(MadMax(WavCoeffs_a[dir], max, W_ab*H_ab));
 			float madb = SQR(MadMax(WavCoeffs_b[dir], max, W_ab*H_ab));
-			
-			
+
+
 			//printf("  dir=%d  mad_L=%f	mad_a=%f	mad_b=%f	\n",dir,sqrt(madL),sqrt(mada),sqrt(madb));
 
 			float mad_L = madL*noisevar_L*5/(level+1);
 			float mad_a = mada*noisevar_ab;
 			float mad_b = madb*noisevar_ab;
-			
+
 			if (noisevar_ab>0.01) {
 //OpenMP here
 
 #ifdef _OPENMP
 #pragma omp parallel for
-#endif				
+#endif
 				for (int i=0; i<H_ab; i++) {
 					for (int j=0; j<W_ab; j++) {
-						
+
 						int coeffloc_ab = i*W_ab+j;
 						int coeffloc_L	= ((i*skip_L)/skip_ab)*W_L + ((j*skip_L)/skip_ab);
-						
+
 						float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L ])+eps;
 						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
 						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
-						
+
 						sfavea[coeffloc_ab] = (1-exp(-(mag_a/mad_a)-(mag_L/(9*madL))));
 						sfaveb[coeffloc_ab] = (1-exp(-(mag_b/mad_b)-(mag_L/(9*madL))));
-						
+
 						// 'firm' threshold of chroma coefficients
 						//WavCoeffs_a[dir][coeffloc_ab] *= (1-exp(-(mag_a/mad_a)-(mag_L/(9*madL))));//(coeff_a>2*thresh_a ? 1 : (coeff_a<thresh_a ? 0 : (coeff_a/thresh_a - 1)));
 						//WavCoeffs_b[dir][coeffloc_ab] *= (1-exp(-(mag_b/mad_b)-(mag_L/(9*madL))));//(coeff_b>2*thresh_b ? 1 : (coeff_b<thresh_b ? 0 : (coeff_b/thresh_b - 1)));
-												
-						
+
+
 					}
 				}//now chrominance coefficients are denoised
-				
+
 				boxblur(sfavea, sfavea, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
 				boxblur(sfaveb, sfaveb, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
 
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-				for (int i=0; i<H_ab; i++) 
+				for (int i=0; i<H_ab; i++)
 					for (int j=0; j<W_ab; j++) {
-						
+
 						int coeffloc_ab = i*W_ab+j;
 						int coeffloc_L	= ((i*skip_L)/skip_ab)*W_L + ((j*skip_L)/skip_ab);
-						
+
 						float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L ])+eps;
 						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
 						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
-						
+
 						float sfa = (1-exp(-(mag_a/mad_a)-(mag_L/(9*madL))));
 						float sfb = (1-exp(-(mag_b/mad_b)-(mag_L/(9*madL))));
-						
+
 						//use smoothed shrinkage unless local shrinkage is much less
 						WavCoeffs_a[dir][coeffloc_ab] *= (SQR(sfavea[coeffloc_ab])+SQR(sfa))/(sfavea[coeffloc_ab]+sfa+eps);
 						WavCoeffs_b[dir][coeffloc_ab] *= (SQR(sfaveb[coeffloc_ab])+SQR(sfb))/(sfaveb[coeffloc_ab]+sfb+eps);
-						
+
 					}//now chrominance coefficients are denoised
 			}
-			
+
 			if (noisevar_L>0.01) {
-//OpenMP here				
+//OpenMP here
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
@@ -944,28 +1002,28 @@ namespace rtengine {
 
 					float mag = SQR(WavCoeffs_L[dir][i]);
 					float shrinkfactor = mag/(mag+mad_L*exp(-mag/(9*mad_L))+eps);
-										
+
 					//WavCoeffs_L[dir][i] *= shrinkfactor;
 					sfave[i] = shrinkfactor;
 				}
-//OpenMP here				
+//OpenMP here
 				boxblur(sfave, sfave, level+2, level+2, W_L, H_L);//increase smoothness by locally averaging shrinkage
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 				for (int i=0; i<W_L*H_L; i++) {
-					
-					
+
+
 					float mag = SQR(WavCoeffs_L[dir][i]);
 					float sf = mag/(mag+mad_L*exp(-mag/(9*mad_L))+eps);
-					
+
 					//use smoothed shrinkage unless local shrinkage is much less
 					WavCoeffs_L[dir][i] *= (SQR(sfave[i])+SQR(sf))/(sfave[i]+sf+eps);
-					
+
 				}//now luminance coefficients are denoised
 			}
-			
-			
+
+
 		}
 		delete[] sfave;
 		delete[] sfavea;
@@ -973,11 +1031,11 @@ namespace rtengine {
 
 	}
 
-	
-	
+
+
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	
+
+
 }