Performance optimization for impulse_nr on multi-core-systems

2013-01-22 18:14:53 +01:00
parent 0a91af9c12
commit 7918fab562
1 changed files with 373 additions and 315 deletions
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -192,7 +192,58 @@ namespace rtengine {
 		//now we have tile dimensions, overlaps
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

-		//adding omp here slows it down
+        // According to FFTW-Doc 'it is safe to execute the same plan in parallel by multiple threads', so we now create 4 plans
+        // outside the parallel region and use them inside the parallel region.
+
+        // calculate max size of numblox_W.
+        int max_numblox_W = ceil(((float)(MIN(imwidth,tilewidth)))/(offset))+2*blkrad;
+        // calculate min size of numblox_W.
+		int min_numblox_W = ceil(((float)((MIN(imwidth,((numtiles_W - 1) * tileWskip) + tilewidth) ) - ((numtiles_W - 1) * tileWskip)))/(offset))+2*blkrad;
+
+        // these are needed only for creation of the plans and will be freed before entering the parallel loop
+		float * Lbloxtmp;
+		float * fLbloxtmp;
+		Lbloxtmp  = fftwf_alloc_real(max_numblox_W*TS*TS);
+		fLbloxtmp = fftwf_alloc_real(max_numblox_W*TS*TS);
+
+		int nfwd[2]={TS,TS};
+
+        //for DCT:
+		const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
+		const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
+
+        fftwf_plan plan_forward_blox[2];
+        fftwf_plan plan_backward_blox[2];
+
+        // Creating the plans with FFTW_MEASURE instead of FFTW_ESTIMATE speeds up the execute a bit
+		plan_forward_blox[0]  = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
+		plan_backward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
+		plan_forward_blox[1]  = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
+		plan_backward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
+		fftwf_free ( Lbloxtmp );
+		fftwf_free ( fLbloxtmp );
+
+#ifdef _OPENMP
+        // Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
+        int numtiles = numtiles_W * numtiles_H;
+        int numthreads = MIN(numtiles,omp_get_max_threads());
+        //if(options.RgbDenoiseThreadLimit > 0) numthreads = MIN(numthreads,options.RgbDenoiseThreadLimit);
+#pragma omp parallel num_threads(numthreads)
+#endif
+        {
+		//DCT block data storage
+		float * Lblox;
+		float * fLblox;
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+        {
+		Lblox  = fftwf_alloc_real(max_numblox_W*TS*TS);
+		fLblox = fftwf_alloc_real(max_numblox_W*TS*TS);
+        }
+#ifdef _OPENMP
+#pragma omp for schedule(dynamic) collapse(2)
+#endif
 		for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
 			for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {

@@ -202,13 +253,14 @@ namespace rtengine {
 				int height = tilebottom-tiletop;

 				//input L channel
-				array2D<float> Lin(width,height,ARRAY2D_CLEAR_DATA);
+				array2D<float> Lin(width,height);
 				//wavelet denoised image
 				LabImage * labdn = new LabImage(width,height);
 				//residual between input and denoised L channel
 				array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
 				//pixel weight
 				array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
+//

 				//#ifdef _OPENMP
 				//#pragma omp parallel for
@@ -233,9 +285,9 @@ namespace rtengine {
 							labdn->a[i1][j1] = (X-Y);
 							labdn->b[i1][j1] = (Y-Z);

-							Ldetail[i1][j1] = 0;
+//							Ldetail[i1][j1] = 0;
 							Lin[i1][j1] = Y;
-							totwt[i1][j1] = 0;
+//							totwt[i1][j1] = 0;
 						}
 					}
 				} else {//image is not raw; use Lab parametrization
@@ -262,9 +314,9 @@ namespace rtengine {
 							labdn->a[i1][j1] = (X-Y);
 							labdn->b[i1][j1] = (Y-Z);

-							Ldetail[i1][j1] = 0;
+//							Ldetail[i1][j1] = 0;
 							Lin[i1][j1] = Y;
-							totwt[i1][j1] = 0;
+//							totwt[i1][j1] = 0;
 						}
 					}
 				}
@@ -282,21 +334,32 @@ namespace rtengine {
 				//and whether to subsample the image after wavelet filtering.  Subsampling is coded as
 				//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
 				//the first level only, 7 means subsample the first three levels, etc.
-				wavelet_decomposition Ldecomp(labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
-				wavelet_decomposition adecomp(labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
-				wavelet_decomposition bdecomp(labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );

 				float noisevarL	 = SQR((dnparams.luma/125.0f)*(1+ dnparams.luma/25.0f));
+
 				float noisevarab = SQR(dnparams.chroma/10.0f);
+                { // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
+                wavelet_decomposition* Ldecomp;
+                wavelet_decomposition* adecomp;
+                wavelet_decomposition* bdecomp;
+
+                Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
+                adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
+                bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );

 				//WaveletDenoiseAll_BiShrink(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
-				WaveletDenoiseAll(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
+				WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarab);

-				Ldecomp.reconstruct(labdn->data);
-				adecomp.reconstruct(labdn->data+datalen);
-				bdecomp.reconstruct(labdn->data+2*datalen);
+				Ldecomp->reconstruct(labdn->data);
+				delete Ldecomp;
+				adecomp->reconstruct(labdn->data+datalen);
+				delete adecomp;
+				bdecomp->reconstruct(labdn->data+2*datalen);
+				delete bdecomp;
+                }

 				//TODO: at this point wavelet coefficients storage can be freed
+				//Issue 1680: Done now

 				//second impulse denoise
 				if (dnparams.luma>0.01) {
@@ -316,49 +379,35 @@ namespace rtengine {
 				// blocks are not the same thing as tiles!


-				// allocate DCT data structures

 				// calculation for detail recovery blocks
 				const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad;
 				const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad;
+
 				//const int nrtiles = numblox_W*numblox_H;
 				// end of tiling calc
-				
-				//DCT block data storage
-				float * Lblox  = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
-				float * fLblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
-				
-				
-				//make a plan for FFTW
-				fftwf_plan plan_forward_blox, plan_backward_blox;
-				
-				int nfwd[2]={TS,TS};
-				
-				//for DCT:
-				const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
-				const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
-				
-				plan_forward_blox  = fftwf_plan_many_r2r(2, nfwd, numblox_W, Lblox, NULL, 1, TS*TS, fLblox, NULL, 1, TS*TS, fwdkind, FFTW_ESTIMATE );
-				plan_backward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, fLblox, NULL, 1, TS*TS, Lblox, NULL, 1, TS*TS, bwdkind, FFTW_ESTIMATE );
+                {

 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				// Main detail recovery algorithm: Block loop
 				//OpenMP here
 				//adding omp here leads to artifacts
+                AlignedBufferMP<float> buffer(width + TS + 2*blkrad*offset);
 				for (int vblk=0; vblk<numblox_H; vblk++) {
 					//printf("vblock=%d",vblk);
 					int vblkmod = vblk%8;

 					int top = (vblk-blkrad)*offset;
-					
-					float * buffer = new float [width + TS + 2*blkrad*offset];
-					float * datarow = buffer+blkrad*offset;
+                    AlignedBuffer<float>* pBuf = buffer.acquire();
+//					float * buffer = new float [width + TS + 2*blkrad*offset];
+					float * datarow = (float*)pBuf->data +blkrad*offset;

 					//#ifdef _OPENMP
 					//#pragma omp parallel for
 					//#endif
 					//TODO: implement using AlignedBufferMP
+//					#pragma omp parallel for
 					for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) {
 						int row = top + i;
 						int rr = row;
@@ -393,13 +442,14 @@ namespace rtengine {
 							}
 						}
 					}//end of filling block row
-					delete[] buffer;
+					buffer.release(pBuf);

 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-					
 					//fftwf_print_plan (plan_forward_blox);
-					fftwf_execute_r2r(plan_forward_blox,Lblox,fLblox);		// DCT an entire row of tiles
-					
+					if(numblox_W == max_numblox_W)
+                        fftwf_execute_r2r(plan_forward_blox[0],Lblox,fLblox);		// DCT an entire row of tiles
+                    else
+                        fftwf_execute_r2r(plan_forward_blox[1],Lblox,fLblox);		// DCT an entire row of tiles
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 					// now process the vblk row of blocks for noise reduction
 					for (int hblk=0; hblk<numblox_W; hblk++) {
@@ -411,7 +461,10 @@ namespace rtengine {
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 					//now perform inverse FT of an entire row of blocks
-					fftwf_execute_r2r(plan_backward_blox,fLblox,Lblox);	//for DCT
+					if(numblox_W == max_numblox_W)
+                        fftwf_execute_r2r(plan_backward_blox[0],fLblox,Lblox);	//for DCT
+                    else
+                        fftwf_execute_r2r(plan_backward_blox[1],fLblox,Lblox);	//for DCT

 					int topproc = (vblk-blkrad)*offset;

@@ -421,20 +474,9 @@ namespace rtengine {
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 				}//end of vertical block loop
-				
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

-				// clean up
-				//#pragma omp single nowait
-				fftwf_destroy_plan( plan_forward_blox );
-				//#pragma omp single nowait
-				fftwf_destroy_plan( plan_backward_blox );
-				
-				fftwf_free ( Lblox);
-				fftwf_free ( fLblox);
-				
-				fftwf_cleanup();
-				
+                }
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				for (int i=0; i<height; i++) {
 					for (int j=0; j<width; j++) {
@@ -470,7 +512,7 @@ namespace rtengine {
 				//convert back to RGB and write to destination array
 				if (isRAW) {
 #ifdef _OPENMP
-#pragma omp parallel for
+//#pragma omp parallel for
 #endif
 					for (int i=tiletop; i<tilebottom; i++){
 						int i1 = i-tiletop;
@@ -496,7 +538,7 @@ namespace rtengine {
 					}
 				} else {
 #ifdef _OPENMP
-#pragma omp parallel for
+//#pragma omp parallel for
 #endif
 					for (int i=tiletop; i<tilebottom; i++){
 						int i1 = i-tiletop;
@@ -534,7 +576,15 @@ namespace rtengine {

 			}//end of tile row
 		}//end of tile loop
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+{
+		fftwf_free ( Lblox);
+		fftwf_free ( fLblox);
+}

+        }
 		//copy denoised image to output
 		memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float));

@@ -549,6 +599,13 @@ namespace rtengine {

 		delete dsttmp;

+    // destroy the plans
+	fftwf_destroy_plan( plan_forward_blox[0] );
+	fftwf_destroy_plan( plan_backward_blox[0] );
+	fftwf_destroy_plan( plan_forward_blox[1] );
+	fftwf_destroy_plan( plan_backward_blox[1] );
+	fftwf_cleanup();
+
 	}//end of main RGB_denoise


@@ -561,11 +618,10 @@ namespace rtengine {
 	void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail )	//for DCT
 	{
 		float * nbrwt  = new float[TS*TS];	//for DCT
-		
 		int blkstart = hblproc*TS*TS;

 		boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation	//for DCT
-
+#pragma omp parallel for
 		for (int n=0; n<TS*TS; n++) {		//for DCT
 			fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail));
 		}//output neighbor averaged result
@@ -586,19 +642,19 @@ namespace rtengine {
 		const int numblox_W = ceil(((float)(width))/(offset));
 		const float DCTnorm = 1.0f/(4*TS*TS); //for DCT

+		int imin = MAX(0,-top);
+		int bottom = MIN( top+TS,height);
+		int imax = bottom - top;
+
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		//add row of tiles to output image
 		for (int hblk=0; hblk < numblox_W; hblk++) {
 			int left = (hblk-blkrad)*offset;
-			int bottom = MIN( top+TS,height);
 			int right  = MIN(left+TS, width);
-			int imin = MAX(0,-top);
 			int jmin = MAX(0,-left);
-			int imax = bottom - top;
 			int jmax = right - left;
-			
 			int indx = hblk*TS;

 			for (int i=imin; i<imax; i++)
@@ -719,7 +775,7 @@ namespace rtengine {
 				//simple wavelet shrinkage
 				float * sfave = new float[Wlvl_L*Hlvl_L];
 				array2D<float> edge(Wlvl_L,Hlvl_L);
-				AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(Wlvl_L,Hlvl_L));
+				AlignedBufferMP<double>* buffer = new AlignedBufferMP<double> (MAX(Wlvl_L,Hlvl_L));

 				//printf("\n level=%d  \n",lvl);

@@ -829,7 +885,8 @@ namespace rtengine {
 											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab )
 	{
 		int maxlvl = WaveletCoeffs_L.maxlevel();
-
+//        printf("maxlevel = %d\n",maxlvl);
+//omp_set_nested(true);
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
@@ -847,11 +904,12 @@ namespace rtengine {
 			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
-			
+//            printf("Hab : %d\n", Hlvl_ab);
+//            printf("Wab : %d\n", Wlvl_ab);
 			ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
 					  skip_L, skip_ab, noisevar_L, noisevar_ab);
 		}
-		
+//omp_set_nested(false);
 	}

 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%