Performance optimization for impulse_nr on multi-core-systems

2013-01-22 18:14:53 +01:00 · 2013-01-22 18:14:53 +01:00 · 7918fab562
commit 7918fab562
parent 0a91af9c12
1 changed files with 373 additions and 315 deletions
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@ -192,7 +192,58 @@ namespace rtengine {
 		//now we have tile dimensions, overlaps
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		//adding omp here slows it down
+        // According to FFTW-Doc 'it is safe to execute the same plan in parallel by multiple threads', so we now create 4 plans
        // outside the parallel region and use them inside the parallel region.
        // calculate max size of numblox_W.
        int max_numblox_W = ceil(((float)(MIN(imwidth,tilewidth)))/(offset))+2*blkrad;
        // calculate min size of numblox_W.
 		int min_numblox_W = ceil(((float)((MIN(imwidth,((numtiles_W - 1) * tileWskip) + tilewidth) ) - ((numtiles_W - 1) * tileWskip)))/(offset))+2*blkrad;
        // these are needed only for creation of the plans and will be freed before entering the parallel loop
 		float * Lbloxtmp;
 		float * fLbloxtmp;
 		Lbloxtmp  = fftwf_alloc_real(max_numblox_W*TS*TS);
 		fLbloxtmp = fftwf_alloc_real(max_numblox_W*TS*TS);
 		int nfwd[2]={TS,TS};
        //for DCT:
 		const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
 		const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
        fftwf_plan plan_forward_blox[2];
        fftwf_plan plan_backward_blox[2];
        // Creating the plans with FFTW_MEASURE instead of FFTW_ESTIMATE speeds up the execute a bit
 		plan_forward_blox[0]  = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
 		plan_backward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
 		plan_forward_blox[1]  = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
 		plan_backward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
 		fftwf_free ( Lbloxtmp );
 		fftwf_free ( fLbloxtmp );
 #ifdef _OPENMP
        // Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
        int numtiles = numtiles_W * numtiles_H;
        int numthreads = MIN(numtiles,omp_get_max_threads());
        //if(options.RgbDenoiseThreadLimit > 0) numthreads = MIN(numthreads,options.RgbDenoiseThreadLimit);
 #pragma omp parallel num_threads(numthreads)
 #endif
        {
 		//DCT block data storage
 		float * Lblox;
 		float * fLblox;
 #ifdef _OPENMP
 #pragma omp critical
 #endif
        {
 		Lblox  = fftwf_alloc_real(max_numblox_W*TS*TS);
 		fLblox = fftwf_alloc_real(max_numblox_W*TS*TS);
        }
 #ifdef _OPENMP
 #pragma omp for schedule(dynamic) collapse(2)
 #endif
 		for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
 			for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
@ -202,13 +253,14 @@ namespace rtengine {
 				int height = tilebottom-tiletop;
 				//input L channel
-				array2D<float> Lin(width,height,ARRAY2D_CLEAR_DATA);
+				array2D<float> Lin(width,height);
 				//wavelet denoised image
 				LabImage * labdn = new LabImage(width,height);
 				//residual between input and denoised L channel
 				array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
 				//pixel weight
 				array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
 //
 				//#ifdef _OPENMP
 				//#pragma omp parallel for
@ -233,9 +285,9 @@ namespace rtengine {
 							labdn->a[i1][j1] = (X-Y);
 							labdn->b[i1][j1] = (Y-Z);
-							Ldetail[i1][j1] = 0;
+//							Ldetail[i1][j1] = 0;
 							Lin[i1][j1] = Y;
-							totwt[i1][j1] = 0;
+//							totwt[i1][j1] = 0;
 						}
 					}
 				} else {//image is not raw; use Lab parametrization
@ -262,9 +314,9 @@ namespace rtengine {
 							labdn->a[i1][j1] = (X-Y);
 							labdn->b[i1][j1] = (Y-Z);
-							Ldetail[i1][j1] = 0;
+//							Ldetail[i1][j1] = 0;
 							Lin[i1][j1] = Y;
-							totwt[i1][j1] = 0;
+//							totwt[i1][j1] = 0;
 						}
 					}
 				}
@ -282,21 +334,32 @@ namespace rtengine {
 				//and whether to subsample the image after wavelet filtering.  Subsampling is coded as
 				//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
 				//the first level only, 7 means subsample the first three levels, etc.
 				wavelet_decomposition Ldecomp(labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
 				wavelet_decomposition adecomp(labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
 				wavelet_decomposition bdecomp(labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );
 				float noisevarL	 = SQR((dnparams.luma/125.0f)*(1+ dnparams.luma/25.0f));
 				float noisevarab = SQR(dnparams.chroma/10.0f);
                { // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
                wavelet_decomposition* Ldecomp;
                wavelet_decomposition* adecomp;
                wavelet_decomposition* bdecomp;
                Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
                adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
                bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );
 				//WaveletDenoiseAll_BiShrink(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
-				WaveletDenoiseAll(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
+				WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarab);
-				Ldecomp.reconstruct(labdn->data);
+				Ldecomp->reconstruct(labdn->data);
-				adecomp.reconstruct(labdn->data+datalen);
+				delete Ldecomp;
-				bdecomp.reconstruct(labdn->data+2*datalen);
+				adecomp->reconstruct(labdn->data+datalen);
 				delete adecomp;
 				bdecomp->reconstruct(labdn->data+2*datalen);
 				delete bdecomp;
                }
 				//TODO: at this point wavelet coefficients storage can be freed
 				//Issue 1680: Done now
 				//second impulse denoise
 				if (dnparams.luma>0.01) {
@ -316,49 +379,35 @@ namespace rtengine {
 				// blocks are not the same thing as tiles!
 				// allocate DCT data structures
 				// calculation for detail recovery blocks
 				const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad;
 				const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad;
 				//const int nrtiles = numblox_W*numblox_H;
 				// end of tiling calc
-				
+                {
 				//DCT block data storage
 				float * Lblox  = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
 				float * fLblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
 				//make a plan for FFTW
 				fftwf_plan plan_forward_blox, plan_backward_blox;
 				int nfwd[2]={TS,TS};
 				//for DCT:
 				const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
 				const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
 				plan_forward_blox  = fftwf_plan_many_r2r(2, nfwd, numblox_W, Lblox, NULL, 1, TS*TS, fLblox, NULL, 1, TS*TS, fwdkind, FFTW_ESTIMATE );
 				plan_backward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, fLblox, NULL, 1, TS*TS, Lblox, NULL, 1, TS*TS, bwdkind, FFTW_ESTIMATE );
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				// Main detail recovery algorithm: Block loop
 				//OpenMP here
 				//adding omp here leads to artifacts
                AlignedBufferMP<float> buffer(width + TS + 2*blkrad*offset);
 				for (int vblk=0; vblk<numblox_H; vblk++) {
 					//printf("vblock=%d",vblk);
 					int vblkmod = vblk%8;
 					int top = (vblk-blkrad)*offset;
-					
+                    AlignedBuffer<float>* pBuf = buffer.acquire();
-					float * buffer = new float [width + TS + 2*blkrad*offset];
+//					float * buffer = new float [width + TS + 2*blkrad*offset];
-					float * datarow = buffer+blkrad*offset;
+					float * datarow = (float*)pBuf->data +blkrad*offset;
 					//#ifdef _OPENMP
 					//#pragma omp parallel for
 					//#endif
 					//TODO: implement using AlignedBufferMP
 //					#pragma omp parallel for
 					for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) {
 						int row = top + i;
 						int rr = row;
@ -393,13 +442,14 @@ namespace rtengine {
 							}
 						}
 					}//end of filling block row
-					delete[] buffer;
+					buffer.release(pBuf);
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 					//fftwf_print_plan (plan_forward_blox);
-					fftwf_execute_r2r(plan_forward_blox,Lblox,fLblox);		// DCT an entire row of tiles
+					if(numblox_W == max_numblox_W)
-					
+                        fftwf_execute_r2r(plan_forward_blox[0],Lblox,fLblox);		// DCT an entire row of tiles
                    else
                        fftwf_execute_r2r(plan_forward_blox[1],Lblox,fLblox);		// DCT an entire row of tiles
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 					// now process the vblk row of blocks for noise reduction
 					for (int hblk=0; hblk<numblox_W; hblk++) {
@ -411,7 +461,10 @@ namespace rtengine {
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 					//now perform inverse FT of an entire row of blocks
-					fftwf_execute_r2r(plan_backward_blox,fLblox,Lblox);	//for DCT
+					if(numblox_W == max_numblox_W)
                        fftwf_execute_r2r(plan_backward_blox[0],fLblox,Lblox);	//for DCT
                    else
                        fftwf_execute_r2r(plan_backward_blox[1],fLblox,Lblox);	//for DCT
 					int topproc = (vblk-blkrad)*offset;
@ -421,20 +474,9 @@ namespace rtengine {
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				}//end of vertical block loop
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				// clean up
+                }
 				//#pragma omp single nowait
 				fftwf_destroy_plan( plan_forward_blox );
 				//#pragma omp single nowait
 				fftwf_destroy_plan( plan_backward_blox );
 				fftwf_free ( Lblox);
 				fftwf_free ( fLblox);
 				fftwf_cleanup();
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 				for (int i=0; i<height; i++) {
 					for (int j=0; j<width; j++) {
@ -470,7 +512,7 @@ namespace rtengine {
 				//convert back to RGB and write to destination array
 				if (isRAW) {
 #ifdef _OPENMP
-#pragma omp parallel for
+//#pragma omp parallel for
 #endif
 					for (int i=tiletop; i<tilebottom; i++){
 						int i1 = i-tiletop;
@ -496,7 +538,7 @@ namespace rtengine {
 					}
 				} else {
 #ifdef _OPENMP
-#pragma omp parallel for
+//#pragma omp parallel for
 #endif
 					for (int i=tiletop; i<tilebottom; i++){
 						int i1 = i-tiletop;
@ -534,7 +576,15 @@ namespace rtengine {
 			}//end of tile row
 		}//end of tile loop
 #ifdef _OPENMP
 #pragma omp critical
 #endif
 {
 		fftwf_free ( Lblox);
 		fftwf_free ( fLblox);
 }
        }
 		//copy denoised image to output
 		memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float));
@ -549,6 +599,13 @@ namespace rtengine {
 		delete dsttmp;
    // destroy the plans
 	fftwf_destroy_plan( plan_forward_blox[0] );
 	fftwf_destroy_plan( plan_backward_blox[0] );
 	fftwf_destroy_plan( plan_forward_blox[1] );
 	fftwf_destroy_plan( plan_backward_blox[1] );
 	fftwf_cleanup();
 	}//end of main RGB_denoise
@ -561,11 +618,10 @@ namespace rtengine {
 	void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail )	//for DCT
 	{
 		float * nbrwt  = new float[TS*TS];	//for DCT
 		int blkstart = hblproc*TS*TS;
 		boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation	//for DCT
-
+#pragma omp parallel for
 		for (int n=0; n<TS*TS; n++) {		//for DCT
 			fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail));
 		}//output neighbor averaged result
@ -586,19 +642,19 @@ namespace rtengine {
 		const int numblox_W = ceil(((float)(width))/(offset));
 		const float DCTnorm = 1.0f/(4*TS*TS); //for DCT
 		int imin = MAX(0,-top);
 		int bottom = MIN( top+TS,height);
 		int imax = bottom - top;
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		//add row of tiles to output image
 		for (int hblk=0; hblk < numblox_W; hblk++) {
 			int left = (hblk-blkrad)*offset;
 			int bottom = MIN( top+TS,height);
 			int right  = MIN(left+TS, width);
 			int imin = MAX(0,-top);
 			int jmin = MAX(0,-left);
 			int imax = bottom - top;
 			int jmax = right - left;
 			int indx = hblk*TS;
 			for (int i=imin; i<imax; i++)
@ -719,7 +775,7 @@ namespace rtengine {
 				//simple wavelet shrinkage
 				float * sfave = new float[Wlvl_L*Hlvl_L];
 				array2D<float> edge(Wlvl_L,Hlvl_L);
-				AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(Wlvl_L,Hlvl_L));
+				AlignedBufferMP<double>* buffer = new AlignedBufferMP<double> (MAX(Wlvl_L,Hlvl_L));
 				//printf("\n level=%d  \n",lvl);
@ -829,7 +885,8 @@ namespace rtengine {
 											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab )
 	{
 		int maxlvl = WaveletCoeffs_L.maxlevel();
-
+//        printf("maxlevel = %d\n",maxlvl);
 //omp_set_nested(true);
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
@ -847,11 +904,12 @@ namespace rtengine {
 			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
-			
+//            printf("Hab : %d\n", Hlvl_ab);
 //            printf("Wab : %d\n", Wlvl_ab);
 			ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
 					  skip_L, skip_ab, noisevar_L, noisevar_ab);
 		}
-		
+//omp_set_nested(false);
 	}
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%