Performance optimization for impulse_nr on multi-core-systems

This commit is contained in:
Ingo 2013-01-22 18:14:53 +01:00
parent 0a91af9c12
commit 7918fab562

View File

@ -192,7 +192,58 @@ namespace rtengine {
//now we have tile dimensions, overlaps //now we have tile dimensions, overlaps
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//adding omp here slows it down // According to FFTW-Doc 'it is safe to execute the same plan in parallel by multiple threads', so we now create 4 plans
// outside the parallel region and use them inside the parallel region.
// calculate max size of numblox_W.
int max_numblox_W = ceil(((float)(MIN(imwidth,tilewidth)))/(offset))+2*blkrad;
// calculate min size of numblox_W.
int min_numblox_W = ceil(((float)((MIN(imwidth,((numtiles_W - 1) * tileWskip) + tilewidth) ) - ((numtiles_W - 1) * tileWskip)))/(offset))+2*blkrad;
// these are needed only for creation of the plans and will be freed before entering the parallel loop
float * Lbloxtmp;
float * fLbloxtmp;
Lbloxtmp = fftwf_alloc_real(max_numblox_W*TS*TS);
fLbloxtmp = fftwf_alloc_real(max_numblox_W*TS*TS);
int nfwd[2]={TS,TS};
//for DCT:
const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
fftwf_plan plan_forward_blox[2];
fftwf_plan plan_backward_blox[2];
// Creating the plans with FFTW_MEASURE instead of FFTW_ESTIMATE speeds up the execute a bit
plan_forward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
plan_backward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
plan_forward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
plan_backward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
fftwf_free ( Lbloxtmp );
fftwf_free ( fLbloxtmp );
#ifdef _OPENMP
// Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
int numtiles = numtiles_W * numtiles_H;
int numthreads = MIN(numtiles,omp_get_max_threads());
//if(options.RgbDenoiseThreadLimit > 0) numthreads = MIN(numthreads,options.RgbDenoiseThreadLimit);
#pragma omp parallel num_threads(numthreads)
#endif
{
//DCT block data storage
float * Lblox;
float * fLblox;
#ifdef _OPENMP
#pragma omp critical
#endif
{
Lblox = fftwf_alloc_real(max_numblox_W*TS*TS);
fLblox = fftwf_alloc_real(max_numblox_W*TS*TS);
}
#ifdef _OPENMP
#pragma omp for schedule(dynamic) collapse(2)
#endif
for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) { for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) { for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
@ -202,13 +253,14 @@ namespace rtengine {
int height = tilebottom-tiletop; int height = tilebottom-tiletop;
//input L channel //input L channel
array2D<float> Lin(width,height,ARRAY2D_CLEAR_DATA); array2D<float> Lin(width,height);
//wavelet denoised image //wavelet denoised image
LabImage * labdn = new LabImage(width,height); LabImage * labdn = new LabImage(width,height);
//residual between input and denoised L channel //residual between input and denoised L channel
array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA); array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
//pixel weight //pixel weight
array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
//
//#ifdef _OPENMP //#ifdef _OPENMP
//#pragma omp parallel for //#pragma omp parallel for
@ -233,9 +285,9 @@ namespace rtengine {
labdn->a[i1][j1] = (X-Y); labdn->a[i1][j1] = (X-Y);
labdn->b[i1][j1] = (Y-Z); labdn->b[i1][j1] = (Y-Z);
Ldetail[i1][j1] = 0; // Ldetail[i1][j1] = 0;
Lin[i1][j1] = Y; Lin[i1][j1] = Y;
totwt[i1][j1] = 0; // totwt[i1][j1] = 0;
} }
} }
} else {//image is not raw; use Lab parametrization } else {//image is not raw; use Lab parametrization
@ -262,9 +314,9 @@ namespace rtengine {
labdn->a[i1][j1] = (X-Y); labdn->a[i1][j1] = (X-Y);
labdn->b[i1][j1] = (Y-Z); labdn->b[i1][j1] = (Y-Z);
Ldetail[i1][j1] = 0; // Ldetail[i1][j1] = 0;
Lin[i1][j1] = Y; Lin[i1][j1] = Y;
totwt[i1][j1] = 0; // totwt[i1][j1] = 0;
} }
} }
} }
@ -282,21 +334,32 @@ namespace rtengine {
//and whether to subsample the image after wavelet filtering. Subsampling is coded as //and whether to subsample the image after wavelet filtering. Subsampling is coded as
//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample //binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
//the first level only, 7 means subsample the first three levels, etc. //the first level only, 7 means subsample the first three levels, etc.
wavelet_decomposition Ldecomp(labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
wavelet_decomposition adecomp(labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
wavelet_decomposition bdecomp(labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );
float noisevarL = SQR((dnparams.luma/125.0f)*(1+ dnparams.luma/25.0f)); float noisevarL = SQR((dnparams.luma/125.0f)*(1+ dnparams.luma/25.0f));
float noisevarab = SQR(dnparams.chroma/10.0f); float noisevarab = SQR(dnparams.chroma/10.0f);
{ // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
wavelet_decomposition* Ldecomp;
wavelet_decomposition* adecomp;
wavelet_decomposition* bdecomp;
Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );
//WaveletDenoiseAll_BiShrink(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab); //WaveletDenoiseAll_BiShrink(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
WaveletDenoiseAll(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab); WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarab);
Ldecomp.reconstruct(labdn->data); Ldecomp->reconstruct(labdn->data);
adecomp.reconstruct(labdn->data+datalen); delete Ldecomp;
bdecomp.reconstruct(labdn->data+2*datalen); adecomp->reconstruct(labdn->data+datalen);
delete adecomp;
bdecomp->reconstruct(labdn->data+2*datalen);
delete bdecomp;
}
//TODO: at this point wavelet coefficients storage can be freed //TODO: at this point wavelet coefficients storage can be freed
//Issue 1680: Done now
//second impulse denoise //second impulse denoise
if (dnparams.luma>0.01) { if (dnparams.luma>0.01) {
@ -316,49 +379,35 @@ namespace rtengine {
// blocks are not the same thing as tiles! // blocks are not the same thing as tiles!
// allocate DCT data structures
// calculation for detail recovery blocks // calculation for detail recovery blocks
const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad; const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad;
const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad; const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad;
//const int nrtiles = numblox_W*numblox_H; //const int nrtiles = numblox_W*numblox_H;
// end of tiling calc // end of tiling calc
{
//DCT block data storage
float * Lblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
float * fLblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
//make a plan for FFTW
fftwf_plan plan_forward_blox, plan_backward_blox;
int nfwd[2]={TS,TS};
//for DCT:
const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
plan_forward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, Lblox, NULL, 1, TS*TS, fLblox, NULL, 1, TS*TS, fwdkind, FFTW_ESTIMATE );
plan_backward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, fLblox, NULL, 1, TS*TS, Lblox, NULL, 1, TS*TS, bwdkind, FFTW_ESTIMATE );
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Main detail recovery algorithm: Block loop // Main detail recovery algorithm: Block loop
//OpenMP here //OpenMP here
//adding omp here leads to artifacts //adding omp here leads to artifacts
AlignedBufferMP<float> buffer(width + TS + 2*blkrad*offset);
for (int vblk=0; vblk<numblox_H; vblk++) { for (int vblk=0; vblk<numblox_H; vblk++) {
//printf("vblock=%d",vblk); //printf("vblock=%d",vblk);
int vblkmod = vblk%8; int vblkmod = vblk%8;
int top = (vblk-blkrad)*offset; int top = (vblk-blkrad)*offset;
AlignedBuffer<float>* pBuf = buffer.acquire();
float * buffer = new float [width + TS + 2*blkrad*offset]; // float * buffer = new float [width + TS + 2*blkrad*offset];
float * datarow = buffer+blkrad*offset; float * datarow = (float*)pBuf->data +blkrad*offset;
//#ifdef _OPENMP //#ifdef _OPENMP
//#pragma omp parallel for //#pragma omp parallel for
//#endif //#endif
//TODO: implement using AlignedBufferMP //TODO: implement using AlignedBufferMP
// #pragma omp parallel for
for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) { for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) {
int row = top + i; int row = top + i;
int rr = row; int rr = row;
@ -393,13 +442,14 @@ namespace rtengine {
} }
} }
}//end of filling block row }//end of filling block row
delete[] buffer; buffer.release(pBuf);
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//fftwf_print_plan (plan_forward_blox); //fftwf_print_plan (plan_forward_blox);
fftwf_execute_r2r(plan_forward_blox,Lblox,fLblox); // DCT an entire row of tiles if(numblox_W == max_numblox_W)
fftwf_execute_r2r(plan_forward_blox[0],Lblox,fLblox); // DCT an entire row of tiles
else
fftwf_execute_r2r(plan_forward_blox[1],Lblox,fLblox); // DCT an entire row of tiles
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// now process the vblk row of blocks for noise reduction // now process the vblk row of blocks for noise reduction
for (int hblk=0; hblk<numblox_W; hblk++) { for (int hblk=0; hblk<numblox_W; hblk++) {
@ -411,7 +461,10 @@ namespace rtengine {
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//now perform inverse FT of an entire row of blocks //now perform inverse FT of an entire row of blocks
fftwf_execute_r2r(plan_backward_blox,fLblox,Lblox); //for DCT if(numblox_W == max_numblox_W)
fftwf_execute_r2r(plan_backward_blox[0],fLblox,Lblox); //for DCT
else
fftwf_execute_r2r(plan_backward_blox[1],fLblox,Lblox); //for DCT
int topproc = (vblk-blkrad)*offset; int topproc = (vblk-blkrad)*offset;
@ -421,20 +474,9 @@ namespace rtengine {
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
}//end of vertical block loop }//end of vertical block loop
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// clean up }
//#pragma omp single nowait
fftwf_destroy_plan( plan_forward_blox );
//#pragma omp single nowait
fftwf_destroy_plan( plan_backward_blox );
fftwf_free ( Lblox);
fftwf_free ( fLblox);
fftwf_cleanup();
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for (int i=0; i<height; i++) { for (int i=0; i<height; i++) {
for (int j=0; j<width; j++) { for (int j=0; j<width; j++) {
@ -470,7 +512,7 @@ namespace rtengine {
//convert back to RGB and write to destination array //convert back to RGB and write to destination array
if (isRAW) { if (isRAW) {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for //#pragma omp parallel for
#endif #endif
for (int i=tiletop; i<tilebottom; i++){ for (int i=tiletop; i<tilebottom; i++){
int i1 = i-tiletop; int i1 = i-tiletop;
@ -496,7 +538,7 @@ namespace rtengine {
} }
} else { } else {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for //#pragma omp parallel for
#endif #endif
for (int i=tiletop; i<tilebottom; i++){ for (int i=tiletop; i<tilebottom; i++){
int i1 = i-tiletop; int i1 = i-tiletop;
@ -534,7 +576,15 @@ namespace rtengine {
}//end of tile row }//end of tile row
}//end of tile loop }//end of tile loop
#ifdef _OPENMP
#pragma omp critical
#endif
{
fftwf_free ( Lblox);
fftwf_free ( fLblox);
}
}
//copy denoised image to output //copy denoised image to output
memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float)); memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float));
@ -549,6 +599,13 @@ namespace rtengine {
delete dsttmp; delete dsttmp;
// destroy the plans
fftwf_destroy_plan( plan_forward_blox[0] );
fftwf_destroy_plan( plan_backward_blox[0] );
fftwf_destroy_plan( plan_forward_blox[1] );
fftwf_destroy_plan( plan_backward_blox[1] );
fftwf_cleanup();
}//end of main RGB_denoise }//end of main RGB_denoise
@ -561,11 +618,10 @@ namespace rtengine {
void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT
{ {
float * nbrwt = new float[TS*TS]; //for DCT float * nbrwt = new float[TS*TS]; //for DCT
int blkstart = hblproc*TS*TS; int blkstart = hblproc*TS*TS;
boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT
#pragma omp parallel for
for (int n=0; n<TS*TS; n++) { //for DCT for (int n=0; n<TS*TS; n++) { //for DCT
fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail)); fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail));
}//output neighbor averaged result }//output neighbor averaged result
@ -586,19 +642,19 @@ namespace rtengine {
const int numblox_W = ceil(((float)(width))/(offset)); const int numblox_W = ceil(((float)(width))/(offset));
const float DCTnorm = 1.0f/(4*TS*TS); //for DCT const float DCTnorm = 1.0f/(4*TS*TS); //for DCT
int imin = MAX(0,-top);
int bottom = MIN( top+TS,height);
int imax = bottom - top;
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for #pragma omp parallel for
#endif #endif
//add row of tiles to output image //add row of tiles to output image
for (int hblk=0; hblk < numblox_W; hblk++) { for (int hblk=0; hblk < numblox_W; hblk++) {
int left = (hblk-blkrad)*offset; int left = (hblk-blkrad)*offset;
int bottom = MIN( top+TS,height);
int right = MIN(left+TS, width); int right = MIN(left+TS, width);
int imin = MAX(0,-top);
int jmin = MAX(0,-left); int jmin = MAX(0,-left);
int imax = bottom - top;
int jmax = right - left; int jmax = right - left;
int indx = hblk*TS; int indx = hblk*TS;
for (int i=imin; i<imax; i++) for (int i=imin; i<imax; i++)
@ -719,7 +775,7 @@ namespace rtengine {
//simple wavelet shrinkage //simple wavelet shrinkage
float * sfave = new float[Wlvl_L*Hlvl_L]; float * sfave = new float[Wlvl_L*Hlvl_L];
array2D<float> edge(Wlvl_L,Hlvl_L); array2D<float> edge(Wlvl_L,Hlvl_L);
AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(Wlvl_L,Hlvl_L)); AlignedBufferMP<double>* buffer = new AlignedBufferMP<double> (MAX(Wlvl_L,Hlvl_L));
//printf("\n level=%d \n",lvl); //printf("\n level=%d \n",lvl);
@ -829,7 +885,8 @@ namespace rtengine {
wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab ) wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab )
{ {
int maxlvl = WaveletCoeffs_L.maxlevel(); int maxlvl = WaveletCoeffs_L.maxlevel();
// printf("maxlevel = %d\n",maxlvl);
//omp_set_nested(true);
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for #pragma omp parallel for
#endif #endif
@ -847,11 +904,12 @@ namespace rtengine {
float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl); float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl); float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl); float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
// printf("Hab : %d\n", Hlvl_ab);
// printf("Wab : %d\n", Wlvl_ab);
ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab, ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
skip_L, skip_ab, noisevar_L, noisevar_ab); skip_L, skip_ab, noisevar_L, noisevar_ab);
} }
//omp_set_nested(false);
} }
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%