Performance optimization for impulse_nr on multi-core-systems
This commit is contained in:
parent
0a91af9c12
commit
7918fab562
@ -192,7 +192,58 @@ namespace rtengine {
|
||||
//now we have tile dimensions, overlaps
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
//adding omp here slows it down
|
||||
// According to FFTW-Doc 'it is safe to execute the same plan in parallel by multiple threads', so we now create 4 plans
|
||||
// outside the parallel region and use them inside the parallel region.
|
||||
|
||||
// calculate max size of numblox_W.
|
||||
int max_numblox_W = ceil(((float)(MIN(imwidth,tilewidth)))/(offset))+2*blkrad;
|
||||
// calculate min size of numblox_W.
|
||||
int min_numblox_W = ceil(((float)((MIN(imwidth,((numtiles_W - 1) * tileWskip) + tilewidth) ) - ((numtiles_W - 1) * tileWskip)))/(offset))+2*blkrad;
|
||||
|
||||
// these are needed only for creation of the plans and will be freed before entering the parallel loop
|
||||
float * Lbloxtmp;
|
||||
float * fLbloxtmp;
|
||||
Lbloxtmp = fftwf_alloc_real(max_numblox_W*TS*TS);
|
||||
fLbloxtmp = fftwf_alloc_real(max_numblox_W*TS*TS);
|
||||
|
||||
int nfwd[2]={TS,TS};
|
||||
|
||||
//for DCT:
|
||||
const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
|
||||
const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
|
||||
|
||||
fftwf_plan plan_forward_blox[2];
|
||||
fftwf_plan plan_backward_blox[2];
|
||||
|
||||
// Creating the plans with FFTW_MEASURE instead of FFTW_ESTIMATE speeds up the execute a bit
|
||||
plan_forward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
|
||||
plan_backward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
|
||||
plan_forward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
|
||||
plan_backward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
|
||||
fftwf_free ( Lbloxtmp );
|
||||
fftwf_free ( fLbloxtmp );
|
||||
|
||||
#ifdef _OPENMP
|
||||
// Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
|
||||
int numtiles = numtiles_W * numtiles_H;
|
||||
int numthreads = MIN(numtiles,omp_get_max_threads());
|
||||
//if(options.RgbDenoiseThreadLimit > 0) numthreads = MIN(numthreads,options.RgbDenoiseThreadLimit);
|
||||
#pragma omp parallel num_threads(numthreads)
|
||||
#endif
|
||||
{
|
||||
//DCT block data storage
|
||||
float * Lblox;
|
||||
float * fLblox;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
{
|
||||
Lblox = fftwf_alloc_real(max_numblox_W*TS*TS);
|
||||
fLblox = fftwf_alloc_real(max_numblox_W*TS*TS);
|
||||
}
|
||||
#ifdef _OPENMP
|
||||
#pragma omp for schedule(dynamic) collapse(2)
|
||||
#endif
|
||||
for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
|
||||
for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
|
||||
|
||||
@ -202,13 +253,14 @@ namespace rtengine {
|
||||
int height = tilebottom-tiletop;
|
||||
|
||||
//input L channel
|
||||
array2D<float> Lin(width,height,ARRAY2D_CLEAR_DATA);
|
||||
array2D<float> Lin(width,height);
|
||||
//wavelet denoised image
|
||||
LabImage * labdn = new LabImage(width,height);
|
||||
//residual between input and denoised L channel
|
||||
array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
|
||||
//pixel weight
|
||||
array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
|
||||
//
|
||||
|
||||
//#ifdef _OPENMP
|
||||
//#pragma omp parallel for
|
||||
@ -233,9 +285,9 @@ namespace rtengine {
|
||||
labdn->a[i1][j1] = (X-Y);
|
||||
labdn->b[i1][j1] = (Y-Z);
|
||||
|
||||
Ldetail[i1][j1] = 0;
|
||||
// Ldetail[i1][j1] = 0;
|
||||
Lin[i1][j1] = Y;
|
||||
totwt[i1][j1] = 0;
|
||||
// totwt[i1][j1] = 0;
|
||||
}
|
||||
}
|
||||
} else {//image is not raw; use Lab parametrization
|
||||
@ -262,9 +314,9 @@ namespace rtengine {
|
||||
labdn->a[i1][j1] = (X-Y);
|
||||
labdn->b[i1][j1] = (Y-Z);
|
||||
|
||||
Ldetail[i1][j1] = 0;
|
||||
// Ldetail[i1][j1] = 0;
|
||||
Lin[i1][j1] = Y;
|
||||
totwt[i1][j1] = 0;
|
||||
// totwt[i1][j1] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -282,21 +334,32 @@ namespace rtengine {
|
||||
//and whether to subsample the image after wavelet filtering. Subsampling is coded as
|
||||
//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
|
||||
//the first level only, 7 means subsample the first three levels, etc.
|
||||
wavelet_decomposition Ldecomp(labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
|
||||
wavelet_decomposition adecomp(labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
|
||||
wavelet_decomposition bdecomp(labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );
|
||||
|
||||
float noisevarL = SQR((dnparams.luma/125.0f)*(1+ dnparams.luma/25.0f));
|
||||
|
||||
float noisevarab = SQR(dnparams.chroma/10.0f);
|
||||
{ // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
|
||||
wavelet_decomposition* Ldecomp;
|
||||
wavelet_decomposition* adecomp;
|
||||
wavelet_decomposition* bdecomp;
|
||||
|
||||
Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ );
|
||||
adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H, 5, 1 );
|
||||
bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 );
|
||||
|
||||
//WaveletDenoiseAll_BiShrink(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
|
||||
WaveletDenoiseAll(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab);
|
||||
WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarab);
|
||||
|
||||
Ldecomp.reconstruct(labdn->data);
|
||||
adecomp.reconstruct(labdn->data+datalen);
|
||||
bdecomp.reconstruct(labdn->data+2*datalen);
|
||||
Ldecomp->reconstruct(labdn->data);
|
||||
delete Ldecomp;
|
||||
adecomp->reconstruct(labdn->data+datalen);
|
||||
delete adecomp;
|
||||
bdecomp->reconstruct(labdn->data+2*datalen);
|
||||
delete bdecomp;
|
||||
}
|
||||
|
||||
//TODO: at this point wavelet coefficients storage can be freed
|
||||
//Issue 1680: Done now
|
||||
|
||||
//second impulse denoise
|
||||
if (dnparams.luma>0.01) {
|
||||
@ -316,49 +379,35 @@ namespace rtengine {
|
||||
// blocks are not the same thing as tiles!
|
||||
|
||||
|
||||
// allocate DCT data structures
|
||||
|
||||
// calculation for detail recovery blocks
|
||||
const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad;
|
||||
const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad;
|
||||
|
||||
//const int nrtiles = numblox_W*numblox_H;
|
||||
// end of tiling calc
|
||||
|
||||
//DCT block data storage
|
||||
float * Lblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
|
||||
float * fLblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float));
|
||||
|
||||
|
||||
//make a plan for FFTW
|
||||
fftwf_plan plan_forward_blox, plan_backward_blox;
|
||||
|
||||
int nfwd[2]={TS,TS};
|
||||
|
||||
//for DCT:
|
||||
const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
|
||||
const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
|
||||
|
||||
plan_forward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, Lblox, NULL, 1, TS*TS, fLblox, NULL, 1, TS*TS, fwdkind, FFTW_ESTIMATE );
|
||||
plan_backward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, fLblox, NULL, 1, TS*TS, Lblox, NULL, 1, TS*TS, bwdkind, FFTW_ESTIMATE );
|
||||
{
|
||||
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
// Main detail recovery algorithm: Block loop
|
||||
//OpenMP here
|
||||
//adding omp here leads to artifacts
|
||||
AlignedBufferMP<float> buffer(width + TS + 2*blkrad*offset);
|
||||
for (int vblk=0; vblk<numblox_H; vblk++) {
|
||||
//printf("vblock=%d",vblk);
|
||||
int vblkmod = vblk%8;
|
||||
|
||||
int top = (vblk-blkrad)*offset;
|
||||
|
||||
float * buffer = new float [width + TS + 2*blkrad*offset];
|
||||
float * datarow = buffer+blkrad*offset;
|
||||
AlignedBuffer<float>* pBuf = buffer.acquire();
|
||||
// float * buffer = new float [width + TS + 2*blkrad*offset];
|
||||
float * datarow = (float*)pBuf->data +blkrad*offset;
|
||||
|
||||
//#ifdef _OPENMP
|
||||
//#pragma omp parallel for
|
||||
//#endif
|
||||
//TODO: implement using AlignedBufferMP
|
||||
// #pragma omp parallel for
|
||||
for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) {
|
||||
int row = top + i;
|
||||
int rr = row;
|
||||
@ -393,13 +442,14 @@ namespace rtengine {
|
||||
}
|
||||
}
|
||||
}//end of filling block row
|
||||
delete[] buffer;
|
||||
buffer.release(pBuf);
|
||||
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
//fftwf_print_plan (plan_forward_blox);
|
||||
fftwf_execute_r2r(plan_forward_blox,Lblox,fLblox); // DCT an entire row of tiles
|
||||
|
||||
if(numblox_W == max_numblox_W)
|
||||
fftwf_execute_r2r(plan_forward_blox[0],Lblox,fLblox); // DCT an entire row of tiles
|
||||
else
|
||||
fftwf_execute_r2r(plan_forward_blox[1],Lblox,fLblox); // DCT an entire row of tiles
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
// now process the vblk row of blocks for noise reduction
|
||||
for (int hblk=0; hblk<numblox_W; hblk++) {
|
||||
@ -411,7 +461,10 @@ namespace rtengine {
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
//now perform inverse FT of an entire row of blocks
|
||||
fftwf_execute_r2r(plan_backward_blox,fLblox,Lblox); //for DCT
|
||||
if(numblox_W == max_numblox_W)
|
||||
fftwf_execute_r2r(plan_backward_blox[0],fLblox,Lblox); //for DCT
|
||||
else
|
||||
fftwf_execute_r2r(plan_backward_blox[1],fLblox,Lblox); //for DCT
|
||||
|
||||
int topproc = (vblk-blkrad)*offset;
|
||||
|
||||
@ -421,20 +474,9 @@ namespace rtengine {
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
}//end of vertical block loop
|
||||
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
// clean up
|
||||
//#pragma omp single nowait
|
||||
fftwf_destroy_plan( plan_forward_blox );
|
||||
//#pragma omp single nowait
|
||||
fftwf_destroy_plan( plan_backward_blox );
|
||||
|
||||
fftwf_free ( Lblox);
|
||||
fftwf_free ( fLblox);
|
||||
|
||||
fftwf_cleanup();
|
||||
|
||||
}
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
for (int i=0; i<height; i++) {
|
||||
for (int j=0; j<width; j++) {
|
||||
@ -470,7 +512,7 @@ namespace rtengine {
|
||||
//convert back to RGB and write to destination array
|
||||
if (isRAW) {
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
//#pragma omp parallel for
|
||||
#endif
|
||||
for (int i=tiletop; i<tilebottom; i++){
|
||||
int i1 = i-tiletop;
|
||||
@ -496,7 +538,7 @@ namespace rtengine {
|
||||
}
|
||||
} else {
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
//#pragma omp parallel for
|
||||
#endif
|
||||
for (int i=tiletop; i<tilebottom; i++){
|
||||
int i1 = i-tiletop;
|
||||
@ -534,7 +576,15 @@ namespace rtengine {
|
||||
|
||||
}//end of tile row
|
||||
}//end of tile loop
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
{
|
||||
fftwf_free ( Lblox);
|
||||
fftwf_free ( fLblox);
|
||||
}
|
||||
|
||||
}
|
||||
//copy denoised image to output
|
||||
memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float));
|
||||
|
||||
@ -549,6 +599,13 @@ namespace rtengine {
|
||||
|
||||
delete dsttmp;
|
||||
|
||||
// destroy the plans
|
||||
fftwf_destroy_plan( plan_forward_blox[0] );
|
||||
fftwf_destroy_plan( plan_backward_blox[0] );
|
||||
fftwf_destroy_plan( plan_forward_blox[1] );
|
||||
fftwf_destroy_plan( plan_backward_blox[1] );
|
||||
fftwf_cleanup();
|
||||
|
||||
}//end of main RGB_denoise
|
||||
|
||||
|
||||
@ -561,11 +618,10 @@ namespace rtengine {
|
||||
void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT
|
||||
{
|
||||
float * nbrwt = new float[TS*TS]; //for DCT
|
||||
|
||||
int blkstart = hblproc*TS*TS;
|
||||
|
||||
boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int n=0; n<TS*TS; n++) { //for DCT
|
||||
fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail));
|
||||
}//output neighbor averaged result
|
||||
@ -586,19 +642,19 @@ namespace rtengine {
|
||||
const int numblox_W = ceil(((float)(width))/(offset));
|
||||
const float DCTnorm = 1.0f/(4*TS*TS); //for DCT
|
||||
|
||||
int imin = MAX(0,-top);
|
||||
int bottom = MIN( top+TS,height);
|
||||
int imax = bottom - top;
|
||||
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
//add row of tiles to output image
|
||||
for (int hblk=0; hblk < numblox_W; hblk++) {
|
||||
int left = (hblk-blkrad)*offset;
|
||||
int bottom = MIN( top+TS,height);
|
||||
int right = MIN(left+TS, width);
|
||||
int imin = MAX(0,-top);
|
||||
int jmin = MAX(0,-left);
|
||||
int imax = bottom - top;
|
||||
int jmax = right - left;
|
||||
|
||||
int indx = hblk*TS;
|
||||
|
||||
for (int i=imin; i<imax; i++)
|
||||
@ -719,7 +775,7 @@ namespace rtengine {
|
||||
//simple wavelet shrinkage
|
||||
float * sfave = new float[Wlvl_L*Hlvl_L];
|
||||
array2D<float> edge(Wlvl_L,Hlvl_L);
|
||||
AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(Wlvl_L,Hlvl_L));
|
||||
AlignedBufferMP<double>* buffer = new AlignedBufferMP<double> (MAX(Wlvl_L,Hlvl_L));
|
||||
|
||||
//printf("\n level=%d \n",lvl);
|
||||
|
||||
@ -829,7 +885,8 @@ namespace rtengine {
|
||||
wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab )
|
||||
{
|
||||
int maxlvl = WaveletCoeffs_L.maxlevel();
|
||||
|
||||
// printf("maxlevel = %d\n",maxlvl);
|
||||
//omp_set_nested(true);
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
@ -847,11 +904,12 @@ namespace rtengine {
|
||||
float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
|
||||
float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
|
||||
float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
|
||||
|
||||
// printf("Hab : %d\n", Hlvl_ab);
|
||||
// printf("Wab : %d\n", Wlvl_ab);
|
||||
ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
|
||||
skip_L, skip_ab, noisevar_L, noisevar_ab);
|
||||
}
|
||||
|
||||
//omp_set_nested(false);
|
||||
}
|
||||
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
Loading…
x
Reference in New Issue
Block a user