From 7918fab5621bbdbd48ef20ee59e2f7079680656c Mon Sep 17 00:00:00 2001 From: Ingo Date: Tue, 22 Jan 2013 18:14:53 +0100 Subject: [PATCH] Performance optimization for impulse_nr on multi-core-systems --- rtengine/FTblockDN.cc | 688 +++++++++++++++++++++++------------------- 1 file changed, 373 insertions(+), 315 deletions(-) diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 140b85135..ebe60d550 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -59,87 +59,87 @@ #define epsilon 0.001f/(TS*TS) //tolerance namespace rtengine { - + // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% /* - Structure of the algorithm: - - 1. Compute an initial denoise of the image via undecimated wavelet transform + Structure of the algorithm: + + 1. Compute an initial denoise of the image via undecimated wavelet transform and universal thresholding modulated by user input. - 2. Decompose the residual image into TSxTS size tiles, shifting by 'offset' each step + 2. Decompose the residual image into TSxTS size tiles, shifting by 'offset' each step (so roughly each pixel is in (TS/offset)^2 tiles); Discrete Cosine transform the tiles. 3. Filter the DCT data to pick out patterns missed by the wavelet denoise 4. Inverse DCT the denoised tile data and combine the tiles into a denoised output image. - + */ - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - - - - + + + + + void ImProcFunctions::RGB_denoise(Imagefloat * src, Imagefloat * dst, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const procparams::DefringeParams & defringe) - { - + { + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - + /*if (plistener) { plistener->setProgressStr ("Denoise..."); plistener->setProgress (0.0); }*/ - + volatile double progress = 0.0; - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - + const short int imheight=src->height, imwidth=src->width; - + if (dnparams.luma==0 && dnparams.chroma==0) { //nothing to do; copy src to dst memcpy(dst->data,src->data,dst->width*dst->height*3*sizeof(float)); return; } - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // gamma transform for input data float gam = dnparams.gamma; float gamthresh = 0.001; float gamslope = exp(log((double)gamthresh)/gam)/gamthresh; - + LUTf gamcurve(65536,0); - + for (int i=0; i<65536; i++) { gamcurve[i] = (Color::gamma((double)i/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f; } - + // inverse gamma transform for output data float igam = 1/gam; float igamthresh = gamthresh*gamslope; float igamslope = 1/gamslope; - + LUTf igamcurve(65536,0); - + for (int i=0; i<65536; i++) { igamcurve[i] = (Color::gamma((float)i/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); } //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - + + //srand((unsigned)time(0));//test with random data - + const float gain = pow (2.0, dnparams.expcomp); - + float noisevar_Ldetail = SQR((SQR(100-dnparams.Ldetail) + 50*(100-dnparams.Ldetail)) * TS * 0.5f); - - + + array2D tilemask_in(TS,TS); array2D tilemask_out(TS,TS); - + const int border = MAX(2,TS/16); - + #ifdef _OPENMP #pragma omp parallel for #endif @@ -150,25 +150,25 @@ namespace rtengine { for (int j=0; jTS/2 ? j-TS+1 : j)); tilemask_in[i][j] = (vmask * (j1data[n] = 0; - + const int tilesize = 1024; const int overlap = 128; - + int numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip; - + if (imwidth 0) numthreads = MIN(numthreads,options.RgbDenoiseThreadLimit); +#pragma omp parallel num_threads(numthreads) +#endif + { + //DCT block data storage + float * Lblox; + float * fLblox; +#ifdef _OPENMP +#pragma omp critical +#endif + { + Lblox = fftwf_alloc_real(max_numblox_W*TS*TS); + fLblox = fftwf_alloc_real(max_numblox_W*TS*TS); + } +#ifdef _OPENMP +#pragma omp for schedule(dynamic) collapse(2) +#endif for (int tiletop=0; tiletop Lin(width,height,ARRAY2D_CLEAR_DATA); + array2D Lin(width,height); //wavelet denoised image LabImage * labdn = new LabImage(width,height); //residual between input and denoised L channel array2D Ldetail(width,height,ARRAY2D_CLEAR_DATA); //pixel weight array2D totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks - +// + //#ifdef _OPENMP //#pragma omp parallel for //#endif @@ -220,145 +272,142 @@ namespace rtengine { int i1 = i - tiletop; for (int j=tileleft/*, j1=0*/; jr[i][j]; float Y = gain*src->g[i][j]; float Z = gain*src->b[i][j]; - + X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); - + labdn->L[i1][j1] = Y; labdn->a[i1][j1] = (X-Y); labdn->b[i1][j1] = (Y-Z); - - Ldetail[i1][j1] = 0; + +// Ldetail[i1][j1] = 0; Lin[i1][j1] = Y; - totwt[i1][j1] = 0; +// totwt[i1][j1] = 0; } - } + } } else {//image is not raw; use Lab parametrization for (int i=tiletop/*, i1=0*/; ir[i][j] ]; float gtmp = Color::igammatab_srgb[ src->g[i][j] ]; float btmp = Color::igammatab_srgb[ src->b[i][j] ]; - + //perhaps use LCH or YCrCb ??? float X = xyz_sRGB[0][0]*rtmp + xyz_sRGB[0][1]*gtmp + xyz_sRGB[0][2]*btmp; float Y = xyz_sRGB[1][0]*rtmp + xyz_sRGB[1][1]*gtmp + xyz_sRGB[1][2]*btmp; float Z = xyz_sRGB[2][0]*rtmp + xyz_sRGB[2][1]*gtmp + xyz_sRGB[2][2]*btmp; - + X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); - + labdn->L[i1][j1] = Y; labdn->a[i1][j1] = (X-Y); labdn->b[i1][j1] = (Y-Z); - - Ldetail[i1][j1] = 0; + +// Ldetail[i1][j1] = 0; Lin[i1][j1] = Y; - totwt[i1][j1] = 0; +// totwt[i1][j1] = 0; } } } - - + + //initial impulse denoise if (dnparams.luma>0.01) { impulse_nr (labdn, MIN(50.0f,dnparams.luma)/20.0f); } - + int datalen = labdn->W * labdn->H; - + //now perform basic wavelet denoise - //last two arguments of wavelet decomposition are max number of wavelet decomposition levels; - //and whether to subsample the image after wavelet filtering. Subsampling is coded as - //binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample + //last two arguments of wavelet decomposition are max number of wavelet decomposition levels; + //and whether to subsample the image after wavelet filtering. Subsampling is coded as + //binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample //the first level only, 7 means subsample the first three levels, etc. - wavelet_decomposition Ldecomp(labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ ); - wavelet_decomposition adecomp(labdn->data+datalen, labdn->W, labdn->H, 5, 1 ); - wavelet_decomposition bdecomp(labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 ); - + float noisevarL = SQR((dnparams.luma/125.0f)*(1+ dnparams.luma/25.0f)); + float noisevarab = SQR(dnparams.chroma/10.0f); - + { // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF) + wavelet_decomposition* Ldecomp; + wavelet_decomposition* adecomp; + wavelet_decomposition* bdecomp; + + Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, 5/*maxlevels*/, 0/*subsampling*/ ); + adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H, 5, 1 ); + bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, 5, 1 ); + //WaveletDenoiseAll_BiShrink(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab); - WaveletDenoiseAll(Ldecomp, adecomp, bdecomp, noisevarL, noisevarab); - - Ldecomp.reconstruct(labdn->data); - adecomp.reconstruct(labdn->data+datalen); - bdecomp.reconstruct(labdn->data+2*datalen); - + WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarab); + + Ldecomp->reconstruct(labdn->data); + delete Ldecomp; + adecomp->reconstruct(labdn->data+datalen); + delete adecomp; + bdecomp->reconstruct(labdn->data+2*datalen); + delete bdecomp; + } + //TODO: at this point wavelet coefficients storage can be freed - + //Issue 1680: Done now + //second impulse denoise if (dnparams.luma>0.01) { impulse_nr (labdn, MIN(50.0f,dnparams.luma)/20.0f); - } + } //PF_correct_RT(dst, dst, defringe.radius, defringe.threshold); - + //wavelet denoised L channel array2D Lwavdn(width,height); float * Lwavdnptr = Lwavdn; memcpy (Lwavdnptr, labdn->data, width*height*sizeof(float)); - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - // now do detail recovery using block DCT to detect + // now do detail recovery using block DCT to detect // patterns missed by wavelet denoise // blocks are not the same thing as tiles! - - - // allocate DCT data structures - + + + // calculation for detail recovery blocks const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad; const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad; + //const int nrtiles = numblox_W*numblox_H; // end of tiling calc - - //DCT block data storage - float * Lblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float)); - float * fLblox = (float *) fftwf_malloc (numblox_W*TS*TS * sizeof (float)); - - - //make a plan for FFTW - fftwf_plan plan_forward_blox, plan_backward_blox; - - int nfwd[2]={TS,TS}; - - //for DCT: - const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10}; - const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01}; - - plan_forward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, Lblox, NULL, 1, TS*TS, fLblox, NULL, 1, TS*TS, fwdkind, FFTW_ESTIMATE ); - plan_backward_blox = fftwf_plan_many_r2r(2, nfwd, numblox_W, fLblox, NULL, 1, TS*TS, Lblox, NULL, 1, TS*TS, bwdkind, FFTW_ESTIMATE ); - + { + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // Main detail recovery algorithm: Block loop - //OpenMP here - //adding omp here leads to artifacts + //OpenMP here + //adding omp here leads to artifacts + AlignedBufferMP buffer(width + TS + 2*blkrad*offset); for (int vblk=0; vblk* pBuf = buffer.acquire(); +// float * buffer = new float [width + TS + 2*blkrad*offset]; + float * datarow = (float*)pBuf->data +blkrad*offset; + //#ifdef _OPENMP //#pragma omp parallel for //#endif //TODO: implement using AlignedBufferMP +// #pragma omp parallel for for (int i=0/*, row=top*/; i=height) { rr = MAX(0,2*height-2-row); - } - + } + for (int j=0; jW; j++) { datarow[j] = (Lin[rr][j]-Lwavdn[rr][j]); } - + for (int j=-blkrad*offset; j<0; j++) { datarow[j] = datarow[MIN(-j,width-1)]; } for (int j=width; j=0 && top+i=0 && left+jL[i][j] = Lwavdn[i][j] + hpdn; - + } } - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // transform denoised "Lab" to output RGB - + //calculate mask for feathering output tile overlaps float * Vmask = new float [height+1]; float * Hmask = new float [width+1]; - + for (int i=0; i0) Hmask[i] = mask; if (tilerightL[i1][j1]; X = (labdn->a[i1][j1]) + Y; Z = Y - (labdn->b[i1][j1]); - + X = X<32768.0f ? igamcurve[X] : (Color::gamma((float)X/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); Y = Y<32768.0f ? igamcurve[Y] : (Color::gamma((float)Y/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); Z = Z<32768.0f ? igamcurve[Z] : (Color::gamma((float)Z/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); - + float factor = Vmask[i1]*Hmask[j1]/gain; - + dsttmp->r[i][j] += factor*X; dsttmp->g[i][j] += factor*Y; dsttmp->b[i][j] += factor*Z; - + } } } else { #ifdef _OPENMP -#pragma omp parallel for +//#pragma omp parallel for #endif for (int i=tiletop; iL[i1][j1]; X = (labdn->a[i1][j1]) + Y; Z = Y - (labdn->b[i1][j1]); - + X = X<32768.0f ? igamcurve[X] : (Color::gamma((float)X/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); Y = Y<32768.0f ? igamcurve[Y] : (Color::gamma((float)Y/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); Z = Z<32768.0f ? igamcurve[Z] : (Color::gamma((float)Z/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); - + float factor = Vmask[i1]*Hmask[j1]; - + float rtmp = sRGB_xyz[0][0]*X + sRGB_xyz[0][1]*Y + sRGB_xyz[0][2]*Z; float gtmp = sRGB_xyz[1][0]*X + sRGB_xyz[1][1]*Y + sRGB_xyz[1][2]*Z; float btmp = sRGB_xyz[2][0]*X + sRGB_xyz[2][1]*Y + sRGB_xyz[2][2]*Z; - + dsttmp->r[i][j] += factor*rtmp; dsttmp->g[i][j] += factor*gtmp; dsttmp->b[i][j] += factor*btmp; - + } } } - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - + delete labdn; delete[] Vmask; delete[] Hmask; - + }//end of tile row }//end of tile loop - +#ifdef _OPENMP +#pragma omp critical +#endif +{ + fftwf_free ( Lblox); + fftwf_free ( fLblox); +} + + } //copy denoised image to output memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float)); - + if (!isRAW) {//restore original image gamma #ifdef _OPENMP #pragma omp parallel for @@ -546,140 +596,146 @@ namespace rtengine { dst->data[i] = Color::gammatab_srgb[ dst->data[i] ]; } } - + delete dsttmp; - + + // destroy the plans + fftwf_destroy_plan( plan_forward_blox[0] ); + fftwf_destroy_plan( plan_backward_blox[0] ); + fftwf_destroy_plan( plan_forward_blox[1] ); + fftwf_destroy_plan( plan_backward_blox[1] ); + fftwf_cleanup(); + }//end of main RGB_denoise //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - - - void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT - { - float * nbrwt = new float[TS*TS]; //for DCT - - int blkstart = hblproc*TS*TS; - - boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT + + + void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT + { + float * nbrwt = new float[TS*TS]; //for DCT + int blkstart = hblproc*TS*TS; + + boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT +#pragma omp parallel for for (int n=0; n=0; lvl--) {//for levels less than max, use level diff to make edge mask //for (int lvl=0; lvl edge(Wlvl_L,Hlvl_L); - AlignedBuffer* buffer = new AlignedBuffer (MAX(Wlvl_L,Hlvl_L)); - + AlignedBufferMP* buffer = new AlignedBufferMP (MAX(Wlvl_L,Hlvl_L)); + //printf("\n level=%d \n",lvl); - + for (int dir=1; dir<4; dir++) { float mad_L = madL[lvl][dir-1]; float mad_a = noisevar_ab*mada[lvl][dir-1]; @@ -730,136 +786,138 @@ namespace rtengine { //float mad_Lpar = madL[lvl+1][dir-1]; //float mad_apar = mada[lvl+1][dir-1]; //float mad_bpar = mada[lvl+1][dir-1]; - + //float skip_ab_ratio = WaveletCoeffs_a.level_stride(lvl+1)/skip_ab; float skip_L_ratio = WaveletCoeffs_L.level_stride(lvl+1)/skip_L; - + if (noisevar_ab>0.01) { - + //printf(" dir=%d mad_L=%f mad_a=%f mad_b=%f \n",dir,sqrt(mad_L),sqrt(mad_a),sqrt(mad_b)); - -//OpenMP here + +//OpenMP here for (int i=0; i2 ? 1 : (coeff_a<1 ? 0 : (coeff_a - 1))); //WavCoeffs_b[dir][coeffloc_ab] *= edgefactor*(coeff_b>2 ? 1 : (coeff_b<1 ? 0 : (coeff_b - 1))); - + //float satfactor_a = mad_a/(mad_a+0.5*SQR(WavCoeffs_a[0][coeffloc_ab])); //float satfactor_b = mad_b/(mad_b+0.5*SQR(WavCoeffs_b[0][coeffloc_ab])); - + WavCoeffs_a[dir][coeffloc_ab] *= SQR(1-exp(-(mag_a/mad_a)-(mag_L/(9*mad_L)))/*satfactor_a*/); WavCoeffs_b[dir][coeffloc_ab] *= SQR(1-exp(-(mag_b/mad_b)-(mag_L/(9*mad_L)))/*satfactor_b*/); - + } }//now chrominance coefficients are denoised } - + if (noisevar_L>0.01) { mad_L *= noisevar_L*5/(lvl+1); -//OpenMP here - for (int i=0; i (edge, edge, buffer, Wlvl_L, Hlvl_L, 1<<(lvl+1), false /*multiThread*/); //gaussVertical (edge, edge, buffer, Wlvl_L, Hlvl_L, 1<<(lvl+1), false); - + boxblur(sfave, sfave, lvl+2, lvl+2, Wlvl_L, Hlvl_L);//increase smoothness by locally averaging shrinkage -//OpenMP here - for (int i=0; i0.01) { //OpenMP here #ifdef _OPENMP #pragma omp parallel for -#endif +#endif for (int i=0; i2*thresh_a ? 1 : (coeff_a2*thresh_b ? 1 : (coeff_b0.01) { -//OpenMP here +//OpenMP here #ifdef _OPENMP #pragma omp parallel for #endif @@ -944,28 +1002,28 @@ namespace rtengine { float mag = SQR(WavCoeffs_L[dir][i]); float shrinkfactor = mag/(mag+mad_L*exp(-mag/(9*mad_L))+eps); - + //WavCoeffs_L[dir][i] *= shrinkfactor; sfave[i] = shrinkfactor; } -//OpenMP here +//OpenMP here boxblur(sfave, sfave, level+2, level+2, W_L, H_L);//increase smoothness by locally averaging shrinkage #ifdef _OPENMP #pragma omp parallel for #endif for (int i=0; i