diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index f9d809dfb..44077d996 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -35,7 +35,7 @@ #include "rt_math.h" #include "mytime.h" #include "sleef.c" -#include "opthelper.h" +#include "opthelper.h" #ifdef _OPENMP #include @@ -244,7 +244,7 @@ void ImProcFunctions::Tile_calc (int tilesize, int overlap, int kall, int imwidt int denoiseNestedLevels = 1; -void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst,Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi) +SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst,Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi) { //#ifdef _DEBUG MyTime t1e,t2e; @@ -278,20 +278,20 @@ void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst,I {wprofi[1][0],wprofi[1][1],wprofi[1][2]}, {wprofi[2][0],wprofi[2][1],wprofi[2][2]} }; - lumcalc = new float*[hei]; - for (int i=0; ir(ii,jj); @@ -301,9 +301,9 @@ void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst,I float XL,YL,ZL; Color::rgbxyz(RL,GL,BL,XL,YL,ZL,wpi); Color::XYZ2Lab(XL, YL, ZL, LLum, AAum, BBum); - lumcalc[ii][jj]=LLum; - acalc[ii][jj]=AAum; - bcalc[ii][jj]=BBum; + lumcalc[ii>>1][jj>>1]=LLum; + acalc[ii>>1][jj>>1]=AAum; + bcalc[ii>>1][jj>>1]=BBum; } } delete calclum; @@ -416,7 +416,7 @@ if(settings->leveldnti ==1) { int numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip; - Tile_calc (tilesize, overlap, 2, imwidth, imheight, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip); + Tile_calc (tilesize, overlap, 2, imwidth, imheight, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip); //now we have tile dimensions, overlaps //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -472,14 +472,12 @@ if(settings->leveldnti ==1) { printf("RGB_denoise uses %d main thread(s) and up to %d nested thread(s) for each main thread\n",numthreads,denoiseNestedLevels); #endif - float *nbrwtArray[denoiseNestedLevels*numthreads]; float *LbloxArray[denoiseNestedLevels*numthreads]; float *fLbloxArray[denoiseNestedLevels*numthreads]; for(int i=0;iworkingSpaceInverseMatrix (params->icm.working); @@ -511,6 +509,12 @@ if(settings->leveldnti ==1) { float residblue=0.f; int pos; + float** noisevarlum = new float*[(tileheight+1)/2]; + for (int i=0; i<(tileheight+1)/2; i++) + noisevarlum[i] = new float[(tilewidth+1)/2]; + float** noisevarchrom = new float*[(tileheight+1)/2]; + for (int i=0; i<(tileheight+1)/2; i++) + noisevarchrom[i] = new float[(tilewidth+1)/2]; #ifdef _OPENMP @@ -562,17 +566,9 @@ if(settings->leveldnti ==1) { //wavelet denoised image LabImage * labdn = new LabImage(width,height); float* mad_LL = new float [height*width]; - float** noisevarlum; - noisevarlum = new float*[height]; - for (int i=0; i Ldetail(width,height,ARRAY2D_CLEAR_DATA); @@ -626,6 +622,7 @@ if(settings->leveldnti ==1) { R_ = R_<65535.0f ? gamcurve[R_] : (Color::gammanf(R_/65535.f, gam)*32768.0f); G_ = G_<65535.0f ? gamcurve[G_] : (Color::gammanf(G_/65535.f, gam)*32768.0f); B_ = B_<65535.0f ? gamcurve[B_] : (Color::gammanf(B_/65535.f, gam)*32768.0f); + //true conversion xyz=>Lab float L,a,b; float X,Y,Z; @@ -633,35 +630,38 @@ if(settings->leveldnti ==1) { //convert to Lab Color::XYZ2Lab(X, Y, Z, L, a, b); - - if(noiseLCurve) { - float kN = lumcalc[i][j]; //with no gamma and take into account working profile - float epsi = 0.01f; - if(kN<2.f) - kN = 2.f;//avoid divided by zero - if(kN>32768.f) - kN = 32768.f; // not strictly necessary - float kinterm = epsi + noiseLCurve[xdivf(kN,15)*500.f]; - float ki = kinterm*100.f; - ki += noiseluma; - noisevarlum[i1][j1] = SQR((ki/125.f)*(1.f+ki/25.f)); - } - if(noiseCCurve) { - float aN = acalc[i][j]; - float bN = bcalc[i][j]; - float cN = sqrtf(SQR(aN)+SQR(bN)); - if(cN < 100.f) - cN = 100.f;//avoid divided by zero - float Cinterm = 1.f + ponderCC*4.f*noiseCCurve[cN/60.f];//C=f(C) - noisevarchrom[i1][j1] = max(noisevarab_b,noisevarab_r)*SQR(Cinterm); - } - //end chroma labdn->L[i1][j1] = L; labdn->a[i1][j1] = a; labdn->b[i1][j1] = b; - Lin[i1][j1] = L; + Lin[i1][j1] = L; + + if(((i1|j1)&1) == 0) { + if(noiseLCurve) { + float kN = lumcalc[i>>1][j>>1]; //with no gamma and take into account working profile + float epsi = 0.01f; + if(kN<2.f) + kN = 2.f;//avoid divided by zero + if(kN>32768.f) + kN = 32768.f; // not strictly necessary + float kinterm = epsi + noiseLCurve[xdivf(kN,15)*500.f]; + float ki = kinterm*100.f; + ki += noiseluma; + noisevarlum[i1>>1][j1>>1] = SQR((ki/125.f)*(1.f+ki/25.f)); + } + if(noiseCCurve) { + float aN = acalc[i>>1][j>>1]; + float bN = bcalc[i>>1][j>>1]; + float cN = sqrtf(SQR(aN)+SQR(bN)); + if(cN < 100.f) + cN = 100.f;//avoid divided by zero + float Cinterm = 1.f + ponderCC*4.f*noiseCCurve[cN/60.f];//C=f(C) + noisevarchrom[i1>>1][j1>>1] = max(noisevarab_b,noisevarab_r)*SQR(Cinterm); + } + } + //end chroma + } } } @@ -680,29 +680,30 @@ if(settings->leveldnti ==1) { //conversion colorspace to determine luminance with no gamma X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); - Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); - if(noiseLCurve) { - // float noiseluma=(float) dnparams.luma; - float kN = lumcalc[i][j]; - float epsi = 0.01f; - if (kN<2.f) - kN = 2.f; - if (kN>32768.f) - kN = 32768.f; - float kinterm = epsi + noiseLCurve[xdivf(kN,15)*500.f]; - float ki = kinterm*100.f; - ki += noiseluma; - noisevarlum[i1][j1] = SQR((ki/125.f)*(1.f+ki/25.f)); - } - if(noiseCCurve) { - float aN = acalc[i][j]; - float bN = bcalc[i][j]; - - float cN = sqrtf(SQR(aN)+SQR(bN)); - if(cN < 100.f) - cN = 100.f;//avoid divided by zero - float Cinterm = 1.f + ponderCC*4.f*noiseCCurve[cN/60.f]; - noisevarchrom[i1][j1] = max(noisevarab_b,noisevarab_r)*SQR(Cinterm); + Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f); + if(((i1|j1)&1) == 0) { + if(noiseLCurve) { + // float noiseluma=(float) dnparams.luma; + float kN = lumcalc[i>>1][j>>1]; + float epsi = 0.01f; + if (kN<2.f) + kN = 2.f; + if (kN>32768.f) + kN = 32768.f; + float kinterm = epsi + noiseLCurve[xdivf(kN,15)*500.f]; + float ki = kinterm*100.f; + ki += noiseluma; + noisevarlum[i1>>1][j1>>1] = SQR((ki/125.f)*(1.f+ki/25.f)); + } + if(noiseCCurve) { + float aN = acalc[i>>1][j>>1]; + float bN = bcalc[i>>1][j>>1]; + float cN = sqrtf(SQR(aN)+SQR(bN)); + if(cN < 100.f) + cN = 100.f;//avoid divided by zero + float Cinterm = 1.f + ponderCC*4.f*noiseCCurve[cN/60.f]; + noisevarchrom[i1>>1][j1>>1] = max(noisevarab_b,noisevarab_r)*SQR(Cinterm); + } } //end chroma labdn->L[i1][j1] = Y; @@ -745,35 +746,35 @@ if(settings->leveldnti ==1) { //convert Lab Color::XYZ2Lab(X, Y, Z, L, a, b); - float Llum,alum,blum; - if(noiseLCurve || noiseCCurve) { - float XL,YL,ZL; - Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp); - Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum); + float Llum,alum,blum; + if(((i1|j1)&1) == 0) { + if(noiseLCurve || noiseCCurve) { + float XL,YL,ZL; + Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp); + Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum); + } + + if(noiseLCurve) { + float kN = Llum; + float epsi=0.01f; + if(kN<2.f) kN=2.f; + if(kN>32768.f) kN=32768.f; + float kinterm=epsi + noiseLCurve[xdivf(kN,15)*500.f]; + float ki=kinterm*100.f; + ki+=noiseluma; + noiseluma += 1.f; + noisevarlum[i1>>1][j1>>1]=SQR((ki/125.f)*(1.f+ki/25.f)); + } + if(noiseCCurve) { + float aN=alum; + float bN=blum; + float cN=sqrtf(SQR(aN)+SQR(bN)); + if(cN < 100.f) + cN=100.f;//avoid divided by zero ??? + float Cinterm=1.f + ponderCC*4.f*noiseCCurve[cN/60.f]; + noisevarchrom[i1>>1][j1>>1]=max(noisevarab_b,noisevarab_r)*SQR(Cinterm); + } } - - if(noiseLCurve) { - float kN = Llum; - float epsi=0.01f; - - if(kN<2.f) kN=2.f; - if(kN>32768.f) kN=32768.f; - float kinterm=epsi + noiseLCurve[xdivf(kN,15)*500.f]; - float ki=kinterm*100.f; - ki+=noiseluma; - noiseluma += 1.f; - noisevarlum[i1][j1]=SQR((ki/125.f)*(1.f+ki/25.f)); - } - if(noiseCCurve) { - float aN=alum; - float bN=blum; - float cN=sqrtf(SQR(aN)+SQR(bN)); - if(cN < 100.f) - cN=100.f;//avoid divided by zero ??? - float Cinterm=1.f + ponderCC*4.f*noiseCCurve[cN/60.f]; - noisevarchrom[i1][j1]=max(noisevarab_b,noisevarab_r)*SQR(Cinterm); - } - labdn->L[i1][j1] = L; labdn->a[i1][j1] = a; labdn->b[i1][j1] = b; @@ -838,13 +839,18 @@ if(settings->leveldnti ==1) { #pragma omp section #endif { - Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, levwav/*maxlevels*/, 0/*subsampling*/ ); + Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, levwav/*maxlevels*/, 1/*subsampling*/ ); } #ifdef _OPENMP #pragma omp section #endif -{ // only one section for both, because they are much faster then Ldecomp +{ adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H,levwav, 1 ); +} +#ifdef _OPENMP +#pragma omp section +#endif +{ bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, levwav, 1 ); } } @@ -1231,10 +1237,6 @@ if(settings->leveldnti ==1) { } //wavelet denoised L channel - array2D Lwavdn(width,height); - float * Lwavdnptr = Lwavdn; - memcpy (Lwavdnptr, labdn->data, width*height*sizeof(float)); - //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // now do detail recovery using block DCT to detect @@ -1269,21 +1271,19 @@ if(settings->leveldnti ==1) { #else int subThread = 0; #endif - AlignedBuffer pBuf(width + TS + 2*blkrad*offset); - float * nbrwt = nbrwtArray[subThread]; + float blurbuffer[TS*TS] ALIGNED64; float * Lblox = LbloxArray[subThread]; - float * fLblox = fLbloxArray[subThread];; - float * blurbuffer = new float[TS*TS]; + float * fLblox = fLbloxArray[subThread]; + float pBuf[width + TS + 2*blkrad*offset] ALIGNED16; + float nbrwt[TS*TS] ALIGNED64; #ifdef _OPENMP #pragma omp for #endif for (int vblk=0; vblkleveldnti ==1) { } for (int j=0; jW; j++) { - datarow[j] = (Lin[rr][j]-Lwavdn[rr][j]); + datarow[j] = (Lin[rr][j]-labdn->L[rr][j]); } for (int j=-blkrad*offset; j<0; j++) { @@ -1305,13 +1305,12 @@ if(settings->leveldnti ==1) { }//now we have a padded data row //now fill this row of the blocks with Lab high pass data - //OMP here does not add speed, better handled on the outside loop for (int hblk=0; hblk=0 && top+i=0 && left+jleveldnti ==1) { }//end of vertical block loop //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - delete [] blurbuffer; } } @@ -1360,7 +1358,7 @@ if(settings->leveldnti ==1) { //may want to include masking threshold for large hipass data to preserve edges/detail float hpdn = Ldetail[i][j]/totwt[i][j];//note that labdn initially stores the denoised hipass data - labdn->L[i][j] = Lwavdn[i][j] + hpdn; + labdn->L[i][j] += hpdn; } } @@ -1369,8 +1367,8 @@ if(settings->leveldnti ==1) { // transform denoised "Lab" to output RGB //calculate mask for feathering output tile overlaps - float * Vmask = new float [height+1]; - float * Hmask = new float [width+1]; + float Vmask[height+1] ALIGNED16; + float Hmask[width+1] ALIGNED16; for (int i=0; ileveldnti ==1) { } for (int i=0; i0) Vmask[i] = mask; if (tilebottom0) Hmask[i] = mask/newGain; @@ -1514,28 +1512,23 @@ if(settings->leveldnti ==1) { //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% delete labdn; - for (int i=0; i maxWL) maxWL = WaveletCoeffs_L.level_W(lvl); if(WaveletCoeffs_L.level_H(lvl) > maxHL) maxHL = WaveletCoeffs_L.level_H(lvl); - if(WaveletCoeffs_a.level_W(lvl) > maxWab) - maxWab = WaveletCoeffs_a.level_W(lvl); - if(WaveletCoeffs_a.level_H(lvl) > maxHab) - maxHab = WaveletCoeffs_a.level_H(lvl); } #ifdef _OPENMP #pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif { - float *buffer[6]; + float *buffer[5]; buffer[0] = new float[maxWL*maxHL+32]; buffer[1] = new float[maxWL*maxHL+64]; buffer[2] = new float[maxWL*maxHL+96]; - buffer[3] = new float[maxWab*maxHab+128]; + buffer[3] = new float[maxWL*maxHL+128]; buffer[4] = new float[maxWL*maxHL+160]; - buffer[5] = new float[maxWL*maxHL+192]; #ifdef _OPENMP @@ -1994,9 +1982,6 @@ SSEFUNCTION void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi int Wlvl_L = WaveletCoeffs_L.level_W(lvl); int Hlvl_L = WaveletCoeffs_L.level_H(lvl); - if(Wlvl_L!=width) Wlvl_L=width; - if(Hlvl_L!=height) Hlvl_L=height; - int Wlvl_ab = WaveletCoeffs_a.level_W(lvl); int Hlvl_ab = WaveletCoeffs_a.level_H(lvl); @@ -2024,9 +2009,6 @@ SSEFUNCTION void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi int Wlvl_L = WaveletCoeffs_L.level_W(lvl); int Hlvl_L = WaveletCoeffs_L.level_H(lvl); - if(Wlvl_L!=width) Wlvl_L=width; - if(Hlvl_L!=height) Hlvl_L=height; - int Wlvl_ab = WaveletCoeffs_a.level_W(lvl); int Hlvl_ab = WaveletCoeffs_a.level_H(lvl); @@ -2044,8 +2026,7 @@ SSEFUNCTION void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi } else { //simple wavelet shrinkage float * sfave = buffer[0]+32; - float * sfaved = buffer[4]+160; - float * WavCoeffsLtemp = buffer[3]+128; + float * sfaved = buffer[3]+128; float * blurBuffer = buffer[1]+64; for (int dir=1; dir<4; dir++) { float mad_Lr = madL[lvl][dir-1]; @@ -2070,48 +2051,42 @@ SSEFUNCTION void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi } } if (noisevar_abr>0.001f || noisevar_abb>0.001f) { - for(int i=0;i=0;i--) + for(int i=4;i>=0;i--) delete [] buffer[i]; } @@ -2214,16 +2182,12 @@ SSEFUNCTION void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi { int maxlvl = WaveletCoeffs_L.maxlevel(); - int maxWL = 0, maxHL = 0, maxWab = 0, maxHab = 0; + int maxWL = 0, maxHL = 0; for (int lvl=0; lvl maxWL) maxWL = WaveletCoeffs_L.level_W(lvl); if(WaveletCoeffs_L.level_H(lvl) > maxHL) maxHL = WaveletCoeffs_L.level_H(lvl); - if(WaveletCoeffs_a.level_W(lvl) > maxWab) - maxWab = WaveletCoeffs_a.level_W(lvl); - if(WaveletCoeffs_a.level_H(lvl) > maxHab) - maxHab = WaveletCoeffs_a.level_H(lvl); } @@ -2231,13 +2195,12 @@ SSEFUNCTION void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi #pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1) #endif { - float *buffer[6]; + float *buffer[5]; buffer[0] = new float[maxWL*maxHL+32]; buffer[1] = new float[maxWL*maxHL+64]; buffer[2] = new float[maxWL*maxHL+96]; - buffer[3] = new float[maxWab*maxHab+128]; + buffer[3] = new float[maxWL*maxHL+128]; buffer[4] = new float[maxWL*maxHL+160]; - buffer[5] = new float[maxWL*maxHL+192]; #ifdef _OPENMP @@ -2268,7 +2231,7 @@ SSEFUNCTION void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi skip_L, skip_ab, noisevar_L, noisevarlum, noisevarchrom, width, height, mad_LL, mad_aa, mad_bb, noisevar_abr, noisevar_abb, noi, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, callby, autoch, perf); } - for(int i=5;i>=0;i--) + for(int i=4;i>=0;i--) delete [] buffer[i]; } } @@ -2282,18 +2245,17 @@ SSEFUNCTION void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo { //simple wavelet shrinkage const float eps = 0.01f; - W_L = width; - H_L = height; - if(autoch && noisevar_abr <=0.001f) noisevar_abr=0.02f; - if(autoch && noisevar_abb <=0.001f) noisevar_abb=0.02f; + if(autoch && noisevar_abr <=0.001f) + noisevar_abr=0.02f; + if(autoch && noisevar_abb <=0.001f) + noisevar_abb=0.02f; float * sfavea = buffer[0]+32; float * sfaveb = buffer[1]+64; - float * sfavead = buffer[4]+160; - float * sfavebd = buffer[5]+192; + float * sfavead = buffer[3]+128; + float * sfavebd = buffer[4]+160; float * sfave = sfavea; // we can safely reuse sfavea here, because they are not used together float * sfaved = sfavead; // we can safely reuse sfavead here, because they are not used together - float * WavCoeffsLtemp = buffer[3]+128; float * blurBuffer = buffer[2]+96; for (int dir=1; dir<4; dir++) { @@ -2332,39 +2294,31 @@ SSEFUNCTION void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo } if (noisevar_abr>0.001f || noisevar_abb>0.001f ) { - - for(int i=0;i SSEFUNCTION void boxabsblur (T* src, A* dst, int radx //horizontal blur for (int row = 0; row < H; row++) { int len = radx + 1; - temp[row*W+0] = fabsf((float)src[row*W+0]); + float tempval = fabsf((float)src[row*W+0]); for (int j=1; j<=radx; j++) { - temp[row*W+0] += fabsf((float)src[row*W+j]); + tempval += fabsf((float)src[row*W+j]); } - temp[row*W+0] = temp[row*W+0] / len; + tempval /= len; + temp[row*W+0] = tempval; for (int col=1; col<=radx; col++) { - temp[row*W+col] = (temp[row*W+col-1]*len + fabsf(src[row*W+col+radx]))/(len+1); + tempval = (tempval*len + fabsf(src[row*W+col+radx]))/(len+1); + temp[row*W+col] = tempval; len ++; } + float rlen = 1.f/(float)len; for (int col = radx+1; col < W-radx; col++) { - temp[row*W+col] = temp[row*W+col-1] + ((float)(fabsf(src[row*W+col+radx]) - fabsf(src[row*W+col-radx-1])))/len; + tempval = tempval + ((float)(fabsf(src[row*W+col+radx]) - fabsf(src[row*W+col-radx-1])))*rlen; + temp[row*W+col] = tempval; } for (int col=W-radx; col SSEFUNCTION void boxabsblur (T* src, A* dst, int radx #ifdef __SSE2__ __m128 leninitv = _mm_set1_ps( (float)(rady+1)); __m128 onev = _mm_set1_ps( 1.0f ); - __m128 tempv,lenv,lenp1v,lenm1v; + __m128 tempv,lenv,lenp1v,lenm1v,rlenv; for (int col = 0; col < W-3; col+=4) { lenv = leninitv; - tempv = LVFU(temp[0*W+col]); + tempv = LVF(temp[0*W+col]); for (int i=1; i<=rady; i++) { - tempv = tempv + LVFU(temp[i*W+col]); + tempv = tempv + LVF(temp[i*W+col]); } tempv = tempv / lenv; - _mm_storeu_ps( &dst[0*W+col], tempv ); + _mm_store_ps( &dst[0*W+col], tempv ); for (int row=1; row<=rady; row++) { lenp1v = lenv + onev; - tempv = (tempv*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v; - _mm_storeu_ps( &dst[row*W+col],tempv); + tempv = (tempv*lenv + LVF(temp[(row+rady)*W+col]))/lenp1v; + _mm_store_ps( &dst[row*W+col],tempv); lenv = lenp1v; } + rlenv = onev / lenv; for (int row = rady+1; row < H-rady; row++) { - tempv = tempv + (LVFU(temp[(row+rady)*W+col])- LVFU(temp[(row-rady-1)*W+col]))/lenv; - _mm_storeu_ps( &dst[row*W+col], tempv); + tempv = tempv + (LVF(temp[(row+rady)*W+col])- LVF(temp[(row-rady-1)*W+col]))*rlenv; + _mm_store_ps( &dst[row*W+col], tempv); } for (int row=H-rady; row #include "rt_math.h" #include "opthelper.h" - namespace rtengine { template @@ -50,15 +49,15 @@ namespace rtengine { // load a row/column of input data, possibly with padding - void AnalysisFilterHaarVertical (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen); - void AnalysisFilterHaarHorizontal (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen); + void AnalysisFilterHaarVertical (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen, int row); + void AnalysisFilterHaarHorizontal (T * srcbuffer, T * dstLo, T * dstHi, int srclen, int row); void SynthesisFilterHaarHorizontal (T * srcLo, T * srcHi, T * dst, int dstlen); void SynthesisFilterHaarVertical (T * srcLo, T * srcHi, T * dst, int pitch, int dstlen); void AnalysisFilterSubsampHorizontal (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, - int taps, int offset, int pitch, int srclen, int m_w2); + int taps, int offset, int pitch, int srclen, int m_w2, int row); void AnalysisFilterSubsampVertical (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, - int taps, int offset, int pitch, int srclen); + int taps, int offset, int pitch, int srclen, int row); void SynthesisFilterSubsampHorizontal (T * srcLo, T * srcHi, T * dst, float *filterLo, float *filterHi, int taps, int offset, int dstlen); void SynthesisFilterSubsampVertical (T * srcLo, T * srcHi, T * dst, float *filterLo, float *filterHi, int taps, int offset, int pitch, int dstlen); @@ -66,7 +65,7 @@ namespace rtengine { public: T ** wavcoeffs; - // full size + // full size size_t m_w, m_h; // size of low frequency part @@ -152,38 +151,33 @@ namespace rtengine { } template - void wavelet_level::AnalysisFilterHaarHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int pitch, int srclen) { - + void wavelet_level::AnalysisFilterHaarHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int srclen, int row) { /* Basic convolution code * Applies a Haar filter - */ - for(int j=0;j void wavelet_level::AnalysisFilterHaarVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int pitch, int srclen) { - + template void wavelet_level::AnalysisFilterHaarVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int pitch, int srclen, int row) { /* Basic convolution code * Applies a Haar filter */ - for(int i = 0; i < (srclen - skip); i++) { + if(row < (srclen - skip)) { for(int j=0;j=max(srclen-skip,skip)) { for(int j=0;j void wavelet_level::AnalysisFilterSubsampHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float *filterHi, - int taps, int offset, int pitch, int srclen, int m_w2) { - + int taps, int offset, int pitch, int srclen, int m_w2, int row) { /* Basic convolution code * Applies an FIR filter 'filter' with filter length 'taps', * aligning the 'offset' element of the filter with * the input pixel, and skipping 'skip' pixels between taps * Output is subsampled by two */ - - // calculate coefficients - for(int k=0;kskip*taps && i void wavelet_level::AnalysisFilterSubsampVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float * RESTRICT filterHi, - int taps, int offset, int pitch, int srclen) { + int taps, int offset, int pitch, int srclen, int row) { + /* Basic convolution code * Applies an FIR filter 'filter' with filter length 'taps', * aligning the 'offset' element of the filter with @@ -269,31 +260,28 @@ namespace rtengine { */ // calculate coefficients - - for(int i = 0; i < srclen; i+=2) { - if (LIKELY(i>skip*taps && iskip*taps && row void wavelet_level::SynthesisFilterSubsampHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, int taps, int offset, int dstlen) { @@ -325,7 +313,7 @@ namespace rtengine { tot += ((filterLo[j] * srcLo[k*m_w2+arg] + filterHi[j] * srcHi[k*m_w2+arg])); } } - dst[k*m_w+(i-m_pad)] = 2.f * tot; + dst[k*m_w+(i-m_pad)] = tot; } } } @@ -353,7 +341,7 @@ namespace rtengine { for (int j=begin, l=0; j template void wavelet_level::decompose_level(E *src, float *filterV, float *filterH, int taps, int offset) { - T *tmpLo = new T[m_w*m_h2]; - T *tmpHi = new T[m_w*m_h2]; - + T tmpLo[m_w] ALIGNED64; + T tmpHi[m_w] ALIGNED64; /* filter along rows and columns */ if(subsamp_out) { - AnalysisFilterSubsampVertical (src, tmpLo, tmpHi, filterV, filterV+taps, taps, offset, m_w/*output_pitch*/, m_h/*srclen*/); - AnalysisFilterSubsampHorizontal (tmpLo, wavcoeffs[0], wavcoeffs[1], filterH, filterH+taps, taps, offset, m_h2/*output_pitch*/, m_w/*srclen*/, m_w2); - AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_h2/*output_pitch*/, m_w/*srclen*/, m_w2); + for(int row=0;row template void wavelet_level::reconstruct_level(E* tmpLo, E* tmpHi, E *dst, float *filterV, float *filterH, int taps, int offset) {