diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 73c23a4cf..7b6985b99 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -39,7 +39,9 @@ #include "boxblur.h" #include "rt_math.h" #include "sleef.c" - +#ifdef __SSE2__ + #include "sleefsseavx.c" +#endif #ifdef _OPENMP #include @@ -84,7 +86,7 @@ namespace rtengine { void ImProcFunctions::RGB_denoise(Imagefloat * src, Imagefloat * dst, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const procparams::DefringeParams & defringe, const double expcomp) - { + { static Glib::Mutex FftwMutex; Glib::Mutex::Lock lock(FftwMutex); @@ -741,7 +743,7 @@ namespace rtengine { fftwf_destroy_plan( plan_forward_blox[1] ); fftwf_destroy_plan( plan_backward_blox[1] ); fftwf_cleanup(); - + }//end of main RGB_denoise @@ -750,19 +752,32 @@ namespace rtengine { //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT +#if defined( __SSE2__ ) && defined( WIN32 ) +__attribute__((force_align_arg_pointer)) void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT +#else + void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT +#endif { float * nbrwt = new float[TS*TS]; //for DCT int blkstart = hblproc*TS*TS; - boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT - + boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT + +#ifdef __SSE2__ + __m128 tempv; + __m128 noisevar_Ldetailv = _mm_set1_ps( noisevar_Ldetail ); + __m128 onev = _mm_set1_ps( 1.0f ); + for (int n=0; n0.01) { -//OpenMP here +#ifdef __SSE2__ + __m128 magv; + __m128 mad_Lv = _mm_set1_ps( mad_L ); + __m128 ninev = _mm_set1_ps( 9.0f ); + __m128 epsv = _mm_set1_ps( eps ); + for (int i=0; i void boxblur (T** src, A** dst, int radx, int rady, i //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - +#if defined( __SSE2__ ) && defined( WIN32 ) +template __attribute__((force_align_arg_pointer)) void boxblur (T* src, A* dst, int radx, int rady, int W, int H) { +#else template void boxblur (T* src, A* dst, int radx, int rady, int W, int H) { - +#endif +//printf("boxblur\n"); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1) AlignedBuffer* buffer = new AlignedBuffer (W*H); float* temp = buffer->data; - if (radx==0) { for (int row=0; row void boxblur (T* src, A* dst, int radx, int rady, int #endif for (int row = 0; row < H; row++) { int len = radx + 1; - temp[row*W+0] = (float)src[row*W+0]/len; + temp[row*W+0] = (float)src[row*W+0]; for (int j=1; j<=radx; j++) { - temp[row*W+0] += (float)src[row*W+j]/len; + temp[row*W+0] += (float)src[row*W+j]; } + temp[row*W+0] = temp[row*W+0]/len; for (int col=1; col<=radx; col++) { temp[row*W+col] = (temp[row*W+col-1]*len + src[row*W+col+radx])/(len+1); len ++; @@ -165,6 +172,50 @@ template void boxblur (T* src, A* dst, int radx, int rady, int } } else { //vertical blur +#ifdef __SSE2__ + __m128 leninitv = _mm_set1_ps( (float)(rady+1)); + __m128 onev = _mm_set1_ps( 1.0f ); + __m128 tempv,lenv,lenp1v,lenm1v; + for (int col = 0; col < W-3; col+=4) { + lenv = leninitv; + tempv = LVFU(temp[0*W+col]); + for (int i=1; i<=rady; i++) { + tempv = tempv + LVFU(temp[i*W+col]); + } + _mm_storeu_ps( &dst[0*W+col], tempv / lenv ); + for (int row=1; row<=rady; row++) { + lenp1v = lenv + onev; + _mm_storeu_ps( &dst[row*W+col], (LVFU(dst[(row-1)*W+col])*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v); + lenv = lenp1v; + } + for (int row = rady+1; row < H-rady; row++) { + _mm_storeu_ps( &dst[row*W+col], LVFU(dst[(row-1)*W+col]) +(LVFU(temp[(row+rady)*W+col])-LVFU(temp[(row-rady-1)*W+col]))/lenv ); + } + for (int row=H-rady; row void boxblur (T* src, A* dst, int radx, int rady, int len --; } } +#endif } delete buffer; @@ -612,9 +664,12 @@ template void boxcorrelate (T* src, A* dst, int dx, int dy, in //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - +#if defined( __SSE2__ ) && defined( WIN32 ) +template __attribute__((force_align_arg_pointer)) void boxabsblur (T* src, A* dst, int radx, int rady, int W, int H) { +#else template void boxabsblur (T* src, A* dst, int radx, int rady, int W, int H) { - +#endif + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1) @@ -637,19 +692,20 @@ template void boxabsblur (T* src, A* dst, int radx, int rady, #endif for (int row = 0; row < H; row++) { int len = radx + 1; - temp[row*W+0] = fabs((float)src[row*W+0])/len; + temp[row*W+0] = fabsf((float)src[row*W+0]); for (int j=1; j<=radx; j++) { - temp[row*W+0] += fabs((float)src[row*W+j])/len; + temp[row*W+0] += fabsf((float)src[row*W+j]); } + temp[row*W+0] = temp[row*W+0] / len; for (int col=1; col<=radx; col++) { - temp[row*W+col] = (temp[row*W+col-1]*len + fabs(src[row*W+col+radx]))/(len+1); + temp[row*W+col] = (temp[row*W+col-1]*len + fabsf(src[row*W+col+radx]))/(len+1); len ++; } for (int col = radx+1; col < W-radx; col++) { - temp[row*W+col] = temp[row*W+col-1] + ((float)(fabs(src[row*W+col+radx]) - fabs(src[row*W+col-radx-1])))/len; + temp[row*W+col] = temp[row*W+col-1] + ((float)(fabsf(src[row*W+col+radx]) - fabsf(src[row*W+col-radx-1])))/len; } for (int col=W-radx; col void boxabsblur (T* src, A* dst, int radx, int rady, } } else { //vertical blur +#ifdef __SSE2__ + __m128 leninitv = _mm_set1_ps( (float)(rady+1)); + __m128 onev = _mm_set1_ps( 1.0f ); + __m128 tempv,lenv,lenp1v,lenm1v; + for (int col = 0; col < W-3; col+=4) { + lenv = leninitv; + tempv = LVFU(temp[0*W+col]); + for (int i=1; i<=rady; i++) { + tempv = tempv + LVFU(temp[i*W+col]); + } + _mm_storeu_ps( &dst[0*W+col], tempv / lenv ); + for (int row=1; row<=rady; row++) { + lenp1v = lenv + onev; + _mm_storeu_ps( &dst[row*W+col],(LVFU(dst[(row-1)*W+col])*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v ); + lenv = lenp1v; + } + for (int row = rady+1; row < H-rady; row++) { + _mm_storeu_ps( &dst[row*W+col], LVFU(dst[(row-1)*W+col]) + (LVFU(temp[(row+rady)*W+col])- LVFU(temp[(row-rady-1)*W+col]))/lenv); + } + for (int row=H-rady; row void boxabsblur (T* src, A* dst, int radx, int rady, len --; } } +#endif } delete buffer; diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index 1a4e5d6f8..3f83ba16e 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -26,6 +26,8 @@ typedef __m128i vint2; // +#define LVFU(x) _mm_loadu_ps(&x) + static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); } diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index 5308c4695..057064b26 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -1291,5 +1291,10 @@ static INLINE vfloat xcbrtf(vfloat d) { return y; } + +static INLINE vfloat SQRV(vfloat a){ + return _mm_mul_ps( a,a ); +} + #endif // __SSE2__ #endif // SLEEFSSEAVX