SSE2 opt for Denoise, Issue 1890
This commit is contained in:
@@ -39,7 +39,9 @@
|
||||
#include "boxblur.h"
|
||||
#include "rt_math.h"
|
||||
#include "sleef.c"
|
||||
|
||||
#ifdef __SSE2__
|
||||
#include "sleefsseavx.c"
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
@@ -84,7 +86,7 @@ namespace rtengine {
|
||||
|
||||
|
||||
void ImProcFunctions::RGB_denoise(Imagefloat * src, Imagefloat * dst, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const procparams::DefringeParams & defringe, const double expcomp)
|
||||
{
|
||||
{
|
||||
static Glib::Mutex FftwMutex;
|
||||
Glib::Mutex::Lock lock(FftwMutex);
|
||||
|
||||
@@ -741,7 +743,7 @@ namespace rtengine {
|
||||
fftwf_destroy_plan( plan_forward_blox[1] );
|
||||
fftwf_destroy_plan( plan_backward_blox[1] );
|
||||
fftwf_cleanup();
|
||||
|
||||
|
||||
}//end of main RGB_denoise
|
||||
|
||||
|
||||
@@ -750,19 +752,32 @@ namespace rtengine {
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
|
||||
|
||||
void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT
|
||||
#if defined( __SSE2__ ) && defined( WIN32 )
|
||||
__attribute__((force_align_arg_pointer)) void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT
|
||||
#else
|
||||
void ImProcFunctions::RGBtile_denoise (float * fLblox, int vblproc, int hblproc, int numblox_H, int numblox_W, float noisevar_Ldetail ) //for DCT
|
||||
#endif
|
||||
{
|
||||
float * nbrwt = new float[TS*TS]; //for DCT
|
||||
int blkstart = hblproc*TS*TS;
|
||||
|
||||
boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT
|
||||
|
||||
boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT
|
||||
|
||||
#ifdef __SSE2__
|
||||
__m128 tempv;
|
||||
__m128 noisevar_Ldetailv = _mm_set1_ps( noisevar_Ldetail );
|
||||
__m128 onev = _mm_set1_ps( 1.0f );
|
||||
for (int n=0; n<TS*TS; n+=4) { //for DCT
|
||||
tempv = onev - xexpf( -SQRV( LVFU(nbrwt[n]))/noisevar_Ldetailv);
|
||||
_mm_storeu_ps( &fLblox[blkstart+n], LVFU(fLblox[blkstart+n]) * tempv );
|
||||
}//output neighbor averaged result
|
||||
|
||||
#else
|
||||
#pragma omp parallel for
|
||||
for (int n=0; n<TS*TS; n++) { //for DCT
|
||||
fLblox[blkstart+n] *= (1-xexpf(-SQR(nbrwt[n])/noisevar_Ldetail));
|
||||
}//output neighbor averaged result
|
||||
|
||||
#endif
|
||||
delete[] nbrwt;
|
||||
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
@@ -783,10 +798,10 @@ namespace rtengine {
|
||||
int bottom = MIN( top+TS,height);
|
||||
int imax = bottom - top;
|
||||
|
||||
//add row of tiles to output image
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
//add row of tiles to output image
|
||||
for (int hblk=0; hblk < numblox_W; hblk++) {
|
||||
int left = (hblk-blkrad)*offset;
|
||||
int right = MIN(left+TS, width);
|
||||
@@ -796,12 +811,10 @@ namespace rtengine {
|
||||
|
||||
for (int i=imin; i<imax; i++)
|
||||
for (int j=jmin; j<jmax; j++) {
|
||||
|
||||
Ldetail[top+i][left+j] += tilemask_out[i][j]*bloxrow_L[(indx + i)*TS+j]*DCTnorm; //for DCT
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#undef TS
|
||||
@@ -1056,10 +1069,13 @@ namespace rtengine {
|
||||
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
#if defined( __SSE2__ ) && defined( WIN32 )
|
||||
__attribute__((force_align_arg_pointer)) void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level,
|
||||
int W_L, int H_L, int W_ab, int H_ab,int skip_L, int skip_ab, float noisevar_L, float noisevar_ab, LabImage * noi)
|
||||
#else
|
||||
void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level,
|
||||
int W_L, int H_L, int W_ab, int H_ab,int skip_L, int skip_ab, float noisevar_L, float noisevar_ab, LabImage * noi)
|
||||
|
||||
#endif
|
||||
|
||||
{
|
||||
//simple wavelet shrinkage
|
||||
@@ -1172,7 +1188,20 @@ namespace rtengine {
|
||||
}
|
||||
|
||||
if (noisevar_L>0.01) {
|
||||
//OpenMP here
|
||||
#ifdef __SSE2__
|
||||
__m128 magv;
|
||||
__m128 mad_Lv = _mm_set1_ps( mad_L );
|
||||
__m128 ninev = _mm_set1_ps( 9.0f );
|
||||
__m128 epsv = _mm_set1_ps( eps );
|
||||
for (int i=0; i<W_L*H_L-3; i+=4) {
|
||||
magv = SQRV(LVFU(WavCoeffs_L[dir][i]));
|
||||
_mm_storeu_ps( &sfave[i], magv / (magv + mad_Lv*xexpf(-magv/(ninev * mad_Lv)) + epsv));
|
||||
}
|
||||
for (int i=(W_L*H_L)-((W_L*H_L)%4); i<W_L*H_L; i++) {
|
||||
float mag = SQR(WavCoeffs_L[dir][i]);
|
||||
sfave[i] = mag/(mag+mad_L*xexpf(-mag/(9*mad_L))+eps);
|
||||
}
|
||||
#else
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
@@ -1183,9 +1212,26 @@ namespace rtengine {
|
||||
|
||||
//WavCoeffs_L[dir][i] *= shrinkfactor;
|
||||
sfave[i] = shrinkfactor;
|
||||
}
|
||||
//OpenMP here
|
||||
}
|
||||
#endif
|
||||
boxblur(sfave, sfave, level+2, level+2, W_L, H_L);//increase smoothness by locally averaging shrinkage
|
||||
#ifdef __SSE2__
|
||||
__m128 sfv;
|
||||
for (int i=0; i<W_L*H_L-3; i+=4) {
|
||||
magv = SQRV( LVFU(WavCoeffs_L[dir][i]));
|
||||
sfv = magv/(magv + mad_Lv * xexpf( -magv / (ninev * mad_Lv)) + epsv );
|
||||
//use smoothed shrinkage unless local shrinkage is much less
|
||||
_mm_storeu_ps( &WavCoeffs_L[dir][i], _mm_loadu_ps( &WavCoeffs_L[dir][i]) * (SQRV( LVFU(sfave[i] )) + SQRV(sfv)) / (LVFU(sfave[i])+sfv+epsv));
|
||||
}
|
||||
for (int i=(W_L*H_L)-((W_L*H_L)%4); i<W_L*H_L; i++) {
|
||||
float mag = SQR(WavCoeffs_L[dir][i]);
|
||||
float sf = mag/(mag+mad_L*xexpf(-mag/(9*mad_L))+eps);
|
||||
|
||||
//use smoothed shrinkage unless local shrinkage is much less
|
||||
WavCoeffs_L[dir][i] *= (SQR(sfave[i])+SQR(sf))/(sfave[i]+sf+eps);
|
||||
}//now luminance coefficients are denoised
|
||||
|
||||
#else
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
@@ -1198,7 +1244,8 @@ namespace rtengine {
|
||||
//use smoothed shrinkage unless local shrinkage is much less
|
||||
WavCoeffs_L[dir][i] *= (SQR(sfave[i])+SQR(sf))/(sfave[i]+sf+eps);
|
||||
|
||||
}//now luminance coefficients are denoised
|
||||
}//now luminance coefficients are denoised
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@@ -29,6 +29,10 @@
|
||||
#endif
|
||||
|
||||
#include "rt_math.h"
|
||||
#ifdef __SSE2__
|
||||
#include "sleefsseavx.c"
|
||||
#endif
|
||||
|
||||
//using namespace rtengine;
|
||||
|
||||
namespace rtengine {
|
||||
@@ -116,15 +120,17 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
|
||||
#if defined( __SSE2__ ) && defined( WIN32 )
|
||||
template<class T, class A> __attribute__((force_align_arg_pointer)) void boxblur (T* src, A* dst, int radx, int rady, int W, int H) {
|
||||
#else
|
||||
template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int W, int H) {
|
||||
|
||||
#endif
|
||||
//printf("boxblur\n");
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
|
||||
|
||||
AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W*H);
|
||||
float* temp = buffer->data;
|
||||
|
||||
if (radx==0) {
|
||||
for (int row=0; row<H; row++)
|
||||
for (int col=0; col<W; col++) {
|
||||
@@ -137,10 +143,11 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
|
||||
#endif
|
||||
for (int row = 0; row < H; row++) {
|
||||
int len = radx + 1;
|
||||
temp[row*W+0] = (float)src[row*W+0]/len;
|
||||
temp[row*W+0] = (float)src[row*W+0];
|
||||
for (int j=1; j<=radx; j++) {
|
||||
temp[row*W+0] += (float)src[row*W+j]/len;
|
||||
temp[row*W+0] += (float)src[row*W+j];
|
||||
}
|
||||
temp[row*W+0] = temp[row*W+0]/len;
|
||||
for (int col=1; col<=radx; col++) {
|
||||
temp[row*W+col] = (temp[row*W+col-1]*len + src[row*W+col+radx])/(len+1);
|
||||
len ++;
|
||||
@@ -165,6 +172,50 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
|
||||
}
|
||||
} else {
|
||||
//vertical blur
|
||||
#ifdef __SSE2__
|
||||
__m128 leninitv = _mm_set1_ps( (float)(rady+1));
|
||||
__m128 onev = _mm_set1_ps( 1.0f );
|
||||
__m128 tempv,lenv,lenp1v,lenm1v;
|
||||
for (int col = 0; col < W-3; col+=4) {
|
||||
lenv = leninitv;
|
||||
tempv = LVFU(temp[0*W+col]);
|
||||
for (int i=1; i<=rady; i++) {
|
||||
tempv = tempv + LVFU(temp[i*W+col]);
|
||||
}
|
||||
_mm_storeu_ps( &dst[0*W+col], tempv / lenv );
|
||||
for (int row=1; row<=rady; row++) {
|
||||
lenp1v = lenv + onev;
|
||||
_mm_storeu_ps( &dst[row*W+col], (LVFU(dst[(row-1)*W+col])*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v);
|
||||
lenv = lenp1v;
|
||||
}
|
||||
for (int row = rady+1; row < H-rady; row++) {
|
||||
_mm_storeu_ps( &dst[row*W+col], LVFU(dst[(row-1)*W+col]) +(LVFU(temp[(row+rady)*W+col])-LVFU(temp[(row-rady-1)*W+col]))/lenv );
|
||||
}
|
||||
for (int row=H-rady; row<H; row++) {
|
||||
lenm1v = lenv - onev;
|
||||
_mm_storeu_ps( &dst[row*W+col], (LVFU(dst[(row-1)*W+col])*lenv - LVFU(temp[(row-rady-1)*W+col]))/lenm1v);
|
||||
lenv = lenm1v;
|
||||
}
|
||||
}
|
||||
for (int col = W-(W%4); col < W; col++) {
|
||||
int len = rady + 1;
|
||||
dst[0*W+col] = temp[0*W+col]/len;
|
||||
for (int i=1; i<=rady; i++) {
|
||||
dst[0*W+col] += temp[i*W+col]/len;
|
||||
}
|
||||
for (int row=1; row<=rady; row++) {
|
||||
dst[row*W+col] = (dst[(row-1)*W+col]*len + temp[(row+rady)*W+col])/(len+1);
|
||||
len ++;
|
||||
}
|
||||
for (int row = rady+1; row < H-rady; row++) {
|
||||
dst[row*W+col] = dst[(row-1)*W+col] + (temp[(row+rady)*W+col] - temp[(row-rady-1)*W+col])/len;
|
||||
}
|
||||
for (int row=H-rady; row<H; row++) {
|
||||
dst[row*W+col] = (dst[(row-1)*W+col]*len - temp[(row-rady-1)*W+col])/(len-1);
|
||||
len --;
|
||||
}
|
||||
}
|
||||
#else
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
@@ -186,6 +237,7 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
|
||||
len --;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
delete buffer;
|
||||
@@ -612,9 +664,12 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
|
||||
#if defined( __SSE2__ ) && defined( WIN32 )
|
||||
template<class T, class A> __attribute__((force_align_arg_pointer)) void boxabsblur (T* src, A* dst, int radx, int rady, int W, int H) {
|
||||
#else
|
||||
template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady, int W, int H) {
|
||||
|
||||
#endif
|
||||
|
||||
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
|
||||
|
||||
@@ -637,19 +692,20 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
|
||||
#endif
|
||||
for (int row = 0; row < H; row++) {
|
||||
int len = radx + 1;
|
||||
temp[row*W+0] = fabs((float)src[row*W+0])/len;
|
||||
temp[row*W+0] = fabsf((float)src[row*W+0]);
|
||||
for (int j=1; j<=radx; j++) {
|
||||
temp[row*W+0] += fabs((float)src[row*W+j])/len;
|
||||
temp[row*W+0] += fabsf((float)src[row*W+j]);
|
||||
}
|
||||
temp[row*W+0] = temp[row*W+0] / len;
|
||||
for (int col=1; col<=radx; col++) {
|
||||
temp[row*W+col] = (temp[row*W+col-1]*len + fabs(src[row*W+col+radx]))/(len+1);
|
||||
temp[row*W+col] = (temp[row*W+col-1]*len + fabsf(src[row*W+col+radx]))/(len+1);
|
||||
len ++;
|
||||
}
|
||||
for (int col = radx+1; col < W-radx; col++) {
|
||||
temp[row*W+col] = temp[row*W+col-1] + ((float)(fabs(src[row*W+col+radx]) - fabs(src[row*W+col-radx-1])))/len;
|
||||
temp[row*W+col] = temp[row*W+col-1] + ((float)(fabsf(src[row*W+col+radx]) - fabsf(src[row*W+col-radx-1])))/len;
|
||||
}
|
||||
for (int col=W-radx; col<W; col++) {
|
||||
temp[row*W+col] = (temp[row*W+col-1]*len - fabs(src[row*W+col-radx-1]))/(len-1);
|
||||
temp[row*W+col] = (temp[row*W+col-1]*len - fabsf(src[row*W+col-radx-1]))/(len-1);
|
||||
len --;
|
||||
}
|
||||
}
|
||||
@@ -665,6 +721,52 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
|
||||
}
|
||||
} else {
|
||||
//vertical blur
|
||||
#ifdef __SSE2__
|
||||
__m128 leninitv = _mm_set1_ps( (float)(rady+1));
|
||||
__m128 onev = _mm_set1_ps( 1.0f );
|
||||
__m128 tempv,lenv,lenp1v,lenm1v;
|
||||
for (int col = 0; col < W-3; col+=4) {
|
||||
lenv = leninitv;
|
||||
tempv = LVFU(temp[0*W+col]);
|
||||
for (int i=1; i<=rady; i++) {
|
||||
tempv = tempv + LVFU(temp[i*W+col]);
|
||||
}
|
||||
_mm_storeu_ps( &dst[0*W+col], tempv / lenv );
|
||||
for (int row=1; row<=rady; row++) {
|
||||
lenp1v = lenv + onev;
|
||||
_mm_storeu_ps( &dst[row*W+col],(LVFU(dst[(row-1)*W+col])*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v );
|
||||
lenv = lenp1v;
|
||||
}
|
||||
for (int row = rady+1; row < H-rady; row++) {
|
||||
_mm_storeu_ps( &dst[row*W+col], LVFU(dst[(row-1)*W+col]) + (LVFU(temp[(row+rady)*W+col])- LVFU(temp[(row-rady-1)*W+col]))/lenv);
|
||||
}
|
||||
for (int row=H-rady; row<H; row++) {
|
||||
lenm1v = lenv - onev;
|
||||
_mm_storeu_ps( &dst[row*W+col], (LVFU(dst[(row-1)*W+col])*lenv - LVFU(temp[(row-rady-1)*W+col]))/lenm1v);
|
||||
lenv = lenm1v;
|
||||
}
|
||||
}
|
||||
for (int col = W-(W%4); col < W; col++) {
|
||||
int len = rady + 1;
|
||||
dst[0*W+col] = temp[0*W+col]/len;
|
||||
for (int i=1; i<=rady; i++) {
|
||||
dst[0*W+col] += temp[i*W+col]/len;
|
||||
}
|
||||
for (int row=1; row<=rady; row++) {
|
||||
dst[row*W+col] = (dst[(row-1)*W+col]*len + temp[(row+rady)*W+col])/(len+1);
|
||||
len ++;
|
||||
}
|
||||
for (int row = rady+1; row < H-rady; row++) {
|
||||
dst[row*W+col] = dst[(row-1)*W+col] + (temp[(row+rady)*W+col] - temp[(row-rady-1)*W+col])/len;
|
||||
}
|
||||
for (int row=H-rady; row<H; row++) {
|
||||
dst[row*W+col] = (dst[(row-1)*W+col]*len - temp[(row-rady-1)*W+col])/(len-1);
|
||||
len --;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
//OpenMP here
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
@@ -687,6 +789,7 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
|
||||
len --;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
delete buffer;
|
||||
|
@@ -26,6 +26,8 @@ typedef __m128i vint2;
|
||||
|
||||
//
|
||||
|
||||
#define LVFU(x) _mm_loadu_ps(&x)
|
||||
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
|
||||
|
@@ -1291,5 +1291,10 @@ static INLINE vfloat xcbrtf(vfloat d) {
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
static INLINE vfloat SQRV(vfloat a){
|
||||
return _mm_mul_ps( a,a );
|
||||
}
|
||||
|
||||
#endif // __SSE2__
|
||||
#endif // SLEEFSSEAVX
|
||||
|
Reference in New Issue
Block a user