From 0bceb1e6df93418c0a1dadd7fde211bf97f61328 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Mon, 24 Aug 2015 13:13:48 +0200 Subject: [PATCH] Speedup 2 for DeHaze --- rtengine/ipdehaz.cc | 90 +++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 40 deletions(-) diff --git a/rtengine/ipdehaz.cc b/rtengine/ipdehaz.cc index 802528780..5a5b5c112 100644 --- a/rtengine/ipdehaz.cc +++ b/rtengine/ipdehaz.cc @@ -37,6 +37,7 @@ #include "gauss.h" #include "rawimagesource.h" #include "improcfun.h" +#include "opthelper.h" #include "StopWatch.h" #define MAX_DEHAZE_SCALES 6 #define clipdehaz( val, minv, maxv ) (( val = (val < minv ? minv : val ) ) > maxv ? maxv : val ) @@ -83,11 +84,10 @@ void dehaze_scales( float* scales, int nscales, int mode, int s) void mean_stddv( float **dst, float &mean, float &stddv, int W_L, int H_L ) { float vsquared; - int i, j; vsquared = 0.0f; mean = 0.0f; -// #pragma omp parallel for reduction(+:mean,vsquared) // will enable this later, because naturally it leads to differences +#pragma omp parallel for reduction(+:mean,vsquared) // this leads to differences, but parallel summation is more accurate for (int i = 0; i L[i][j] + eps; } } + dehaze_scales( DehazeScales, scal, modedehaz, nei ); pond = 1.0f / (float) scal; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int i = 0; i < H_L ; i++ ) - for (int j=0; j* pBuffer = new AlignedBufferMP (max(W_L, H_L)); - gaussHorizontal (in, out, *pBuffer, W_L, H_L, DehazeScales[scale]); + gaussHorizontal (src, out, *pBuffer, W_L, H_L, DehazeScales[scale]); gaussVertical (out, out, *pBuffer,W_L, H_L, DehazeScales[scale]); delete pBuffer; } +#ifdef __SSE2__ +#ifdef _OPENMP +#pragma omp parallel +{ + vfloat pondv = F2V(pond); +#pragma omp for +#endif + for ( int i=0; i < H_L; i++) { + int j; + for (j=0; j < W_L-3; j+=4) + { + _mm_storeu_ps(&dst[i][j], LVFU(dst[i][j]) + pondv * ( xlogf(LVFU(src[i][j])/LVFU(out[i][j])) )); + } + for (;j < W_L; j++) + { + dst[i][j] += pond * ( xlogf((src[i][j])/out[i][j]) ); + } + } +} +#else #ifdef _OPENMP #pragma omp parallel for #endif for ( int i=0; i < H_L; i++) for (int j=0; j < W_L; j++) { - dst[i][j] += pond * ( xlogf((in[i][j])/out[i][j]) ); + dst[i][j] += pond * ( xlogf((src[i][j])/out[i][j]) ); } +#endif } - for (int i = 0; i < H_L; i++) { - delete [] in[i]; - } - delete [] in; for (int i = 0; i < H_L; i++) { delete [] out[i]; } delete [] out; + for (int i = 0; i < H_L; i++) { + delete [] src[i]; + } + delete [] src; float beta=16384.0f; -float logBeta = xlogf(beta); +float logBetaGain = xlogf(beta) * gain; #ifdef _OPENMP #pragma omp parallel for @@ -208,9 +223,8 @@ float logBeta = xlogf(beta); for (int i=0; i< H_L; i++ ) for (int j=0; jL[i][j]=((100.f - strength)* lab->L[i][j] + strength * clipdehaz( cd, 0.f, 32768.f ))/100.f; + float cd = cdfactor * ( dst[i][j] - mini ) + offse; + lab->L[i][j]=((1.f - strength)* lab->L[i][j] + strength * clipdehaz( cd, 0.f, 32768.f )); } for (int i = 0; i < H_L; i++) { delete [] dst[i]; } delete [] dst; - for (int i = 0; i < H_L; i++) { - delete [] src[i]; - } - delete [] src; } } } - - \ No newline at end of file