diff --git a/rtengine/CMakeLists.txt b/rtengine/CMakeLists.txt index 231875f95..bf9ec442b 100644 --- a/rtengine/CMakeLists.txt +++ b/rtengine/CMakeLists.txt @@ -8,7 +8,7 @@ link_directories ("${PROJECT_SOURCE_DIR}/rtexif" ${EXTRA_LIBDIR} ${GTHREAD_LIBRA set (CAMCONSTSFILE "camconst.json") set (RTENGINESOURCEFILES safegtk.cc colortemp.cc curves.cc flatcurves.cc diagonalcurves.cc dcraw.cc iccstore.cc color.cc - dfmanager.cc ffmanager.cc rawimage.cc image8.cc image16.cc imagefloat.cc imagedata.cc imageio.cc improcfun.cc init.cc dcrop.cc + dfmanager.cc ffmanager.cc gauss.cc rawimage.cc image8.cc image16.cc imagefloat.cc imagedata.cc imageio.cc improcfun.cc init.cc dcrop.cc loadinitial.cc procparams.cc rawimagesource.cc demosaic_algos.cc shmap.cc simpleprocess.cc refreshmap.cc fast_demo.cc amaze_demosaic_RT.cc CA_correct_RT.cc cfa_linedn_RT.cc green_equil_RT.cc hilite_recon.cc expo_before_b.cc stdimagesource.cc myfile.cc iccjpeg.cc hlmultipliers.cc improccoordinator.cc editbuffer.cc coord.cc diff --git a/rtengine/PF_correct_RT.cc b/rtengine/PF_correct_RT.cc index c56ce168d..fce4a14c0 100644 --- a/rtengine/PF_correct_RT.cc +++ b/rtengine/PF_correct_RT.cc @@ -65,40 +65,12 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, #pragma omp parallel #endif { - gaussianBlur (src->a, tmp1->a, src->W, src->H, radius); - gaussianBlur (src->b, tmp1->b, src->W, src->H, radius); + gaussianBlur (src->a, tmp1->a, src->W, src->H, radius); + gaussianBlur (src->b, tmp1->b, src->W, src->H, radius); } float chromave = 0.0f; -#ifdef __SSE2__ - - if( chCurve ) { -// vectorized precalculation of the atan2 values -#ifdef _OPENMP - #pragma omp parallel -#endif - { - int j; -#ifdef _OPENMP - #pragma omp for -#endif - - for(int i = 0; i < height; i++ ) - { - for(j = 0; j < width - 3; j += 4) { - _mm_storeu_ps(&fringe[i * width + j], xatan2f(LVFU(src->b[i][j]), LVFU(src->a[i][j]))); - } - - for(; j < width; j++) { - fringe[i * width + j] = xatan2f(src->b[i][j], src->a[i][j]); - } - } - } - } - -#endif - #ifdef _OPENMP #pragma omp parallel #endif @@ -109,6 +81,23 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, #endif for(int i = 0; i < height; i++ ) { +#ifdef __SSE2__ + + // vectorized per row precalculation of the atan2 values + if (chCurve) { + int k = 0; + + for(; k < width - 3; k += 4) { + STVFU(fringe[i * width + k], xatan2f(LVFU(src->b[i][k]), LVFU(src->a[i][k]))); + } + + for(; k < width; k++) { + fringe[i * width + k] = xatan2f(src->b[i][k], src->a[i][k]); + } + } + +#endif // __SSE2__ + for(int j = 0; j < width; j++) { if (chCurve) { #ifdef __SSE2__ @@ -144,19 +133,21 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, #pragma omp parallel #endif { - __m128 sumv = _mm_set1_ps( chromave ); - __m128 onev = _mm_set1_ps( 1.0f ); + __m128 sumv = F2V( chromave ); + __m128 onev = F2V( 1.0f ); #ifdef _OPENMP - #pragma omp for + #pragma omp for nowait #endif for(int j = 0; j < width * height - 3; j += 4) { - _mm_storeu_ps( &fringe[j], onev / (LVFU(fringe[j]) + sumv)); + STVFU(fringe[j], onev / (LVFU(fringe[j]) + sumv)); } - } - for(int j = width * height - (width * height) % 4; j < width * height; j++) { - fringe[j] = 1.f / (fringe[j] + chromave); + #pragma omp single + + for(int j = width * height - (width * height) % 4; j < width * height; j++) { + fringe[j] = 1.f / (fringe[j] + chromave); + } } #else @@ -191,8 +182,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, tmp1->b[i][j] = src->b[i][j]; //test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average - /*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/ - /*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/ if (fringe[i * width + j] < threshfactor) { float atot = 0.f; float btot = 0.f; @@ -218,8 +207,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, tmp1->b[i][j] = src->b[i][j]; //test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average - /*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/ - /*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/ if (fringe[i * width + j] < threshfactor) { float atot = 0.f; float btot = 0.f; @@ -245,8 +232,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, tmp1->b[i][j] = src->b[i][j]; //test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average - /*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/ - /*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/ if (fringe[i * width + j] < threshfactor) { float atot = 0.f; float btot = 0.f; @@ -355,7 +340,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds #ifdef __SSE2__ int j; vfloat2 sincosvalv; - __m128 piidv = _mm_set1_ps(piid); + __m128 piidv = F2V(piid); #endif // __SSE2__ #ifdef _OPENMP #pragma omp for @@ -366,8 +351,8 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds for (j = 0; j < width - 3; j += 4) { sincosvalv = xsincosf(piidv * LVFU(src->h_p[i][j])); - _mm_storeu_ps(&sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y); - _mm_storeu_ps(&srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x); + STVFU(sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y); + STVFU(srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x); } for (; j < width; j++) { @@ -392,8 +377,8 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds #pragma omp parallel #endif { - gaussianBlur (sraa, tmaa, src->W, src->H, radius); - gaussianBlur (srbb, tmbb, src->W, src->H, radius); + gaussianBlur (sraa, tmaa, src->W, src->H, radius); + gaussianBlur (srbb, tmbb, src->W, src->H, radius); } float chromave = 0.0f; @@ -414,7 +399,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds for(int i = 0; i < height; i++ ) { for(j = 0; j < width - 3; j += 4) { - _mm_storeu_ps(&fringe[i * width + j], xatan2f(LVFU(srbb[i][j]), LVFU(sraa[i][j]))); + STVFU(fringe[i * width + j], xatan2f(LVFU(srbb[i][j]), LVFU(sraa[i][j]))); } for(; j < width; j++) { @@ -470,14 +455,14 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds #pragma omp parallel #endif { - __m128 sumv = _mm_set1_ps( chromave + eps2 ); - __m128 onev = _mm_set1_ps( 1.0f ); + __m128 sumv = F2V( chromave + eps2 ); + __m128 onev = F2V( 1.0f ); #ifdef _OPENMP #pragma omp for #endif for(int j = 0; j < width * height - 3; j += 4) { - _mm_storeu_ps( &fringe[j], onev / (LVFU(fringe[j]) + sumv)); + STVFU(fringe[j], onev / (LVFU(fringe[j]) + sumv)); } } @@ -599,7 +584,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds #ifdef __SSE2__ int j; __m128 interav, interbv; - __m128 piidv = _mm_set1_ps(piid); + __m128 piidv = F2V(piid); #endif #ifdef _OPENMP #pragma omp for @@ -609,11 +594,11 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds #ifdef __SSE2__ for(j = 0; j < width - 3; j += 4) { - _mm_storeu_ps( &dst->sh_p[i][j], LVFU(src->sh_p[i][j])); + STVFU(dst->sh_p[i][j], LVFU(src->sh_p[i][j])); interav = LVFU(tmaa[i][j]); interbv = LVFU(tmbb[i][j]); - _mm_storeu_ps(&dst->h_p[i][j], (xatan2f(interbv, interav)) / piidv); - _mm_storeu_ps(&dst->C_p[i][j], _mm_sqrt_ps(SQRV(interbv) + SQRV(interav))); + STVFU(dst->h_p[i][j], (xatan2f(interbv, interav)) / piidv); + STVFU(dst->C_p[i][j], vsqrtf(SQRV(interbv) + SQRV(interav))); } for(; j < width; j++) { @@ -730,7 +715,7 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d #ifdef __SSE2__ int j; vfloat2 sincosvalv; - __m128 piidv = _mm_set1_ps(piid); + __m128 piidv = F2V(piid); #endif // __SSE2__ #ifdef _OPENMP #pragma omp for @@ -741,8 +726,8 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d for (j = 0; j < width - 3; j += 4) { sincosvalv = xsincosf(piidv * LVFU(src->h_p[i][j])); - _mm_storeu_ps(&sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y); - _mm_storeu_ps(&srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x); + STVFU(sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y); + STVFU(srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x); } for (; j < width; j++) { @@ -769,12 +754,12 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d { //chroma a and b if(mode == 2) { //choice of gaussian blur - gaussianBlur (sraa, tmaa, src->W, src->H, radius); - gaussianBlur (srbb, tmbb, src->W, src->H, radius); + gaussianBlur (sraa, tmaa, src->W, src->H, radius); + gaussianBlur (srbb, tmbb, src->W, src->H, radius); } //luma sh_p - gaussianBlur (src->sh_p, tmL, src->W, src->H, 2.0);//low value to avoid artifacts + gaussianBlur (src->sh_p, tmL, src->W, src->H, 2.0);//low value to avoid artifacts } if(mode == 1) { //choice of median @@ -859,8 +844,8 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d int j; #ifdef __SSE2__ __m128 shfabsv, shmedv; - __m128 shthrv = _mm_set1_ps(shthr); - __m128 onev = _mm_set1_ps(1.0f); + __m128 shthrv = F2V(shthr); + __m128 onev = F2V(1.0f); #endif // __SSE2__ #ifdef _OPENMP #pragma omp for private(shfabs, shmed,i1,j1) @@ -883,14 +868,14 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d for (; j < width - 5; j += 4) { shfabsv = vabsf(LVFU(src->sh_p[i][j]) - LVFU(tmL[i][j])); - shmedv = _mm_setzero_ps(); + shmedv = ZEROV; for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) for (j1 = j - 2; j1 <= j + 2; j1++ ) { shmedv += vabsf(LVFU(src->sh_p[i1][j1]) - LVFU(tmL[i1][j1])); } - _mm_storeu_ps( &badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, _mm_setzero_ps())); + STVFU(badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, ZEROV)); } for (; j < width - 2; j++) { @@ -1082,15 +1067,15 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d #endif { int j; - __m128 sumv = _mm_set1_ps( chrommed + eps2 ); - __m128 onev = _mm_set1_ps( 1.0f ); + __m128 sumv = F2V( chrommed + eps2 ); + __m128 onev = F2V( 1.0f ); #ifdef _OPENMP #pragma omp for #endif for(int i = 0; i < height; i++) { for(j = 0; j < width - 3; j += 4) { - _mm_storeu_ps( &badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv)); + STVFU(badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv)); } for(; j < width; j++) { @@ -1341,7 +1326,7 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d #ifdef __SSE2__ int j; // vfloat2 sincosvalv; -// __m128 piidv = _mm_set1_ps(piid); +// __m128 piidv = F2V(piid); #endif // __SSE2__ #ifdef _OPENMP #pragma omp for @@ -1351,8 +1336,8 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d #ifdef __SSE2__ for (j = 0; j < width - 3; j += 4) { - _mm_storeu_ps(&sraa[i][j], LVFU(src->a[i][j])); - _mm_storeu_ps(&srbb[i][j], LVFU(src->b[i][j])); + STVFU(sraa[i][j], LVFU(src->a[i][j])); + STVFU(srbb[i][j], LVFU(src->b[i][j])); } for (; j < width; j++) { @@ -1377,12 +1362,12 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d { //chroma a and b if(mode >= 2) { //choice of gaussian blur - gaussianBlur (sraa, tmaa, src->W, src->H, radius); - gaussianBlur (srbb, tmbb, src->W, src->H, radius); + gaussianBlur (sraa, tmaa, src->W, src->H, radius); + gaussianBlur (srbb, tmbb, src->W, src->H, radius); } //luma sh_p - gaussianBlur (src->L, tmL, src->W, src->H, 2.0);//low value to avoid artifacts + gaussianBlur (src->L, tmL, src->W, src->H, 2.0);//low value to avoid artifacts } if(mode == 1) { //choice of median @@ -1467,8 +1452,8 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d int j; #ifdef __SSE2__ __m128 shfabsv, shmedv; - __m128 shthrv = _mm_set1_ps(shthr); - __m128 onev = _mm_set1_ps(1.0f); + __m128 shthrv = F2V(shthr); + __m128 onev = F2V(1.0f); #endif // __SSE2__ #ifdef _OPENMP #pragma omp for private(shfabs, shmed,i1,j1) @@ -1491,14 +1476,14 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d for (; j < width - 5; j += 4) { shfabsv = vabsf(LVFU(src->L[i][j]) - LVFU(tmL[i][j])); - shmedv = _mm_setzero_ps(); + shmedv = ZEROV; for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) for (j1 = j - 2; j1 <= j + 2; j1++ ) { shmedv += vabsf(LVFU(src->L[i1][j1]) - LVFU(tmL[i1][j1])); } - _mm_storeu_ps( &badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, _mm_setzero_ps())); + STVFU(badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, ZEROV)); } for (; j < width - 2; j++) { @@ -1690,15 +1675,15 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d #endif { int j; - __m128 sumv = _mm_set1_ps( chrommed + eps2 ); - __m128 onev = _mm_set1_ps( 1.0f ); + __m128 sumv = F2V( chrommed + eps2 ); + __m128 onev = F2V( 1.0f ); #ifdef _OPENMP #pragma omp for #endif for(int i = 0; i < height; i++) { for(j = 0; j < width - 3; j += 4) { - _mm_storeu_ps( &badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv)); + STVFU(badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv)); } for(; j < width; j++) { diff --git a/rtengine/gauss.cc b/rtengine/gauss.cc new file mode 100644 index 000000000..f8b4124d6 --- /dev/null +++ b/rtengine/gauss.cc @@ -0,0 +1,1301 @@ +/* + * This file is part of RawTherapee. + * + * Copyright (c) 2004-2010 Gabor Horvath + * + * RawTherapee is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RawTherapee is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RawTherapee. If not, see . + */ +#include "gauss.h" +#include +#include +#include "opthelper.h" +#include "boxblur.h" + +namespace +{ + +template void calculateYvVFactors( const T sigma, T &b1, T &b2, T &b3, T &B, T M[3][3]) +{ + // coefficient calculation + T q; + + if (sigma < 2.5) { + q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); + } else { + q = 0.98711 * sigma - 0.96330; + } + + T b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q; + b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q; + b2 = -1.4281 * q * q - 1.26661 * q * q * q; + b3 = 0.422205 * q * q * q; + B = 1.0 - (b1 + b2 + b3) / b0; + + b1 /= b0; + b2 /= b0; + b3 /= b0; + + // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering + M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2; + M[0][1] = (b3 + b1) * (b2 + b3 * b1); + M[0][2] = b3 * (b1 + b3 * b2); + M[1][0] = b1 + b3 * b2; + M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1); + M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3; + M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2; + M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3; + M[2][2] = b3 * (b1 + b3 * b2); + +} + +// classical filtering if the support window is small and src != dst +template void gauss3x3 (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1) +{ + + // first row +#ifdef _OPENMP + #pragma omp single nowait +#endif + { + dst[0][0] = src[0][0]; + + for (int j = 1; j < W - 1; j++) + { + dst[0][j] = b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j]; + } + + dst[0][W - 1] = src[0][W - 1]; + } + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int i = 1; i < H - 1; i++) { + dst[i][0] = b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0]; + + for (int j = 1; j < W - 1; j++) { + dst[i][j] = c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j]; + } + + dst[i][W - 1] = b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1]; + } + + // last row +#ifdef _OPENMP + #pragma omp single +#endif + { + dst[H - 1][0] = src[H - 1][0]; + + for (int j = 1; j < W - 1; j++) { + dst[H - 1][j] = b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j]; + } + + dst[H - 1][W - 1] = src[H - 1][W - 1]; + } +} + +// classical filtering if the support window is small and src != dst +template void gauss3x3mult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1) +{ + + // first row +#ifdef _OPENMP + #pragma omp single nowait +#endif + { + dst[0][0] *= src[0][0]; + + for (int j = 1; j < W - 1; j++) + { + dst[0][j] *= b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j]; + } + + dst[0][W - 1] *= src[0][W - 1]; + } + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int i = 1; i < H - 1; i++) { + dst[i][0] *= b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0]; + + for (int j = 1; j < W - 1; j++) { + dst[i][j] *= c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j]; + } + + dst[i][W - 1] *= b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1]; + } + + // last row +#ifdef _OPENMP + #pragma omp single +#endif + { + dst[H - 1][0] *= src[H - 1][0]; + + for (int j = 1; j < W - 1; j++) { + dst[H - 1][j] *= b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j]; + } + + dst[H - 1][W - 1] *= src[H - 1][W - 1]; + } +} + +template void gauss3x3div (T** RESTRICT src, T** RESTRICT dst, T** RESTRICT divBuffer, const int W, const int H, const T c0, const T c1, const T c2, const T b0, const T b1) +{ + + // first row +#ifdef _OPENMP + #pragma omp single nowait +#endif + { + dst[0][0] = divBuffer[0][0] / (src[0][0] > 0.f ? src[0][0] : 1.f); + + for (int j = 1; j < W - 1; j++) + { + float tmp = (b1 * (src[0][j - 1] + src[0][j + 1]) + b0 * src[0][j]); + dst[0][j] = divBuffer[0][j] / (tmp > 0.f ? tmp : 1.f); + } + + dst[0][W - 1] = divBuffer[0][W - 1] / (src[0][W - 1] > 0.f ? src[0][W - 1] : 1.f); + } + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int i = 1; i < H - 1; i++) { + float tmp = (b1 * (src[i - 1][0] + src[i + 1][0]) + b0 * src[i][0]); + dst[i][0] = divBuffer[i][0] / (tmp > 0.f ? tmp : 1.f); + + for (int j = 1; j < W - 1; j++) { + tmp = (c2 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c1 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c0 * src[i][j]); + dst[i][j] = divBuffer[i][j] / (tmp > 0.f ? tmp : 1.f); + } + + tmp = (b1 * (src[i - 1][W - 1] + src[i + 1][W - 1]) + b0 * src[i][W - 1]); + dst[i][W - 1] = divBuffer[i][W - 1] / (tmp > 0.f ? tmp : 1.f); + } + + // last row +#ifdef _OPENMP + #pragma omp single +#endif + { + dst[H - 1][0] = divBuffer[H - 1][0] / (src[H - 1][0] > 0.f ? src[H - 1][0] : 1.f); + + for (int j = 1; j < W - 1; j++) { + float tmp = (b1 * (src[H - 1][j - 1] + src[H - 1][j + 1]) + b0 * src[H - 1][j]); + dst[H - 1][j] = divBuffer[H - 1][W] / (tmp > 0.f ? tmp : 1.f); + } + + dst[H - 1][W - 1] = divBuffer[H - 1][W - 1] / (src[H - 1][W - 1] > 0.f ? src[H - 1][W - 1] : 1.f); + } +} + + +// use separated filter if the support window is small and src == dst +template void gaussHorizontal3 (T** src, T** dst, int W, int H, const float c0, const float c1) +{ + T temp[W] ALIGNED16; +#ifdef _OPENMP + #pragma omp for +#endif + + for (int i = 0; i < H; i++) { + for (int j = 1; j < W - 1; j++) { + temp[j] = (T)(c1 * (src[i][j - 1] + src[i][j + 1]) + c0 * src[i][j]); + } + + dst[i][0] = src[i][0]; + memcpy (dst[i] + 1, temp + 1, (W - 2)*sizeof(T)); + + dst[i][W - 1] = src[i][W - 1]; + } +} + +#ifdef __SSE2__ +template SSEFUNCTION void gaussVertical3 (T** src, T** dst, int W, int H, const float c0, const float c1) +{ + vfloat Tv, Tm1v, Tp1v; + vfloat Tv1, Tm1v1, Tp1v1; + vfloat c0v, c1v; + c0v = F2V(c0); + c1v = F2V(c1); + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + // process 8 columns per iteration for better usage of cpu cache + for (int i = 0; i < W - 7; i += 8) { + Tm1v = LVFU( src[0][i] ); + Tm1v1 = LVFU( src[0][i + 4] ); + STVFU( dst[0][i], Tm1v); + STVFU( dst[0][i + 4], Tm1v1); + + if (H > 1) { + Tv = LVFU( src[1][i]); + Tv1 = LVFU( src[1][i + 4]); + } + + for (int j = 1; j < H - 1; j++) { + Tp1v = LVFU( src[j + 1][i]); + Tp1v1 = LVFU( src[j + 1][i + 4]); + STVFU( dst[j][i], c1v * (Tp1v + Tm1v) + Tv * c0v); + STVFU( dst[j][i + 4], c1v * (Tp1v1 + Tm1v1) + Tv1 * c0v); + Tm1v = Tv; + Tm1v1 = Tv1; + Tv = Tp1v; + Tv1 = Tp1v1; + } + + STVFU( dst[H - 1][i], LVFU( src[H - 1][i])); + STVFU( dst[H - 1][i + 4], LVFU( src[H - 1][i + 4])); + } + +// Borders are done without SSE + float temp[H] ALIGNED16; +#ifdef _OPENMP + #pragma omp single +#endif + + for (int i = W - (W % 8); i < W; i++) { + for (int j = 1; j < H - 1; j++) { + temp[j] = c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i]; + } + + dst[0][i] = src[0][i]; + + for (int j = 1; j < H - 1; j++) { + dst[j][i] = temp[j]; + } + + dst[H - 1][i] = src[H - 1][i]; + } +} +#else +template void gaussVertical3 (T** src, T** dst, int W, int H, const float c0, const float c1) +{ + T temp[H] ALIGNED16; +#ifdef _OPENMP + #pragma omp for +#endif + + for (int i = 0; i < W; i++) { + for (int j = 1; j < H - 1; j++) { + temp[j] = (T)(c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i]); + } + + dst[0][i] = src[0][i]; + + for (int j = 1; j < H - 1; j++) { + dst[j][i] = temp[j]; + } + + dst[H - 1][i] = src[H - 1][i]; + } +} +#endif + +#ifdef __SSE2__ +// fast gaussian approximation if the support window is large +template SSEFUNCTION void gaussHorizontalSse (T** src, T** dst, const int W, const int H, const float sigma) +{ + double b1, b2, b3, B, M[3][3]; + calculateYvVFactors(sigma, b1, b2, b3, B, M); + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + M[i][j] *= (1.0 + b2 + (b1 - b3) * b3); + M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3); + } + + vfloat Rv; + vfloat Tv, Tm2v, Tm3v; + vfloat Bv, b1v, b2v, b3v; + vfloat temp2W, temp2Wp1; + float tmp[W][4] ALIGNED16; + Bv = F2V(B); + b1v = F2V(b1); + b2v = F2V(b2); + b3v = F2V(b3); + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int i = 0; i < H - 3; i += 4) { + Tv = _mm_set_ps(src[i][0], src[i + 1][0], src[i + 2][0], src[i + 3][0]); + Tm3v = Tv * (Bv + b1v + b2v + b3v); + STVF( tmp[0][0], Tm3v ); + + Tm2v = _mm_set_ps(src[i][1], src[i + 1][1], src[i + 2][1], src[i + 3][1]) * Bv + Tm3v * b1v + Tv * (b2v + b3v); + STVF( tmp[1][0], Tm2v ); + + Rv = _mm_set_ps(src[i][2], src[i + 1][2], src[i + 2][2], src[i + 3][2]) * Bv + Tm2v * b1v + Tm3v * b2v + Tv * b3v; + STVF( tmp[2][0], Rv ); + + for (int j = 3; j < W; j++) { + Tv = Rv; + Rv = _mm_set_ps(src[i][j], src[i + 1][j], src[i + 2][j], src[i + 3][j]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; + STVF( tmp[j][0], Rv ); + Tm3v = Tm2v; + Tm2v = Tv; + } + + Tv = _mm_set_ps(src[i][W - 1], src[i + 1][W - 1], src[i + 2][W - 1], src[i + 3][W - 1]); + + temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * ( Tm2v - Tv ) + F2V(M[2][2]) * (Tm3v - Tv); + temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv); + + Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv); + STVF(tmp[W - 1][0], Rv); + + Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1; + STVF(tmp[W - 2][0], Tm2v); + + Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W; + STVF(tmp[W - 3][0], Tm3v); + + Tv = Rv; + Rv = Tm3v; + Tm3v = Tv; + + for (int j = W - 4; j >= 0; j--) { + Tv = Rv; + Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; + STVF(tmp[j][0], Rv); + Tm3v = Tm2v; + Tm2v = Tv; + } + + for (int j = 0; j < W; j++) { + dst[i + 3][j] = tmp[j][0]; + dst[i + 2][j] = tmp[j][1]; + dst[i + 1][j] = tmp[j][2]; + dst[i + 0][j] = tmp[j][3]; + } + + + } + +// Borders are done without SSE +#ifdef _OPENMP + #pragma omp single +#endif + + for (int i = H - (H % 4); i < H; i++) { + tmp[0][0] = src[i][0] * (B + b1 + b2 + b3); + tmp[1][0] = B * src[i][1] + b1 * tmp[0][0] + src[i][0] * (b2 + b3); + tmp[2][0] = B * src[i][2] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[i][0]; + + for (int j = 3; j < W; j++) { + tmp[j][0] = B * src[i][j] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0]; + } + + float temp2Wm1 = src[i][W - 1] + M[0][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[0][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[0][2] * (tmp[W - 3][0] - src[i][W - 1]); + float temp2W = src[i][W - 1] + M[1][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[1][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[1][2] * (tmp[W - 3][0] - src[i][W - 1]); + float temp2Wp1 = src[i][W - 1] + M[2][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[2][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[2][2] * (tmp[W - 3][0] - src[i][W - 1]); + + tmp[W - 1][0] = temp2Wm1; + tmp[W - 2][0] = B * tmp[W - 2][0] + b1 * tmp[W - 1][0] + b2 * temp2W + b3 * temp2Wp1; + tmp[W - 3][0] = B * tmp[W - 3][0] + b1 * tmp[W - 2][0] + b2 * tmp[W - 1][0] + b3 * temp2W; + + for (int j = W - 4; j >= 0; j--) { + tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0]; + } + + for (int j = 0; j < W; j++) { + dst[i][j] = tmp[j][0]; + } + } +} +#endif + +// fast gaussian approximation if the support window is large +template void gaussHorizontal (T** src, T** dst, const int W, const int H, const double sigma) +{ + double b1, b2, b3, B, M[3][3]; + calculateYvVFactors(sigma, b1, b2, b3, B, M); + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3); + } + + double temp2[W] ALIGNED16; + +#ifdef _OPENMP + #pragma omp for +#endif + + for (int i = 0; i < H; i++) { + + temp2[0] = B * src[i][0] + b1 * src[i][0] + b2 * src[i][0] + b3 * src[i][0]; + temp2[1] = B * src[i][1] + b1 * temp2[0] + b2 * src[i][0] + b3 * src[i][0]; + temp2[2] = B * src[i][2] + b1 * temp2[1] + b2 * temp2[0] + b3 * src[i][0]; + + for (int j = 3; j < W; j++) { + temp2[j] = B * src[i][j] + b1 * temp2[j - 1] + b2 * temp2[j - 2] + b3 * temp2[j - 3]; + } + + double temp2Wm1 = src[i][W - 1] + M[0][0] * (temp2[W - 1] - src[i][W - 1]) + M[0][1] * (temp2[W - 2] - src[i][W - 1]) + M[0][2] * (temp2[W - 3] - src[i][W - 1]); + double temp2W = src[i][W - 1] + M[1][0] * (temp2[W - 1] - src[i][W - 1]) + M[1][1] * (temp2[W - 2] - src[i][W - 1]) + M[1][2] * (temp2[W - 3] - src[i][W - 1]); + double temp2Wp1 = src[i][W - 1] + M[2][0] * (temp2[W - 1] - src[i][W - 1]) + M[2][1] * (temp2[W - 2] - src[i][W - 1]) + M[2][2] * (temp2[W - 3] - src[i][W - 1]); + + temp2[W - 1] = temp2Wm1; + temp2[W - 2] = B * temp2[W - 2] + b1 * temp2[W - 1] + b2 * temp2W + b3 * temp2Wp1; + temp2[W - 3] = B * temp2[W - 3] + b1 * temp2[W - 2] + b2 * temp2[W - 1] + b3 * temp2W; + + for (int j = W - 4; j >= 0; j--) { + temp2[j] = B * temp2[j] + b1 * temp2[j + 1] + b2 * temp2[j + 2] + b3 * temp2[j + 3]; + } + + for (int j = 0; j < W; j++) { + dst[i][j] = (T)temp2[j]; + } + + } +} + +#ifdef __SSE2__ +template SSEFUNCTION void gaussVerticalSse (T** src, T** dst, const int W, const int H, const float sigma) +{ + double b1, b2, b3, B, M[3][3]; + calculateYvVFactors(sigma, b1, b2, b3, B, M); + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + M[i][j] *= (1.0 + b2 + (b1 - b3) * b3); + M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3); + } + + float tmp[H][8] ALIGNED16; + vfloat Rv; + vfloat Tv, Tm2v, Tm3v; + vfloat Rv1; + vfloat Tv1, Tm2v1, Tm3v1; + vfloat Bv, b1v, b2v, b3v; + vfloat temp2W, temp2Wp1; + vfloat temp2W1, temp2Wp11; + Bv = F2V(B); + b1v = F2V(b1); + b2v = F2V(b2); + b3v = F2V(b3); + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + // process 8 columns per iteration for better usage of cpu cache + for (int i = 0; i < W - 7; i += 8) { + Tv = LVFU( src[0][i]); + Tv1 = LVFU( src[0][i + 4]); + Rv = Tv * (Bv + b1v + b2v + b3v); + Rv1 = Tv1 * (Bv + b1v + b2v + b3v); + Tm3v = Rv; + Tm3v1 = Rv1; + STVF( tmp[0][0], Rv ); + STVF( tmp[0][4], Rv1 ); + + Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v); + Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v); + Tm2v = Rv; + Tm2v1 = Rv1; + STVF( tmp[1][0], Rv ); + STVF( tmp[1][4], Rv1 ); + + Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v; + Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v; + STVF( tmp[2][0], Rv ); + STVF( tmp[2][4], Rv1 ); + + for (int j = 3; j < H; j++) { + Tv = Rv; + Tv1 = Rv1; + Rv = LVFU(src[j][i]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; + Rv1 = LVFU(src[j][i + 4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v; + STVF( tmp[j][0], Rv ); + STVF( tmp[j][4], Rv1 ); + Tm3v = Tm2v; + Tm3v1 = Tm2v1; + Tm2v = Tv; + Tm2v1 = Tv1; + } + + Tv = LVFU(src[H - 1][i]); + Tv1 = LVFU(src[H - 1][i + 4]); + + temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv); + temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1); + temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv); + temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1); + + Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv); + Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1); + STVFU( dst[H - 1][i], Rv ); + STVFU( dst[H - 1][i + 4], Rv1 ); + + Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1; + Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11; + STVFU( dst[H - 2][i], Tm2v ); + STVFU( dst[H - 2][i + 4], Tm2v1 ); + + Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W; + Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1; + STVFU( dst[H - 3][i], Tm3v ); + STVFU( dst[H - 3][i + 4], Tm3v1 ); + + Tv = Rv; + Tv1 = Rv1; + Rv = Tm3v; + Rv1 = Tm3v1; + Tm3v = Tv; + Tm3v1 = Tv1; + + for (int j = H - 4; j >= 0; j--) { + Tv = Rv; + Tv1 = Rv1; + Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; + Rv1 = LVF(tmp[j][4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v; + STVFU( dst[j][i], Rv ); + STVFU( dst[j][i + 4], Rv1 ); + Tm3v = Tm2v; + Tm3v1 = Tm2v1; + Tm2v = Tv; + Tm2v1 = Tv1; + } + } + +// Borders are done without SSE +#ifdef _OPENMP + #pragma omp single +#endif + + for (int i = W - (W % 8); i < W; i++) { + tmp[0][0] = src[0][i] * (B + b1 + b2 + b3); + tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3); + tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i]; + + for (int j = 3; j < H; j++) { + tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0]; + } + + float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]); + float temp2H = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]); + float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]); + + tmp[H - 1][0] = temp2Hm1; + tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1; + tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H; + + for (int j = H - 4; j >= 0; j--) { + tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0]; + } + + for (int j = 0; j < H; j++) { + dst[j][i] = tmp[j][0]; + } + + } +} +#endif + +#ifdef __SSE2__ +template SSEFUNCTION void gaussVerticalSsemult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, const float sigma) +{ + double b1, b2, b3, B, M[3][3]; + calculateYvVFactors(sigma, b1, b2, b3, B, M); + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + M[i][j] *= (1.0 + b2 + (b1 - b3) * b3); + M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3); + } + + float tmp[H][8] ALIGNED16; + vfloat Rv; + vfloat Tv, Tm2v, Tm3v; + vfloat Rv1; + vfloat Tv1, Tm2v1, Tm3v1; + vfloat Bv, b1v, b2v, b3v; + vfloat temp2W, temp2Wp1; + vfloat temp2W1, temp2Wp11; + Bv = F2V(B); + b1v = F2V(b1); + b2v = F2V(b2); + b3v = F2V(b3); + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + // process 8 columns per iteration for better usage of cpu cache + for (int i = 0; i < W - 7; i += 8) { + Tv = LVFU( src[0][i]); + Tv1 = LVFU( src[0][i + 4]); + Rv = Tv * (Bv + b1v + b2v + b3v); + Rv1 = Tv1 * (Bv + b1v + b2v + b3v); + Tm3v = Rv; + Tm3v1 = Rv1; + STVF( tmp[0][0], Rv ); + STVF( tmp[0][4], Rv1 ); + + Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v); + Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v); + Tm2v = Rv; + Tm2v1 = Rv1; + STVF( tmp[1][0], Rv ); + STVF( tmp[1][4], Rv1 ); + + Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v; + Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v; + STVF( tmp[2][0], Rv ); + STVF( tmp[2][4], Rv1 ); + + for (int j = 3; j < H; j++) { + Tv = Rv; + Tv1 = Rv1; + Rv = LVFU(src[j][i]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; + Rv1 = LVFU(src[j][i + 4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v; + STVF( tmp[j][0], Rv ); + STVF( tmp[j][4], Rv1 ); + Tm3v = Tm2v; + Tm3v1 = Tm2v1; + Tm2v = Tv; + Tm2v1 = Tv1; + } + + Tv = LVFU(src[H - 1][i]); + Tv1 = LVFU(src[H - 1][i + 4]); + + temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv); + temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1); + temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv); + temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1); + + Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv); + Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1); + STVFU( dst[H - 1][i], LVFU(dst[H - 1][i]) * Rv ); + STVFU( dst[H - 1][i + 4], LVFU(dst[H - 1][i + 4]) * Rv1 ); + + Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1; + Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11; + STVFU( dst[H - 2][i], LVFU(dst[H - 2][i]) * Tm2v ); + STVFU( dst[H - 2][i + 4], LVFU(dst[H - 2][i + 4]) * Tm2v1 ); + + Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W; + Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1; + STVFU( dst[H - 3][i], LVFU(dst[H - 3][i]) * Tm3v ); + STVFU( dst[H - 3][i + 4], LVFU(dst[H - 3][i + 4]) * Tm3v1 ); + + Tv = Rv; + Tv1 = Rv1; + Rv = Tm3v; + Rv1 = Tm3v1; + Tm3v = Tv; + Tm3v1 = Tv1; + + for (int j = H - 4; j >= 0; j--) { + Tv = Rv; + Tv1 = Rv1; + Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; + Rv1 = LVF(tmp[j][4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v; + STVFU( dst[j][i], LVFU(dst[j][i]) * Rv ); + STVFU( dst[j][i + 4], LVFU(dst[j][i + 4]) * Rv1 ); + Tm3v = Tm2v; + Tm3v1 = Tm2v1; + Tm2v = Tv; + Tm2v1 = Tv1; + } + } + +// Borders are done without SSE +#ifdef _OPENMP + #pragma omp single +#endif + + for (int i = W - (W % 8); i < W; i++) { + tmp[0][0] = src[0][i] * (B + b1 + b2 + b3); + tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3); + tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i]; + + for (int j = 3; j < H; j++) { + tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0]; + } + + float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]); + float temp2H = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]); + float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]); + + tmp[H - 1][0] = temp2Hm1; + tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1; + tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H; + + for (int j = H - 4; j >= 0; j--) { + tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0]; + } + + for (int j = 0; j < H; j++) { + dst[j][i] *= tmp[j][0]; + } + + } +} + +template SSEFUNCTION void gaussVerticalSsediv (T** RESTRICT src, T** RESTRICT dst, T** divBuffer, const int W, const int H, const float sigma) +{ + double b1, b2, b3, B, M[3][3]; + calculateYvVFactors(sigma, b1, b2, b3, B, M); + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + M[i][j] *= (1.0 + b2 + (b1 - b3) * b3); + M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3); + } + + float tmp[H][8] ALIGNED16; + vfloat Rv; + vfloat Tv, Tm2v, Tm3v; + vfloat Rv1; + vfloat Tv1, Tm2v1, Tm3v1; + vfloat Bv, b1v, b2v, b3v; + vfloat temp2W, temp2Wp1; + vfloat temp2W1, temp2Wp11; + vfloat onev = F2V(1.f); + Bv = F2V(B); + b1v = F2V(b1); + b2v = F2V(b2); + b3v = F2V(b3); + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + // process 8 columns per iteration for better usage of cpu cache + for (int i = 0; i < W - 7; i += 8) { + Tv = LVFU( src[0][i]); + Tv1 = LVFU( src[0][i + 4]); + Rv = Tv * (Bv + b1v + b2v + b3v); + Rv1 = Tv1 * (Bv + b1v + b2v + b3v); + Tm3v = Rv; + Tm3v1 = Rv1; + STVF( tmp[0][0], Rv ); + STVF( tmp[0][4], Rv1 ); + + Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v); + Rv1 = LVFU(src[1][i + 4]) * Bv + Rv1 * b1v + Tv1 * (b2v + b3v); + Tm2v = Rv; + Tm2v1 = Rv1; + STVF( tmp[1][0], Rv ); + STVF( tmp[1][4], Rv1 ); + + Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v; + Rv1 = LVFU(src[2][i + 4]) * Bv + Rv1 * b1v + Tm3v1 * b2v + Tv1 * b3v; + STVF( tmp[2][0], Rv ); + STVF( tmp[2][4], Rv1 ); + + for (int j = 3; j < H; j++) { + Tv = Rv; + Tv1 = Rv1; + Rv = LVFU(src[j][i]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; + Rv1 = LVFU(src[j][i + 4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v; + STVF( tmp[j][0], Rv ); + STVF( tmp[j][4], Rv1 ); + Tm3v = Tm2v; + Tm3v1 = Tm2v1; + Tm2v = Tv; + Tm2v1 = Tv1; + } + + Tv = LVFU(src[H - 1][i]); + Tv1 = LVFU(src[H - 1][i + 4]); + + temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv); + temp2Wp11 = Tv1 + F2V(M[2][0]) * (Rv1 - Tv1) + F2V(M[2][1]) * (Tm2v1 - Tv1) + F2V(M[2][2]) * (Tm3v1 - Tv1); + temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv); + temp2W1 = Tv1 + F2V(M[1][0]) * (Rv1 - Tv1) + F2V(M[1][1]) * (Tm2v1 - Tv1) + F2V(M[1][2]) * (Tm3v1 - Tv1); + + Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv); + Rv1 = Tv1 + F2V(M[0][0]) * (Rv1 - Tv1) + F2V(M[0][1]) * (Tm2v1 - Tv1) + F2V(M[0][2]) * (Tm3v1 - Tv1); + + STVFU( dst[H - 1][i], LVFU(divBuffer[H - 1][i]) / vself(vmaskf_gt(Rv, ZEROV), Rv, onev)); + STVFU( dst[H - 1][i + 4], LVFU(divBuffer[H - 1][i + 4]) / vself(vmaskf_gt(Rv1, ZEROV), Rv1, onev)); + + Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1; + Tm2v1 = Bv * Tm2v1 + b1v * Rv1 + b2v * temp2W1 + b3v * temp2Wp11; + STVFU( dst[H - 2][i], LVFU(divBuffer[H - 2][i]) / vself(vmaskf_gt(Tm2v, ZEROV), Tm2v, onev)); + STVFU( dst[H - 2][i + 4], LVFU(divBuffer[H - 2][i + 4]) / vself(vmaskf_gt(Tm2v1, ZEROV), Tm2v1, onev)); + + Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W; + Tm3v1 = Bv * Tm3v1 + b1v * Tm2v1 + b2v * Rv1 + b3v * temp2W1; + STVFU( dst[H - 3][i], LVFU(divBuffer[H - 3][i]) / vself(vmaskf_gt(Tm3v, ZEROV), Tm3v, onev)); + STVFU( dst[H - 3][i + 4], LVFU(divBuffer[H - 3][i + 4]) / vself(vmaskf_gt(Tm3v1, ZEROV), Tm3v1, onev)); + + Tv = Rv; + Tv1 = Rv1; + Rv = Tm3v; + Rv1 = Tm3v1; + Tm3v = Tv; + Tm3v1 = Tv1; + + for (int j = H - 4; j >= 0; j--) { + Tv = Rv; + Tv1 = Rv1; + Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; + Rv1 = LVF(tmp[j][4]) * Bv + Tv1 * b1v + Tm2v1 * b2v + Tm3v1 * b3v; + STVFU( dst[j][i], LVFU(divBuffer[j][i]) / vself(vmaskf_gt(Rv, ZEROV), Rv, onev)); + STVFU( dst[j][i + 4], LVFU(divBuffer[j][i + 4]) / vself(vmaskf_gt(Rv1, ZEROV), Rv1, onev)); + Tm3v = Tm2v; + Tm3v1 = Tm2v1; + Tm2v = Tv; + Tm2v1 = Tv1; + } + } + +// Borders are done without SSE +#ifdef _OPENMP + #pragma omp single +#endif + + for (int i = W - (W % 8); i < W; i++) { + tmp[0][0] = src[0][i] * (B + b1 + b2 + b3); + tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + src[0][i] * (b2 + b3); + tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i]; + + for (int j = 3; j < H; j++) { + tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0]; + } + + float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]); + float temp2H = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]); + float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]); + + tmp[H - 1][0] = temp2Hm1; + tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1; + tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H; + + for (int j = H - 4; j >= 0; j--) { + tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0]; + } + + for (int j = 0; j < H; j++) { + dst[j][i] = divBuffer[j][i] / (tmp[j][0] > 0.f ? tmp[j][0] : 1.f); + } + + } +} + +#endif + +template void gaussVertical (T** src, T** dst, const int W, const int H, const double sigma) +{ + double b1, b2, b3, B, M[3][3]; + calculateYvVFactors(sigma, b1, b2, b3, B, M); + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3); + } + + // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H) + static const int numcols = 8; + double temp2[H][numcols] ALIGNED16; + double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols]; +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int i = 0; i < W - numcols + 1; i += numcols) { + for (int k = 0; k < numcols; k++) { + temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k]; + temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k]; + temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k]; + } + + for (int j = 3; j < H; j++) { + for (int k = 0; k < numcols; k++) { + temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k]; + } + } + + for (int k = 0; k < numcols; k++) { + temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + } + + for (int k = 0; k < numcols; k++) { + dst[H - 1][i + k] = temp2[H - 1][k] = temp2Hm1[k]; + dst[H - 2][i + k] = temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k]; + dst[H - 3][i + k] = temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k]; + } + + for (int j = H - 4; j >= 0; j--) { + for (int k = 0; k < numcols; k++) { + dst[j][i + k] = temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]; + } + } + } + +#ifdef _OPENMP + #pragma omp single +#endif + + // process remaining columns + for (int i = W - (W % numcols); i < W; i++) { + temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i]; + temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i]; + temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i]; + + for (int j = 3; j < H; j++) { + temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0]; + } + + double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]); + double temp2H = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]); + double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]); + + dst[H - 1][i] = temp2[H - 1][0] = temp2Hm1; + dst[H - 2][i] = temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1; + dst[H - 3][i] = temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H; + + for (int j = H - 4; j >= 0; j--) { + dst[j][i] = temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]; + } + } +} + +#ifndef __SSE2__ +template void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const int W, const int H, const double sigma) +{ + double b1, b2, b3, B, M[3][3]; + calculateYvVFactors(sigma, b1, b2, b3, B, M); + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3); + } + + // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H) + static const int numcols = 8; + double temp2[H][numcols] ALIGNED16; + double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols]; +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int i = 0; i < W - numcols + 1; i += numcols) { + for (int k = 0; k < numcols; k++) { + temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k]; + temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k]; + temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k]; + } + + for (int j = 3; j < H; j++) { + for (int k = 0; k < numcols; k++) { + temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k]; + } + } + + for (int k = 0; k < numcols; k++) { + temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + } + + for (int k = 0; k < numcols; k++) { + dst[H - 1][i + k] = divBuffer[H - 1][i + k] / (temp2[H - 1][k] = temp2Hm1[k]); + dst[H - 2][i + k] = divBuffer[H - 2][i + k] / (temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k]); + dst[H - 3][i + k] = divBuffer[H - 3][i + k] / (temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k]); + } + + for (int j = H - 4; j >= 0; j--) { + for (int k = 0; k < numcols; k++) { + dst[j][i + k] = divBuffer[j][i + k] / (temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]); + } + } + } + +#ifdef _OPENMP + #pragma omp single +#endif + + // process remaining columns + for (int i = W - (W % numcols); i < W; i++) { + temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i]; + temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i]; + temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i]; + + for (int j = 3; j < H; j++) { + temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0]; + } + + double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]); + double temp2H = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]); + double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]); + + dst[H - 1][i] = divBuffer[H - 1][i] / (temp2[H - 1][0] = temp2Hm1); + dst[H - 2][i] = divBuffer[H - 2][i] / (temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1); + dst[H - 3][i] = divBuffer[H - 3][i] / (temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H); + + for (int j = H - 4; j >= 0; j--) { + dst[j][i] = divBuffer[j][i] / (temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]); + } + } +} + +template void gaussVerticalmult (T** src, T** dst, const int W, const int H, const double sigma) +{ + double b1, b2, b3, B, M[3][3]; + calculateYvVFactors(sigma, b1, b2, b3, B, M); + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3); + } + + // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H) + static const int numcols = 8; + double temp2[H][numcols] ALIGNED16; + double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols]; +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int i = 0; i < W - numcols + 1; i += numcols) { + for (int k = 0; k < numcols; k++) { + temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k]; + temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k]; + temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k]; + } + + for (int j = 3; j < H; j++) { + for (int k = 0; k < numcols; k++) { + temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k]; + } + } + + for (int k = 0; k < numcols; k++) { + temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]); + } + + for (int k = 0; k < numcols; k++) { + dst[H - 1][i + k] *= temp2[H - 1][k] = temp2Hm1[k]; + dst[H - 2][i + k] *= temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k]; + dst[H - 3][i + k] *= temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k]; + } + + for (int j = H - 4; j >= 0; j--) { + for (int k = 0; k < numcols; k++) { + dst[j][i + k] *= (temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]); + } + } + } + +#ifdef _OPENMP + #pragma omp single +#endif + + // process remaining columns + for (int i = W - (W % numcols); i < W; i++) { + temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i]; + temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i]; + temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i]; + + for (int j = 3; j < H; j++) { + temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0]; + } + + double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]); + double temp2H = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]); + double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]); + + dst[H - 1][i] *= temp2[H - 1][0] = temp2Hm1; + dst[H - 2][i] *= temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1; + dst[H - 3][i] *= temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H; + + for (int j = H - 4; j >= 0; j--) { + dst[j][i] *= (temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]); + } + } +} +#endif + +template void gaussianBlurImpl(T** src, T** dst, const int W, const int H, const double sigma, T *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, T** buffer2 = nullptr) +{ + static constexpr auto GAUSS_SKIP = 0.25; + static constexpr auto GAUSS_3X3_LIMIT = 0.6; + static constexpr auto GAUSS_DOUBLE = 70.0; + + if(buffer) { + // special variant for very large sigma, currently only used by retinex algorithm + // use iterated boxblur to approximate gaussian blur + // Compute ideal averaging filter width and number of iterations + int n = 1; + double wIdeal = sqrt((12 * sigma * sigma) + 1); + + while(wIdeal > W || wIdeal > H) { + n++; + wIdeal = sqrt((12 * sigma * sigma / n) + 1); + } + + if(n < 3) { + n = 3; + wIdeal = sqrt((12 * sigma * sigma / n) + 1); + } else if(n > 6) { + n = 6; + } + + int wl = wIdeal; + + if(wl % 2 == 0) { + wl--; + } + + int wu = wl + 2; + + double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4); + int m = round(mIdeal); + + int sizes[n]; + + for(int i = 0; i < n; i++) { + sizes[i] = ((i < m ? wl : wu) - 1) / 2; + } + + rtengine::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H); + + for(int i = 1; i < n; i++) { + rtengine::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H); + } + } else { + if (sigma < GAUSS_SKIP) { + // don't perform filtering + if (src != dst) { + memcpy (dst[0], src[0], W * H * sizeof(T)); + } + } else if (sigma < GAUSS_3X3_LIMIT) { + if(src != dst) { + // If src != dst we can take the fast way + // compute 3x3 kernel values + double c0 = 1.0; + double c1 = exp( -0.5 * (rtengine::SQR(1.0 / sigma)) ); + double c2 = exp( -rtengine::SQR(1.0 / sigma) ); + + // normalize kernel values + double sum = c0 + 4.0 * (c1 + c2); + c0 /= sum; + c1 /= sum; + c2 /= sum; + // compute kernel values for border pixels + double b1 = exp (-1.0 / (2.0 * sigma * sigma)); + double bsum = 2.0 * b1 + 1.0; + b1 /= bsum; + double b0 = 1.0 / bsum; + + switch (gausstype) { + case GAUSS_MULT : + gauss3x3mult (src, dst, W, H, c0, c1, c2, b0, b1); + break; + + case GAUSS_DIV : + gauss3x3div (src, dst, buffer2, W, H, c0, c1, c2, b0, b1); + break; + + case GAUSS_STANDARD : + gauss3x3 (src, dst, W, H, c0, c1, c2, b0, b1); + break; + } + } else { + // compute kernel values for separated 3x3 gaussian blur + double c1 = exp (-1.0 / (2.0 * sigma * sigma)); + double csum = 2.0 * c1 + 1.0; + c1 /= csum; + double c0 = 1.0 / csum; + gaussHorizontal3 (src, dst, W, H, c0, c1); + gaussVertical3 (dst, dst, W, H, c0, c1); + } + } else { +#ifdef __SSE2__ + + if (sigma < GAUSS_DOUBLE) { + switch (gausstype) { + case GAUSS_MULT : { + gaussHorizontalSse (src, src, W, H, sigma); + gaussVerticalSsemult (src, dst, W, H, sigma); + break; + } + + case GAUSS_DIV : { + gaussHorizontalSse (src, dst, W, H, sigma); + gaussVerticalSsediv (dst, dst, buffer2, W, H, sigma); + break; + } + + case GAUSS_STANDARD : { + gaussHorizontalSse (src, dst, W, H, sigma); + gaussVerticalSse (dst, dst, W, H, sigma); + break; + } + } + } else { // large sigma only with double precision + gaussHorizontal (src, dst, W, H, sigma); + gaussVertical (dst, dst, W, H, sigma); + } + +#else + + if (sigma < GAUSS_DOUBLE) { + switch (gausstype) { + case GAUSS_MULT : { + gaussHorizontal (src, src, W, H, sigma); + gaussVerticalmult (src, dst, W, H, sigma); + break; + } + + case GAUSS_DIV : { + gaussHorizontal (src, dst, W, H, sigma); + gaussVerticaldiv (dst, dst, buffer2, W, H, sigma); + break; + } + + case GAUSS_STANDARD : { + gaussHorizontal (src, dst, W, H, sigma); + gaussVertical (dst, dst, W, H, sigma); + break; + } + } + } else { // large sigma only with double precision + gaussHorizontal (src, dst, W, H, sigma); + gaussVertical (dst, dst, W, H, sigma); + } + +#endif + } + } +} +} + +void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer, eGaussType gausstype, float** buffer2) +{ + gaussianBlurImpl(src, dst, W, H, sigma, buffer, gausstype, buffer2); +} + diff --git a/rtengine/gauss.h b/rtengine/gauss.h index 4136e6f6d..72f115cc4 100644 --- a/rtengine/gauss.h +++ b/rtengine/gauss.h @@ -19,797 +19,8 @@ #ifndef _GAUSS_H_ #define _GAUSS_H_ -#include -#include -#include -#include "opthelper.h" -#include "stdio.h" -#include "boxblur.h" -// classical filtering if the support window is small: +enum eGaussType {GAUSS_STANDARD, GAUSS_MULT, GAUSS_DIV}; -template void gaussHorizontal3 (T** src, T** dst, int W, int H, const float c0, const float c1) -{ +void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, float** buffer2 = nullptr); -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H; i++) { - T temp[W] ALIGNED16; - - for (int j = 1; j < W - 1; j++) { - temp[j] = (T)(c1 * (src[i][j - 1] + src[i][j + 1]) + c0 * src[i][j]); - } - - dst[i][0] = src[i][0]; - memcpy (dst[i] + 1, temp + 1, (W - 2)*sizeof(T)); - - dst[i][W - 1] = src[i][W - 1]; - } -} - -template void gaussVertical3 (T** src, T** dst, int W, int H, const float c0, const float c1) -{ - -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < W; i++) { - T temp[H] ALIGNED16; - - for (int j = 1; j < H - 1; j++) { - temp[j] = (T)(c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i]); - } - - dst[0][i] = src[0][i]; - - for (int j = 1; j < H - 1; j++) { - dst[j][i] = temp[j]; - } - - dst[H - 1][i] = src[H - 1][i]; - } -} - -#ifdef __SSE2__ -template SSEFUNCTION void gaussVertical3Sse (T** src, T** dst, int W, int H, const float c0, const float c1) -{ - vfloat Tv, Tm1v, Tp1v; - vfloat c0v, c1v; - c0v = F2V(c0); - c1v = F2V(c1); -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < W - 3; i += 4) { - Tm1v = LVFU( src[0][i] ); - STVFU( dst[0][i], Tm1v); - - if (H > 1) { - Tv = LVFU( src[1][i]); - } - - for (int j = 1; j < H - 1; j++) { - Tp1v = LVFU( src[j + 1][i]); - STVFU( dst[j][i], c1v * (Tp1v + Tm1v) + Tv * c0v); - Tm1v = Tv; - Tv = Tp1v; - } - - STVFU( dst[H - 1][i], LVFU( src[H - 1][i])); - } - -// Borders are done without SSE -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = W - (W % 4); i < W; i++) { - dst[0][i] = src[0][i]; - - for (int j = 1; j < H - 1; j++) { - dst[j][i] = c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i]; - } - - dst[H - 1][i] = src[H - 1][i]; - } -} - - -template SSEFUNCTION void gaussHorizontal3Sse (T** src, T** dst, int W, int H, const float c0, const float c1) -{ - float tmp[W][4] ALIGNED16; - - vfloat Tv, Tm1v, Tp1v; - vfloat c0v, c1v; - c0v = F2V(c0); - c1v = F2V(c1); -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H - 3; i += 4) { - dst[i][0] = src[i][0]; - dst[i + 1][0] = src[i + 1][0]; - dst[i + 2][0] = src[i + 2][0]; - dst[i + 3][0] = src[i + 3][0]; - Tm1v = _mm_set_ps( src[i][0], src[i + 1][0], src[i + 2][0], src[i + 3][0] ); - - if (W > 1) { - Tv = _mm_set_ps( src[i][1], src[i + 1][1], src[i + 2][1], src[i + 3][1] ); - } - - for (int j = 1; j < W - 1; j++) { - Tp1v = _mm_set_ps( src[i][j + 1], src[i + 1][j + 1], src[i + 2][j + 1], src[i + 3][j + 1] ); - STVF( tmp[j][0], c1v * (Tp1v + Tm1v) + Tv * c0v); - Tm1v = Tv; - Tv = Tp1v; - } - - for (int j = 1; j < W - 1; j++) { - dst[i + 3][j] = tmp[j][0]; - dst[i + 2][j] = tmp[j][1]; - dst[i + 1][j] = tmp[j][2]; - dst[i][j] = tmp[j][3]; - } - - dst[i][W - 1] = src[i][W - 1]; - dst[i + 1][W - 1] = src[i + 1][W - 1]; - dst[i + 2][W - 1] = src[i + 2][W - 1]; - dst[i + 3][W - 1] = src[i + 3][W - 1]; - } - -// Borders are done without SSE -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = H - (H % 4); i < H; i++) { - dst[i][0] = src[i][0]; - - for (int j = 1; j < W - 1; j++) { - dst[i][j] = c1 * (src[i][j - 1] + src[i][j + 1]) + c0 * src[i][j]; - } - - dst[i][W - 1] = src[i][W - 1]; - } -} - - - -// fast gaussian approximation if the support window is large -template SSEFUNCTION void gaussHorizontalSse (T** src, T** dst, int W, int H, float sigma) -{ - - if (sigma < 0.25) { - // dont perform filtering - if (src != dst) -#ifdef _OPENMP - #pragma omp for -#endif - for (int i = 0; i < H; i++) { - memcpy (dst[i], src[i], W * sizeof(T)); - } - - return; - } - - if (sigma < 0.6) { - // compute 3x3 kernel - float c1 = exp (-1.0 / (2.0 * sigma * sigma)); - float csum = 2.0 * c1 + 1.0; - c1 /= csum; - float c0 = 1.0 / csum; - gaussHorizontal3Sse (src, dst, W, H, c0, c1); - return; - } - - // coefficient calculation - float q = 0.98711 * sigma - 0.96330; - - if (sigma < 2.5) { - q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); - } - - float b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q; - float b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q; - float b2 = -1.4281 * q * q - 1.26661 * q * q * q; - float b3 = 0.422205 * q * q * q; - float B = 1.0 - (b1 + b2 + b3) / b0; - - b1 /= b0; - b2 /= b0; - b3 /= b0; - - // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering - float M[3][3]; - M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2; - M[0][1] = (b3 + b1) * (b2 + b3 * b1); - M[0][2] = b3 * (b1 + b3 * b2); - M[1][0] = b1 + b3 * b2; - M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1); - M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3; - M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2; - M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3; - M[2][2] = b3 * (b1 + b3 * b2); - - for (int i = 0; i < 3; i++) - for (int j = 0; j < 3; j++) { - M[i][j] *= (1.0 + b2 + (b1 - b3) * b3); - M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3); - } - - vfloat Rv; - vfloat Tv, Tm2v, Tm3v; - vfloat Bv, b1v, b2v, b3v; - vfloat temp2W, temp2Wp1; - float tmp[W][4] ALIGNED16; - float tmpV[4] ALIGNED16; - Bv = F2V(B); - b1v = F2V(b1); - b2v = F2V(b2); - b3v = F2V(b3); - -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H - 3; i += 4) { - tmpV[0] = src[i + 3][0]; - tmpV[1] = src[i + 2][0]; - tmpV[2] = src[i + 1][0]; - tmpV[3] = src[i][0]; - Tv = LVF(tmpV[0]); - Rv = Tv * (Bv + b1v + b2v + b3v); - Tm3v = Rv; - STVF( tmp[0][0], Rv ); - - tmpV[0] = src[i + 3][1]; - tmpV[1] = src[i + 2][1]; - tmpV[2] = src[i + 1][1]; - tmpV[3] = src[i][1]; - Rv = LVF(tmpV[0]) * Bv + Rv * b1v + Tv * (b2v + b3v); - Tm2v = Rv; - STVF( tmp[1][0], Rv ); - - tmpV[0] = src[i + 3][2]; - tmpV[1] = src[i + 2][2]; - tmpV[2] = src[i + 1][2]; - tmpV[3] = src[i][2]; - Rv = LVF(tmpV[0]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v; - STVF( tmp[2][0], Rv ); - - for (int j = 3; j < W; j++) { - Tv = Rv; - Rv = _mm_set_ps(src[i][j], src[i + 1][j], src[i + 2][j], src[i + 3][j]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; - STVF( tmp[j][0], Rv ); - Tm3v = Tm2v; - Tm2v = Tv; - } - - Tv = _mm_set_ps(src[i][W - 1], src[i + 1][W - 1], src[i + 2][W - 1], src[i + 3][W - 1]); - - temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * ( Tm2v - Tv ) + F2V(M[2][2]) * (Tm3v - Tv); - temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv); - - Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv); - STVF( tmp[W - 1][0], Rv ); - - Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1; - STVF( tmp[W - 2][0], Tm2v ); - - Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W; - STVF( tmp[W - 3][0], Tm3v ); - - Tv = Rv; - Rv = Tm3v; - Tm3v = Tv; - - for (int j = W - 4; j >= 0; j--) { - Tv = Rv; - Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; - STVF( tmp[j][0], Rv ); - Tm3v = Tm2v; - Tm2v = Tv; - } - - for (int j = 0; j < W; j++) { - dst[i + 3][j] = tmp[j][0]; - dst[i + 2][j] = tmp[j][1]; - dst[i + 1][j] = tmp[j][2]; - dst[i][j] = tmp[j][3]; - } - - - } - -// Borders are done without SSE -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = H - (H % 4); i < H; i++) { - tmp[0][0] = B * src[i][0] + b1 * src[i][0] + b2 * src[i][0] + b3 * src[i][0]; - tmp[1][0] = B * src[i][1] + b1 * tmp[0][0] + b2 * src[i][0] + b3 * src[i][0]; - tmp[2][0] = B * src[i][2] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[i][0]; - - for (int j = 3; j < W; j++) { - tmp[j][0] = B * src[i][j] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0]; - } - - float temp2Wm1 = src[i][W - 1] + M[0][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[0][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[0][2] * (tmp[W - 3][0] - src[i][W - 1]); - float temp2W = src[i][W - 1] + M[1][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[1][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[1][2] * (tmp[W - 3][0] - src[i][W - 1]); - float temp2Wp1 = src[i][W - 1] + M[2][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[2][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[2][2] * (tmp[W - 3][0] - src[i][W - 1]); - - tmp[W - 1][0] = temp2Wm1; - tmp[W - 2][0] = B * tmp[W - 2][0] + b1 * tmp[W - 1][0] + b2 * temp2W + b3 * temp2Wp1; - tmp[W - 3][0] = B * tmp[W - 3][0] + b1 * tmp[W - 2][0] + b2 * tmp[W - 1][0] + b3 * temp2W; - - for (int j = W - 4; j >= 0; j--) { - tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0]; - } - - for (int j = 0; j < W; j++) { - dst[i][j] = tmp[j][0]; - } - - } -} -#endif - -// fast gaussian approximation if the support window is large - -template void gaussHorizontal (T** src, T** dst, int W, int H, double sigma) -{ - -#ifdef __SSE2__ - - if (sigma < 70) { // bigger sigma only with double precision - gaussHorizontalSse (src, dst, W, H, sigma); - return; - } - -#endif - - if (sigma < 0.25) { - // dont perform filtering - if (src != dst) -#ifdef _OPENMP - #pragma omp for -#endif - for (int i = 0; i < H; i++) { - memcpy (dst[i], src[i], W * sizeof(T)); - } - - return; - } - - if (sigma < 0.6) { - // compute 3x3 kernel - double c1 = exp (-1.0 / (2.0 * sigma * sigma)); - double csum = 2.0 * c1 + 1.0; - c1 /= csum; - double c0 = 1.0 / csum; - gaussHorizontal3 (src, dst, W, H, c0, c1); - return; - } - - // coefficient calculation - double q = 0.98711 * sigma - 0.96330; - - if (sigma < 2.5) { - q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); - } - - double b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q; - double b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q; - double b2 = -1.4281 * q * q - 1.26661 * q * q * q; - double b3 = 0.422205 * q * q * q; - double B = 1.0 - (b1 + b2 + b3) / b0; - - b1 /= b0; - b2 /= b0; - b3 /= b0; - - // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering - double M[3][3]; - M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2; - M[0][1] = (b3 + b1) * (b2 + b3 * b1); - M[0][2] = b3 * (b1 + b3 * b2); - M[1][0] = b1 + b3 * b2; - M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1); - M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3; - M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2; - M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3; - M[2][2] = b3 * (b1 + b3 * b2); - - for (int i = 0; i < 3; i++) - for (int j = 0; j < 3; j++) { - M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3); - } - - double temp2[W] ALIGNED16; - -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H; i++) { - - temp2[0] = B * src[i][0] + b1 * src[i][0] + b2 * src[i][0] + b3 * src[i][0]; - temp2[1] = B * src[i][1] + b1 * temp2[0] + b2 * src[i][0] + b3 * src[i][0]; - temp2[2] = B * src[i][2] + b1 * temp2[1] + b2 * temp2[0] + b3 * src[i][0]; - - for (int j = 3; j < W; j++) { - temp2[j] = B * src[i][j] + b1 * temp2[j - 1] + b2 * temp2[j - 2] + b3 * temp2[j - 3]; - } - - double temp2Wm1 = src[i][W - 1] + M[0][0] * (temp2[W - 1] - src[i][W - 1]) + M[0][1] * (temp2[W - 2] - src[i][W - 1]) + M[0][2] * (temp2[W - 3] - src[i][W - 1]); - double temp2W = src[i][W - 1] + M[1][0] * (temp2[W - 1] - src[i][W - 1]) + M[1][1] * (temp2[W - 2] - src[i][W - 1]) + M[1][2] * (temp2[W - 3] - src[i][W - 1]); - double temp2Wp1 = src[i][W - 1] + M[2][0] * (temp2[W - 1] - src[i][W - 1]) + M[2][1] * (temp2[W - 2] - src[i][W - 1]) + M[2][2] * (temp2[W - 3] - src[i][W - 1]); - - temp2[W - 1] = temp2Wm1; - temp2[W - 2] = B * temp2[W - 2] + b1 * temp2[W - 1] + b2 * temp2W + b3 * temp2Wp1; - temp2[W - 3] = B * temp2[W - 3] + b1 * temp2[W - 2] + b2 * temp2[W - 1] + b3 * temp2W; - - for (int j = W - 4; j >= 0; j--) { - temp2[j] = B * temp2[j] + b1 * temp2[j + 1] + b2 * temp2[j + 2] + b3 * temp2[j + 3]; - } - - for (int j = 0; j < W; j++) { - dst[i][j] = (T)temp2[j]; - } - - } -} - -#ifdef __SSE2__ -template SSEFUNCTION void gaussVerticalSse (T** src, T** dst, int W, int H, float sigma) -{ - - if (sigma < 0.25) { - // dont perform filtering - if (src != dst) -#ifdef _OPENMP - #pragma omp for -#endif - for (int i = 0; i < H; i++) { - memcpy (dst[i], src[i], W * sizeof(T)); - } - - return; - } - - if (sigma < 0.6) { - // compute 3x3 kernel - double c1 = exp (-1.0 / (2.0 * sigma * sigma)); - double csum = 2.0 * c1 + 1.0; - c1 /= csum; - double c0 = 1.0 / csum; - gaussVertical3Sse (src, dst, W, H, c0, c1); - return; - } - - // coefficient calculation - double q = 0.98711 * sigma - 0.96330; - - if (sigma < 2.5) { - q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); - } - - double b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q; - double b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q; - double b2 = -1.4281 * q * q - 1.26661 * q * q * q; - double b3 = 0.422205 * q * q * q; - double B = 1.0 - (b1 + b2 + b3) / b0; - - b1 /= b0; - b2 /= b0; - b3 /= b0; - - // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering - double M[3][3]; - M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2; - M[0][1] = (b3 + b1) * (b2 + b3 * b1); - M[0][2] = b3 * (b1 + b3 * b2); - M[1][0] = b1 + b3 * b2; - M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1); - M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3; - M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2; - M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3; - M[2][2] = b3 * (b1 + b3 * b2); - - for (int i = 0; i < 3; i++) - for (int j = 0; j < 3; j++) { - M[i][j] *= (1.0 + b2 + (b1 - b3) * b3); - M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3); - } - - float tmp[H][4] ALIGNED16; - vfloat Rv; - vfloat Tv, Tm2v, Tm3v; - vfloat Bv, b1v, b2v, b3v; - vfloat temp2W, temp2Wp1; - Bv = F2V(B); - b1v = F2V(b1); - b2v = F2V(b2); - b3v = F2V(b3); - - -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < W - 3; i += 4) { - Tv = LVFU( src[0][i]); - Rv = Tv * (Bv + b1v + b2v + b3v); - Tm3v = Rv; - STVF( tmp[0][0], Rv ); - - Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v); - Tm2v = Rv; - STVF( tmp[1][0], Rv ); - - Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v; - STVF( tmp[2][0], Rv ); - - for (int j = 3; j < H; j++) { - Tv = Rv; - Rv = LVFU(src[j][i]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; - STVF( tmp[j][0], Rv ); - Tm3v = Tm2v; - Tm2v = Tv; - } - - Tv = LVFU(src[H - 1][i]); - - temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv); - temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv); - - Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv); - STVFU( dst[H - 1][i], Rv ); - - Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1; - STVFU( dst[H - 2][i], Tm2v ); - - Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W; - STVFU( dst[H - 3][i], Tm3v ); - - Tv = Rv; - Rv = Tm3v; - Tm3v = Tv; - - for (int j = H - 4; j >= 0; j--) { - Tv = Rv; - Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; - STVFU( dst[j][i], Rv ); - Tm3v = Tm2v; - Tm2v = Tv; - } - } - -// Borders are done without SSE -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = W - (W % 4); i < W; i++) { - tmp[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i]; - tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + b2 * src[0][i] + b3 * src[0][i]; - tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i]; - - for (int j = 3; j < H; j++) { - tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0]; - } - - float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]); - float temp2H = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]); - float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]); - - tmp[H - 1][0] = temp2Hm1; - tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1; - tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H; - - for (int j = H - 4; j >= 0; j--) { - tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0]; - } - - for (int j = 0; j < H; j++) { - dst[j][i] = tmp[j][0]; - } - - } -} - -#endif - -template void gaussVertical (T** src, T** dst, int W, int H, double sigma) -{ - -#ifdef __SSE2__ - - if (sigma < 70) { // bigger sigma only with double precision - gaussVerticalSse (src, dst, W, H, sigma); - return; - } - -#endif - - if (sigma < 0.25) { - // don't perform filtering - if (src != dst) -#ifdef _OPENMP - #pragma omp for -#endif - for (int i = 0; i < H; i++) { - memcpy (dst[i], src[i], W * sizeof(T)); - } - - return; - } - - if (sigma < 0.6) { - // compute 3x3 kernel - double c1 = exp (-1.0 / (2.0 * sigma * sigma)); - double csum = 2.0 * c1 + 1.0; - c1 /= csum; - double c0 = 1.0 / csum; - gaussVertical3 (src, dst, W, H, c0, c1); - return; - } - - // coefficient calculation - double q = 0.98711 * sigma - 0.96330; - - if (sigma < 2.5) { - q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); - } - - double b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q; - double b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q; - double b2 = -1.4281 * q * q - 1.26661 * q * q * q; - double b3 = 0.422205 * q * q * q; - double B = 1.0 - (b1 + b2 + b3) / b0; - - b1 /= b0; - b2 /= b0; - b3 /= b0; - - // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering - double M[3][3]; - M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2; - M[0][1] = (b3 + b1) * (b2 + b3 * b1); - M[0][2] = b3 * (b1 + b3 * b2); - M[1][0] = b1 + b3 * b2; - M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1); - M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3; - M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2; - M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3; - M[2][2] = b3 * (b1 + b3 * b2); - - for (int i = 0; i < 3; i++) - for (int j = 0; j < 3; j++) { - M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3); - } - - // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H) - static const int numcols = 8; - double temp2[H][numcols] ALIGNED16; - double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols]; -#ifdef _OPENMP - #pragma omp for nowait -#endif - - for (int i = 0; i < W - numcols + 1; i += numcols) { - for (int k = 0; k < numcols; k++) { - temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k]; - temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k]; - temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k]; - } - - for (int j = 3; j < H; j++) { - for (int k = 0; k < numcols; k++) { - temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k]; - } - } - - for (int k = 0; k < numcols; k++) { - temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]); - temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]); - temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]); - } - - for (int k = 0; k < numcols; k++) { - dst[H - 1][i + k] = temp2[H - 1][k] = temp2Hm1[k]; - dst[H - 2][i + k] = temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k]; - dst[H - 3][i + k] = temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k]; - } - - for (int j = H - 4; j >= 0; j--) { - for (int k = 0; k < numcols; k++) { - dst[j][i + k] = temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]; - } - } - } - -#ifdef _OPENMP - #pragma omp single -#endif - - // process remaining column - for (int i = W - (W % numcols); i < W; i++) { - temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i]; - temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i]; - temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i]; - - for (int j = 3; j < H; j++) { - temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0]; - } - - double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]); - double temp2H = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]); - double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]); - - dst[H - 1][i] = temp2[H - 1][0] = temp2Hm1; - dst[H - 2][i] = temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1; - dst[H - 3][i] = temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H; - - for (int j = H - 4; j >= 0; j--) { - dst[j][i] = temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]; - } - } -} - -template void gaussianBlur(T** src, T** dst, const int W, const int H, const double sigma, T *buffer = NULL) -{ - - if(buffer) { // use iterated boxblur to approximate gaussian blur - // Compute ideal averaging filter width and number of iterations - int n = 1; - double wIdeal = sqrt((12 * sigma * sigma) + 1); - - while(wIdeal > W || wIdeal > H) { - n++; - wIdeal = sqrt((12 * sigma * sigma / n) + 1); - } - - if(n < 3) { - n = 3; - wIdeal = sqrt((12 * sigma * sigma / n) + 1); - } else if(n > 6) { - n = 6; - } - - int wl = wIdeal; - - if(wl % 2 == 0) { - wl--; - } - - int wu = wl + 2; - - double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4); - int m = round(mIdeal); - - int sizes[n]; - - for(int i = 0; i < n; i++) { - sizes[i] = ((i < m ? wl : wu) - 1) / 2; - } - - rtengine::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H); - - for(int i = 1; i < n; i++) { - rtengine::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H); - } - - } else { - gaussHorizontal (src, dst, W, H, sigma); - gaussVertical (dst, dst, W, H, sigma); - - } -} - -#endif +#endif \ No newline at end of file diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index 9257ab81e..b60b5c9bc 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -115,21 +115,30 @@ static INLINE vfloat vcast_vf_f(float f) return _mm_set_ps(f, f, f, f); } +// Don't use intrinsics here. Newer gcc versions (>= 4.9, maybe also before 4.9) generate better code when not using intrinsics +// example: vaddf(vmulf(a,b),c) will generate an FMA instruction when build for chips with that feature only when vaddf and vmulf don't use intrinsics static INLINE vfloat vaddf(vfloat x, vfloat y) { - return _mm_add_ps(x, y); + return x + y; } static INLINE vfloat vsubf(vfloat x, vfloat y) { - return _mm_sub_ps(x, y); + return x - y; } static INLINE vfloat vmulf(vfloat x, vfloat y) { - return _mm_mul_ps(x, y); + return x * y; } static INLINE vfloat vdivf(vfloat x, vfloat y) { - return _mm_div_ps(x, y); + return x / y; +} +// Also don't use intrinsic here: Some chips support FMA instructions with 3 and 4 operands +// 3 operands: a = a*b+c, b = a*b+c, c = a*b+c // destination has to be one of a,b,c +// 4 operands: d = a*b+c // destination does not have to be one of a,b,c +// gcc will use the one which fits best when not using intrinsics. With using intrinsics that's not possible +static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { + return x * y + z; } static INLINE vfloat vrecf(vfloat x) { diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h index ec81f26eb..74db0ff3f 100644 --- a/rtengine/improcfun.h +++ b/rtengine/improcfun.h @@ -73,6 +73,7 @@ class ImProcFunctions void transformLuminanceOnly (Imagefloat* original, Imagefloat* transformed, int cx, int cy, int oW, int oH, int fW, int fH); void transformHighQuality (Imagefloat* original, Imagefloat* transformed, int cx, int cy, int sx, int sy, int oW, int oH, int fW, int fH, const LCPMapper *pLCPMap, bool fullImage); + void sharpenHaloCtrl (float** luminance, float** blurmap, float** base, int W, int H, const SharpeningParams &sharpenParam); void sharpenHaloCtrl (LabImage* lab, float** blurmap, float** base, int W, int H, SharpeningParams &sharpenParam); void sharpenHaloCtrlcam (CieImage* ncie, float** blurmap, float** base, int W, int H); void firstAnalysisThread(Imagefloat* original, Glib::ustring wprofile, unsigned int* histogram, int row_from, int row_to); @@ -271,9 +272,9 @@ public: void Lanczos (const LabImage* src, LabImage* dst, float scale); void Lanczos (const Image16* src, Image16* dst, float scale); - void deconvsharpening (LabImage* lab, float** buffer, SharpeningParams &sharpenParam); - void deconvsharpeningcam (CieImage* ncie, float** buffer); + void deconvsharpening (float** luminance, float** buffer, int W, int H, const SharpeningParams &sharpenParam); void MLsharpen (LabImage* lab);// Manuel's clarity / sharpening + void MLmicrocontrast(float** luminance, int W, int H ); //Manuel's microcontrast void MLmicrocontrast(LabImage* lab ); //Manuel's microcontrast void MLmicrocontrastcam(CieImage* ncie ); //Manuel's microcontrast diff --git a/rtengine/impulse_denoise.h b/rtengine/impulse_denoise.h index 0e3fefd97..ba596559c 100644 --- a/rtengine/impulse_denoise.h +++ b/rtengine/impulse_denoise.h @@ -32,7 +32,6 @@ namespace rtengine SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) { - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // impulse noise removal // local variables @@ -41,15 +40,15 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) int height = lab->H; // buffer for the lowpass image - float ** lpf = new float *[height]; + float * lpf[height] ALIGNED16; + lpf[0] = new float [width * height]; // buffer for the highpass image - float ** impish = new float *[height]; + char * impish[height] ALIGNED16; + impish[0] = new char [width * height]; - for (int i = 0; i < height; i++) { - lpf[i] = new float [width]; - //memset (lpf[i], 0, width*sizeof(float)); - impish[i] = new float [width]; - //memset (impish[i], 0, width*sizeof(unsigned short)); + for (int i = 1; i < height; i++) { + lpf[i] = lpf[i - 1] + width; + impish[i] = impish[i - 1] + width; } @@ -60,12 +59,11 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) const float eps = 1.0; - //rangeblur (lab->L, lpf, impish /*used as buffer here*/, width, height, thresh, false); #ifdef _OPENMP #pragma omp parallel #endif { - gaussianBlur (lab->L, lpf, width, height, max(2.0, thresh - 1.0)); + gaussianBlur (lab->L, lpf, width, height, max(2.0, thresh - 1.0)); } //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -81,9 +79,9 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) int i1, j1, j; float hpfabs, hfnbrave; #ifdef __SSE2__ - __m128 hfnbravev, hpfabsv; - __m128 impthrDiv24v = _mm_set1_ps( impthrDiv24 ); - __m128 onev = _mm_set1_ps( 1.0f ); + vfloat hfnbravev, hpfabsv; + vfloat impthrDiv24v = F2V( impthrDiv24 ); + vfloat onev = F2V( 1.0f ); #endif #ifdef _OPENMP #pragma omp for @@ -105,46 +103,37 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) #ifdef __SSE2__ for (; j < width - 5; j += 4) { - hfnbravev = _mm_setzero_ps( ); + hfnbravev = ZEROV; hpfabsv = vabsf(LVFU(lab->L[i][j]) - LVFU(lpf[i][j])); //block average of high pass data - for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) + for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) { for (j1 = j - 2; j1 <= j + 2; j1++) { hfnbravev += vabsf(LVFU(lab->L[i1][j1]) - LVFU(lpf[i1][j1])); } + } - _mm_storeu_ps(&impish[i][j], vself(vmaskf_gt(hpfabsv, (hfnbravev - hpfabsv)*impthrDiv24v), onev, _mm_setzero_ps())); - } - - for (; j < width - 2; j++) { - hpfabs = fabs(lab->L[i][j] - lpf[i][j]); - - //block average of high pass data - for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ ) - for (j1 = j - 2; j1 <= j + 2; j1++) { - hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]); - } - - impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24)); - } - -#else - - for (; j < width - 2; j++) { - hpfabs = fabs(lab->L[i][j] - lpf[i][j]); - - //block average of high pass data - for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ ) - for (j1 = j - 2; j1 <= j + 2; j1++) { - hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]); - } - - impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24)); + int mask = _mm_movemask_ps((hfnbravev - hpfabsv) * impthrDiv24v - hpfabsv); + impish[i][j] = (mask & 1); + impish[i][j + 1] = ((mask & 2) >> 1); + impish[i][j + 2] = ((mask & 4) >> 2); + impish[i][j + 3] = ((mask & 8) >> 3); } #endif + for (; j < width - 2; j++) { + hpfabs = fabs(lab->L[i][j] - lpf[i][j]); + + //block average of high pass data + for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ ) + for (j1 = j - 2; j1 <= j + 2; j1++) { + hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]); + } + + impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24)); + } + for (; j < width; j++) { hpfabs = fabs(lab->L[i][j] - lpf[i][j]); @@ -188,10 +177,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) for (j1 = 0; j1 <= j + 2; j1++ ) { - if (i1 == i && j1 == j) { - continue; - } - if (impish[i1][j1]) { continue; } @@ -220,10 +205,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) for (j1 = j - 2; j1 <= j + 2; j1++ ) { - if (i1 == i && j1 == j) { - continue; - } - if (impish[i1][j1]) { continue; } @@ -252,10 +233,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) for (j1 = j - 2; j1 < width; j1++ ) { - if (i1 == i && j1 == j) { - continue; - } - if (impish[i1][j1]) { continue; } @@ -277,13 +254,8 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) } //now impulsive values have been corrected - for (int i = 0; i < height; i++) { - delete [] lpf[i]; - delete [] impish[i]; - } - - delete [] lpf; - delete [] impish; + delete [] lpf[0]; + delete [] impish[0]; } @@ -317,7 +289,7 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, #pragma omp parallel #endif { - gaussianBlur (ncie->sh_p, lpf, width, height, max(2.0, thresh - 1.0)); + gaussianBlur (ncie->sh_p, lpf, width, height, max(2.0, thresh - 1.0)); } //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -332,9 +304,9 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, int i1, j1, j; float hpfabs, hfnbrave; #ifdef __SSE2__ - __m128 hfnbravev, hpfabsv; - __m128 impthrDiv24v = _mm_set1_ps( impthrDiv24 ); - __m128 onev = _mm_set1_ps( 1.0f ); + vfloat hfnbravev, hpfabsv; + vfloat impthrDiv24v = F2V( impthrDiv24 ); + vfloat onev = F2V( 1.0f ); #endif #ifdef _OPENMP #pragma omp for @@ -357,7 +329,7 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, for (; j < width - 5; j += 4) { hpfabsv = vabsf(LVFU(ncie->sh_p[i][j]) - LVFU(lpf[i][j])); - hfnbravev = _mm_setzero_ps(); + hfnbravev = ZEROV; //block average of high pass data for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) { @@ -365,38 +337,25 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, hfnbravev += vabsf(LVFU(ncie->sh_p[i1][j1]) - LVFU(lpf[i1][j1])); } - _mm_storeu_ps(&impish[i][j], vself(vmaskf_gt(hpfabsv, (hfnbravev - hpfabsv)*impthrDiv24v), onev, _mm_setzero_ps())); } - } - for (; j < width - 2; j++) { - hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]); - - //block average of high pass data - for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ ) - for (j1 = j - 2; j1 <= j + 2; j1++ ) { - hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]); - } - - impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24)); - } - -#else - - for (; j < width - 2; j++) { - hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]); - - //block average of high pass data - for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ ) - for (j1 = j - 2; j1 <= j + 2; j1++ ) { - hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]); - } - - impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24)); + STVFU(impish[i][j], vselfzero(vmaskf_gt(hpfabsv, (hfnbravev - hpfabsv)*impthrDiv24v), onev)); } #endif + for (; j < width - 2; j++) { + hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]); + + //block average of high pass data + for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ ) + for (j1 = j - 2; j1 <= j + 2; j1++ ) { + hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]); + } + + impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24)); + } + for (; j < width; j++) { hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]); @@ -422,42 +381,34 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, #pragma omp parallel #endif { - int j; - float2 sincosval; + #ifdef __SSE2__ vfloat2 sincosvalv; - __m128 piidv = _mm_set1_ps( piid ); - __m128 tempv; + vfloat piidv = F2V( piid ); + vfloat tempv; #endif #ifdef _OPENMP #pragma omp for #endif for (int i = 0; i < height; i++) { + int j = 0; #ifdef __SSE2__ - for (j = 0; j < width - 3; j += 4) { + for (; j < width - 3; j += 4) { sincosvalv = xsincosf(piidv * LVFU(ncie->h_p[i][j])); tempv = LVFU(ncie->C_p[i][j]); - _mm_storeu_ps(&sraa[i][j], tempv * sincosvalv.y); - _mm_storeu_ps(&srbb[i][j], tempv * sincosvalv.x); - } - - for (; j < width; j++) { - sincosval = xsincosf(piid * ncie->h_p[i][j]); - sraa[i][j] = ncie->C_p[i][j] * sincosval.y; - srbb[i][j] = ncie->C_p[i][j] * sincosval.x; - } - -#else - - for (j = 0; j < width; j++) { - sincosval = xsincosf(piid * ncie->h_p[i][j]); - sraa[i][j] = ncie->C_p[i][j] * sincosval.y; - srbb[i][j] = ncie->C_p[i][j] * sincosval.x; + STVFU(sraa[i][j], tempv * sincosvalv.y); + STVFU(srbb[i][j], tempv * sincosvalv.x); } #endif + + for (; j < width; j++) { + float2 sincosval = xsincosf(piid * ncie->h_p[i][j]); + sraa[i][j] = ncie->C_p[i][j] * sincosval.y; + srbb[i][j] = ncie->C_p[i][j] * sincosval.x; + } } } @@ -488,10 +439,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) for (j1 = 0; j1 <= j + 2; j1++ ) { - if (i1 == i && j1 == j) { - continue; - } - if (impish[i1][j1]) { continue; } @@ -520,10 +467,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) for (j1 = j - 2; j1 <= j + 2; j1++ ) { - if (i1 == i && j1 == j) { - continue; - } - if (impish[i1][j1]) { continue; } @@ -552,10 +495,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) for (j1 = j - 2; j1 < width; j1++ ) { - if (i1 == i && j1 == j) { - continue; - } - if (impish[i1][j1]) { continue; } @@ -583,41 +522,32 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, #endif { #ifdef __SSE2__ - __m128 interav, interbv; - __m128 piidv = _mm_set1_ps(piid); + vfloat interav, interbv; + vfloat piidv = F2V(piid); #endif // __SSE2__ - int j; #ifdef _OPENMP #pragma omp for #endif for(int i = 0; i < height; i++ ) { + int j = 0; #ifdef __SSE2__ - for(j = 0; j < width - 3; j += 4) { + for(; j < width - 3; j += 4) { interav = LVFU(sraa[i][j]); interbv = LVFU(srbb[i][j]); - _mm_storeu_ps(&ncie->h_p[i][j], (xatan2f(interbv, interav)) / piidv); - _mm_storeu_ps(&ncie->C_p[i][j], _mm_sqrt_ps(SQRV(interbv) + SQRV(interav))); + STVFU(ncie->h_p[i][j], (xatan2f(interbv, interav)) / piidv); + STVFU(ncie->C_p[i][j], vsqrtf(SQRV(interbv) + SQRV(interav))); } +#endif + for(; j < width; j++) { float intera = sraa[i][j]; float interb = srbb[i][j]; ncie->h_p[i][j] = (xatan2f(interb, intera)) / piid; ncie->C_p[i][j] = sqrt(SQR(interb) + SQR(intera)); } - -#else - - for(j = 0; j < width; j++) { - float intera = sraa[i][j]; - float interb = srbb[i][j]; - ncie->h_p[i][j] = (xatan2f(interb, intera)) / piid; - ncie->C_p[i][j] = sqrt(SQR(interb) + SQR(intera)); - } - -#endif } } diff --git a/rtengine/ipretinex.cc b/rtengine/ipretinex.cc index 8426f3f75..30863a4f9 100644 --- a/rtengine/ipretinex.cc +++ b/rtengine/ipretinex.cc @@ -18,7 +18,7 @@ * adaptation to RawTherapee * 2015 Jacques Desmis - * 2015 Ingo Weyrich + * 2015 Ingo Weyrich * D. J. Jobson, Z. Rahman, and G. A. Woodell. A multi-scale * Retinex for bridging the gap between color images and the @@ -235,17 +235,19 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e bool higplus = false ; float elogt; float hl = deh.baselog; + if(hl >= 2.71828f) { elogt = 2.71828f + SQR(SQR(hl - 2.71828f)); } else { elogt = hl; } + int H_L = height; int W_L = width; float *tran[H_L] ALIGNED16; float *tranBuffer; - int viewmet=0; + int viewmet = 0; elogt = 2.71828f;//disabled baselog FlatCurve* shcurve = NULL;//curve L=f(H) @@ -280,116 +282,121 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e if (deh.retinexMethod == "highli" || deh.retinexMethod == "highliplus") { moderetinex = 3; } - for(int it=1; it= 0; scale-- ) { if(scale == scal - 1) { - gaussianBlur (src, out, W_L, H_L, RetinexScales[scale], buffer); - } else { - // reuse result of last iteration - gaussianBlur (out, out, W_L, H_L, sqrtf(SQR(RetinexScales[scale]) - SQR(RetinexScales[scale + 1])), buffer); + gaussianBlur (src, out, W_L, H_L, RetinexScales[scale], buffer); + } else { // reuse result of last iteration + gaussianBlur (out, out, W_L, H_L, sqrtf(SQR(RetinexScales[scale]) - SQR(RetinexScales[scale + 1])), buffer); } - //printf("scal=%d RetinexScales=%f\n",scale, RetinexScales[scale]); - printf(".."); - - if(mapmet==4) shradius /= 1.; - else shradius = 40.; + if(mapmet == 4) { + shradius /= 1.; + } else { + shradius = 40.; + } // if(shHighlights > 0 || shShadows > 0) { - if(mapmet==3) if(it==1) shmap->updateL (out, shradius, true, 1);//wav Total - if(mapmet==2 && scale >2) if(it==1) shmap->updateL (out, shradius, true, 1);//wav partial - if(mapmet==4) if(it==1) shmap->updateL (out, shradius, false, 1);//gauss + if(mapmet == 3) if(it == 1) { + shmap->updateL (out, shradius, true, 1); //wav Total + } + + if(mapmet == 2 && scale > 2) if(it == 1) { + shmap->updateL (out, shradius, true, 1); //wav partial + } + + if(mapmet == 4) if(it == 1) { + shmap->updateL (out, shradius, false, 1); //gauss + } + // } if (shmap) { h_th = shmap->max_f - deh.htonalwidth * (shmap->max_f - shmap->avg) / 100; s_th = deh.stonalwidth * (shmap->avg - shmap->min_f) / 100; } - #ifdef __SSE2__ vfloat pondv = F2V(pond); vfloat limMinv = F2V(ilimdx); @@ -490,11 +529,15 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e #ifdef _OPENMP #pragma omp for #endif + for (int i = 0; i < H_L; i++) { if(mapcontlutili) { int j = 0; + for (; j < W_L; j++) { - if(it==1) out[i][j] = mapcurve[2.f * out[i][j]] / 2.f; + if(it == 1) { + out[i][j] = mapcurve[2.f * out[i][j]] / 2.f; + } } } } @@ -502,14 +545,16 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e } // if(shHighlights > 0 || shShadows > 0) { - if(((mapmet == 2 && scale >2) || mapmet==3 || mapmet==4) && it==1) { + if(((mapmet == 2 && scale > 2) || mapmet == 3 || mapmet == 4) && it == 1) { #ifdef _OPENMP #pragma omp for #endif + for (int i = 0; i < H_L; i++) { int j = 0; + for (; j < W_L; j++) { double mapval = 1.0 + shmap->map[i][j]; double factor = 1.0; @@ -519,11 +564,13 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e } else if (mapval < s_th) { factor = (s_th - (100.0 - shShadows) * (s_th - mapval) / 100.0) / mapval; } + out[i][j] *= factor; } } } + // } #ifdef _OPENMP @@ -559,12 +606,13 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e } } } - printf(".\n"); + if(mapmet > 1) { if(shmap) { delete shmap; } } + shmap = NULL; delete [] buffer; @@ -613,12 +661,15 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e } luminance[i][j] *= (-1.f + 4.f * dehatransmissionCurve[absciss]); //new transmission - if(viewmet==3 || viewmet==2) tran[i][j]=luminance[i][j]; + + if(viewmet == 3 || viewmet == 2) { + tran[i][j] = luminance[i][j]; + } } } // median filter on transmission ==> reduce artifacts - if (deh.medianmap && it==1) {//only one time + if (deh.medianmap && it == 1) { //only one time int wid = W_L; int hei = H_L; float *tmL[hei] ALIGNED16; @@ -686,10 +737,10 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e maxCD = -9999999.f; minCD = 9999999.f; // coeff for auto "transmission" with 2 sigma #95% datas - float aza=16300.f/(2.f*stddv); - float azb=-aza*(mean-2.f*stddv); - float bza=16300.f/(2.f*stddv); - float bzb=16300.f-bza*(mean); + float aza = 16300.f / (2.f * stddv); + float azb = -aza * (mean - 2.f * stddv); + float bza = 16300.f / (2.f * stddv); + float bzb = 16300.f - bza * (mean); @@ -718,7 +769,7 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e float str = strengthx; - if(lhutili && it==1) { // S=f(H) + if(lhutili && it == 1) { // S=f(H) { float HH = exLuminance[i][j]; float valparam; @@ -733,15 +784,33 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e } } - if(exLuminance[i][j] > 65535.f*hig && higplus) str *= hig; - if(viewmet==0) luminance[i][j]=clipretinex( cd, 0.f, 32768.f ) * str + (1.f - str) * originalLuminance[i][j]; - if(viewmet==1) luminance[i][j] = out[i][j]; - if(viewmet==4) luminance[i][j] = (1.f + str) * originalLuminance[i][j] - str* out[i][j];//unsharp - if(viewmet==2) { - if(tran[i][j]<= mean) luminance[i][j] = azb + aza*tran[i][j];//auto values - else luminance[i][j] = bzb + bza*tran[i][j]; + if(exLuminance[i][j] > 65535.f * hig && higplus) { + str *= hig; + } + + if(viewmet == 0) { + luminance[i][j] = clipretinex( cd, 0.f, 32768.f ) * str + (1.f - str) * originalLuminance[i][j]; + } + + if(viewmet == 1) { + luminance[i][j] = out[i][j]; + } + + if(viewmet == 4) { + luminance[i][j] = (1.f + str) * originalLuminance[i][j] - str * out[i][j]; //unsharp + } + + if(viewmet == 2) { + if(tran[i][j] <= mean) { + luminance[i][j] = azb + aza * tran[i][j]; //auto values + } else { + luminance[i][j] = bzb + bza * tran[i][j]; + } + } + + if(viewmet == 3) { + luminance[i][j] = 1000.f + tran[i][j] * 700.f; //arbitrary values to help display log values which are between -20 to + 30 - usage values -4 + 5 } - if(viewmet==3) luminance[i][j] = 1000.f + tran[i][j]*700.f;//arbitrary values to help display log values which are between -20 to + 30 - usage values -4 + 5 } @@ -763,11 +832,12 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e Tmax = maxtr; - if (shcurve && it==1) { + if (shcurve && it == 1) { delete shcurve; } } - if(viewmet==3 || viewmet==2) { + + if(viewmet == 3 || viewmet == 2) { delete [] tranBuffer; tranBuffer = NULL; } diff --git a/rtengine/ipsharpen.cc b/rtengine/ipsharpen.cc index 928ffe5bd..a3d624b65 100644 --- a/rtengine/ipsharpen.cc +++ b/rtengine/ipsharpen.cc @@ -23,7 +23,6 @@ #include "rt_math.h" #include "sleef.c" #include "opthelper.h" - using namespace std; namespace rtengine @@ -42,136 +41,96 @@ SSEFUNCTION void ImProcFunctions::dcdamping (float** aI, float** aO, float dampi const float dampingFac = -2.0 / (damping * damping); #ifdef __SSE2__ - __m128 Iv, Ov, Uv, zerov, onev, fourv, fivev, dampingFacv, Tv; + __m128 Iv, Ov, Uv, zerov, onev, fourv, fivev, dampingFacv, Tv, Wv, Lv; zerov = _mm_setzero_ps( ); - onev = _mm_set1_ps( 1.0f ); - fourv = _mm_set1_ps( 4.0f ); - fivev = _mm_set1_ps( 5.0f ); - dampingFacv = _mm_set1_ps( dampingFac ); + onev = F2V( 1.0f ); + fourv = F2V( 4.0f ); + fivev = F2V( 5.0f ); + dampingFacv = F2V( dampingFac ); +#endif #ifdef _OPENMP #pragma omp for #endif - for (int i = 0; i < H; i++) - for (int j = 0; j < W - 3; j += 4) { - Iv = _mm_loadu_ps( &aI[i][j] ); - Ov = _mm_loadu_ps( &aO[i][j] ); + for (int i = 0; i < H; i++) { + int j = 0; +#ifdef __SSE2__ - Uv = (Ov * xlogf(Iv / Ov) - Iv + Ov) * dampingFacv; - Uv = _mm_min_ps(Uv, onev); + for (; j < W - 3; j += 4) { + Iv = LVFU( aI[i][j] ); + Ov = LVFU( aO[i][j] ); + Lv = xlogf(Iv / Ov); + Wv = Ov - Iv; + Uv = (Ov * Lv + Wv) * dampingFacv; + Uv = vminf(Uv, onev); Tv = Uv * Uv; Tv = Tv * Tv; Uv = Tv * (fivev - Uv * fourv); - Uv = (Ov - Iv) / Iv * Uv + onev; - Uv = vself(vmaskf_ge(zerov, Iv), zerov, Uv); - Uv = vself(vmaskf_ge(zerov, Ov), zerov, Uv); - - _mm_storeu_ps( &aI[i][j], Uv ); + Uv = (Wv / Iv) * Uv + onev; + Uv = vselfzero(vmaskf_gt(Iv, zerov), Uv); + Uv = vselfzero(vmaskf_gt(Ov, zerov), Uv); + STVFU( aI[i][j], Uv ); } -// border pixels are done without SSE2 - float I, O, U; -#ifdef _OPENMP - #pragma omp for #endif - for (int i = 0; i < H; i++) - for(int j = W - (W % 4); j < W; j++) { - I = aI[i][j]; - O = aO[i][j]; + for(; j < W; j++) { + float I = aI[i][j]; + float O = aO[i][j]; - if (O <= 0.0 || I <= 0.0) { - aI[i][j] = 0.0; + if (O <= 0.f || I <= 0.f) { + aI[i][j] = 0.f; continue; } - U = (O * xlogf(I / O) - I + O) * dampingFac; + float U = (O * xlogf(I / O) - I + O) * dampingFac; U = min(U, 1.0f); - U = U * U * U * U * (5.0 - U * 4.0); - aI[i][j] = (O - I) / I * U + 1.0; + U = U * U * U * U * (5.f - U * 4.f); + aI[i][j] = (O - I) / I * U + 1.f; } - -#else // without __SSE2__ - float I, O, U; -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H; i++) - for (int j = 0; j < W; j++) { - I = aI[i][j]; - O = aO[i][j]; - - if (O <= 0.0 || I <= 0.0) { - aI[i][j] = 0.0; - continue; - } - - U = (O * xlogf(I / O) - I + O) * dampingFac; - U = min(U, 1.0f); - U = U * U * U * U * (5.0 - U * 4.0); - aI[i][j] = (O - I) / I * U + 1.0; - } - -#endif + } } -void ImProcFunctions::deconvsharpening (LabImage* lab, float** b2, SharpeningParams &sharpenParam) +void ImProcFunctions::deconvsharpening (float** luminance, float** tmp, int W, int H, const SharpeningParams &sharpenParam) { - if (sharpenParam.enabled == false || sharpenParam.deconvamount < 1) { + if (sharpenParam.deconvamount < 1) { return; } - int W = lab->W, H = lab->H; + float *tmpI[H] ALIGNED16; - float** tmpI = new float*[H]; + tmpI[0] = new float[W * H]; + + for (int i = 1; i < H; i++) { + tmpI[i] = tmpI[i - 1] + W; + } for (int i = 0; i < H; i++) { - tmpI[i] = new float[W]; - - for (int j = 0; j < W; j++) { - tmpI[i][j] = (float)lab->L[i][j]; + for(int j = 0; j < W; j++) { + tmpI[i][j] = luminance[i][j]; } } - float** tmp = (float**)b2; + float damping = sharpenParam.deconvdamping / 5.0; + bool needdamp = sharpenParam.deconvdamping > 0; + double sigma = sharpenParam.deconvradius / scale; #ifdef _OPENMP #pragma omp parallel #endif { - float damping = sharpenParam.deconvdamping / 5.0; - bool needdamp = sharpenParam.deconvdamping > 0; - for (int k = 0; k < sharpenParam.deconviter; k++) { - - // apply blur function (gaussian blur) - gaussianBlur (tmpI, tmp, W, H, sharpenParam.deconvradius / scale); - if (!needdamp) { -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H; i++) - for (int j = 0; j < W; j++) - if (tmp[i][j] > 0) { - tmp[i][j] = (float)lab->L[i][j] / tmp[i][j]; - } + // apply gaussian blur and divide luminance by result of gaussian blur + gaussianBlur (tmpI, tmp, W, H, sigma, nullptr, GAUSS_DIV, luminance); } else { - dcdamping (tmp, lab->L, damping, W, H); + // apply gaussian blur + damping + gaussianBlur (tmpI, tmp, W, H, sigma); + dcdamping (tmp, luminance, damping, W, H); } - gaussianBlur (tmp, tmp, W, H, sharpenParam.deconvradius / scale); + gaussianBlur (tmp, tmpI, W, H, sigma, nullptr, GAUSS_MULT); -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H; i++) - for (int j = 0; j < W; j++) { - tmpI[i][j] = tmpI[i][j] * tmp[i][j]; - } } // end for float p2 = sharpenParam.deconvamount / 100.0; @@ -183,34 +142,35 @@ void ImProcFunctions::deconvsharpening (LabImage* lab, float** b2, SharpeningPar for (int i = 0; i < H; i++) for (int j = 0; j < W; j++) { - lab->L[i][j] = lab->L[i][j] * p1 + max(tmpI[i][j], 0.0f) * p2; + luminance[i][j] = luminance[i][j] * p1 + max(tmpI[i][j], 0.0f) * p2; } - } // end parallel - for (int i = 0; i < H; i++) { - delete [] tmpI[i]; - } + delete [] tmpI[0]; - delete [] tmpI; } void ImProcFunctions::sharpening (LabImage* lab, float** b2, SharpeningParams &sharpenParam) { - if (sharpenParam.method == "rld") { - deconvsharpening (lab, b2, sharpenParam); + if (!sharpenParam.enabled) { return; } + if (sharpenParam.method == "rld") { + deconvsharpening (lab->L, b2, lab->W, lab->H, sharpenParam); + return; + } + + if ((!sharpenParam.enabled) || sharpenParam.amount < 1 || lab->W < 8 || lab->H < 8) { + return; + } + + // Rest is UNSHARP MASK - if (sharpenParam.enabled == false || sharpenParam.amount < 1 || lab->W < 8 || lab->H < 8) { - return; - } - int W = lab->W, H = lab->H; - float** b3 = NULL; - float** labCopy = NULL; + float** b3 = nullptr; + float** labCopy = nullptr; if (sharpenParam.edgesonly) { b3 = new float*[H]; @@ -221,7 +181,7 @@ void ImProcFunctions::sharpening (LabImage* lab, float** b2, SharpeningParams &s } if (sharpenParam.halocontrol && !sharpenParam.edgesonly) { - // We only need the lab parameter copy in this special case + // We only need the lab channel copy in this special case labCopy = new float*[H]; for( int i = 0; i < H; i++ ) { @@ -235,10 +195,10 @@ void ImProcFunctions::sharpening (LabImage* lab, float** b2, SharpeningParams &s { if (sharpenParam.edgesonly == false) { - gaussianBlur (lab->L, b2, W, H, sharpenParam.radius / scale); + gaussianBlur (lab->L, b2, W, H, sharpenParam.radius / scale); } else { bilateral (lab->L, (float**)b3, b2, W, H, sharpenParam.edges_radius / scale, sharpenParam.edges_tolerance, multiThread); - gaussianBlur (b3, b2, W, H, sharpenParam.radius / scale); + gaussianBlur (b3, b2, W, H, sharpenParam.radius / scale); } float** base = lab->L; @@ -277,7 +237,7 @@ void ImProcFunctions::sharpening (LabImage* lab, float** b2, SharpeningParams &s base = labCopy; } - sharpenHaloCtrl (lab, b2, base, W, H, sharpenParam); + sharpenHaloCtrl (lab->L, b2, base, W, H, sharpenParam); } } // end parallel @@ -300,7 +260,7 @@ void ImProcFunctions::sharpening (LabImage* lab, float** b2, SharpeningParams &s } } -void ImProcFunctions::sharpenHaloCtrl (LabImage* lab, float** blurmap, float** base, int W, int H, SharpeningParams &sharpenParam) +void ImProcFunctions::sharpenHaloCtrl (float** luminance, float** blurmap, float** base, int W, int H, const SharpeningParams &sharpenParam) { float scale = (100.f - sharpenParam.halocontrol_amount) * 0.01f; @@ -331,7 +291,7 @@ void ImProcFunctions::sharpenHaloCtrl (LabImage* lab, float** blurmap, float** b max2 = maxn; min1 = min2; min2 = minn; - labL = lab->L[i][j]; + labL = luminance[i][j]; if (max_ < labL) { max_ = labL; @@ -358,7 +318,7 @@ void ImProcFunctions::sharpenHaloCtrl (LabImage* lab, float** blurmap, float** b newL = min_ - (min_ - newL) * scale; } - lab->L[i][j] = newL; + luminance[i][j] = newL; } } } @@ -606,851 +566,377 @@ void ImProcFunctions::MLsharpen (LabImage* lab) //! MicroContrast is a sharpening method developed by Manuel Llorens and documented here: http://www.rawness.es/sharpening/?lang=en //!
The purpose is maximize clarity of the image without creating halo's. //!
Addition from JD : pyramid + pondered contrast with matrix 5x5 -//! \param lab LabImage Image in the CIELab colour space +//! \param luminance : Luminance channel of image +void ImProcFunctions::MLmicrocontrast(float** luminance, int W, int H) +{ + if (params->sharpenMicro.enabled == false) { + return; + } + + MyTime t1e, t2e; + t1e.set(); + + int k = params->sharpenMicro.matrix ? 1 : 2; + + // k=2 matrix 5x5 k=1 matrix 3x3 + int offset, offset2, i, j, col, row, n; + float temp, temp2, temp3, temp4, tempL; + float *LM, v, s, contrast; + int signs[25]; + int width = W, height = H; + float uniform = params->sharpenMicro.uniformity;//between 0 to 100 + int unif; + unif = (int)(uniform / 10.0f); //put unif between 0 to 10 + float amount = params->sharpenMicro.amount / 1500.0f; //amount 2000.0 quasi no artefacts ==> 1500 = maximum, after artefacts + + if (amount < 0.000001f) { + return; + } + + if (k == 1) { + amount *= 2.7f; //25/9 if 3x3 + } + + if (settings->verbose) { + printf ("Micro-contrast amount %f\n", amount); + } + + if (settings->verbose) { + printf ("Micro-contrast uniformity %i\n", unif); + } + + //modulation uniformity in function of luminance + float L98[11] = {0.001f, 0.0015f, 0.002f, 0.004f, 0.006f, 0.008f, 0.01f, 0.03f, 0.05f, 0.1f, 0.1f}; + float L95[11] = {0.0012f, 0.002f, 0.005f, 0.01f, 0.02f, 0.05f, 0.1f, 0.12f, 0.15f, 0.2f, 0.25f}; + float L92[11] = {0.01f, 0.015f, 0.02f, 0.06f, 0.10f, 0.13f, 0.17f, 0.25f, 0.3f, 0.32f, 0.35f}; + float L90[11] = {0.015f, 0.02f, 0.04f, 0.08f, 0.12f, 0.15f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f}; + float L87[11] = {0.025f, 0.03f, 0.05f, 0.1f, 0.15f, 0.25f, 0.3f, 0.4f, 0.5f, 0.63f, 0.75f}; + float L83[11] = {0.055f, 0.08f, 0.1f, 0.15f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.75f, 0.85f}; + float L80[11] = {0.15f, 0.2f, 0.25f, 0.3f, 0.35f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f}; + float L75[11] = {0.22f, 0.25f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 0.95f}; + float L70[11] = {0.35f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.97f, 1.0f, 1.0f, 1.0f, 1.0f}; + float L63[11] = {0.55f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + float L58[11] = {0.75f, 0.77f, 0.8f, 0.9f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + //default 5 + //modulation contrast + float Cont0[11] = {0.05f, 0.1f, 0.2f, 0.25f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f}; + float Cont1[11] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 0.95f, 1.0f}; + float Cont2[11] = {0.2f, 0.40f, 0.6f, 0.7f, 0.8f, 0.85f, 0.90f, 0.95f, 1.0f, 1.05f, 1.10f}; + float Cont3[11] = {0.5f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 1.0f, 1.0f, 1.05f, 1.10f, 1.20f}; + float Cont4[11] = {0.8f, 0.85f, 0.9f, 0.95f, 1.0f, 1.05f, 1.10f, 1.150f, 1.2f, 1.25f, 1.40f}; + float Cont5[11] = {1.0f, 1.1f, 1.2f, 1.25f, 1.3f, 1.4f, 1.45f, 1.50f, 1.6f, 1.65f, 1.80f}; + + float chmax = 8.0f; + LM = new float[width * height]; //allocation for Luminance +#ifdef _OPENMP + #pragma omp parallel for private(offset, i,j) shared(LM) +#endif + + for(j = 0; j < height; j++) + for(i = 0, offset = j * width + i; i < width; i++, offset++) { + LM[offset] = luminance[j][i] / 327.68f; // adjust to 0.100 and to RT variables + } + +#ifdef _OPENMP + #pragma omp parallel for private(j,i,offset,s,signs,v,n,row,col,offset2,contrast,temp,temp2,tempL,temp4) shared(luminance,LM,amount,chmax,unif,k,L98,L95,L92,L90,L87,L83,L80,L75,L70,L63,L58,Cont0,Cont1,Cont2,Cont3,Cont4,Cont5) +#endif + + for(j = k; j < height - k; j++) + for(i = k, offset = j * width + i; i < width - k; i++, offset++) { + s = amount; + v = LM[offset]; + n = 0; + + for(row = j - k; row <= j + k; row++) + for(col = i - k, offset2 = row * width + col; col <= i + k; col++, offset2++) { + signs[n] = 0; + + if (v < LM[offset2]) { + signs[n] = -1; + } + + if (v > LM[offset2]) { + signs[n] = 1; + } + + n++; + } + + if (k == 1) { + contrast = sqrt(fabs(LM[offset + 1] - LM[offset - 1]) * fabs(LM[offset + 1] - LM[offset - 1]) + fabs(LM[offset + width] - LM[offset - width]) * fabs(LM[offset + width] - LM[offset - width])) / chmax; //for 3x3 + } else /* if (k==2) */ contrast = sqrt(fabs(LM[offset + 1] - LM[offset - 1]) * fabs(LM[offset + 1] - LM[offset - 1]) + fabs(LM[offset + width] - LM[offset - width]) * fabs(LM[offset + width] - LM[offset - width]) + + fabs(LM[offset + 2] - LM[offset - 2]) * fabs(LM[offset + 2] - LM[offset - 2]) + fabs(LM[offset + 2 * width] - LM[offset - 2 * width]) * fabs(LM[offset + 2 * width] - LM[offset - 2 * width])) / (2 * chmax); //for 5x5 + + if (contrast > 1.0f) { + contrast = 1.0f; + } + + //matrix 5x5 + temp = luminance[j][i] / 327.68f; //begin 3x3 + temp += CLIREF(v - LM[offset - width - 1]) * sqrtf(2.0f) * s; + temp += CLIREF(v - LM[offset - width]) * s; + temp += CLIREF(v - LM[offset - width + 1]) * sqrtf(2.0f) * s; + temp += CLIREF(v - LM[offset - 1]) * s; + temp += CLIREF(v - LM[offset + 1]) * s; + temp += CLIREF(v - LM[offset + width - 1]) * sqrtf(2.0f) * s; + temp += CLIREF(v - LM[offset + width]) * s; + temp += CLIREF(v - LM[offset + width + 1]) * sqrtf(2.0f) * s; //end 3x3 + + // add JD continue 5x5 + if (k == 2) { + temp += 2.0f * CLIREF(v - LM[offset + 2 * width]) * s; + temp += 2.0f * CLIREF(v - LM[offset - 2 * width]) * s; + temp += 2.0f * CLIREF(v - LM[offset - 2 ]) * s; + temp += 2.0f * CLIREF(v - LM[offset + 2 ]) * s; + + temp += 2.0f * CLIREF(v - LM[offset + 2 * width - 1]) * s * sqrtf(1.25f); // 1.25 = 1*1 + 0.5*0.5 + temp += 2.0f * CLIREF(v - LM[offset + 2 * width - 2]) * s * sqrtf(2.00f); + temp += 2.0f * CLIREF(v - LM[offset + 2 * width + 1]) * s * sqrtf(1.25f); + temp += 2.0f * CLIREF(v - LM[offset + 2 * width + 2]) * s * sqrtf(2.00f); + temp += 2.0f * CLIREF(v - LM[offset + width + 2]) * s * sqrtf(1.25f); + temp += 2.0f * CLIREF(v - LM[offset + width - 2]) * s * sqrtf(1.25f); + temp += 2.0f * CLIREF(v - LM[offset - 2 * width - 1]) * s * sqrtf(1.25f); + temp += 2.0f * CLIREF(v - LM[offset - 2 * width - 2]) * s * sqrtf(2.00f); + temp += 2.0f * CLIREF(v - LM[offset - 2 * width + 1]) * s * sqrtf(1.25f); + temp += 2.0f * CLIREF(v - LM[offset - 2 * width + 2]) * s * sqrtf(2.00f); + temp += 2.0f * CLIREF(v - LM[offset - width + 2]) * s * sqrtf(1.25f); + temp += 2.0f * CLIREF(v - LM[offset - width - 2]) * s * sqrtf(1.25f); + } + + if (temp < 0.0f) { + temp = 0.0f; + } + + v = temp; + + n = 0; + + for(row = j - k; row <= j + k; row++) { + for(col = i - k, offset2 = row * width + col; col <= i + k; col++, offset2++) { + if (((v < LM[offset2]) && (signs[n] > 0)) || ((v > LM[offset2]) && (signs[n] < 0))) { + temp = v * 0.75f + LM[offset2] * 0.25f; // 0.75 0.25 + } + + n++; + } + } + + if (LM[offset] > 95.0f || LM[offset] < 5.0f) { + contrast *= Cont0[unif]; //+ JD : luminance pyramid to adjust contrast by evaluation of LM[offset] + } else if (LM[offset] > 90.0f || LM[offset] < 10.0f) { + contrast *= Cont1[unif]; + } else if (LM[offset] > 80.0f || LM[offset] < 20.0f) { + contrast *= Cont2[unif]; + } else if (LM[offset] > 70.0f || LM[offset] < 30.0f) { + contrast *= Cont3[unif]; + } else if (LM[offset] > 60.0f || LM[offset] < 40.0f) { + contrast *= Cont4[unif]; + } else { + contrast *= Cont5[unif]; //(2.0f/k)*Cont5[unif]; + } + + if (contrast > 1.0f) { + contrast = 1.0f; + } + + tempL = 327.68f * (temp * (1.0f - contrast) + LM[offset] * contrast); + // JD: modulation of microcontrast in function of original Luminance and modulation of luminance + temp2 = tempL / (327.68f * LM[offset]); //for highlights + + if (temp2 > 1.0f) { + if (temp2 > 1.70f) { + temp2 = 1.70f; //limit action + } + + if (LM[offset] > 98.0f) { + luminance[j][i] = LM[offset] * 327.68f; + } else if (LM[offset] > 95.0f) { + temp = (L95[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 92.0f) { + temp = (L92[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 90.0f) { + temp = (L90[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 87.0f) { + temp = (L87[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 83.0f) { + temp = (L83[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 80.0f) { + temp = (L80[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 75.0f) { + temp = (L75[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 70.0f) { + temp = (L70[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 63.0f) { + temp = (L63[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 58.0f) { + temp = (L58[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 42.0f) { + temp = (L58[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 37.0f) { + temp = (L63[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 30.0f) { + temp = (L70[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 25.0f) { + temp = (L75[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 20.0f) { + temp = (L80[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 17.0f) { + temp = (L83[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 13.0f) { + temp = (L87[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 10.0f) { + temp = (L90[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 5.0f) { + temp = (L95[unif] * (temp2 - 1.f)) + 1.0f; + luminance[j][i] = temp * LM[offset] * 327.68f; + } else if (LM[offset] > 0.0f) { + luminance[j][i] = LM[offset] * 327.68f; + } + } + + temp4 = (327.68f * LM[offset]) / tempL; // + + if (temp4 > 1.0f) { + if (temp4 > 1.7f) { + temp4 = 1.7f; //limit action + } + + if (LM[offset] < 2.0f) { + temp3 = temp4 - 1.0f; + temp = (L98[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 5.0f) { + temp3 = temp4 - 1.0f; + temp = (L95[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 8.0f) { + temp3 = temp4 - 1.0f; + temp = (L92[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 10.0f) { + temp3 = temp4 - 1.0f; + temp = (L90[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 13.0f) { + temp3 = temp4 - 1.0f; + temp = (L87[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 17.0f) { + temp3 = temp4 - 1.0f; + temp = (L83[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 20.0f) { + temp3 = temp4 - 1.0f; + temp = (L80[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 25.0f) { + temp3 = temp4 - 1.0f; + temp = (L75[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 30.0f) { + temp3 = temp4 - 1.0f; + temp = (L70[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 37.0f) { + temp3 = temp4 - 1.0f; + temp = (L63[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 42.0f) { + temp3 = temp4 - 1.0f; + temp = (L58[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 58.0f) { + temp3 = temp4 - 1.0f; + temp = (L58[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 63.0f) { + temp3 = temp4 - 1.0f; + temp = (L63[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 70.0f) { + temp3 = temp4 - 1.0f; + temp = (L70[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 75.0f) { + temp3 = temp4 - 1.0f; + temp = (L75[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 80.0f) { + temp3 = temp4 - 1.0f; + temp = (L80[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 83.0f) { + temp3 = temp4 - 1.0f; + temp = (L83[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 87.0f) { + temp3 = temp4 - 1.0f; + temp = (L87[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 90.0f) { + temp3 = temp4 - 1.0f; + temp = (L90[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 95.0f) { + temp3 = temp4 - 1.0f; + temp = (L95[unif] * temp3) + 1.0f; + luminance[j][i] = (LM[offset] * 327.68f) / temp; + } else if (LM[offset] < 100.0f) { + luminance[j][i] = LM[offset] * 327.68f; + } + } + + } + + delete [] LM; + t2e.set(); + + if (settings->verbose) { + printf("Micro-contrast %d usec\n", t2e.etime(t1e)); + } + +} + void ImProcFunctions::MLmicrocontrast(LabImage* lab) { - if (params->sharpenMicro.enabled == false) { - return; - } - - MyTime t1e, t2e; - t1e.set(); - int k; - - if (params->sharpenMicro.matrix == false) { - k = 2; - } else { - k = 1; - } - - // k=2 matrix 5x5 k=1 matrix 3x3 - int offset, offset2, i, j, col, row, n; - float temp, temp2, temp3, temp4, tempL; - float *LM, v, s, contrast; - int signs[25]; - int width = lab->W, height = lab->H; - float uniform = params->sharpenMicro.uniformity;//between 0 to 100 - int unif; - unif = (int)(uniform / 10.0f); //put unif between 0 to 10 - float amount = params->sharpenMicro.amount / 1500.0f; //amount 2000.0 quasi no artefacts ==> 1500 = maximum, after artefacts - - if (amount < 0.000001f) { - return; - } - - if (k == 1) { - amount *= 2.7f; //25/9 if 3x3 - } - - if (settings->verbose) { - printf ("Micro-contrast amount %f\n", amount); - } - - if (settings->verbose) { - printf ("Micro-contrast uniformity %i\n", unif); - } - - //modulation uniformity in function of luminance - float L98[11] = {0.001f, 0.0015f, 0.002f, 0.004f, 0.006f, 0.008f, 0.01f, 0.03f, 0.05f, 0.1f, 0.1f}; - float L95[11] = {0.0012f, 0.002f, 0.005f, 0.01f, 0.02f, 0.05f, 0.1f, 0.12f, 0.15f, 0.2f, 0.25f}; - float L92[11] = {0.01f, 0.015f, 0.02f, 0.06f, 0.10f, 0.13f, 0.17f, 0.25f, 0.3f, 0.32f, 0.35f}; - float L90[11] = {0.015f, 0.02f, 0.04f, 0.08f, 0.12f, 0.15f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f}; - float L87[11] = {0.025f, 0.03f, 0.05f, 0.1f, 0.15f, 0.25f, 0.3f, 0.4f, 0.5f, 0.63f, 0.75f}; - float L83[11] = {0.055f, 0.08f, 0.1f, 0.15f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.75f, 0.85f}; - float L80[11] = {0.15f, 0.2f, 0.25f, 0.3f, 0.35f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f}; - float L75[11] = {0.22f, 0.25f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 0.95f}; - float L70[11] = {0.35f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.97f, 1.0f, 1.0f, 1.0f, 1.0f}; - float L63[11] = {0.55f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - float L58[11] = {0.75f, 0.77f, 0.8f, 0.9f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - //default 5 - //modulation contrast - float Cont0[11] = {0.05f, 0.1f, 0.2f, 0.25f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f}; - float Cont1[11] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 0.95f, 1.0f}; - float Cont2[11] = {0.2f, 0.40f, 0.6f, 0.7f, 0.8f, 0.85f, 0.90f, 0.95f, 1.0f, 1.05f, 1.10f}; - float Cont3[11] = {0.5f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 1.0f, 1.0f, 1.05f, 1.10f, 1.20f}; - float Cont4[11] = {0.8f, 0.85f, 0.9f, 0.95f, 1.0f, 1.05f, 1.10f, 1.150f, 1.2f, 1.25f, 1.40f}; - float Cont5[11] = {1.0f, 1.1f, 1.2f, 1.25f, 1.3f, 1.4f, 1.45f, 1.50f, 1.6f, 1.65f, 1.80f}; - - float chmax = 8.0f; - LM = new float[width * height]; //allocation for Luminance -#ifdef _OPENMP - #pragma omp parallel for private(offset, i,j) shared(LM) -#endif - - for(j = 0; j < height; j++) - for(i = 0, offset = j * width + i; i < width; i++, offset++) { - LM[offset] = lab->L[j][i] / 327.68f; // adjust to 0.100 and to RT variables - } - -#ifdef _OPENMP - #pragma omp parallel for private(j,i,offset,s,signs,v,n,row,col,offset2,contrast,temp,temp2,temp3,tempL,temp4) shared(lab,LM,amount,chmax,unif,k,L98,L95,L92,L90,L87,L83,L80,L75,L70,L63,L58,Cont0,Cont1,Cont2,Cont3,Cont4,Cont5) -#endif - - for(j = k; j < height - k; j++) - for(i = k, offset = j * width + i; i < width - k; i++, offset++) { - s = amount; - v = LM[offset]; - n = 0; - - for(row = j - k; row <= j + k; row++) - for(col = i - k, offset2 = row * width + col; col <= i + k; col++, offset2++) { - signs[n] = 0; - - if (v < LM[offset2]) { - signs[n] = -1; - } - - if (v > LM[offset2]) { - signs[n] = 1; - } - - n++; - } - - if (k == 1) { - contrast = sqrt(fabs(LM[offset + 1] - LM[offset - 1]) * fabs(LM[offset + 1] - LM[offset - 1]) + fabs(LM[offset + width] - LM[offset - width]) * fabs(LM[offset + width] - LM[offset - width])) / chmax; //for 3x3 - } else /* if (k==2) */ contrast = sqrt(fabs(LM[offset + 1] - LM[offset - 1]) * fabs(LM[offset + 1] - LM[offset - 1]) + fabs(LM[offset + width] - LM[offset - width]) * fabs(LM[offset + width] - LM[offset - width]) - + fabs(LM[offset + 2] - LM[offset - 2]) * fabs(LM[offset + 2] - LM[offset - 2]) + fabs(LM[offset + 2 * width] - LM[offset - 2 * width]) * fabs(LM[offset + 2 * width] - LM[offset - 2 * width])) / (2 * chmax); //for 5x5 - - if (contrast > 1.0f) { - contrast = 1.0f; - } - - //matrix 5x5 - temp = lab->L[j][i] / 327.68f; //begin 3x3 - temp += CLIREF(v - LM[offset - width - 1]) * sqrtf(2.0f) * s; - temp += CLIREF(v - LM[offset - width]) * s; - temp += CLIREF(v - LM[offset - width + 1]) * sqrtf(2.0f) * s; - temp += CLIREF(v - LM[offset - 1]) * s; - temp += CLIREF(v - LM[offset + 1]) * s; - temp += CLIREF(v - LM[offset + width - 1]) * sqrtf(2.0f) * s; - temp += CLIREF(v - LM[offset + width]) * s; - temp += CLIREF(v - LM[offset + width + 1]) * sqrtf(2.0f) * s; //end 3x3 - - // add JD continue 5x5 - if (k == 2) { - temp += 2.0f * CLIREF(v - LM[offset + 2 * width]) * s; - temp += 2.0f * CLIREF(v - LM[offset - 2 * width]) * s; - temp += 2.0f * CLIREF(v - LM[offset - 2 ]) * s; - temp += 2.0f * CLIREF(v - LM[offset + 2 ]) * s; - - temp += 2.0f * CLIREF(v - LM[offset + 2 * width - 1]) * s * sqrtf(1.25f); // 1.25 = 1*1 + 0.5*0.5 - temp += 2.0f * CLIREF(v - LM[offset + 2 * width - 2]) * s * sqrtf(2.00f); - temp += 2.0f * CLIREF(v - LM[offset + 2 * width + 1]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset + 2 * width + 2]) * s * sqrtf(2.00f); - temp += 2.0f * CLIREF(v - LM[offset + width + 2]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset + width - 2]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset - 2 * width - 1]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset - 2 * width - 2]) * s * sqrtf(2.00f); - temp += 2.0f * CLIREF(v - LM[offset - 2 * width + 1]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset - 2 * width + 2]) * s * sqrtf(2.00f); - temp += 2.0f * CLIREF(v - LM[offset - width + 2]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset - width - 2]) * s * sqrtf(1.25f); - } - - if (temp < 0.0f) { - temp = 0.0f; - } - - v = temp; - - n = 0; - - for(row = j - k; row <= j + k; row++) { - for(col = i - k, offset2 = row * width + col; col <= i + k; col++, offset2++) { - if (((v < LM[offset2]) && (signs[n] > 0)) || ((v > LM[offset2]) && (signs[n] < 0))) { - temp = v * 0.75f + LM[offset2] * 0.25f; // 0.75 0.25 - } - - n++; - } - } - - if (LM[offset] > 95.0f || LM[offset] < 5.0f) { - contrast *= Cont0[unif]; //+ JD : luminance pyramid to adjust contrast by evaluation of LM[offset] - } else if (LM[offset] > 90.0f || LM[offset] < 10.0f) { - contrast *= Cont1[unif]; - } else if (LM[offset] > 80.0f || LM[offset] < 20.0f) { - contrast *= Cont2[unif]; - } else if (LM[offset] > 70.0f || LM[offset] < 30.0f) { - contrast *= Cont3[unif]; - } else if (LM[offset] > 60.0f || LM[offset] < 40.0f) { - contrast *= Cont4[unif]; - } else { - contrast *= Cont5[unif]; //(2.0f/k)*Cont5[unif]; - } - - if (contrast > 1.0f) { - contrast = 1.0f; - } - - tempL = 327.68f * (temp * (1.0f - contrast) + LM[offset] * contrast); - // JD: modulation of microcontrast in function of original Luminance and modulation of luminance - temp2 = tempL / (327.68f * LM[offset]); //for highlights - - if (temp2 > 1.0f) { - if (temp2 > 1.70f) { - temp2 = 1.70f; //limit action - } - - if (LM[offset] > 98.0f) { - lab->L[j][i] = LM[offset] * 327.68f; - } else if (LM[offset] > 95.0f) { - temp3 = temp2 - 1.0f; - temp = (L95[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 92.0f) { - temp3 = temp2 - 1.0f; - temp = (L92[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 90.0f) { - temp3 = temp2 - 1.0f; - temp = (L90[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 87.0f) { - temp3 = temp2 - 1.0f; - temp = (L87[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 83.0f) { - temp3 = temp2 - 1.0f; - temp = (L83[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 80.0f) { - temp3 = temp2 - 1.0f; - temp = (L80[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 75.0f) { - temp3 = temp2 - 1.0f; - temp = (L75[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 70.0f) { - temp3 = temp2 - 1.0f; - temp = (L70[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 63.0f) { - temp3 = temp2 - 1.0f; - temp = (L63[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 58.0f) { - temp3 = temp2 - 1.0f; - temp = (L58[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 42.0f) { - temp3 = temp2 - 1.0f; - temp = (L58[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 37.0f) { - temp3 = temp2 - 1.0f; - temp = (L63[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 30.0f) { - temp3 = temp2 - 1.0f; - temp = (L70[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 25.0f) { - temp3 = temp2 - 1.0f; - temp = (L75[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 20.0f) { - temp3 = temp2 - 1.0f; - temp = (L80[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 17.0f) { - temp3 = temp2 - 1.0f; - temp = (L83[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 13.0f) { - temp3 = temp2 - 1.0f; - temp = (L87[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 10.0f) { - temp3 = temp2 - 1.0f; - temp = (L90[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 5.0f) { - temp3 = temp2 - 1.0f; - temp = (L95[unif] * temp3) + 1.0f; - lab->L[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 0.0f) { - lab->L[j][i] = LM[offset] * 327.68f; - } - } - - temp4 = (327.68f * LM[offset]) / tempL; // - - if (temp4 > 1.0f) { - if (temp4 > 1.7f) { - temp4 = 1.7f; //limit action - } - - if (LM[offset] < 2.0f) { - temp3 = temp4 - 1.0f; - temp = (L98[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 5.0f) { - temp3 = temp4 - 1.0f; - temp = (L95[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 8.0f) { - temp3 = temp4 - 1.0f; - temp = (L92[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 10.0f) { - temp3 = temp4 - 1.0f; - temp = (L90[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 13.0f) { - temp3 = temp4 - 1.0f; - temp = (L87[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 17.0f) { - temp3 = temp4 - 1.0f; - temp = (L83[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 20.0f) { - temp3 = temp4 - 1.0f; - temp = (L80[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 25.0f) { - temp3 = temp4 - 1.0f; - temp = (L75[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 30.0f) { - temp3 = temp4 - 1.0f; - temp = (L70[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 37.0f) { - temp3 = temp4 - 1.0f; - temp = (L63[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 42.0f) { - temp3 = temp4 - 1.0f; - temp = (L58[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 58.0f) { - temp3 = temp4 - 1.0f; - temp = (L58[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 63.0f) { - temp3 = temp4 - 1.0f; - temp = (L63[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 70.0f) { - temp3 = temp4 - 1.0f; - temp = (L70[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 75.0f) { - temp3 = temp4 - 1.0f; - temp = (L75[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 80.0f) { - temp3 = temp4 - 1.0f; - temp = (L80[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 83.0f) { - temp3 = temp4 - 1.0f; - temp = (L83[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 87.0f) { - temp3 = temp4 - 1.0f; - temp = (L87[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 90.0f) { - temp3 = temp4 - 1.0f; - temp = (L90[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 95.0f) { - temp3 = temp4 - 1.0f; - temp = (L95[unif] * temp3) + 1.0f; - lab->L[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 100.0f) { - lab->L[j][i] = LM[offset] * 327.68f; - } - } - - } - - delete [] LM; - t2e.set(); - - if (settings->verbose) { - printf("Micro-contrast %d usec\n", t2e.etime(t1e)); - } - + MLmicrocontrast(lab->L, lab->W, lab->H); } -//! MicroContrast is a sharpening method developed by Manuel Llorens and documented here: http://www.rawness.es/sharpening/?lang=en -//!
The purpose is maximize clarity of the image without creating halo's. -//!
Addition from JD : pyramid + pondered contrast with matrix 5x5 -//! \param ncie CieImage Image in the CIECAM02 colour space void ImProcFunctions::MLmicrocontrastcam(CieImage* ncie) { - if (params->sharpenMicro.enabled == false) { - return; - } - - MyTime t1e, t2e; - t1e.set(); - int k; - - if (params->sharpenMicro.matrix == false) { - k = 2; - } else { - k = 1; - } - - // k=2 matrix 5x5 k=1 matrix 3x3 - int offset, offset2, i, j, col, row, n; - float temp, temp2, temp3, temp4, tempL; - float *LM, v, s, contrast; - int signs[25]; - int width = ncie->W, height = ncie->H; - float uniform = params->sharpenMicro.uniformity;//between 0 to 100 - int unif; - unif = (int)(uniform / 10.0f); //put unif between 0 to 10 - float amount = params->sharpenMicro.amount / 1500.0f; //amount 2000.0 quasi no artefacts ==> 1500 = maximum, after artefacts - - if (amount < 0.000001f) { - return; - } - - if (k == 1) { - amount *= 2.7f; //25/9 if 3x3 - } - - if (settings->verbose) { - printf ("Micro-contrast amount %f\n", amount); - } - - if (settings->verbose) { - printf ("Micro-contrast uniformity %i\n", unif); - } - - //modulation uniformity in function of luminance - float L98[11] = {0.001f, 0.0015f, 0.002f, 0.004f, 0.006f, 0.008f, 0.01f, 0.03f, 0.05f, 0.1f, 0.1f}; - float L95[11] = {0.0012f, 0.002f, 0.005f, 0.01f, 0.02f, 0.05f, 0.1f, 0.12f, 0.15f, 0.2f, 0.25f}; - float L92[11] = {0.01f, 0.015f, 0.02f, 0.06f, 0.10f, 0.13f, 0.17f, 0.25f, 0.3f, 0.32f, 0.35f}; - float L90[11] = {0.015f, 0.02f, 0.04f, 0.08f, 0.12f, 0.15f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f}; - float L87[11] = {0.025f, 0.03f, 0.05f, 0.1f, 0.15f, 0.25f, 0.3f, 0.4f, 0.5f, 0.63f, 0.75f}; - float L83[11] = {0.055f, 0.08f, 0.1f, 0.15f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.75f, 0.85f}; - float L80[11] = {0.15f, 0.2f, 0.25f, 0.3f, 0.35f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f}; - float L75[11] = {0.22f, 0.25f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 0.95f}; - float L70[11] = {0.35f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.97f, 1.0f, 1.0f, 1.0f, 1.0f}; - float L63[11] = {0.55f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - float L58[11] = {0.75f, 0.77f, 0.8f, 0.9f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - //default 5 - //modulation contrast - float Cont0[11] = {0.05f, 0.1f, 0.2f, 0.25f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f}; - float Cont1[11] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 0.95f, 1.0f}; - float Cont2[11] = {0.2f, 0.40f, 0.6f, 0.7f, 0.8f, 0.85f, 0.90f, 0.95f, 1.0f, 1.05f, 1.10f}; - float Cont3[11] = {0.5f, 0.6f, 0.7f, 0.8f, 0.85f, 0.9f, 1.0f, 1.0f, 1.05f, 1.10f, 1.20f}; - float Cont4[11] = {0.8f, 0.85f, 0.9f, 0.95f, 1.0f, 1.05f, 1.10f, 1.150f, 1.2f, 1.25f, 1.40f}; - float Cont5[11] = {1.0f, 1.1f, 1.2f, 1.25f, 1.3f, 1.4f, 1.45f, 1.50f, 1.6f, 1.65f, 1.80f}; - - float chmax = 8.0f; - LM = new float[width * height]; //allocation for Luminance -#ifdef _OPENMP - #pragma omp parallel for private(offset, i,j) shared(LM) -#endif - - for(j = 0; j < height; j++) - for(i = 0, offset = j * width + i; i < width; i++, offset++) { - LM[offset] = ncie->sh_p[j][i] / 327.68f; // adjust to 0.100 and to RT variables - } - -#ifdef _OPENMP - #pragma omp parallel for private(j,i,offset,s,signs,v,n,row,col,offset2,contrast,temp,temp2,temp3,tempL,temp4) shared(ncie,LM,amount,chmax,unif,k,L98,L95,L92,L90,L87,L83,L80,L75,L70,L63,L58,Cont0,Cont1,Cont2,Cont3,Cont4,Cont5) -#endif - - for(j = k; j < height - k; j++) - for(i = k, offset = j * width + i; i < width - k; i++, offset++) { - s = amount; - v = LM[offset]; - n = 0; - - for(row = j - k; row <= j + k; row++) - for(col = i - k, offset2 = row * width + col; col <= i + k; col++, offset2++) { - signs[n] = 0; - - if (v < LM[offset2]) { - signs[n] = -1; - } - - if (v > LM[offset2]) { - signs[n] = 1; - } - - n++; - } - - if (k == 1) { - contrast = sqrt(fabs(LM[offset + 1] - LM[offset - 1]) * fabs(LM[offset + 1] - LM[offset - 1]) + fabs(LM[offset + width] - LM[offset - width]) * fabs(LM[offset + width] - LM[offset - width])) / chmax; //for 3x3 - } else /* if (k==2) */ contrast = sqrt(fabs(LM[offset + 1] - LM[offset - 1]) * fabs(LM[offset + 1] - LM[offset - 1]) + fabs(LM[offset + width] - LM[offset - width]) * fabs(LM[offset + width] - LM[offset - width]) - + fabs(LM[offset + 2] - LM[offset - 2]) * fabs(LM[offset + 2] - LM[offset - 2]) + fabs(LM[offset + 2 * width] - LM[offset - 2 * width]) * fabs(LM[offset + 2 * width] - LM[offset - 2 * width])) / (2 * chmax); //for 5x5 - - if (contrast > 1.0f) { - contrast = 1.0f; - } - - //matrix 5x5 - temp = ncie->sh_p[j][i] / 327.68f; //begin 3x3 - temp += CLIREF(v - LM[offset - width - 1]) * sqrtf(2.0f) * s; - temp += CLIREF(v - LM[offset - width]) * s; - temp += CLIREF(v - LM[offset - width + 1]) * sqrtf(2.0f) * s; - temp += CLIREF(v - LM[offset - 1]) * s; - temp += CLIREF(v - LM[offset + 1]) * s; - temp += CLIREF(v - LM[offset + width - 1]) * sqrtf(2.0f) * s; - temp += CLIREF(v - LM[offset + width]) * s; - temp += CLIREF(v - LM[offset + width + 1]) * sqrtf(2.0f) * s; //end 3x3 - - // add JD continue 5x5 - if (k == 2) { - temp += 2.0f * CLIREF(v - LM[offset + 2 * width]) * s; - temp += 2.0f * CLIREF(v - LM[offset - 2 * width]) * s; - temp += 2.0f * CLIREF(v - LM[offset - 2 ]) * s; - temp += 2.0f * CLIREF(v - LM[offset + 2 ]) * s; - - temp += 2.0f * CLIREF(v - LM[offset + 2 * width - 1]) * s * sqrtf(1.25f); // 1.25 = 1*1 + 0.5*0.5 - temp += 2.0f * CLIREF(v - LM[offset + 2 * width - 2]) * s * sqrtf(2.00f); - temp += 2.0f * CLIREF(v - LM[offset + 2 * width + 1]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset + 2 * width + 2]) * s * sqrtf(2.00f); - temp += 2.0f * CLIREF(v - LM[offset + width + 2]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset + width - 2]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset - 2 * width - 1]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset - 2 * width - 2]) * s * sqrtf(2.00f); - temp += 2.0f * CLIREF(v - LM[offset - 2 * width + 1]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset - 2 * width + 2]) * s * sqrtf(2.00f); - temp += 2.0f * CLIREF(v - LM[offset - width + 2]) * s * sqrtf(1.25f); - temp += 2.0f * CLIREF(v - LM[offset - width - 2]) * s * sqrtf(1.25f); - } - - if (temp < 0.0f) { - temp = 0.0f; - } - - v = temp; - - n = 0; - - for(row = j - k; row <= j + k; row++) { - for(col = i - k, offset2 = row * width + col; col <= i + k; col++, offset2++) { - if (((v < LM[offset2]) && (signs[n] > 0)) || ((v > LM[offset2]) && (signs[n] < 0))) { - temp = v * 0.75f + LM[offset2] * 0.25f; // 0.75 0.25 - } - - n++; - } - } - - if (LM[offset] > 95.0f || LM[offset] < 5.0f) { - contrast *= Cont0[unif]; //+ JD : luminance pyramid to adjust contrast by evaluation of LM[offset] - } else if (LM[offset] > 90.0f || LM[offset] < 10.0f) { - contrast *= Cont1[unif]; - } else if (LM[offset] > 80.0f || LM[offset] < 20.0f) { - contrast *= Cont2[unif]; - } else if (LM[offset] > 70.0f || LM[offset] < 30.0f) { - contrast *= Cont3[unif]; - } else if (LM[offset] > 60.0f || LM[offset] < 40.0f) { - contrast *= Cont4[unif]; - } else { - contrast *= Cont5[unif]; //(2.0f/k)*Cont5[unif]; - } - - if (contrast > 1.0f) { - contrast = 1.0f; - } - - tempL = 327.68f * (temp * (1.0f - contrast) + LM[offset] * contrast); - // JD: modulation of microcontrast in function of original Luminance and modulation of luminance - temp2 = tempL / (327.68f * LM[offset]); //for highlights - - if (temp2 > 1.0f) { - if (temp2 > 1.70f) { - temp2 = 1.70f; //limit action - } - - if (LM[offset] > 98.0f) { - ncie->sh_p[j][i] = LM[offset] * 327.68f; - } else if (LM[offset] > 95.0f) { - temp3 = temp2 - 1.0f; - temp = (L95[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 92.0f) { - temp3 = temp2 - 1.0f; - temp = (L92[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 90.0f) { - temp3 = temp2 - 1.0f; - temp = (L90[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 87.0f) { - temp3 = temp2 - 1.0f; - temp = (L87[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 83.0f) { - temp3 = temp2 - 1.0f; - temp = (L83[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 80.0f) { - temp3 = temp2 - 1.0f; - temp = (L80[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 75.0f) { - temp3 = temp2 - 1.0f; - temp = (L75[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 70.0f) { - temp3 = temp2 - 1.0f; - temp = (L70[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 63.0f) { - temp3 = temp2 - 1.0f; - temp = (L63[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 58.0f) { - temp3 = temp2 - 1.0f; - temp = (L58[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 42.0f) { - temp3 = temp2 - 1.0f; - temp = (L58[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 37.0f) { - temp3 = temp2 - 1.0f; - temp = (L63[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 30.0f) { - temp3 = temp2 - 1.0f; - temp = (L70[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 25.0f) { - temp3 = temp2 - 1.0f; - temp = (L75[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 20.0f) { - temp3 = temp2 - 1.0f; - temp = (L80[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 17.0f) { - temp3 = temp2 - 1.0f; - temp = (L83[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 13.0f) { - temp3 = temp2 - 1.0f; - temp = (L87[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 10.0f) { - temp3 = temp2 - 1.0f; - temp = (L90[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 5.0f) { - temp3 = temp2 - 1.0f; - temp = (L95[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = temp * LM[offset] * 327.68f; - } else if (LM[offset] > 0.0f) { - ncie->sh_p[j][i] = LM[offset] * 327.68f; - } - } - - temp4 = (327.68f * LM[offset]) / tempL; // - - if (temp4 > 1.0f) { - if (temp4 > 1.7f) { - temp4 = 1.7f; //limit action - } - - if (LM[offset] < 2.0f) { - temp3 = temp4 - 1.0f; - temp = (L98[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 5.0f) { - temp3 = temp4 - 1.0f; - temp = (L95[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 8.0f) { - temp3 = temp4 - 1.0f; - temp = (L92[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 10.0f) { - temp3 = temp4 - 1.0f; - temp = (L90[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 13.0f) { - temp3 = temp4 - 1.0f; - temp = (L87[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 17.0f) { - temp3 = temp4 - 1.0f; - temp = (L83[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 20.0f) { - temp3 = temp4 - 1.0f; - temp = (L80[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 25.0f) { - temp3 = temp4 - 1.0f; - temp = (L75[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 30.0f) { - temp3 = temp4 - 1.0f; - temp = (L70[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 37.0f) { - temp3 = temp4 - 1.0f; - temp = (L63[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 42.0f) { - temp3 = temp4 - 1.0f; - temp = (L58[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 58.0f) { - temp3 = temp4 - 1.0f; - temp = (L58[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 63.0f) { - temp3 = temp4 - 1.0f; - temp = (L63[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 70.0f) { - temp3 = temp4 - 1.0f; - temp = (L70[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 75.0f) { - temp3 = temp4 - 1.0f; - temp = (L75[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 80.0f) { - temp3 = temp4 - 1.0f; - temp = (L80[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 83.0f) { - temp3 = temp4 - 1.0f; - temp = (L83[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 87.0f) { - temp3 = temp4 - 1.0f; - temp = (L87[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 90.0f) { - temp3 = temp4 - 1.0f; - temp = (L90[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 95.0f) { - temp3 = temp4 - 1.0f; - temp = (L95[unif] * temp3) + 1.0f; - ncie->sh_p[j][i] = (LM[offset] * 327.68f) / temp; - } else if (LM[offset] < 100.0f) { - ncie->sh_p[j][i] = LM[offset] * 327.68f; - } - } - - } - - delete [] LM; - t2e.set(); - - if (settings->verbose) { - printf("Micro-contrast %d usec\n", t2e.etime(t1e)); - } - -} - -void ImProcFunctions::deconvsharpeningcam (CieImage* ncie, float** b2) -{ - - if (params->sharpening.enabled == false || params->sharpening.deconvamount < 1) { - return; - } - - int W = ncie->W, H = ncie->H; - - float** tmpI = new float*[H]; - - for (int i = 0; i < H; i++) { - tmpI[i] = new float[W]; - - for (int j = 0; j < W; j++) { - tmpI[i][j] = (float)ncie->sh_p[i][j]; - } - } - - float** tmp = (float**)b2; - -#ifdef _OPENMP - #pragma omp parallel -#endif - { - float damping = params->sharpening.deconvdamping / 5.0; - bool needdamp = params->sharpening.deconvdamping > 0; - - for (int k = 0; k < params->sharpening.deconviter; k++) { - - // apply blur function (gaussian blur) - gaussianBlur (tmpI, tmp, W, H, params->sharpening.deconvradius / scale); - - if (!needdamp) { -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H; i++) - for (int j = 0; j < W; j++) - if (tmp[i][j] > 0) { - tmp[i][j] = (float)ncie->sh_p[i][j] / tmp[i][j]; - } - } else { - dcdamping (tmp, ncie->sh_p, damping, W, H); - } - - gaussianBlur (tmp, tmp, W, H, params->sharpening.deconvradius / scale); - - -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H; i++) - for (int j = 0; j < W; j++) { - tmpI[i][j] = tmpI[i][j] * tmp[i][j]; - } - } // end for - -// float p2 = params->sharpening.deconvamount / 100.0; - float p2 = params->sharpening.deconvamount / 100.0; - float p1 = 1.0 - p2; - -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 0; i < H; i++) - for (int j = 0; j < W; j++) - if(ncie->J_p[i][j] > 8.0f && ncie->J_p[i][j] < 92.0f) { - ncie->sh_p[i][j] = ncie->sh_p[i][j] * p1 + max(tmpI[i][j], 0.0f) * p2; - } - - } // end parallel - - for (int i = 0; i < H; i++) { - delete [] tmpI[i]; - } - - delete [] tmpI; - + MLmicrocontrast(ncie->sh_p, ncie->W, ncie->H); } void ImProcFunctions::sharpeningcam (CieImage* ncie, float** b2) { + if ((!params->sharpening.enabled) || params->sharpening.amount < 1 || ncie->W < 8 || ncie->H < 8) { + return; + } if (params->sharpening.method == "rld") { - deconvsharpeningcam (ncie, b2); + deconvsharpening (ncie->sh_p, b2, ncie->W, ncie->H, params->sharpening); return; } // Rest is UNSHARP MASK - if (params->sharpening.enabled == false || params->sharpening.amount < 1 || ncie->W < 8 || ncie->H < 8) { - return; - } int W = ncie->W, H = ncie->H; float** b3; @@ -1479,10 +965,10 @@ void ImProcFunctions::sharpeningcam (CieImage* ncie, float** b2) { if (params->sharpening.edgesonly == false) { - gaussianBlur (ncie->sh_p, b2, W, H, params->sharpening.radius / scale); + gaussianBlur (ncie->sh_p, b2, W, H, params->sharpening.radius / scale); } else { bilateral (ncie->sh_p, (float**)b3, b2, W, H, params->sharpening.edges_radius / scale, params->sharpening.edges_tolerance, multiThread); - gaussianBlur (b3, b2, W, H, params->sharpening.radius / scale); + gaussianBlur (b3, b2, W, H, params->sharpening.radius / scale); } float** base = ncie->sh_p; @@ -1524,7 +1010,7 @@ void ImProcFunctions::sharpeningcam (CieImage* ncie, float** b2) base = ncieCopy; } - sharpenHaloCtrlcam (ncie, b2, base, W, H); + sharpenHaloCtrl (ncie->sh_p, b2, base, W, H, params->sharpening); } } // end parallel @@ -1547,67 +1033,4 @@ void ImProcFunctions::sharpeningcam (CieImage* ncie, float** b2) } } -void ImProcFunctions::sharpenHaloCtrlcam (CieImage* ncie, float** blurmap, float** base, int W, int H) -{ - - float scale = (100.f - params->sharpening.halocontrol_amount) * 0.01f; - float sharpFac = params->sharpening.amount * 0.01f; - float** nL = base; - -#ifdef _OPENMP - #pragma omp for -#endif - - for (int i = 2; i < H - 2; i++) { - float max1 = 0, max2 = 0, min1 = 0, min2 = 0, maxn, minn, np1, np2, np3, min_, max_, labL; - - for (int j = 2; j < W - 2; j++) { - // compute 3 iterations, only forward - np1 = 2.f * (nL[i - 2][j] + nL[i - 2][j + 1] + nL[i - 2][j + 2] + nL[i - 1][j] + nL[i - 1][j + 1] + nL[i - 1][j + 2] + nL[i] [j] + nL[i] [j + 1] + nL[i] [j + 2]) / 27.f + nL[i - 1][j + 1] / 3.f; - np2 = 2.f * (nL[i - 1][j] + nL[i - 1][j + 1] + nL[i - 1][j + 2] + nL[i] [j] + nL[i] [j + 1] + nL[i] [j + 2] + nL[i + 1][j] + nL[i + 1][j + 1] + nL[i + 1][j + 2]) / 27.f + nL[i] [j + 1] / 3.f; - np3 = 2.f * (nL[i] [j] + nL[i] [j + 1] + nL[i] [j + 2] + nL[i + 1][j] + nL[i + 1][j + 1] + nL[i + 1][j + 2] + nL[i + 2][j] + nL[i + 2][j + 1] + nL[i + 2][j + 2]) / 27.f + nL[i + 1][j + 1] / 3.f; - - // Max/Min of all these deltas and the last two max/min - maxn = max(np1, np2, np3); - minn = min(np1, np2, np3); - max_ = max(max1, max2, maxn); - min_ = min(min1, min2, minn); - - // Shift the queue - max1 = max2; - max2 = maxn; - min1 = min2; - min2 = minn; - labL = ncie->sh_p[i][j]; - - if (max_ < labL) { - max_ = labL; - } - - if (min_ > labL) { - min_ = labL; - } - - // deviation from the environment as measurement - float diff = nL[i][j] - blurmap[i][j]; - - const float upperBound = 2000.f; // WARNING: Duplicated value, it's baaaaaad ! - float delta = params->sharpening.threshold.multiply( - min(ABS(diff), upperBound), // X axis value = absolute value of the difference - sharpFac * diff // Y axis max value = sharpening.amount * signed difference - ); - float newL = labL + delta; - - // applying halo control - if (newL > max_) { - newL = max_ + (newL - max_) * scale; - } else if (newL < min_) { - newL = min_ - (min_ - newL) * scale; - } - - ncie->sh_p[i][j] = newL; - } - } -} - } diff --git a/rtengine/shmap.cc b/rtengine/shmap.cc index 2471b17a4..4f93c80d3 100644 --- a/rtengine/shmap.cc +++ b/rtengine/shmap.cc @@ -88,7 +88,7 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int #pragma omp parallel #endif { - gaussianBlur (map, map, W, H, radius); + gaussianBlur (map, map, W, H, radius); } } @@ -233,7 +233,7 @@ void SHMap::updateL (float** L, double radius, bool hq, int skip) #pragma omp parallel #endif { - gaussianBlur (map, map, W, H, radius); + gaussianBlur (map, map, W, H, radius); } } @@ -244,7 +244,7 @@ void SHMap::updateL (float** L, double radius, bool hq, int skip) //experimental dirpyr shmap float thresh = (100.f * radius); //1000; int levrad = 16; - levrad=2;//for retinex - otherwise levrad = 16 + levrad = 2; //for retinex - otherwise levrad = 16 // set up range function // calculate size of Lookup table. That's possible because from a value k for all i>=k rangefn[i] will be exp(-10) // So we use this fact and the automatic clip of lut to reduce the size of lut and the number of calculations to fill the lut @@ -253,6 +253,7 @@ void SHMap::updateL (float** L, double radius, bool hq, int skip) const int lutSize = (int) thresh * sqrtf(10.f) + 1; thresh *= thresh; LUTf rangefn(lutSize); + for (int i = 0; i < lutSize - 1; i++) { rangefn[i] = xexpf(-min(10.f, (static_cast(i) * i) / thresh )); //*intfactor; } @@ -275,6 +276,7 @@ void SHMap::updateL (float** L, double radius, bool hq, int skip) scale *= 2; numLevels++; } + //printf("numlev=%d\n",numLevels); float ** dirpyrlo[2]; diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index c2d1278f4..453025fd6 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -901,11 +901,7 @@ static INLINE vdouble xlog1p(vdouble a) { typedef struct { vfloat x, y; } vfloat2; -#if defined( __FMA__ ) && defined( __x86_64__ ) - static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x,y,z); } -#else - static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return vaddf(vmulf(x, y), z); } -#endif + static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); } static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); } @@ -921,6 +917,18 @@ static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vca } #endif +static INLINE vfloat vselfzero(vmask mask, vfloat x) { + // returns value of x if corresponding mask bits are 1, else returns 0 + // faster than vself(mask, x, ZEROV) + return _mm_and_ps((vfloat)mask, x); +} +static INLINE vfloat vselfnotzero(vmask mask, vfloat x) { + // returns value of x if corresponding mask bits are 0, else returns 0 + // faster than vself(mask, ZEROV, x) + return _mm_andnot_ps((vfloat)mask, x); +} + + static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) { vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1)); return vori2(vandi2(m2, x), vandnoti2(m2, y)); @@ -1171,7 +1179,7 @@ static INLINE vfloat xatan2f(vfloat y, vfloat x) { r = vmulsignf(r, x); r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(M_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(M_PI/2)), x))), r); r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(M_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(M_PI/4)), x))), r); - r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vself(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)M_PI), vcast_vf_f(0.0f)), r); + r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)M_PI)), r); return vself(vorm(vmaskf_isnan(x), vmaskf_isnan(y)), vcast_vf_f(NANf), vmulsignf(r, y)); } @@ -1304,7 +1312,7 @@ static INLINE vfloat xcbrtf(vfloat d) { } static INLINE vfloat LIMV( vfloat a, vfloat b, vfloat c ) { -return _mm_max_ps( b, _mm_min_ps(a,c)); +return vmaxf( b, vminf(a,c)); } static INLINE vfloat ULIMV( vfloat a, vfloat b, vfloat c ){ @@ -1312,13 +1320,13 @@ static INLINE vfloat ULIMV( vfloat a, vfloat b, vfloat c ){ } static INLINE vfloat SQRV(vfloat a){ - return _mm_mul_ps( a,a ); + return a * a; } static inline void vswap( vmask condition, vfloat &a, vfloat &b) { - vfloat temp = vself(condition, a, b); // the larger of the two - condition = vnotm(condition); // invert the mask - a = vself(condition, a, b); // the smaller of the two + vfloat temp = vself(condition, a, b); // the values which fit to condition + condition = vnotm(condition); // invert the condition + a = vself(condition, a, b); // the values which fit to inverted condition b = temp; } diff --git a/rtgui/sharpening.cc b/rtgui/sharpening.cc index a36b1aa07..09207a883 100644 --- a/rtgui/sharpening.cc +++ b/rtgui/sharpening.cc @@ -41,7 +41,7 @@ Sharpening::Sharpening () : FoldableToolPanel(this, "sharpening", M("TP_SHARPENI pack_start (*hb); rld = new Gtk::VBox (); - dradius = Gtk::manage (new Adjuster (M("TP_SHARPENING_EDRADIUS"), 0.5, 2.5, 0.01, 0.75)); + dradius = Gtk::manage (new Adjuster (M("TP_SHARPENING_EDRADIUS"), 0.4, 2.5, 0.01, 0.75)); damount = Gtk::manage (new Adjuster (M("TP_SHARPENING_RLD_AMOUNT"), 0.0, 100, 1, 75)); ddamping = Gtk::manage (new Adjuster (M("TP_SHARPENING_RLD_DAMPING"), 0, 100, 1, 20)); diter = Gtk::manage (new Adjuster (M("TP_SHARPENING_RLD_ITERATIONS"), 5, 100, 1, 30));