Cleaned gauss code and included some speedups

This commit is contained in:
heckflosse 2016-01-18 23:56:02 +01:00
parent 33ea7156b8
commit a3c20daa46
12 changed files with 2117 additions and 2177 deletions

View File

@ -8,7 +8,7 @@ link_directories ("${PROJECT_SOURCE_DIR}/rtexif" ${EXTRA_LIBDIR} ${GTHREAD_LIBRA
set (CAMCONSTSFILE "camconst.json")
set (RTENGINESOURCEFILES safegtk.cc colortemp.cc curves.cc flatcurves.cc diagonalcurves.cc dcraw.cc iccstore.cc color.cc
dfmanager.cc ffmanager.cc rawimage.cc image8.cc image16.cc imagefloat.cc imagedata.cc imageio.cc improcfun.cc init.cc dcrop.cc
dfmanager.cc ffmanager.cc gauss.cc rawimage.cc image8.cc image16.cc imagefloat.cc imagedata.cc imageio.cc improcfun.cc init.cc dcrop.cc
loadinitial.cc procparams.cc rawimagesource.cc demosaic_algos.cc shmap.cc simpleprocess.cc refreshmap.cc
fast_demo.cc amaze_demosaic_RT.cc CA_correct_RT.cc cfa_linedn_RT.cc green_equil_RT.cc hilite_recon.cc expo_before_b.cc
stdimagesource.cc myfile.cc iccjpeg.cc hlmultipliers.cc improccoordinator.cc editbuffer.cc coord.cc

View File

@ -65,40 +65,12 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
#pragma omp parallel
#endif
{
gaussianBlur<float> (src->a, tmp1->a, src->W, src->H, radius);
gaussianBlur<float> (src->b, tmp1->b, src->W, src->H, radius);
gaussianBlur (src->a, tmp1->a, src->W, src->H, radius);
gaussianBlur (src->b, tmp1->b, src->W, src->H, radius);
}
float chromave = 0.0f;
#ifdef __SSE2__
if( chCurve ) {
// vectorized precalculation of the atan2 values
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int j;
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++ )
{
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps(&fringe[i * width + j], xatan2f(LVFU(src->b[i][j]), LVFU(src->a[i][j])));
}
for(; j < width; j++) {
fringe[i * width + j] = xatan2f(src->b[i][j], src->a[i][j]);
}
}
}
}
#endif
#ifdef _OPENMP
#pragma omp parallel
#endif
@ -109,6 +81,23 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
#endif
for(int i = 0; i < height; i++ ) {
#ifdef __SSE2__
// vectorized per row precalculation of the atan2 values
if (chCurve) {
int k = 0;
for(; k < width - 3; k += 4) {
STVFU(fringe[i * width + k], xatan2f(LVFU(src->b[i][k]), LVFU(src->a[i][k])));
}
for(; k < width; k++) {
fringe[i * width + k] = xatan2f(src->b[i][k], src->a[i][k]);
}
}
#endif // __SSE2__
for(int j = 0; j < width; j++) {
if (chCurve) {
#ifdef __SSE2__
@ -144,19 +133,21 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
#pragma omp parallel
#endif
{
__m128 sumv = _mm_set1_ps( chromave );
__m128 onev = _mm_set1_ps( 1.0f );
__m128 sumv = F2V( chromave );
__m128 onev = F2V( 1.0f );
#ifdef _OPENMP
#pragma omp for
#pragma omp for nowait
#endif
for(int j = 0; j < width * height - 3; j += 4) {
_mm_storeu_ps( &fringe[j], onev / (LVFU(fringe[j]) + sumv));
STVFU(fringe[j], onev / (LVFU(fringe[j]) + sumv));
}
}
for(int j = width * height - (width * height) % 4; j < width * height; j++) {
fringe[j] = 1.f / (fringe[j] + chromave);
#pragma omp single
for(int j = width * height - (width * height) % 4; j < width * height; j++) {
fringe[j] = 1.f / (fringe[j] + chromave);
}
}
#else
@ -191,8 +182,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i * width + j] < threshfactor) {
float atot = 0.f;
float btot = 0.f;
@ -218,8 +207,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i * width + j] < threshfactor) {
float atot = 0.f;
float btot = 0.f;
@ -245,8 +232,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i * width + j] < threshfactor) {
float atot = 0.f;
float btot = 0.f;
@ -355,7 +340,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#ifdef __SSE2__
int j;
vfloat2 sincosvalv;
__m128 piidv = _mm_set1_ps(piid);
__m128 piidv = F2V(piid);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for
@ -366,8 +351,8 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
for (j = 0; j < width - 3; j += 4) {
sincosvalv = xsincosf(piidv * LVFU(src->h_p[i][j]));
_mm_storeu_ps(&sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y);
_mm_storeu_ps(&srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x);
STVFU(sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y);
STVFU(srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x);
}
for (; j < width; j++) {
@ -392,8 +377,8 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#pragma omp parallel
#endif
{
gaussianBlur<float> (sraa, tmaa, src->W, src->H, radius);
gaussianBlur<float> (srbb, tmbb, src->W, src->H, radius);
gaussianBlur (sraa, tmaa, src->W, src->H, radius);
gaussianBlur (srbb, tmbb, src->W, src->H, radius);
}
float chromave = 0.0f;
@ -414,7 +399,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
for(int i = 0; i < height; i++ )
{
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps(&fringe[i * width + j], xatan2f(LVFU(srbb[i][j]), LVFU(sraa[i][j])));
STVFU(fringe[i * width + j], xatan2f(LVFU(srbb[i][j]), LVFU(sraa[i][j])));
}
for(; j < width; j++) {
@ -470,14 +455,14 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#pragma omp parallel
#endif
{
__m128 sumv = _mm_set1_ps( chromave + eps2 );
__m128 onev = _mm_set1_ps( 1.0f );
__m128 sumv = F2V( chromave + eps2 );
__m128 onev = F2V( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int j = 0; j < width * height - 3; j += 4) {
_mm_storeu_ps( &fringe[j], onev / (LVFU(fringe[j]) + sumv));
STVFU(fringe[j], onev / (LVFU(fringe[j]) + sumv));
}
}
@ -599,7 +584,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#ifdef __SSE2__
int j;
__m128 interav, interbv;
__m128 piidv = _mm_set1_ps(piid);
__m128 piidv = F2V(piid);
#endif
#ifdef _OPENMP
#pragma omp for
@ -609,11 +594,11 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#ifdef __SSE2__
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps( &dst->sh_p[i][j], LVFU(src->sh_p[i][j]));
STVFU(dst->sh_p[i][j], LVFU(src->sh_p[i][j]));
interav = LVFU(tmaa[i][j]);
interbv = LVFU(tmbb[i][j]);
_mm_storeu_ps(&dst->h_p[i][j], (xatan2f(interbv, interav)) / piidv);
_mm_storeu_ps(&dst->C_p[i][j], _mm_sqrt_ps(SQRV(interbv) + SQRV(interav)));
STVFU(dst->h_p[i][j], (xatan2f(interbv, interav)) / piidv);
STVFU(dst->C_p[i][j], vsqrtf(SQRV(interbv) + SQRV(interav)));
}
for(; j < width; j++) {
@ -730,7 +715,7 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
#ifdef __SSE2__
int j;
vfloat2 sincosvalv;
__m128 piidv = _mm_set1_ps(piid);
__m128 piidv = F2V(piid);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for
@ -741,8 +726,8 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
for (j = 0; j < width - 3; j += 4) {
sincosvalv = xsincosf(piidv * LVFU(src->h_p[i][j]));
_mm_storeu_ps(&sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y);
_mm_storeu_ps(&srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x);
STVFU(sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y);
STVFU(srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x);
}
for (; j < width; j++) {
@ -769,12 +754,12 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
{
//chroma a and b
if(mode == 2) { //choice of gaussian blur
gaussianBlur<float> (sraa, tmaa, src->W, src->H, radius);
gaussianBlur<float> (srbb, tmbb, src->W, src->H, radius);
gaussianBlur (sraa, tmaa, src->W, src->H, radius);
gaussianBlur (srbb, tmbb, src->W, src->H, radius);
}
//luma sh_p
gaussianBlur<float> (src->sh_p, tmL, src->W, src->H, 2.0);//low value to avoid artifacts
gaussianBlur (src->sh_p, tmL, src->W, src->H, 2.0);//low value to avoid artifacts
}
if(mode == 1) { //choice of median
@ -859,8 +844,8 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
int j;
#ifdef __SSE2__
__m128 shfabsv, shmedv;
__m128 shthrv = _mm_set1_ps(shthr);
__m128 onev = _mm_set1_ps(1.0f);
__m128 shthrv = F2V(shthr);
__m128 onev = F2V(1.0f);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for private(shfabs, shmed,i1,j1)
@ -883,14 +868,14 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
for (; j < width - 5; j += 4) {
shfabsv = vabsf(LVFU(src->sh_p[i][j]) - LVFU(tmL[i][j]));
shmedv = _mm_setzero_ps();
shmedv = ZEROV;
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
shmedv += vabsf(LVFU(src->sh_p[i1][j1]) - LVFU(tmL[i1][j1]));
}
_mm_storeu_ps( &badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, _mm_setzero_ps()));
STVFU(badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, ZEROV));
}
for (; j < width - 2; j++) {
@ -1082,15 +1067,15 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
#endif
{
int j;
__m128 sumv = _mm_set1_ps( chrommed + eps2 );
__m128 onev = _mm_set1_ps( 1.0f );
__m128 sumv = F2V( chrommed + eps2 );
__m128 onev = F2V( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++) {
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps( &badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv));
STVFU(badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv));
}
for(; j < width; j++) {
@ -1341,7 +1326,7 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
#ifdef __SSE2__
int j;
// vfloat2 sincosvalv;
// __m128 piidv = _mm_set1_ps(piid);
// __m128 piidv = F2V(piid);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for
@ -1351,8 +1336,8 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
#ifdef __SSE2__
for (j = 0; j < width - 3; j += 4) {
_mm_storeu_ps(&sraa[i][j], LVFU(src->a[i][j]));
_mm_storeu_ps(&srbb[i][j], LVFU(src->b[i][j]));
STVFU(sraa[i][j], LVFU(src->a[i][j]));
STVFU(srbb[i][j], LVFU(src->b[i][j]));
}
for (; j < width; j++) {
@ -1377,12 +1362,12 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
{
//chroma a and b
if(mode >= 2) { //choice of gaussian blur
gaussianBlur<float> (sraa, tmaa, src->W, src->H, radius);
gaussianBlur<float> (srbb, tmbb, src->W, src->H, radius);
gaussianBlur (sraa, tmaa, src->W, src->H, radius);
gaussianBlur (srbb, tmbb, src->W, src->H, radius);
}
//luma sh_p
gaussianBlur<float> (src->L, tmL, src->W, src->H, 2.0);//low value to avoid artifacts
gaussianBlur (src->L, tmL, src->W, src->H, 2.0);//low value to avoid artifacts
}
if(mode == 1) { //choice of median
@ -1467,8 +1452,8 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
int j;
#ifdef __SSE2__
__m128 shfabsv, shmedv;
__m128 shthrv = _mm_set1_ps(shthr);
__m128 onev = _mm_set1_ps(1.0f);
__m128 shthrv = F2V(shthr);
__m128 onev = F2V(1.0f);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for private(shfabs, shmed,i1,j1)
@ -1491,14 +1476,14 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
for (; j < width - 5; j += 4) {
shfabsv = vabsf(LVFU(src->L[i][j]) - LVFU(tmL[i][j]));
shmedv = _mm_setzero_ps();
shmedv = ZEROV;
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
shmedv += vabsf(LVFU(src->L[i1][j1]) - LVFU(tmL[i1][j1]));
}
_mm_storeu_ps( &badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, _mm_setzero_ps()));
STVFU(badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, ZEROV));
}
for (; j < width - 2; j++) {
@ -1690,15 +1675,15 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
#endif
{
int j;
__m128 sumv = _mm_set1_ps( chrommed + eps2 );
__m128 onev = _mm_set1_ps( 1.0f );
__m128 sumv = F2V( chrommed + eps2 );
__m128 onev = F2V( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++) {
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps( &badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv));
STVFU(badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv));
}
for(; j < width; j++) {

1301
rtengine/gauss.cc Normal file

File diff suppressed because it is too large Load Diff

View File

@ -19,797 +19,8 @@
#ifndef _GAUSS_H_
#define _GAUSS_H_
#include <cstdlib>
#include <cstring>
#include <cmath>
#include "opthelper.h"
#include "stdio.h"
#include "boxblur.h"
// classical filtering if the support window is small:
enum eGaussType {GAUSS_STANDARD, GAUSS_MULT, GAUSS_DIV};
template<class T> void gaussHorizontal3 (T** src, T** dst, int W, int H, const float c0, const float c1)
{
void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, float** buffer2 = nullptr);
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H; i++) {
T temp[W] ALIGNED16;
for (int j = 1; j < W - 1; j++) {
temp[j] = (T)(c1 * (src[i][j - 1] + src[i][j + 1]) + c0 * src[i][j]);
}
dst[i][0] = src[i][0];
memcpy (dst[i] + 1, temp + 1, (W - 2)*sizeof(T));
dst[i][W - 1] = src[i][W - 1];
}
}
template<class T> void gaussVertical3 (T** src, T** dst, int W, int H, const float c0, const float c1)
{
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < W; i++) {
T temp[H] ALIGNED16;
for (int j = 1; j < H - 1; j++) {
temp[j] = (T)(c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i]);
}
dst[0][i] = src[0][i];
for (int j = 1; j < H - 1; j++) {
dst[j][i] = temp[j];
}
dst[H - 1][i] = src[H - 1][i];
}
}
#ifdef __SSE2__
template<class T> SSEFUNCTION void gaussVertical3Sse (T** src, T** dst, int W, int H, const float c0, const float c1)
{
vfloat Tv, Tm1v, Tp1v;
vfloat c0v, c1v;
c0v = F2V(c0);
c1v = F2V(c1);
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < W - 3; i += 4) {
Tm1v = LVFU( src[0][i] );
STVFU( dst[0][i], Tm1v);
if (H > 1) {
Tv = LVFU( src[1][i]);
}
for (int j = 1; j < H - 1; j++) {
Tp1v = LVFU( src[j + 1][i]);
STVFU( dst[j][i], c1v * (Tp1v + Tm1v) + Tv * c0v);
Tm1v = Tv;
Tv = Tp1v;
}
STVFU( dst[H - 1][i], LVFU( src[H - 1][i]));
}
// Borders are done without SSE
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = W - (W % 4); i < W; i++) {
dst[0][i] = src[0][i];
for (int j = 1; j < H - 1; j++) {
dst[j][i] = c1 * (src[j - 1][i] + src[j + 1][i]) + c0 * src[j][i];
}
dst[H - 1][i] = src[H - 1][i];
}
}
template<class T> SSEFUNCTION void gaussHorizontal3Sse (T** src, T** dst, int W, int H, const float c0, const float c1)
{
float tmp[W][4] ALIGNED16;
vfloat Tv, Tm1v, Tp1v;
vfloat c0v, c1v;
c0v = F2V(c0);
c1v = F2V(c1);
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H - 3; i += 4) {
dst[i][0] = src[i][0];
dst[i + 1][0] = src[i + 1][0];
dst[i + 2][0] = src[i + 2][0];
dst[i + 3][0] = src[i + 3][0];
Tm1v = _mm_set_ps( src[i][0], src[i + 1][0], src[i + 2][0], src[i + 3][0] );
if (W > 1) {
Tv = _mm_set_ps( src[i][1], src[i + 1][1], src[i + 2][1], src[i + 3][1] );
}
for (int j = 1; j < W - 1; j++) {
Tp1v = _mm_set_ps( src[i][j + 1], src[i + 1][j + 1], src[i + 2][j + 1], src[i + 3][j + 1] );
STVF( tmp[j][0], c1v * (Tp1v + Tm1v) + Tv * c0v);
Tm1v = Tv;
Tv = Tp1v;
}
for (int j = 1; j < W - 1; j++) {
dst[i + 3][j] = tmp[j][0];
dst[i + 2][j] = tmp[j][1];
dst[i + 1][j] = tmp[j][2];
dst[i][j] = tmp[j][3];
}
dst[i][W - 1] = src[i][W - 1];
dst[i + 1][W - 1] = src[i + 1][W - 1];
dst[i + 2][W - 1] = src[i + 2][W - 1];
dst[i + 3][W - 1] = src[i + 3][W - 1];
}
// Borders are done without SSE
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = H - (H % 4); i < H; i++) {
dst[i][0] = src[i][0];
for (int j = 1; j < W - 1; j++) {
dst[i][j] = c1 * (src[i][j - 1] + src[i][j + 1]) + c0 * src[i][j];
}
dst[i][W - 1] = src[i][W - 1];
}
}
// fast gaussian approximation if the support window is large
template<class T> SSEFUNCTION void gaussHorizontalSse (T** src, T** dst, int W, int H, float sigma)
{
if (sigma < 0.25) {
// dont perform filtering
if (src != dst)
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H; i++) {
memcpy (dst[i], src[i], W * sizeof(T));
}
return;
}
if (sigma < 0.6) {
// compute 3x3 kernel
float c1 = exp (-1.0 / (2.0 * sigma * sigma));
float csum = 2.0 * c1 + 1.0;
c1 /= csum;
float c0 = 1.0 / csum;
gaussHorizontal3Sse<T> (src, dst, W, H, c0, c1);
return;
}
// coefficient calculation
float q = 0.98711 * sigma - 0.96330;
if (sigma < 2.5) {
q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
}
float b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q;
float b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q;
float b2 = -1.4281 * q * q - 1.26661 * q * q * q;
float b3 = 0.422205 * q * q * q;
float B = 1.0 - (b1 + b2 + b3) / b0;
b1 /= b0;
b2 /= b0;
b3 /= b0;
// From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering
float M[3][3];
M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2;
M[0][1] = (b3 + b1) * (b2 + b3 * b1);
M[0][2] = b3 * (b1 + b3 * b2);
M[1][0] = b1 + b3 * b2;
M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1);
M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3;
M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2;
M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3;
M[2][2] = b3 * (b1 + b3 * b2);
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
}
vfloat Rv;
vfloat Tv, Tm2v, Tm3v;
vfloat Bv, b1v, b2v, b3v;
vfloat temp2W, temp2Wp1;
float tmp[W][4] ALIGNED16;
float tmpV[4] ALIGNED16;
Bv = F2V(B);
b1v = F2V(b1);
b2v = F2V(b2);
b3v = F2V(b3);
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H - 3; i += 4) {
tmpV[0] = src[i + 3][0];
tmpV[1] = src[i + 2][0];
tmpV[2] = src[i + 1][0];
tmpV[3] = src[i][0];
Tv = LVF(tmpV[0]);
Rv = Tv * (Bv + b1v + b2v + b3v);
Tm3v = Rv;
STVF( tmp[0][0], Rv );
tmpV[0] = src[i + 3][1];
tmpV[1] = src[i + 2][1];
tmpV[2] = src[i + 1][1];
tmpV[3] = src[i][1];
Rv = LVF(tmpV[0]) * Bv + Rv * b1v + Tv * (b2v + b3v);
Tm2v = Rv;
STVF( tmp[1][0], Rv );
tmpV[0] = src[i + 3][2];
tmpV[1] = src[i + 2][2];
tmpV[2] = src[i + 1][2];
tmpV[3] = src[i][2];
Rv = LVF(tmpV[0]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v;
STVF( tmp[2][0], Rv );
for (int j = 3; j < W; j++) {
Tv = Rv;
Rv = _mm_set_ps(src[i][j], src[i + 1][j], src[i + 2][j], src[i + 3][j]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
STVF( tmp[j][0], Rv );
Tm3v = Tm2v;
Tm2v = Tv;
}
Tv = _mm_set_ps(src[i][W - 1], src[i + 1][W - 1], src[i + 2][W - 1], src[i + 3][W - 1]);
temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * ( Tm2v - Tv ) + F2V(M[2][2]) * (Tm3v - Tv);
temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
STVF( tmp[W - 1][0], Rv );
Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
STVF( tmp[W - 2][0], Tm2v );
Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
STVF( tmp[W - 3][0], Tm3v );
Tv = Rv;
Rv = Tm3v;
Tm3v = Tv;
for (int j = W - 4; j >= 0; j--) {
Tv = Rv;
Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
STVF( tmp[j][0], Rv );
Tm3v = Tm2v;
Tm2v = Tv;
}
for (int j = 0; j < W; j++) {
dst[i + 3][j] = tmp[j][0];
dst[i + 2][j] = tmp[j][1];
dst[i + 1][j] = tmp[j][2];
dst[i][j] = tmp[j][3];
}
}
// Borders are done without SSE
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = H - (H % 4); i < H; i++) {
tmp[0][0] = B * src[i][0] + b1 * src[i][0] + b2 * src[i][0] + b3 * src[i][0];
tmp[1][0] = B * src[i][1] + b1 * tmp[0][0] + b2 * src[i][0] + b3 * src[i][0];
tmp[2][0] = B * src[i][2] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[i][0];
for (int j = 3; j < W; j++) {
tmp[j][0] = B * src[i][j] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
}
float temp2Wm1 = src[i][W - 1] + M[0][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[0][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[0][2] * (tmp[W - 3][0] - src[i][W - 1]);
float temp2W = src[i][W - 1] + M[1][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[1][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[1][2] * (tmp[W - 3][0] - src[i][W - 1]);
float temp2Wp1 = src[i][W - 1] + M[2][0] * (tmp[W - 1][0] - src[i][W - 1]) + M[2][1] * (tmp[W - 2][0] - src[i][W - 1]) + M[2][2] * (tmp[W - 3][0] - src[i][W - 1]);
tmp[W - 1][0] = temp2Wm1;
tmp[W - 2][0] = B * tmp[W - 2][0] + b1 * tmp[W - 1][0] + b2 * temp2W + b3 * temp2Wp1;
tmp[W - 3][0] = B * tmp[W - 3][0] + b1 * tmp[W - 2][0] + b2 * tmp[W - 1][0] + b3 * temp2W;
for (int j = W - 4; j >= 0; j--) {
tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
}
for (int j = 0; j < W; j++) {
dst[i][j] = tmp[j][0];
}
}
}
#endif
// fast gaussian approximation if the support window is large
template<class T> void gaussHorizontal (T** src, T** dst, int W, int H, double sigma)
{
#ifdef __SSE2__
if (sigma < 70) { // bigger sigma only with double precision
gaussHorizontalSse<T> (src, dst, W, H, sigma);
return;
}
#endif
if (sigma < 0.25) {
// dont perform filtering
if (src != dst)
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H; i++) {
memcpy (dst[i], src[i], W * sizeof(T));
}
return;
}
if (sigma < 0.6) {
// compute 3x3 kernel
double c1 = exp (-1.0 / (2.0 * sigma * sigma));
double csum = 2.0 * c1 + 1.0;
c1 /= csum;
double c0 = 1.0 / csum;
gaussHorizontal3<T> (src, dst, W, H, c0, c1);
return;
}
// coefficient calculation
double q = 0.98711 * sigma - 0.96330;
if (sigma < 2.5) {
q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
}
double b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q;
double b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q;
double b2 = -1.4281 * q * q - 1.26661 * q * q * q;
double b3 = 0.422205 * q * q * q;
double B = 1.0 - (b1 + b2 + b3) / b0;
b1 /= b0;
b2 /= b0;
b3 /= b0;
// From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering
double M[3][3];
M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2;
M[0][1] = (b3 + b1) * (b2 + b3 * b1);
M[0][2] = b3 * (b1 + b3 * b2);
M[1][0] = b1 + b3 * b2;
M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1);
M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3;
M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2;
M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3;
M[2][2] = b3 * (b1 + b3 * b2);
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
}
double temp2[W] ALIGNED16;
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H; i++) {
temp2[0] = B * src[i][0] + b1 * src[i][0] + b2 * src[i][0] + b3 * src[i][0];
temp2[1] = B * src[i][1] + b1 * temp2[0] + b2 * src[i][0] + b3 * src[i][0];
temp2[2] = B * src[i][2] + b1 * temp2[1] + b2 * temp2[0] + b3 * src[i][0];
for (int j = 3; j < W; j++) {
temp2[j] = B * src[i][j] + b1 * temp2[j - 1] + b2 * temp2[j - 2] + b3 * temp2[j - 3];
}
double temp2Wm1 = src[i][W - 1] + M[0][0] * (temp2[W - 1] - src[i][W - 1]) + M[0][1] * (temp2[W - 2] - src[i][W - 1]) + M[0][2] * (temp2[W - 3] - src[i][W - 1]);
double temp2W = src[i][W - 1] + M[1][0] * (temp2[W - 1] - src[i][W - 1]) + M[1][1] * (temp2[W - 2] - src[i][W - 1]) + M[1][2] * (temp2[W - 3] - src[i][W - 1]);
double temp2Wp1 = src[i][W - 1] + M[2][0] * (temp2[W - 1] - src[i][W - 1]) + M[2][1] * (temp2[W - 2] - src[i][W - 1]) + M[2][2] * (temp2[W - 3] - src[i][W - 1]);
temp2[W - 1] = temp2Wm1;
temp2[W - 2] = B * temp2[W - 2] + b1 * temp2[W - 1] + b2 * temp2W + b3 * temp2Wp1;
temp2[W - 3] = B * temp2[W - 3] + b1 * temp2[W - 2] + b2 * temp2[W - 1] + b3 * temp2W;
for (int j = W - 4; j >= 0; j--) {
temp2[j] = B * temp2[j] + b1 * temp2[j + 1] + b2 * temp2[j + 2] + b3 * temp2[j + 3];
}
for (int j = 0; j < W; j++) {
dst[i][j] = (T)temp2[j];
}
}
}
#ifdef __SSE2__
template<class T> SSEFUNCTION void gaussVerticalSse (T** src, T** dst, int W, int H, float sigma)
{
if (sigma < 0.25) {
// dont perform filtering
if (src != dst)
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H; i++) {
memcpy (dst[i], src[i], W * sizeof(T));
}
return;
}
if (sigma < 0.6) {
// compute 3x3 kernel
double c1 = exp (-1.0 / (2.0 * sigma * sigma));
double csum = 2.0 * c1 + 1.0;
c1 /= csum;
double c0 = 1.0 / csum;
gaussVertical3Sse<T> (src, dst, W, H, c0, c1);
return;
}
// coefficient calculation
double q = 0.98711 * sigma - 0.96330;
if (sigma < 2.5) {
q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
}
double b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q;
double b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q;
double b2 = -1.4281 * q * q - 1.26661 * q * q * q;
double b3 = 0.422205 * q * q * q;
double B = 1.0 - (b1 + b2 + b3) / b0;
b1 /= b0;
b2 /= b0;
b3 /= b0;
// From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering
double M[3][3];
M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2;
M[0][1] = (b3 + b1) * (b2 + b3 * b1);
M[0][2] = b3 * (b1 + b3 * b2);
M[1][0] = b1 + b3 * b2;
M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1);
M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3;
M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2;
M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3;
M[2][2] = b3 * (b1 + b3 * b2);
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
M[i][j] *= (1.0 + b2 + (b1 - b3) * b3);
M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 - b1 - b2 - b3);
}
float tmp[H][4] ALIGNED16;
vfloat Rv;
vfloat Tv, Tm2v, Tm3v;
vfloat Bv, b1v, b2v, b3v;
vfloat temp2W, temp2Wp1;
Bv = F2V(B);
b1v = F2V(b1);
b2v = F2V(b2);
b3v = F2V(b3);
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < W - 3; i += 4) {
Tv = LVFU( src[0][i]);
Rv = Tv * (Bv + b1v + b2v + b3v);
Tm3v = Rv;
STVF( tmp[0][0], Rv );
Rv = LVFU(src[1][i]) * Bv + Rv * b1v + Tv * (b2v + b3v);
Tm2v = Rv;
STVF( tmp[1][0], Rv );
Rv = LVFU(src[2][i]) * Bv + Rv * b1v + Tm3v * b2v + Tv * b3v;
STVF( tmp[2][0], Rv );
for (int j = 3; j < H; j++) {
Tv = Rv;
Rv = LVFU(src[j][i]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
STVF( tmp[j][0], Rv );
Tm3v = Tm2v;
Tm2v = Tv;
}
Tv = LVFU(src[H - 1][i]);
temp2Wp1 = Tv + F2V(M[2][0]) * (Rv - Tv) + F2V(M[2][1]) * (Tm2v - Tv) + F2V(M[2][2]) * (Tm3v - Tv);
temp2W = Tv + F2V(M[1][0]) * (Rv - Tv) + F2V(M[1][1]) * (Tm2v - Tv) + F2V(M[1][2]) * (Tm3v - Tv);
Rv = Tv + F2V(M[0][0]) * (Rv - Tv) + F2V(M[0][1]) * (Tm2v - Tv) + F2V(M[0][2]) * (Tm3v - Tv);
STVFU( dst[H - 1][i], Rv );
Tm2v = Bv * Tm2v + b1v * Rv + b2v * temp2W + b3v * temp2Wp1;
STVFU( dst[H - 2][i], Tm2v );
Tm3v = Bv * Tm3v + b1v * Tm2v + b2v * Rv + b3v * temp2W;
STVFU( dst[H - 3][i], Tm3v );
Tv = Rv;
Rv = Tm3v;
Tm3v = Tv;
for (int j = H - 4; j >= 0; j--) {
Tv = Rv;
Rv = LVF(tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v;
STVFU( dst[j][i], Rv );
Tm3v = Tm2v;
Tm2v = Tv;
}
}
// Borders are done without SSE
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = W - (W % 4); i < W; i++) {
tmp[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + b2 * src[0][i] + b3 * src[0][i];
tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i];
for (int j = 3; j < H; j++) {
tmp[j][0] = B * src[j][i] + b1 * tmp[j - 1][0] + b2 * tmp[j - 2][0] + b3 * tmp[j - 3][0];
}
float temp2Hm1 = src[H - 1][i] + M[0][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[0][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[0][2] * (tmp[H - 3][0] - src[H - 1][i]);
float temp2H = src[H - 1][i] + M[1][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[1][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[1][2] * (tmp[H - 3][0] - src[H - 1][i]);
float temp2Hp1 = src[H - 1][i] + M[2][0] * (tmp[H - 1][0] - src[H - 1][i]) + M[2][1] * (tmp[H - 2][0] - src[H - 1][i]) + M[2][2] * (tmp[H - 3][0] - src[H - 1][i]);
tmp[H - 1][0] = temp2Hm1;
tmp[H - 2][0] = B * tmp[H - 2][0] + b1 * tmp[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
tmp[H - 3][0] = B * tmp[H - 3][0] + b1 * tmp[H - 2][0] + b2 * tmp[H - 1][0] + b3 * temp2H;
for (int j = H - 4; j >= 0; j--) {
tmp[j][0] = B * tmp[j][0] + b1 * tmp[j + 1][0] + b2 * tmp[j + 2][0] + b3 * tmp[j + 3][0];
}
for (int j = 0; j < H; j++) {
dst[j][i] = tmp[j][0];
}
}
}
#endif
template<class T> void gaussVertical (T** src, T** dst, int W, int H, double sigma)
{
#ifdef __SSE2__
if (sigma < 70) { // bigger sigma only with double precision
gaussVerticalSse<T> (src, dst, W, H, sigma);
return;
}
#endif
if (sigma < 0.25) {
// don't perform filtering
if (src != dst)
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H; i++) {
memcpy (dst[i], src[i], W * sizeof(T));
}
return;
}
if (sigma < 0.6) {
// compute 3x3 kernel
double c1 = exp (-1.0 / (2.0 * sigma * sigma));
double csum = 2.0 * c1 + 1.0;
c1 /= csum;
double c0 = 1.0 / csum;
gaussVertical3<T> (src, dst, W, H, c0, c1);
return;
}
// coefficient calculation
double q = 0.98711 * sigma - 0.96330;
if (sigma < 2.5) {
q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
}
double b0 = 1.57825 + 2.44413 * q + 1.4281 * q * q + 0.422205 * q * q * q;
double b1 = 2.44413 * q + 2.85619 * q * q + 1.26661 * q * q * q;
double b2 = -1.4281 * q * q - 1.26661 * q * q * q;
double b3 = 0.422205 * q * q * q;
double B = 1.0 - (b1 + b2 + b3) / b0;
b1 /= b0;
b2 /= b0;
b3 /= b0;
// From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering
double M[3][3];
M[0][0] = -b3 * b1 + 1.0 - b3 * b3 - b2;
M[0][1] = (b3 + b1) * (b2 + b3 * b1);
M[0][2] = b3 * (b1 + b3 * b2);
M[1][0] = b1 + b3 * b2;
M[1][1] = -(b2 - 1.0) * (b2 + b3 * b1);
M[1][2] = -(b3 * b1 + b3 * b3 + b2 - 1.0) * b3;
M[2][0] = b3 * b1 + b2 + b1 * b1 - b2 * b2;
M[2][1] = b1 * b2 + b3 * b2 * b2 - b1 * b3 * b3 - b3 * b3 * b3 - b3 * b2 + b3;
M[2][2] = b3 * (b1 + b3 * b2);
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
}
// process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
static const int numcols = 8;
double temp2[H][numcols] ALIGNED16;
double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int i = 0; i < W - numcols + 1; i += numcols) {
for (int k = 0; k < numcols; k++) {
temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
}
for (int j = 3; j < H; j++) {
for (int k = 0; k < numcols; k++) {
temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
}
}
for (int k = 0; k < numcols; k++) {
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
}
for (int k = 0; k < numcols; k++) {
dst[H - 1][i + k] = temp2[H - 1][k] = temp2Hm1[k];
dst[H - 2][i + k] = temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k];
dst[H - 3][i + k] = temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k];
}
for (int j = H - 4; j >= 0; j--) {
for (int k = 0; k < numcols; k++) {
dst[j][i + k] = temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k];
}
}
}
#ifdef _OPENMP
#pragma omp single
#endif
// process remaining column
for (int i = W - (W % numcols); i < W; i++) {
temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i];
temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i];
for (int j = 3; j < H; j++) {
temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
}
double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
double temp2H = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
dst[H - 1][i] = temp2[H - 1][0] = temp2Hm1;
dst[H - 2][i] = temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
dst[H - 3][i] = temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H;
for (int j = H - 4; j >= 0; j--) {
dst[j][i] = temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0];
}
}
}
template<class T> void gaussianBlur(T** src, T** dst, const int W, const int H, const double sigma, T *buffer = NULL)
{
if(buffer) { // use iterated boxblur to approximate gaussian blur
// Compute ideal averaging filter width and number of iterations
int n = 1;
double wIdeal = sqrt((12 * sigma * sigma) + 1);
while(wIdeal > W || wIdeal > H) {
n++;
wIdeal = sqrt((12 * sigma * sigma / n) + 1);
}
if(n < 3) {
n = 3;
wIdeal = sqrt((12 * sigma * sigma / n) + 1);
} else if(n > 6) {
n = 6;
}
int wl = wIdeal;
if(wl % 2 == 0) {
wl--;
}
int wu = wl + 2;
double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4);
int m = round(mIdeal);
int sizes[n];
for(int i = 0; i < n; i++) {
sizes[i] = ((i < m ? wl : wu) - 1) / 2;
}
rtengine::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H);
for(int i = 1; i < n; i++) {
rtengine::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H);
}
} else {
gaussHorizontal<T> (src, dst, W, H, sigma);
gaussVertical<T> (dst, dst, W, H, sigma);
}
}
#endif
#endif

View File

@ -115,21 +115,30 @@ static INLINE vfloat vcast_vf_f(float f)
return _mm_set_ps(f, f, f, f);
}
// Don't use intrinsics here. Newer gcc versions (>= 4.9, maybe also before 4.9) generate better code when not using intrinsics
// example: vaddf(vmulf(a,b),c) will generate an FMA instruction when build for chips with that feature only when vaddf and vmulf don't use intrinsics
static INLINE vfloat vaddf(vfloat x, vfloat y)
{
return _mm_add_ps(x, y);
return x + y;
}
static INLINE vfloat vsubf(vfloat x, vfloat y)
{
return _mm_sub_ps(x, y);
return x - y;
}
static INLINE vfloat vmulf(vfloat x, vfloat y)
{
return _mm_mul_ps(x, y);
return x * y;
}
static INLINE vfloat vdivf(vfloat x, vfloat y)
{
return _mm_div_ps(x, y);
return x / y;
}
// Also don't use intrinsic here: Some chips support FMA instructions with 3 and 4 operands
// 3 operands: a = a*b+c, b = a*b+c, c = a*b+c // destination has to be one of a,b,c
// 4 operands: d = a*b+c // destination does not have to be one of a,b,c
// gcc will use the one which fits best when not using intrinsics. With using intrinsics that's not possible
static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) {
return x * y + z;
}
static INLINE vfloat vrecf(vfloat x)
{

View File

@ -73,6 +73,7 @@ class ImProcFunctions
void transformLuminanceOnly (Imagefloat* original, Imagefloat* transformed, int cx, int cy, int oW, int oH, int fW, int fH);
void transformHighQuality (Imagefloat* original, Imagefloat* transformed, int cx, int cy, int sx, int sy, int oW, int oH, int fW, int fH, const LCPMapper *pLCPMap, bool fullImage);
void sharpenHaloCtrl (float** luminance, float** blurmap, float** base, int W, int H, const SharpeningParams &sharpenParam);
void sharpenHaloCtrl (LabImage* lab, float** blurmap, float** base, int W, int H, SharpeningParams &sharpenParam);
void sharpenHaloCtrlcam (CieImage* ncie, float** blurmap, float** base, int W, int H);
void firstAnalysisThread(Imagefloat* original, Glib::ustring wprofile, unsigned int* histogram, int row_from, int row_to);
@ -271,9 +272,9 @@ public:
void Lanczos (const LabImage* src, LabImage* dst, float scale);
void Lanczos (const Image16* src, Image16* dst, float scale);
void deconvsharpening (LabImage* lab, float** buffer, SharpeningParams &sharpenParam);
void deconvsharpeningcam (CieImage* ncie, float** buffer);
void deconvsharpening (float** luminance, float** buffer, int W, int H, const SharpeningParams &sharpenParam);
void MLsharpen (LabImage* lab);// Manuel's clarity / sharpening
void MLmicrocontrast(float** luminance, int W, int H ); //Manuel's microcontrast
void MLmicrocontrast(LabImage* lab ); //Manuel's microcontrast
void MLmicrocontrastcam(CieImage* ncie ); //Manuel's microcontrast

View File

@ -32,7 +32,6 @@ namespace rtengine
SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
{
// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// impulse noise removal
// local variables
@ -41,15 +40,15 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
int height = lab->H;
// buffer for the lowpass image
float ** lpf = new float *[height];
float * lpf[height] ALIGNED16;
lpf[0] = new float [width * height];
// buffer for the highpass image
float ** impish = new float *[height];
char * impish[height] ALIGNED16;
impish[0] = new char [width * height];
for (int i = 0; i < height; i++) {
lpf[i] = new float [width];
//memset (lpf[i], 0, width*sizeof(float));
impish[i] = new float [width];
//memset (impish[i], 0, width*sizeof(unsigned short));
for (int i = 1; i < height; i++) {
lpf[i] = lpf[i - 1] + width;
impish[i] = impish[i - 1] + width;
}
@ -60,12 +59,11 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
const float eps = 1.0;
//rangeblur<unsigned short, unsigned int> (lab->L, lpf, impish /*used as buffer here*/, width, height, thresh, false);
#ifdef _OPENMP
#pragma omp parallel
#endif
{
gaussianBlur<float> (lab->L, lpf, width, height, max(2.0, thresh - 1.0));
gaussianBlur (lab->L, lpf, width, height, max(2.0, thresh - 1.0));
}
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -81,9 +79,9 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
int i1, j1, j;
float hpfabs, hfnbrave;
#ifdef __SSE2__
__m128 hfnbravev, hpfabsv;
__m128 impthrDiv24v = _mm_set1_ps( impthrDiv24 );
__m128 onev = _mm_set1_ps( 1.0f );
vfloat hfnbravev, hpfabsv;
vfloat impthrDiv24v = F2V( impthrDiv24 );
vfloat onev = F2V( 1.0f );
#endif
#ifdef _OPENMP
#pragma omp for
@ -105,46 +103,37 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
#ifdef __SSE2__
for (; j < width - 5; j += 4) {
hfnbravev = _mm_setzero_ps( );
hfnbravev = ZEROV;
hpfabsv = vabsf(LVFU(lab->L[i][j]) - LVFU(lpf[i][j]));
//block average of high pass data
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) {
for (j1 = j - 2; j1 <= j + 2; j1++) {
hfnbravev += vabsf(LVFU(lab->L[i1][j1]) - LVFU(lpf[i1][j1]));
}
}
_mm_storeu_ps(&impish[i][j], vself(vmaskf_gt(hpfabsv, (hfnbravev - hpfabsv)*impthrDiv24v), onev, _mm_setzero_ps()));
}
for (; j < width - 2; j++) {
hpfabs = fabs(lab->L[i][j] - lpf[i][j]);
//block average of high pass data
for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++) {
hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]);
}
impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
}
#else
for (; j < width - 2; j++) {
hpfabs = fabs(lab->L[i][j] - lpf[i][j]);
//block average of high pass data
for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++) {
hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]);
}
impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
int mask = _mm_movemask_ps((hfnbravev - hpfabsv) * impthrDiv24v - hpfabsv);
impish[i][j] = (mask & 1);
impish[i][j + 1] = ((mask & 2) >> 1);
impish[i][j + 2] = ((mask & 4) >> 2);
impish[i][j + 3] = ((mask & 8) >> 3);
}
#endif
for (; j < width - 2; j++) {
hpfabs = fabs(lab->L[i][j] - lpf[i][j]);
//block average of high pass data
for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++) {
hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]);
}
impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
}
for (; j < width; j++) {
hpfabs = fabs(lab->L[i][j] - lpf[i][j]);
@ -188,10 +177,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = 0; j1 <= j + 2; j1++ ) {
if (i1 == i && j1 == j) {
continue;
}
if (impish[i1][j1]) {
continue;
}
@ -220,10 +205,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
if (i1 == i && j1 == j) {
continue;
}
if (impish[i1][j1]) {
continue;
}
@ -252,10 +233,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 < width; j1++ ) {
if (i1 == i && j1 == j) {
continue;
}
if (impish[i1][j1]) {
continue;
}
@ -277,13 +254,8 @@ SSEFUNCTION void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
}
//now impulsive values have been corrected
for (int i = 0; i < height; i++) {
delete [] lpf[i];
delete [] impish[i];
}
delete [] lpf;
delete [] impish;
delete [] lpf[0];
delete [] impish[0];
}
@ -317,7 +289,7 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
#pragma omp parallel
#endif
{
gaussianBlur<float> (ncie->sh_p, lpf, width, height, max(2.0, thresh - 1.0));
gaussianBlur (ncie->sh_p, lpf, width, height, max(2.0, thresh - 1.0));
}
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -332,9 +304,9 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
int i1, j1, j;
float hpfabs, hfnbrave;
#ifdef __SSE2__
__m128 hfnbravev, hpfabsv;
__m128 impthrDiv24v = _mm_set1_ps( impthrDiv24 );
__m128 onev = _mm_set1_ps( 1.0f );
vfloat hfnbravev, hpfabsv;
vfloat impthrDiv24v = F2V( impthrDiv24 );
vfloat onev = F2V( 1.0f );
#endif
#ifdef _OPENMP
#pragma omp for
@ -357,7 +329,7 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
for (; j < width - 5; j += 4) {
hpfabsv = vabsf(LVFU(ncie->sh_p[i][j]) - LVFU(lpf[i][j]));
hfnbravev = _mm_setzero_ps();
hfnbravev = ZEROV;
//block average of high pass data
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) {
@ -365,38 +337,25 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
hfnbravev += vabsf(LVFU(ncie->sh_p[i1][j1]) - LVFU(lpf[i1][j1]));
}
_mm_storeu_ps(&impish[i][j], vself(vmaskf_gt(hpfabsv, (hfnbravev - hpfabsv)*impthrDiv24v), onev, _mm_setzero_ps()));
}
}
for (; j < width - 2; j++) {
hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]);
//block average of high pass data
for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]);
}
impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
}
#else
for (; j < width - 2; j++) {
hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]);
//block average of high pass data
for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]);
}
impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
STVFU(impish[i][j], vselfzero(vmaskf_gt(hpfabsv, (hfnbravev - hpfabsv)*impthrDiv24v), onev));
}
#endif
for (; j < width - 2; j++) {
hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]);
//block average of high pass data
for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]);
}
impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
}
for (; j < width; j++) {
hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]);
@ -422,42 +381,34 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
#pragma omp parallel
#endif
{
int j;
float2 sincosval;
#ifdef __SSE2__
vfloat2 sincosvalv;
__m128 piidv = _mm_set1_ps( piid );
__m128 tempv;
vfloat piidv = F2V( piid );
vfloat tempv;
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < height; i++) {
int j = 0;
#ifdef __SSE2__
for (j = 0; j < width - 3; j += 4) {
for (; j < width - 3; j += 4) {
sincosvalv = xsincosf(piidv * LVFU(ncie->h_p[i][j]));
tempv = LVFU(ncie->C_p[i][j]);
_mm_storeu_ps(&sraa[i][j], tempv * sincosvalv.y);
_mm_storeu_ps(&srbb[i][j], tempv * sincosvalv.x);
}
for (; j < width; j++) {
sincosval = xsincosf(piid * ncie->h_p[i][j]);
sraa[i][j] = ncie->C_p[i][j] * sincosval.y;
srbb[i][j] = ncie->C_p[i][j] * sincosval.x;
}
#else
for (j = 0; j < width; j++) {
sincosval = xsincosf(piid * ncie->h_p[i][j]);
sraa[i][j] = ncie->C_p[i][j] * sincosval.y;
srbb[i][j] = ncie->C_p[i][j] * sincosval.x;
STVFU(sraa[i][j], tempv * sincosvalv.y);
STVFU(srbb[i][j], tempv * sincosvalv.x);
}
#endif
for (; j < width; j++) {
float2 sincosval = xsincosf(piid * ncie->h_p[i][j]);
sraa[i][j] = ncie->C_p[i][j] * sincosval.y;
srbb[i][j] = ncie->C_p[i][j] * sincosval.x;
}
}
}
@ -488,10 +439,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = 0; j1 <= j + 2; j1++ ) {
if (i1 == i && j1 == j) {
continue;
}
if (impish[i1][j1]) {
continue;
}
@ -520,10 +467,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
if (i1 == i && j1 == j) {
continue;
}
if (impish[i1][j1]) {
continue;
}
@ -552,10 +495,6 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 < width; j1++ ) {
if (i1 == i && j1 == j) {
continue;
}
if (impish[i1][j1]) {
continue;
}
@ -583,41 +522,32 @@ SSEFUNCTION void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh,
#endif
{
#ifdef __SSE2__
__m128 interav, interbv;
__m128 piidv = _mm_set1_ps(piid);
vfloat interav, interbv;
vfloat piidv = F2V(piid);
#endif // __SSE2__
int j;
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++ ) {
int j = 0;
#ifdef __SSE2__
for(j = 0; j < width - 3; j += 4) {
for(; j < width - 3; j += 4) {
interav = LVFU(sraa[i][j]);
interbv = LVFU(srbb[i][j]);
_mm_storeu_ps(&ncie->h_p[i][j], (xatan2f(interbv, interav)) / piidv);
_mm_storeu_ps(&ncie->C_p[i][j], _mm_sqrt_ps(SQRV(interbv) + SQRV(interav)));
STVFU(ncie->h_p[i][j], (xatan2f(interbv, interav)) / piidv);
STVFU(ncie->C_p[i][j], vsqrtf(SQRV(interbv) + SQRV(interav)));
}
#endif
for(; j < width; j++) {
float intera = sraa[i][j];
float interb = srbb[i][j];
ncie->h_p[i][j] = (xatan2f(interb, intera)) / piid;
ncie->C_p[i][j] = sqrt(SQR(interb) + SQR(intera));
}
#else
for(j = 0; j < width; j++) {
float intera = sraa[i][j];
float interb = srbb[i][j];
ncie->h_p[i][j] = (xatan2f(interb, intera)) / piid;
ncie->C_p[i][j] = sqrt(SQR(interb) + SQR(intera));
}
#endif
}
}

View File

@ -18,7 +18,7 @@
* adaptation to RawTherapee
* 2015 Jacques Desmis <jdesmis@gmail.com>
* 2015 Ingo Weyrich <heckflosse@i-weyrich.de>
* 2015 Ingo Weyrich <heckflosse67@gmx.de>
* D. J. Jobson, Z. Rahman, and G. A. Woodell. A multi-scale
* Retinex for bridging the gap between color images and the
@ -235,17 +235,19 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
bool higplus = false ;
float elogt;
float hl = deh.baselog;
if(hl >= 2.71828f) {
elogt = 2.71828f + SQR(SQR(hl - 2.71828f));
} else {
elogt = hl;
}
int H_L = height;
int W_L = width;
float *tran[H_L] ALIGNED16;
float *tranBuffer;
int viewmet=0;
int viewmet = 0;
elogt = 2.71828f;//disabled baselog
FlatCurve* shcurve = NULL;//curve L=f(H)
@ -280,116 +282,121 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
if (deh.retinexMethod == "highli" || deh.retinexMethod == "highliplus") {
moderetinex = 3;
}
for(int it=1; it<iter+1; it++) {//iter nb max of iterations
for(int it = 1; it < iter + 1; it++) { //iter nb max of iterations
float aahi = 49.f / 99.f; ////reduce sensibility 50%
float bbhi = 1.f - aahi;
float high;
high = bbhi + aahi * (float) deh.highl;
float grads;
float grad=1.f;
float grad = 1.f;
float sc = 3.f;
if(gradient==0) {
grad=1.f;
sc=3.f;
}
else if(gradient==1) {
grad=0.25f*it+0.75f;
sc=-0.5f*it+4.5f;
}
else if(gradient==2) {
grad=0.5f*it+0.5f;
sc=-0.75f*it+5.75f;
}
else if(gradient==3) {
grad=0.666f*it+0.333f;
sc=-0.75f*it+5.75f;
}
else if(gradient==4) {
grad=0.8f*it+0.2f;
sc=-0.75f*it+5.75f;
}
else if(gradient==5) {
if(moderetinex!=3) {
grad=2.5f*it-1.5f;
if(gradient == 0) {
grad = 1.f;
sc = 3.f;
} else if(gradient == 1) {
grad = 0.25f * it + 0.75f;
sc = -0.5f * it + 4.5f;
} else if(gradient == 2) {
grad = 0.5f * it + 0.5f;
sc = -0.75f * it + 5.75f;
} else if(gradient == 3) {
grad = 0.666f * it + 0.333f;
sc = -0.75f * it + 5.75f;
} else if(gradient == 4) {
grad = 0.8f * it + 0.2f;
sc = -0.75f * it + 5.75f;
} else if(gradient == 5) {
if(moderetinex != 3) {
grad = 2.5f * it - 1.5f;
} else {
float aa = (11.f * high - 1.f) / 4.f;
float bb = 1.f - aa;
grad = aa * it + bb;
}
else {
float aa=(11.f*high-1.f)/4.f;
float bb=1.f-aa;
grad=aa*it+bb;
sc = -0.75f * it + 5.75f;
} else if(gradient == 6) {
if(moderetinex != 3) {
grad = 5.f * it - 4.f;
} else {
float aa = (21.f * high - 1.f) / 4.f;
float bb = 1.f - aa;
grad = aa * it + bb;
}
sc=-0.75f*it+5.75f;
}
else if(gradient==6) {
if(moderetinex!=3) {
grad=5.f*it-4.f;
}
else {
float aa=(21.f*high-1.f)/4.f;
float bb=1.f-aa;
grad=aa*it+bb;
}
sc=-0.75f*it+5.75f;
sc = -0.75f * it + 5.75f;
}
else if(gradient==-1) {
grad=-0.125f*it+1.125f;
sc=3.f;
else if(gradient == -1) {
grad = -0.125f * it + 1.125f;
sc = 3.f;
}
float varx;
float limdx, ilimdx;
if(gradvart!=0) {
if(gradvart==1) {
varx=vart*(-0.125f*it+1.125f);
limdx=limD*(-0.125f*it+1.125f);
ilimdx=1.f/limdx;
}
else if(gradvart==2) {
varx=vart*(-0.2f*it+1.2f);
limdx=limD*(-0.2f*it+1.2f);
ilimdx=1.f/limdx;
}
else if(gradvart==-1) {
varx=vart*(0.125f*it+0.875f);
limdx=limD*(0.125f*it+0.875f);
ilimdx=1.f/limdx;
}
else if(gradvart==-2) {
varx=vart*(0.4f*it+0.6f);
limdx=limD*(0.4f*it+0.6f);
ilimdx=1.f/limdx;
if(gradvart != 0) {
if(gradvart == 1) {
varx = vart * (-0.125f * it + 1.125f);
limdx = limD * (-0.125f * it + 1.125f);
ilimdx = 1.f / limdx;
} else if(gradvart == 2) {
varx = vart * (-0.2f * it + 1.2f);
limdx = limD * (-0.2f * it + 1.2f);
ilimdx = 1.f / limdx;
} else if(gradvart == -1) {
varx = vart * (0.125f * it + 0.875f);
limdx = limD * (0.125f * it + 0.875f);
ilimdx = 1.f / limdx;
} else if(gradvart == -2) {
varx = vart * (0.4f * it + 0.6f);
limdx = limD * (0.4f * it + 0.6f);
ilimdx = 1.f / limdx;
}
} else {
varx = vart;
limdx = limD;
ilimdx = ilimD;
}
else {
varx=vart;
limdx=limD;
ilimdx=ilimD;
}
scal=round(sc);
scal = round(sc);
float strengthx;
float ks=1.f;
float ks = 1.f;
if(gradstr!=0) {
if(gradstr==1) {
if(it <= 3) ks=-0.3f*it+1.6f;
else ks = 0.5f;
}
else if(gradstr==2) {
if(it <= 3) ks=-0.6f*it+2.2f;
else ks = 0.3f;
}
else if(gradstr==-1) {
if(it <= 3) ks=0.2f*it+0.6f;
else ks = 1.2f;
}
else if(gradstr==-2) {
if(it <= 3) ks=0.4f*it+0.2f;
else ks = 1.5f;
if(gradstr != 0) {
if(gradstr == 1) {
if(it <= 3) {
ks = -0.3f * it + 1.6f;
} else {
ks = 0.5f;
}
} else if(gradstr == 2) {
if(it <= 3) {
ks = -0.6f * it + 2.2f;
} else {
ks = 0.3f;
}
} else if(gradstr == -1) {
if(it <= 3) {
ks = 0.2f * it + 0.6f;
} else {
ks = 1.2f;
}
} else if(gradstr == -2) {
if(it <= 3) {
ks = 0.4f * it + 0.2f;
} else {
ks = 1.5f;
}
}
}
strengthx=ks*strength;
retinex_scales( RetinexScales, scal, moderetinex, nei/grad, high );
strengthx = ks * strength;
retinex_scales( RetinexScales, scal, moderetinex, nei / grad, high );
float *src[H_L] ALIGNED16;
float *srcBuffer = new float[H_L * W_L];
@ -402,17 +409,41 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
int shHighlights = deh.highlights;
int shShadows = deh.shadows;
int mapmet=0;
if(deh.mapMethod=="map") mapmet=2;
if(deh.mapMethod=="mapT") mapmet=3;
if(deh.mapMethod=="curv") mapmet=1;
if(deh.mapMethod=="gaus") mapmet=4;
int mapmet = 0;
if(deh.mapMethod == "map") {
mapmet = 2;
}
if(deh.mapMethod == "mapT") {
mapmet = 3;
}
if(deh.mapMethod == "curv") {
mapmet = 1;
}
if(deh.mapMethod == "gaus") {
mapmet = 4;
}
double shradius = (double) deh.radius;
if(deh.viewMethod=="mask") viewmet=1;
if(deh.viewMethod=="tran") viewmet=2;
if(deh.viewMethod=="tran2") viewmet=3;
if(deh.viewMethod=="unsharp") viewmet=4;
if(deh.viewMethod == "mask") {
viewmet = 1;
}
if(deh.viewMethod == "tran") {
viewmet = 2;
}
if(deh.viewMethod == "tran2") {
viewmet = 3;
}
if(deh.viewMethod == "unsharp") {
viewmet = 4;
}
#ifdef _OPENMP
#pragma omp parallel for
@ -431,13 +462,14 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
out[i] = &outBuffer[i * W_L];
}
if(viewmet==3 || viewmet==2) {
if(viewmet == 3 || viewmet == 2) {
tranBuffer = new float[H_L * W_L];
for (int i = 0; i < H_L; i++) {
tran[i] = &tranBuffer[i * W_L];
}
}
const float logBetaGain = xlogf(16384.f);
float pond = logBetaGain / (float) scal;
@ -456,29 +488,36 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
{
for ( int scale = scal - 1; scale >= 0; scale-- ) {
if(scale == scal - 1) {
gaussianBlur<float> (src, out, W_L, H_L, RetinexScales[scale], buffer);
} else {
// reuse result of last iteration
gaussianBlur<float> (out, out, W_L, H_L, sqrtf(SQR(RetinexScales[scale]) - SQR(RetinexScales[scale + 1])), buffer);
gaussianBlur (src, out, W_L, H_L, RetinexScales[scale], buffer);
} else { // reuse result of last iteration
gaussianBlur (out, out, W_L, H_L, sqrtf(SQR(RetinexScales[scale]) - SQR(RetinexScales[scale + 1])), buffer);
}
//printf("scal=%d RetinexScales=%f\n",scale, RetinexScales[scale]);
printf("..");
if(mapmet==4) shradius /= 1.;
else shradius = 40.;
if(mapmet == 4) {
shradius /= 1.;
} else {
shradius = 40.;
}
// if(shHighlights > 0 || shShadows > 0) {
if(mapmet==3) if(it==1) shmap->updateL (out, shradius, true, 1);//wav Total
if(mapmet==2 && scale >2) if(it==1) shmap->updateL (out, shradius, true, 1);//wav partial
if(mapmet==4) if(it==1) shmap->updateL (out, shradius, false, 1);//gauss
if(mapmet == 3) if(it == 1) {
shmap->updateL (out, shradius, true, 1); //wav Total
}
if(mapmet == 2 && scale > 2) if(it == 1) {
shmap->updateL (out, shradius, true, 1); //wav partial
}
if(mapmet == 4) if(it == 1) {
shmap->updateL (out, shradius, false, 1); //gauss
}
// }
if (shmap) {
h_th = shmap->max_f - deh.htonalwidth * (shmap->max_f - shmap->avg) / 100;
s_th = deh.stonalwidth * (shmap->avg - shmap->min_f) / 100;
}
#ifdef __SSE2__
vfloat pondv = F2V(pond);
vfloat limMinv = F2V(ilimdx);
@ -490,11 +529,15 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H_L; i++) {
if(mapcontlutili) {
int j = 0;
for (; j < W_L; j++) {
if(it==1) out[i][j] = mapcurve[2.f * out[i][j]] / 2.f;
if(it == 1) {
out[i][j] = mapcurve[2.f * out[i][j]] / 2.f;
}
}
}
}
@ -502,14 +545,16 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
}
// if(shHighlights > 0 || shShadows > 0) {
if(((mapmet == 2 && scale >2) || mapmet==3 || mapmet==4) && it==1) {
if(((mapmet == 2 && scale > 2) || mapmet == 3 || mapmet == 4) && it == 1) {
#ifdef _OPENMP
#pragma omp for
#endif
for (int i = 0; i < H_L; i++) {
int j = 0;
for (; j < W_L; j++) {
double mapval = 1.0 + shmap->map[i][j];
double factor = 1.0;
@ -519,11 +564,13 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
} else if (mapval < s_th) {
factor = (s_th - (100.0 - shShadows) * (s_th - mapval) / 100.0) / mapval;
}
out[i][j] *= factor;
}
}
}
// }
#ifdef _OPENMP
@ -559,12 +606,13 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
}
}
}
printf(".\n");
if(mapmet > 1) {
if(shmap) {
delete shmap;
}
}
shmap = NULL;
delete [] buffer;
@ -613,12 +661,15 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
}
luminance[i][j] *= (-1.f + 4.f * dehatransmissionCurve[absciss]); //new transmission
if(viewmet==3 || viewmet==2) tran[i][j]=luminance[i][j];
if(viewmet == 3 || viewmet == 2) {
tran[i][j] = luminance[i][j];
}
}
}
// median filter on transmission ==> reduce artifacts
if (deh.medianmap && it==1) {//only one time
if (deh.medianmap && it == 1) { //only one time
int wid = W_L;
int hei = H_L;
float *tmL[hei] ALIGNED16;
@ -686,10 +737,10 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
maxCD = -9999999.f;
minCD = 9999999.f;
// coeff for auto "transmission" with 2 sigma #95% datas
float aza=16300.f/(2.f*stddv);
float azb=-aza*(mean-2.f*stddv);
float bza=16300.f/(2.f*stddv);
float bzb=16300.f-bza*(mean);
float aza = 16300.f / (2.f * stddv);
float azb = -aza * (mean - 2.f * stddv);
float bza = 16300.f / (2.f * stddv);
float bzb = 16300.f - bza * (mean);
@ -718,7 +769,7 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
float str = strengthx;
if(lhutili && it==1) { // S=f(H)
if(lhutili && it == 1) { // S=f(H)
{
float HH = exLuminance[i][j];
float valparam;
@ -733,15 +784,33 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
}
}
if(exLuminance[i][j] > 65535.f*hig && higplus) str *= hig;
if(viewmet==0) luminance[i][j]=clipretinex( cd, 0.f, 32768.f ) * str + (1.f - str) * originalLuminance[i][j];
if(viewmet==1) luminance[i][j] = out[i][j];
if(viewmet==4) luminance[i][j] = (1.f + str) * originalLuminance[i][j] - str* out[i][j];//unsharp
if(viewmet==2) {
if(tran[i][j]<= mean) luminance[i][j] = azb + aza*tran[i][j];//auto values
else luminance[i][j] = bzb + bza*tran[i][j];
if(exLuminance[i][j] > 65535.f * hig && higplus) {
str *= hig;
}
if(viewmet == 0) {
luminance[i][j] = clipretinex( cd, 0.f, 32768.f ) * str + (1.f - str) * originalLuminance[i][j];
}
if(viewmet == 1) {
luminance[i][j] = out[i][j];
}
if(viewmet == 4) {
luminance[i][j] = (1.f + str) * originalLuminance[i][j] - str * out[i][j]; //unsharp
}
if(viewmet == 2) {
if(tran[i][j] <= mean) {
luminance[i][j] = azb + aza * tran[i][j]; //auto values
} else {
luminance[i][j] = bzb + bza * tran[i][j];
}
}
if(viewmet == 3) {
luminance[i][j] = 1000.f + tran[i][j] * 700.f; //arbitrary values to help display log values which are between -20 to + 30 - usage values -4 + 5
}
if(viewmet==3) luminance[i][j] = 1000.f + tran[i][j]*700.f;//arbitrary values to help display log values which are between -20 to + 30 - usage values -4 + 5
}
@ -763,11 +832,12 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
Tmax = maxtr;
if (shcurve && it==1) {
if (shcurve && it == 1) {
delete shcurve;
}
}
if(viewmet==3 || viewmet==2) {
if(viewmet == 3 || viewmet == 2) {
delete [] tranBuffer;
tranBuffer = NULL;
}

File diff suppressed because it is too large Load Diff

View File

@ -88,7 +88,7 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int
#pragma omp parallel
#endif
{
gaussianBlur<float> (map, map, W, H, radius);
gaussianBlur (map, map, W, H, radius);
}
}
@ -233,7 +233,7 @@ void SHMap::updateL (float** L, double radius, bool hq, int skip)
#pragma omp parallel
#endif
{
gaussianBlur<float> (map, map, W, H, radius);
gaussianBlur (map, map, W, H, radius);
}
}
@ -244,7 +244,7 @@ void SHMap::updateL (float** L, double radius, bool hq, int skip)
//experimental dirpyr shmap
float thresh = (100.f * radius); //1000;
int levrad = 16;
levrad=2;//for retinex - otherwise levrad = 16
levrad = 2; //for retinex - otherwise levrad = 16
// set up range function
// calculate size of Lookup table. That's possible because from a value k for all i>=k rangefn[i] will be exp(-10)
// So we use this fact and the automatic clip of lut to reduce the size of lut and the number of calculations to fill the lut
@ -253,6 +253,7 @@ void SHMap::updateL (float** L, double radius, bool hq, int skip)
const int lutSize = (int) thresh * sqrtf(10.f) + 1;
thresh *= thresh;
LUTf rangefn(lutSize);
for (int i = 0; i < lutSize - 1; i++) {
rangefn[i] = xexpf(-min(10.f, (static_cast<float>(i) * i) / thresh )); //*intfactor;
}
@ -275,6 +276,7 @@ void SHMap::updateL (float** L, double radius, bool hq, int skip)
scale *= 2;
numLevels++;
}
//printf("numlev=%d\n",numLevels);
float ** dirpyrlo[2];

View File

@ -901,11 +901,7 @@ static INLINE vdouble xlog1p(vdouble a) {
typedef struct {
vfloat x, y;
} vfloat2;
#if defined( __FMA__ ) && defined( __x86_64__ )
static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x,y,z); }
#else
static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return vaddf(vmulf(x, y), z); }
#endif
static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
@ -921,6 +917,18 @@ static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vca
}
#endif
static INLINE vfloat vselfzero(vmask mask, vfloat x) {
// returns value of x if corresponding mask bits are 1, else returns 0
// faster than vself(mask, x, ZEROV)
return _mm_and_ps((vfloat)mask, x);
}
static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
// returns value of x if corresponding mask bits are 0, else returns 0
// faster than vself(mask, ZEROV, x)
return _mm_andnot_ps((vfloat)mask, x);
}
static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
return vori2(vandi2(m2, x), vandnoti2(m2, y));
@ -1171,7 +1179,7 @@ static INLINE vfloat xatan2f(vfloat y, vfloat x) {
r = vmulsignf(r, x);
r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(M_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(M_PI/2)), x))), r);
r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(M_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(M_PI/4)), x))), r);
r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vself(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)M_PI), vcast_vf_f(0.0f)), r);
r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)M_PI)), r);
return vself(vorm(vmaskf_isnan(x), vmaskf_isnan(y)), vcast_vf_f(NANf), vmulsignf(r, y));
}
@ -1304,7 +1312,7 @@ static INLINE vfloat xcbrtf(vfloat d) {
}
static INLINE vfloat LIMV( vfloat a, vfloat b, vfloat c ) {
return _mm_max_ps( b, _mm_min_ps(a,c));
return vmaxf( b, vminf(a,c));
}
static INLINE vfloat ULIMV( vfloat a, vfloat b, vfloat c ){
@ -1312,13 +1320,13 @@ static INLINE vfloat ULIMV( vfloat a, vfloat b, vfloat c ){
}
static INLINE vfloat SQRV(vfloat a){
return _mm_mul_ps( a,a );
return a * a;
}
static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
vfloat temp = vself(condition, a, b); // the larger of the two
condition = vnotm(condition); // invert the mask
a = vself(condition, a, b); // the smaller of the two
vfloat temp = vself(condition, a, b); // the values which fit to condition
condition = vnotm(condition); // invert the condition
a = vself(condition, a, b); // the values which fit to inverted condition
b = temp;
}

View File

@ -41,7 +41,7 @@ Sharpening::Sharpening () : FoldableToolPanel(this, "sharpening", M("TP_SHARPENI
pack_start (*hb);
rld = new Gtk::VBox ();
dradius = Gtk::manage (new Adjuster (M("TP_SHARPENING_EDRADIUS"), 0.5, 2.5, 0.01, 0.75));
dradius = Gtk::manage (new Adjuster (M("TP_SHARPENING_EDRADIUS"), 0.4, 2.5, 0.01, 0.75));
damount = Gtk::manage (new Adjuster (M("TP_SHARPENING_RLD_AMOUNT"), 0.0, 100, 1, 75));
ddamping = Gtk::manage (new Adjuster (M("TP_SHARPENING_RLD_DAMPING"), 0, 100, 1, 20));
diter = Gtk::manage (new Adjuster (M("TP_SHARPENING_RLD_ITERATIONS"), 5, 100, 1, 30));