/* * This file is part of RawTherapee. * * Copyright (c) 2004-2010 Gabor Horvath * * RawTherapee is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * RawTherapee is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with RawTherapee. If not, see . */ #ifndef _GAUSS_H_ #define _GAUSS_H_ #include #include #include #include "alignedbuffer.h" #ifdef _OPENMP #include #endif #ifdef __SSE__ #if defined( WIN32 ) && defined(__x86_64__) #include #else #include #endif #endif // classical filtering if the support window is small: template void gaussHorizontal3 (T** src, T** dst, AlignedBufferMP &buffer, int W, int H, const float c0, const float c1) { #ifdef _OPENMP #pragma omp for #endif for (int i=0; i* pBuf = buffer.acquire(); T* temp=(T*)pBuf->data; for (int j=1; j void gaussVertical3 (T** src, T** dst, AlignedBufferMP &buffer, int W, int H, const float c0, const float c1) { #ifdef _OPENMP #pragma omp for #endif for (int i=0; i* pBuf = buffer.acquire(); T* temp = (T*)pBuf->data; for (int j = 1; j __attribute__((force_align_arg_pointer)) void gaussVertical3Sse (T** src, T** dst, int W, int H, const float c0, const float c1) { #else template void gaussVertical3Sse (T** src, T** dst, int W, int H, const float c0, const float c1) { #endif __m128 Tv,Tm1v,Tp1v; __m128 c0v,c1v; c0v = _mm_set1_ps(c0); c1v = _mm_set1_ps(c1); #ifdef _OPENMP #pragma omp for #endif for (int i=0; i1) Tv = _mm_loadu_ps( &src[1][i]); for (int j=1; j __attribute__((force_align_arg_pointer)) void gaussHorizontal3Sse (T** src, T** dst, int W, int H, const float c0, const float c1) { #else template void gaussHorizontal3Sse (T** src, T** dst, int W, int H, const float c0, const float c1) { #endif float tmp[W][4] __attribute__ ((aligned (16))); __m128 Tv,Tm1v,Tp1v; __m128 c0v,c1v; c0v = _mm_set1_ps(c0); c1v = _mm_set1_ps(c1); #ifdef _OPENMP #pragma omp for #endif for (int i=0; i1) Tv = _mm_set_ps( src[i][1], src[i+1][1], src[i+2][1], src[i+3][1] ); for (int j=1; j __attribute__((force_align_arg_pointer)) void gaussHorizontalSse (T** src, T** dst, int W, int H, float sigma) { #else template void gaussHorizontalSse (T** src, T** dst, int W, int H, float sigma) { #endif if (sigma<0.25) { // dont perform filtering if (src!=dst) #pragma omp for for (int i = 0; i (src, dst, W, H, c0, c1); return; } // coefficient calculation float q = 0.98711 * sigma - 0.96330; if (sigma<2.5) q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); float b0 = 1.57825 + 2.44413*q + 1.4281*q*q + 0.422205*q*q*q; float b1 = 2.44413*q + 2.85619*q*q + 1.26661*q*q*q; float b2 = -1.4281*q*q - 1.26661*q*q*q; float b3 = 0.422205*q*q*q; float B = 1.0 - (b1+b2+b3) / b0; b1 /= b0; b2 /= b0; b3 /= b0; // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering float M[3][3]; M[0][0] = -b3*b1+1.0-b3*b3-b2; M[0][1] = (b3+b1)*(b2+b3*b1); M[0][2] = b3*(b1+b3*b2); M[1][0] = b1+b3*b2; M[1][1] = -(b2-1.0)*(b2+b3*b1); M[1][2] = -(b3*b1+b3*b3+b2-1.0)*b3; M[2][0] = b3*b1+b2+b1*b1-b2*b2; M[2][1] = b1*b2+b3*b2*b2-b1*b3*b3-b3*b3*b3-b3*b2+b3; M[2][2] = b3*(b1+b3*b2); for (int i=0; i<3; i++) for (int j=0; j<3; j++) { M[i][j] *= (1.0+b2+(b1-b3)*b3); M[i][j] /= (1.0+b1-b2+b3)*(1.0-b1-b2-b3); } float tmp[W][4] __attribute__ ((aligned (16))); float tmpV[4] __attribute__ ((aligned (16))); __m128 Rv; __m128 Tv,Tm2v,Tm3v; __m128 Bv,b1v,b2v,b3v; __m128 temp2W,temp2Wp1; Bv = _mm_set1_ps(B); b1v = _mm_set1_ps(b1); b2v = _mm_set1_ps(b2); b3v = _mm_set1_ps(b3); #pragma omp for for (int i=0; i=0; j--) { Tv = Rv; Rv = _mm_load_ps(&tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; _mm_store_ps( &tmp[j][0], Rv ); Tm3v = Tm2v; Tm2v = Tv; } for (int j=0; j=0; j--) tmp[j][0] = B * tmp[j][0] + b1*tmp[j+1][0] + b2*tmp[j+2][0] + b3*tmp[j+3][0]; for (int j=0; j void gaussHorizontal (T** src, T** dst, AlignedBufferMP &buffer, int W, int H, double sigma) { #ifdef __SSE__ if(sigma < 70) { // bigger sigma only with double precision gaussHorizontalSse (src, dst, W, H, sigma); return; } #endif if (sigma<0.25) { // dont perform filtering if (src!=dst) #pragma omp for for (int i = 0; i (src, dst, buffer, W, H, c0, c1); return; } // coefficient calculation double q = 0.98711 * sigma - 0.96330; if (sigma<2.5) q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); double b0 = 1.57825 + 2.44413*q + 1.4281*q*q + 0.422205*q*q*q; double b1 = 2.44413*q + 2.85619*q*q + 1.26661*q*q*q; double b2 = -1.4281*q*q - 1.26661*q*q*q; double b3 = 0.422205*q*q*q; double B = 1.0 - (b1+b2+b3) / b0; b1 /= b0; b2 /= b0; b3 /= b0; // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering double M[3][3]; M[0][0] = -b3*b1+1.0-b3*b3-b2; M[0][1] = (b3+b1)*(b2+b3*b1); M[0][2] = b3*(b1+b3*b2); M[1][0] = b1+b3*b2; M[1][1] = -(b2-1.0)*(b2+b3*b1); M[1][2] = -(b3*b1+b3*b3+b2-1.0)*b3; M[2][0] = b3*b1+b2+b1*b1-b2*b2; M[2][1] = b1*b2+b3*b2*b2-b1*b3*b3-b3*b3*b3-b3*b2+b3; M[2][2] = b3*(b1+b3*b2); for (int i=0; i<3; i++) for (int j=0; j<3; j++) M[i][j] /= (1.0+b1-b2+b3)*(1.0+b2+(b1-b3)*b3); #pragma omp for for (int i=0; i* pBuf = buffer.acquire(); double* temp2 = pBuf->data; temp2[0] = B * src[i][0] + b1*src[i][0] + b2*src[i][0] + b3*src[i][0]; temp2[1] = B * src[i][1] + b1*temp2[0] + b2*src[i][0] + b3*src[i][0]; temp2[2] = B * src[i][2] + b1*temp2[1] + b2*temp2[0] + b3*src[i][0]; for (int j=3; j=0; j--) temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3]; for (int j=0; j __attribute__((force_align_arg_pointer)) void gaussVerticalSse (T** src, T** dst, int W, int H, float sigma) { #else template void gaussVerticalSse (T** src, T** dst, int W, int H, float sigma) { #endif if (sigma<0.25) { // dont perform filtering if (src!=dst) #pragma omp for for (int i = 0; i (src, dst, W, H, c0, c1); return; } // coefficient calculation double q = 0.98711 * sigma - 0.96330; if (sigma<2.5) q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); double b0 = 1.57825 + 2.44413*q + 1.4281*q*q + 0.422205*q*q*q; double b1 = 2.44413*q + 2.85619*q*q + 1.26661*q*q*q; double b2 = -1.4281*q*q - 1.26661*q*q*q; double b3 = 0.422205*q*q*q; double B = 1.0 - (b1+b2+b3) / b0; b1 /= b0; b2 /= b0; b3 /= b0; // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering double M[3][3]; M[0][0] = -b3*b1+1.0-b3*b3-b2; M[0][1] = (b3+b1)*(b2+b3*b1); M[0][2] = b3*(b1+b3*b2); M[1][0] = b1+b3*b2; M[1][1] = -(b2-1.0)*(b2+b3*b1); M[1][2] = -(b3*b1+b3*b3+b2-1.0)*b3; M[2][0] = b3*b1+b2+b1*b1-b2*b2; M[2][1] = b1*b2+b3*b2*b2-b1*b3*b3-b3*b3*b3-b3*b2+b3; M[2][2] = b3*(b1+b3*b2); for (int i=0; i<3; i++) for (int j=0; j<3; j++) { M[i][j] *= (1.0+b2+(b1-b3)*b3); M[i][j] /= (1.0+b1-b2+b3)*(1.0-b1-b2-b3); } float tmp[H][4] __attribute__ ((aligned (16))); __m128 Rv; __m128 Tv,Tm2v,Tm3v; __m128 Bv,b1v,b2v,b3v; __m128 temp2W,temp2Wp1; Bv = _mm_set1_ps(B); b1v = _mm_set1_ps(b1); b2v = _mm_set1_ps(b2); b3v = _mm_set1_ps(b3); #ifdef _OPENMP #pragma omp for #endif for (int i=0; i=0; j--) { Tv = Rv; Rv = _mm_load_ps(&tmp[j][0]) * Bv + Tv * b1v + Tm2v * b2v + Tm3v * b3v; _mm_storeu_ps( &dst[j][i], Rv ); Tm3v = Tm2v; Tm2v = Tv; } } // Borders are done without SSE #pragma omp for for(int i=W-(W%4);i=0; j--) tmp[j][0] = B * tmp[j][0] + b1*tmp[j+1][0] + b2*tmp[j+2][0] + b3*tmp[j+3][0]; for (int j=0; j void gaussVertical (T** src, T** dst, AlignedBufferMP &buffer, int W, int H, double sigma) { #ifdef __SSE__ if(sigma < 70) { // bigger sigma only with double precision gaussVerticalSse (src, dst, W, H, sigma); return; } #endif if (sigma<0.25) { // dont perform filtering if (src!=dst) #pragma omp for for (int i = 0; i (src, dst, buffer, W, H, c0, c1); return; } // coefficient calculation double q = 0.98711 * sigma - 0.96330; if (sigma<2.5) q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma); double b0 = 1.57825 + 2.44413*q + 1.4281*q*q + 0.422205*q*q*q; double b1 = 2.44413*q + 2.85619*q*q + 1.26661*q*q*q; double b2 = -1.4281*q*q - 1.26661*q*q*q; double b3 = 0.422205*q*q*q; double B = 1.0 - (b1+b2+b3) / b0; b1 /= b0; b2 /= b0; b3 /= b0; // From: Bill Triggs, Michael Sdika: Boundary Conditions for Young-van Vliet Recursive Filtering double M[3][3]; M[0][0] = -b3*b1+1.0-b3*b3-b2; M[0][1] = (b3+b1)*(b2+b3*b1); M[0][2] = b3*(b1+b3*b2); M[1][0] = b1+b3*b2; M[1][1] = -(b2-1.0)*(b2+b3*b1); M[1][2] = -(b3*b1+b3*b3+b2-1.0)*b3; M[2][0] = b3*b1+b2+b1*b1-b2*b2; M[2][1] = b1*b2+b3*b2*b2-b1*b3*b3-b3*b3*b3-b3*b2+b3; M[2][2] = b3*(b1+b3*b2); for (int i=0; i<3; i++) for (int j=0; j<3; j++) M[i][j] /= (1.0+b1-b2+b3)*(1.0+b2+(b1-b3)*b3); #ifdef _OPENMP #pragma omp for #endif for (int i=0; i* pBuf = buffer.acquire(); double* temp2 = pBuf->data; temp2[0] = B * src[0][i] + b1*src[0][i] + b2*src[0][i] + b3*src[0][i]; temp2[1] = B * src[1][i] + b1*temp2[0] + b2*src[0][i] + b3*src[0][i]; temp2[2] = B * src[2][i] + b1*temp2[1] + b2*temp2[0] + b3*src[0][i]; for (int j=3; j=0; j--) temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3]; for (int j=0; j void gaussDerivH (T** src, T** dst, AlignedBufferMP &buffer, int W, int H, double sigma) { if (sigma<0.6) { // apply symmetric derivative #ifdef _OPENMP #pragma omp for #endif for (int i=0; i* pBuf = buffer.acquire(); T* temp = (T*)pBuf->data; // double* temp = buffer->data;// replaced by 2 lines above for (int j=1; j* pBuf = buffer.acquire(); T* temp2 = (T*)pBuf->data; // double* temp2 = buffer->data;// replaced by 2 lines above double src0 = (src[i][1]-src[i][0]); temp2[0] = B * src0 + b1*src0 + b2*src0 + b3*src0; temp2[1] = B * 0.5*(src[i][2]-src[i][0]) + b1*temp2[0] + b2*src0 + b3*src0; temp2[2] = B * 0.5*(src[i][3]-src[i][1]) + b1*temp2[1] + b2*temp2[0] + b3*src0; for (int j=3; j=0; j--) temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3]; for (int j=0; j void gaussDerivV (T** src, T** dst, AlignedBufferMP &buffer, int W, int H, double sigma) { if (sigma<0.6) { // apply symmetric derivative #ifdef _OPENMP #pragma omp for #endif for (int j=0; j* pBuf = buffer.acquire(); T* temp = (T*)pBuf->data; // double* temp = buffer->data;// replaced by 2 lines above for (int i = 1; i* pBuf = buffer.acquire(); T* temp2 = (T*)pBuf->data; // double* temp2 = buffer->data;// replaced by 2 lines above double src0 = 0.5*(src[1][i]-src[0][i]); temp2[0] = B * src0 + b1*src0 + b2*src0 + b3*src0; temp2[1] = B * 0.5*(src[2][i]-src[0][i]) + b1*temp2[0] + b2*src0 + b3*src0; temp2[2] = B * 0.5*(src[3][i]-src[1][i]) + b1*temp2[1] + b2*temp2[0] + b3*src0; for (int j=3; j=0; j--) temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3]; for (int j=0; j