From 0edbe74fd3c85a79b1a9c57433ed01d10852745f Mon Sep 17 00:00:00 2001 From: heckflosse Date: Tue, 3 Nov 2015 14:17:56 +0100 Subject: [PATCH] SSE code for vertical part of boxblur --- rtengine/boxblur.h | 97 ++++++++++++++++++++++++++++++++++------------ rtengine/gauss.h | 37 +++++++++++------- 2 files changed, 95 insertions(+), 39 deletions(-) diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h index f1a444b2d..651dfd51a 100644 --- a/rtengine/boxblur.h +++ b/rtengine/boxblur.h @@ -23,16 +23,10 @@ #include #include #include "alignedbuffer.h" -#ifdef _OPENMP -#include -#endif - #include "rt_math.h" #include "opthelper.h" -//using namespace rtengine; - namespace rtengine { @@ -190,6 +184,59 @@ template void boxblur (T** src, A** dst, T* buffer, int radx, } } else { const int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache +#ifdef __SSE2__ + vfloat leninitv = F2V( (float)(rady + 1)); + vfloat onev = F2V( 1.f ); + vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv; + +#ifdef _OPENMP + #pragma omp for +#endif + + for (int col = 0; col < W - 7; col += 8) { + lenv = leninitv; + tempv = LVFU(temp[0 * W + col]); + temp1v = LVFU(temp[0 * W + col + 4]); + + for (int i = 1; i <= rady; i++) { + tempv = tempv + LVFU(temp[i * W + col]); + temp1v = temp1v + LVFU(temp[i * W + col + 4]); + } + + tempv = tempv / lenv; + temp1v = temp1v / lenv; + STVFU( dst[0][col], tempv); + STVFU( dst[0][col + 4], temp1v); + + for (int row = 1; row <= rady; row++) { + lenp1v = lenv + onev; + tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v; + temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v; + STVFU( dst[row][col], tempv); + STVFU( dst[row][col + 4], temp1v); + lenv = lenp1v; + } + + rlenv = onev / lenv; + + for (int row = rady + 1; row < H - rady; row++) { + tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ; + temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ; + STVFU( dst[row][col], tempv); + STVFU( dst[row][col + 4], temp1v); + } + + for (int row = H - rady; row < H; row++) { + lenm1v = lenv - onev; + tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v; + temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v; + STVFU( dst[row][col], tempv); + STVFU( dst[row][col + 4], temp1v); + lenv = lenm1v; + } + } + +#else //vertical blur #ifdef _OPENMP #pragma omp for @@ -235,6 +282,7 @@ template void boxblur (T** src, A** dst, T* buffer, int radx, } } +#endif #ifdef _OPENMP #pragma omp single #endif @@ -271,7 +319,6 @@ template void boxblur (T** src, A** dst, T* buffer, int radx, template SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H) { -//printf("boxblur\n"); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1) @@ -324,9 +371,9 @@ template SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, } else { //vertical blur #ifdef __SSE2__ - __m128 leninitv = _mm_set1_ps( (float)(rady + 1)); - __m128 onev = _mm_set1_ps( 1.0f ); - __m128 tempv, temp1v, lenv, lenp1v, lenm1v, rlenv; + vfloat leninitv = F2V( (float)(rady + 1)); + vfloat onev = F2V( 1.f ); + vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv; int col; for (col = 0; col < W - 7; col += 8) { @@ -341,15 +388,15 @@ template SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, tempv = tempv / lenv; temp1v = temp1v / lenv; - _mm_storeu_ps( &dst[0 * W + col], tempv); - _mm_storeu_ps( &dst[0 * W + col + 4], temp1v); + STVFU( dst[0 * W + col], tempv); + STVFU( dst[0 * W + col + 4], temp1v); for (int row = 1; row <= rady; row++) { lenp1v = lenv + onev; tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v; temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v; - _mm_storeu_ps( &dst[row * W + col], tempv); - _mm_storeu_ps( &dst[row * W + col + 4], temp1v); + STVFU( dst[row * W + col], tempv); + STVFU( dst[row * W + col + 4], temp1v); lenv = lenp1v; } @@ -358,16 +405,16 @@ template SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, for (int row = rady + 1; row < H - rady; row++) { tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ; temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ; - _mm_storeu_ps( &dst[row * W + col], tempv); - _mm_storeu_ps( &dst[row * W + col + 4], temp1v); + STVFU( dst[row * W + col], tempv); + STVFU( dst[row * W + col + 4], temp1v); } for (int row = H - rady; row < H; row++) { lenm1v = lenv - onev; tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v; temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v; - _mm_storeu_ps( &dst[row * W + col], tempv); - _mm_storeu_ps( &dst[row * W + col + 4], temp1v); + STVFU( dst[row * W + col], tempv); + STVFU( dst[row * W + col + 4], temp1v); lenv = lenm1v; } } @@ -381,12 +428,12 @@ template SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, } tempv = tempv / lenv; - _mm_storeu_ps( &dst[0 * W + col], tempv); + STVFU( dst[0 * W + col], tempv); for (int row = 1; row <= rady; row++) { lenp1v = lenv + onev; tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v; - _mm_storeu_ps( &dst[row * W + col], tempv); + STVFU( dst[row * W + col], tempv); lenv = lenp1v; } @@ -394,13 +441,13 @@ template SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, for (int row = rady + 1; row < H - rady; row++) { tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ; - _mm_storeu_ps( &dst[row * W + col], tempv); + STVFU( dst[row * W + col], tempv); } for (int row = H - rady; row < H; row++) { lenm1v = lenv - onev; tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v; - _mm_storeu_ps( &dst[row * W + col], tempv); + STVFU( dst[row * W + col], tempv); lenv = lenm1v; } } @@ -994,9 +1041,9 @@ template SSEFUNCTION void boxabsblur (T* src, A* dst, int radx } else { //vertical blur #ifdef __SSE2__ - __m128 leninitv = _mm_set1_ps( (float)(rady + 1)); - __m128 onev = _mm_set1_ps( 1.0f ); - __m128 tempv, lenv, lenp1v, lenm1v, rlenv; + vfloat leninitv = F2V( (float)(rady + 1)); + vfloat onev = F2V( 1.f ); + vfloat tempv, lenv, lenp1v, lenm1v, rlenv; for (int col = 0; col < W - 3; col += 4) { lenv = leninitv; diff --git a/rtengine/gauss.h b/rtengine/gauss.h index 85def3b6a..4136e6f6d 100644 --- a/rtengine/gauss.h +++ b/rtengine/gauss.h @@ -768,32 +768,41 @@ template void gaussianBlur(T** src, T** dst, const int W, const int H, if(buffer) { // use iterated boxblur to approximate gaussian blur // Compute ideal averaging filter width and number of iterations int n = 1; - double wIdeal = sqrt((12*sigma*sigma)+1); + double wIdeal = sqrt((12 * sigma * sigma) + 1); + while(wIdeal > W || wIdeal > H) { n++; - wIdeal = sqrt((12*sigma*sigma/n)+1); + wIdeal = sqrt((12 * sigma * sigma / n) + 1); } - if(n<3) { + if(n < 3) { n = 3; - wIdeal = sqrt((12*sigma*sigma/n)+1); - } else if(n>6) - n=6; + wIdeal = sqrt((12 * sigma * sigma / n) + 1); + } else if(n > 6) { + n = 6; + } int wl = wIdeal; - if(wl%2==0) wl--; - int wu = wl+2; - double mIdeal = (12*sigma*sigma - n*wl*wl - 4*n*wl - 3*n)/(-4*wl - 4); + if(wl % 2 == 0) { + wl--; + } + + int wu = wl + 2; + + double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4); int m = round(mIdeal); int sizes[n]; - for(int i=0; i