SSE code for vertical part of boxblur

2015-11-03 14:17:56 +01:00
parent 5ddc43278c
commit 0edbe74fd3
2 changed files with 95 additions and 39 deletions
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -23,16 +23,10 @@
 #include <string.h>
 #include <math.h>
 #include "alignedbuffer.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
 #include "rt_math.h"
 #include "opthelper.h"


-//using namespace rtengine;
-
 namespace rtengine
 {

@@ -190,6 +184,59 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
            }
    } else {
        const int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
+#ifdef __SSE2__
+        vfloat  leninitv = F2V( (float)(rady + 1));
+        vfloat  onev = F2V( 1.f );
+        vfloat  tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
+
+#ifdef _OPENMP
+        #pragma omp for
+#endif
+
+        for (int col = 0; col < W - 7; col += 8) {
+            lenv = leninitv;
+            tempv = LVFU(temp[0 * W + col]);
+            temp1v = LVFU(temp[0 * W + col + 4]);
+
+            for (int i = 1; i <= rady; i++) {
+                tempv = tempv + LVFU(temp[i * W + col]);
+                temp1v = temp1v + LVFU(temp[i * W + col + 4]);
+            }
+
+            tempv = tempv / lenv;
+            temp1v = temp1v / lenv;
+            STVFU( dst[0][col], tempv);
+            STVFU( dst[0][col + 4], temp1v);
+
+            for (int row = 1; row <= rady; row++) {
+                lenp1v = lenv + onev;
+                tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
+                temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
+                STVFU( dst[row][col], tempv);
+                STVFU( dst[row][col + 4], temp1v);
+                lenv = lenp1v;
+            }
+
+            rlenv = onev / lenv;
+
+            for (int row = rady + 1; row < H - rady; row++) {
+                tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
+                temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
+                STVFU( dst[row][col], tempv);
+                STVFU( dst[row][col + 4], temp1v);
+            }
+
+            for (int row = H - rady; row < H; row++) {
+                lenm1v = lenv - onev;
+                tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
+                temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
+                STVFU( dst[row][col], tempv);
+                STVFU( dst[row][col + 4], temp1v);
+                lenv = lenm1v;
+            }
+        }
+
+#else
        //vertical blur
 #ifdef _OPENMP
        #pragma omp for
@@ -235,6 +282,7 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
            }
        }

+#endif
 #ifdef _OPENMP
        #pragma omp single
 #endif
@@ -271,7 +319,6 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
 template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
 {

-//printf("boxblur\n");
    //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)

@@ -324,9 +371,9 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
    } else {
        //vertical blur
 #ifdef __SSE2__
-        __m128  leninitv = _mm_set1_ps( (float)(rady + 1));
-        __m128  onev = _mm_set1_ps( 1.0f );
-        __m128  tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
+        vfloat  leninitv = F2V( (float)(rady + 1));
+        vfloat  onev = F2V( 1.f );
+        vfloat  tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
        int col;

        for (col = 0; col < W - 7; col += 8) {
@@ -341,15 +388,15 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,

            tempv = tempv / lenv;
            temp1v = temp1v / lenv;
-            _mm_storeu_ps( &dst[0 * W + col], tempv);
-            _mm_storeu_ps( &dst[0 * W + col + 4], temp1v);
+            STVFU( dst[0 * W + col], tempv);
+            STVFU( dst[0 * W + col + 4], temp1v);

            for (int row = 1; row <= rady; row++) {
                lenp1v = lenv + onev;
                tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
                temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
-                _mm_storeu_ps( &dst[row * W + col], tempv);
-                _mm_storeu_ps( &dst[row * W + col + 4], temp1v);
+                STVFU( dst[row * W + col], tempv);
+                STVFU( dst[row * W + col + 4], temp1v);
                lenv = lenp1v;
            }

@@ -358,16 +405,16 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
            for (int row = rady + 1; row < H - rady; row++) {
                tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
                temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
-                _mm_storeu_ps( &dst[row * W + col], tempv);
-                _mm_storeu_ps( &dst[row * W + col + 4], temp1v);
+                STVFU( dst[row * W + col], tempv);
+                STVFU( dst[row * W + col + 4], temp1v);
            }

            for (int row = H - rady; row < H; row++) {
                lenm1v = lenv - onev;
                tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
                temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
-                _mm_storeu_ps( &dst[row * W + col], tempv);
-                _mm_storeu_ps( &dst[row * W + col + 4], temp1v);
+                STVFU( dst[row * W + col], tempv);
+                STVFU( dst[row * W + col + 4], temp1v);
                lenv = lenm1v;
            }
        }
@@ -381,12 +428,12 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
            }

            tempv = tempv / lenv;
-            _mm_storeu_ps( &dst[0 * W + col], tempv);
+            STVFU( dst[0 * W + col], tempv);

            for (int row = 1; row <= rady; row++) {
                lenp1v = lenv + onev;
                tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
-                _mm_storeu_ps( &dst[row * W + col], tempv);
+                STVFU( dst[row * W + col], tempv);
                lenv = lenp1v;
            }

@@ -394,13 +441,13 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,

            for (int row = rady + 1; row < H - rady; row++) {
                tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
-                _mm_storeu_ps( &dst[row * W + col], tempv);
+                STVFU( dst[row * W + col], tempv);
            }

            for (int row = H - rady; row < H; row++) {
                lenm1v = lenv - onev;
                tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
-                _mm_storeu_ps( &dst[row * W + col], tempv);
+                STVFU( dst[row * W + col], tempv);
                lenv = lenm1v;
            }
        }
@@ -994,9 +1041,9 @@ template<class T, class A> SSEFUNCTION void boxabsblur (T* src, A* dst, int radx
    } else {
        //vertical blur
 #ifdef __SSE2__
-        __m128  leninitv = _mm_set1_ps( (float)(rady + 1));
-        __m128  onev = _mm_set1_ps( 1.0f );
-        __m128  tempv, lenv, lenp1v, lenm1v, rlenv;
+        vfloat  leninitv = F2V( (float)(rady + 1));
+        vfloat  onev = F2V( 1.f );
+        vfloat  tempv, lenv, lenp1v, lenm1v, rlenv;

        for (int col = 0; col < W - 3; col += 4) {
            lenv = leninitv;
--- a/rtengine/gauss.h
+++ b/rtengine/gauss.h
@@ -768,32 +768,41 @@ template<class T> void gaussianBlur(T** src, T** dst, const int W, const int H,
    if(buffer) { // use iterated boxblur to approximate gaussian blur
        // Compute ideal averaging filter width and number of iterations
        int n = 1;
-        double wIdeal = sqrt((12*sigma*sigma)+1);
+        double wIdeal = sqrt((12 * sigma * sigma) + 1);
+
        while(wIdeal > W || wIdeal > H) {
            n++;
-            wIdeal = sqrt((12*sigma*sigma/n)+1);
+            wIdeal = sqrt((12 * sigma * sigma / n) + 1);
        }

-        if(n<3) {
+        if(n < 3) {
            n = 3;
-            wIdeal = sqrt((12*sigma*sigma/n)+1);
-        } else if(n>6)
-            n=6;
+            wIdeal = sqrt((12 * sigma * sigma / n) + 1);
+        } else if(n > 6) {
+            n = 6;
+        }

        int wl = wIdeal;
-        if(wl%2==0) wl--;
-        int wu = wl+2;

-        double mIdeal = (12*sigma*sigma - n*wl*wl - 4*n*wl - 3*n)/(-4*wl - 4);
+        if(wl % 2 == 0) {
+            wl--;
+        }
+
+        int wu = wl + 2;
+
+        double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4);
        int m = round(mIdeal);

        int sizes[n];
-        for(int i=0; i<n; i++) {
-            sizes[i] = ((i<m?wl:wu)-1)/2;
+
+        for(int i = 0; i < n; i++) {
+            sizes[i] = ((i < m ? wl : wu) - 1) / 2;
        }
-        rtengine::boxblur(src,dst,buffer,sizes[0],sizes[0],W,H);
-        for(int i=1; i<n; i++) {
-            rtengine::boxblur(dst,dst,buffer, sizes[i],sizes[i],W,H);
+
+        rtengine::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H);
+
+        for(int i = 1; i < n; i++) {
+            rtengine::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H);
        }

    } else {