SSE code for vertical part of boxblur

This commit is contained in:
heckflosse
2015-11-03 14:17:56 +01:00
parent 5ddc43278c
commit 0edbe74fd3
2 changed files with 95 additions and 39 deletions

View File

@@ -23,16 +23,10 @@
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#include "alignedbuffer.h" #include "alignedbuffer.h"
#ifdef _OPENMP
#include <omp.h>
#endif
#include "rt_math.h" #include "rt_math.h"
#include "opthelper.h" #include "opthelper.h"
//using namespace rtengine;
namespace rtengine namespace rtengine
{ {
@@ -190,6 +184,59 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
} }
} else { } else {
const int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache const int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
#ifdef __SSE2__
vfloat leninitv = F2V( (float)(rady + 1));
vfloat onev = F2V( 1.f );
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
#ifdef _OPENMP
#pragma omp for
#endif
for (int col = 0; col < W - 7; col += 8) {
lenv = leninitv;
tempv = LVFU(temp[0 * W + col]);
temp1v = LVFU(temp[0 * W + col + 4]);
for (int i = 1; i <= rady; i++) {
tempv = tempv + LVFU(temp[i * W + col]);
temp1v = temp1v + LVFU(temp[i * W + col + 4]);
}
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU( dst[0][col], tempv);
STVFU( dst[0][col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
lenv = lenp1v;
}
rlenv = onev / lenv;
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
lenv = lenm1v;
}
}
#else
//vertical blur //vertical blur
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp for #pragma omp for
@@ -235,6 +282,7 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
} }
} }
#endif
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp single #pragma omp single
#endif #endif
@@ -271,7 +319,6 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H) template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
{ {
//printf("boxblur\n");
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1) //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
@@ -324,9 +371,9 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
} else { } else {
//vertical blur //vertical blur
#ifdef __SSE2__ #ifdef __SSE2__
__m128 leninitv = _mm_set1_ps( (float)(rady + 1)); vfloat leninitv = F2V( (float)(rady + 1));
__m128 onev = _mm_set1_ps( 1.0f ); vfloat onev = F2V( 1.f );
__m128 tempv, temp1v, lenv, lenp1v, lenm1v, rlenv; vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
int col; int col;
for (col = 0; col < W - 7; col += 8) { for (col = 0; col < W - 7; col += 8) {
@@ -341,15 +388,15 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
tempv = tempv / lenv; tempv = tempv / lenv;
temp1v = temp1v / lenv; temp1v = temp1v / lenv;
_mm_storeu_ps( &dst[0 * W + col], tempv); STVFU( dst[0 * W + col], tempv);
_mm_storeu_ps( &dst[0 * W + col + 4], temp1v); STVFU( dst[0 * W + col + 4], temp1v);
for (int row = 1; row <= rady; row++) { for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev; lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v; tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v; temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
_mm_storeu_ps( &dst[row * W + col], tempv); STVFU( dst[row * W + col], tempv);
_mm_storeu_ps( &dst[row * W + col + 4], temp1v); STVFU( dst[row * W + col + 4], temp1v);
lenv = lenp1v; lenv = lenp1v;
} }
@@ -358,16 +405,16 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
for (int row = rady + 1; row < H - rady; row++) { for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ; tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ; temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
_mm_storeu_ps( &dst[row * W + col], tempv); STVFU( dst[row * W + col], tempv);
_mm_storeu_ps( &dst[row * W + col + 4], temp1v); STVFU( dst[row * W + col + 4], temp1v);
} }
for (int row = H - rady; row < H; row++) { for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev; lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v; tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v; temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
_mm_storeu_ps( &dst[row * W + col], tempv); STVFU( dst[row * W + col], tempv);
_mm_storeu_ps( &dst[row * W + col + 4], temp1v); STVFU( dst[row * W + col + 4], temp1v);
lenv = lenm1v; lenv = lenm1v;
} }
} }
@@ -381,12 +428,12 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
} }
tempv = tempv / lenv; tempv = tempv / lenv;
_mm_storeu_ps( &dst[0 * W + col], tempv); STVFU( dst[0 * W + col], tempv);
for (int row = 1; row <= rady; row++) { for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev; lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v; tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
_mm_storeu_ps( &dst[row * W + col], tempv); STVFU( dst[row * W + col], tempv);
lenv = lenp1v; lenv = lenp1v;
} }
@@ -394,13 +441,13 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
for (int row = rady + 1; row < H - rady; row++) { for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ; tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
_mm_storeu_ps( &dst[row * W + col], tempv); STVFU( dst[row * W + col], tempv);
} }
for (int row = H - rady; row < H; row++) { for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev; lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v; tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
_mm_storeu_ps( &dst[row * W + col], tempv); STVFU( dst[row * W + col], tempv);
lenv = lenm1v; lenv = lenm1v;
} }
} }
@@ -994,9 +1041,9 @@ template<class T, class A> SSEFUNCTION void boxabsblur (T* src, A* dst, int radx
} else { } else {
//vertical blur //vertical blur
#ifdef __SSE2__ #ifdef __SSE2__
__m128 leninitv = _mm_set1_ps( (float)(rady + 1)); vfloat leninitv = F2V( (float)(rady + 1));
__m128 onev = _mm_set1_ps( 1.0f ); vfloat onev = F2V( 1.f );
__m128 tempv, lenv, lenp1v, lenm1v, rlenv; vfloat tempv, lenv, lenp1v, lenm1v, rlenv;
for (int col = 0; col < W - 3; col += 4) { for (int col = 0; col < W - 3; col += 4) {
lenv = leninitv; lenv = leninitv;

View File

@@ -768,32 +768,41 @@ template<class T> void gaussianBlur(T** src, T** dst, const int W, const int H,
if(buffer) { // use iterated boxblur to approximate gaussian blur if(buffer) { // use iterated boxblur to approximate gaussian blur
// Compute ideal averaging filter width and number of iterations // Compute ideal averaging filter width and number of iterations
int n = 1; int n = 1;
double wIdeal = sqrt((12*sigma*sigma)+1); double wIdeal = sqrt((12 * sigma * sigma) + 1);
while(wIdeal > W || wIdeal > H) { while(wIdeal > W || wIdeal > H) {
n++; n++;
wIdeal = sqrt((12*sigma*sigma/n)+1); wIdeal = sqrt((12 * sigma * sigma / n) + 1);
} }
if(n<3) { if(n < 3) {
n = 3; n = 3;
wIdeal = sqrt((12*sigma*sigma/n)+1); wIdeal = sqrt((12 * sigma * sigma / n) + 1);
} else if(n>6) } else if(n > 6) {
n=6; n = 6;
}
int wl = wIdeal; int wl = wIdeal;
if(wl%2==0) wl--;
int wu = wl+2;
double mIdeal = (12*sigma*sigma - n*wl*wl - 4*n*wl - 3*n)/(-4*wl - 4); if(wl % 2 == 0) {
wl--;
}
int wu = wl + 2;
double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4);
int m = round(mIdeal); int m = round(mIdeal);
int sizes[n]; int sizes[n];
for(int i=0; i<n; i++) {
sizes[i] = ((i<m?wl:wu)-1)/2; for(int i = 0; i < n; i++) {
sizes[i] = ((i < m ? wl : wu) - 1) / 2;
} }
rtengine::boxblur(src,dst,buffer,sizes[0],sizes[0],W,H);
for(int i=1; i<n; i++) { rtengine::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H);
rtengine::boxblur(dst,dst,buffer, sizes[i],sizes[i],W,H);
for(int i = 1; i < n; i++) {
rtengine::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H);
} }
} else { } else {