SSE code for vertical part of boxblur

This commit is contained in:
heckflosse
2015-11-03 14:17:56 +01:00
parent 5ddc43278c
commit 0edbe74fd3
2 changed files with 95 additions and 39 deletions

View File

@@ -23,16 +23,10 @@
#include <string.h>
#include <math.h>
#include "alignedbuffer.h"
#ifdef _OPENMP
#include <omp.h>
#endif
#include "rt_math.h"
#include "opthelper.h"
//using namespace rtengine;
namespace rtengine
{
@@ -190,6 +184,59 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
}
} else {
const int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
#ifdef __SSE2__
vfloat leninitv = F2V( (float)(rady + 1));
vfloat onev = F2V( 1.f );
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
#ifdef _OPENMP
#pragma omp for
#endif
for (int col = 0; col < W - 7; col += 8) {
lenv = leninitv;
tempv = LVFU(temp[0 * W + col]);
temp1v = LVFU(temp[0 * W + col + 4]);
for (int i = 1; i <= rady; i++) {
tempv = tempv + LVFU(temp[i * W + col]);
temp1v = temp1v + LVFU(temp[i * W + col + 4]);
}
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU( dst[0][col], tempv);
STVFU( dst[0][col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
lenv = lenp1v;
}
rlenv = onev / lenv;
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
lenv = lenm1v;
}
}
#else
//vertical blur
#ifdef _OPENMP
#pragma omp for
@@ -235,6 +282,7 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
}
}
#endif
#ifdef _OPENMP
#pragma omp single
#endif
@@ -271,7 +319,6 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
{
//printf("boxblur\n");
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
@@ -324,9 +371,9 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
} else {
//vertical blur
#ifdef __SSE2__
__m128 leninitv = _mm_set1_ps( (float)(rady + 1));
__m128 onev = _mm_set1_ps( 1.0f );
__m128 tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
vfloat leninitv = F2V( (float)(rady + 1));
vfloat onev = F2V( 1.f );
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
int col;
for (col = 0; col < W - 7; col += 8) {
@@ -341,15 +388,15 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
tempv = tempv / lenv;
temp1v = temp1v / lenv;
_mm_storeu_ps( &dst[0 * W + col], tempv);
_mm_storeu_ps( &dst[0 * W + col + 4], temp1v);
STVFU( dst[0 * W + col], tempv);
STVFU( dst[0 * W + col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
_mm_storeu_ps( &dst[row * W + col], tempv);
_mm_storeu_ps( &dst[row * W + col + 4], temp1v);
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
lenv = lenp1v;
}
@@ -358,16 +405,16 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
_mm_storeu_ps( &dst[row * W + col], tempv);
_mm_storeu_ps( &dst[row * W + col + 4], temp1v);
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
_mm_storeu_ps( &dst[row * W + col], tempv);
_mm_storeu_ps( &dst[row * W + col + 4], temp1v);
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
lenv = lenm1v;
}
}
@@ -381,12 +428,12 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
}
tempv = tempv / lenv;
_mm_storeu_ps( &dst[0 * W + col], tempv);
STVFU( dst[0 * W + col], tempv);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
_mm_storeu_ps( &dst[row * W + col], tempv);
STVFU( dst[row * W + col], tempv);
lenv = lenp1v;
}
@@ -394,13 +441,13 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer,
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
_mm_storeu_ps( &dst[row * W + col], tempv);
STVFU( dst[row * W + col], tempv);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
_mm_storeu_ps( &dst[row * W + col], tempv);
STVFU( dst[row * W + col], tempv);
lenv = lenm1v;
}
}
@@ -994,9 +1041,9 @@ template<class T, class A> SSEFUNCTION void boxabsblur (T* src, A* dst, int radx
} else {
//vertical blur
#ifdef __SSE2__
__m128 leninitv = _mm_set1_ps( (float)(rady + 1));
__m128 onev = _mm_set1_ps( 1.0f );
__m128 tempv, lenv, lenp1v, lenm1v, rlenv;
vfloat leninitv = F2V( (float)(rady + 1));
vfloat onev = F2V( 1.f );
vfloat tempv, lenv, lenp1v, lenm1v, rlenv;
for (int col = 0; col < W - 3; col += 4) {
lenv = leninitv;

View File

@@ -768,32 +768,41 @@ template<class T> void gaussianBlur(T** src, T** dst, const int W, const int H,
if(buffer) { // use iterated boxblur to approximate gaussian blur
// Compute ideal averaging filter width and number of iterations
int n = 1;
double wIdeal = sqrt((12*sigma*sigma)+1);
double wIdeal = sqrt((12 * sigma * sigma) + 1);
while(wIdeal > W || wIdeal > H) {
n++;
wIdeal = sqrt((12*sigma*sigma/n)+1);
wIdeal = sqrt((12 * sigma * sigma / n) + 1);
}
if(n<3) {
if(n < 3) {
n = 3;
wIdeal = sqrt((12*sigma*sigma/n)+1);
} else if(n>6)
n=6;
wIdeal = sqrt((12 * sigma * sigma / n) + 1);
} else if(n > 6) {
n = 6;
}
int wl = wIdeal;
if(wl%2==0) wl--;
int wu = wl+2;
double mIdeal = (12*sigma*sigma - n*wl*wl - 4*n*wl - 3*n)/(-4*wl - 4);
if(wl % 2 == 0) {
wl--;
}
int wu = wl + 2;
double mIdeal = (12 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4);
int m = round(mIdeal);
int sizes[n];
for(int i=0; i<n; i++) {
sizes[i] = ((i<m?wl:wu)-1)/2;
for(int i = 0; i < n; i++) {
sizes[i] = ((i < m ? wl : wu) - 1) / 2;
}
rtengine::boxblur(src,dst,buffer,sizes[0],sizes[0],W,H);
for(int i=1; i<n; i++) {
rtengine::boxblur(dst,dst,buffer, sizes[i],sizes[i],W,H);
rtengine::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H);
for(int i = 1; i < n; i++) {
rtengine::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H);
}
} else {