Speedup for gauss5x5 and gauss7x7

This commit is contained in:
Ingo Weyrich 2019-08-21 17:29:59 +02:00
parent 0c1caf6c36
commit a0f95fe9e6

View File

@ -30,8 +30,12 @@ void compute7x7kernel(float sigma, float kernel[7][7]) {
float sum = 0.f;
for (int i = -3; i <= 3; ++i) {
for (int j = -3; j <= 3; ++j) {
kernel[i + 3][j + 3] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
sum += kernel[i + 3][j + 3];
if((rtengine::SQR(i) + rtengine::SQR(j)) <= rtengine::SQR(3.0 * 1.15)) {
kernel[i + 3][j + 3] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
sum += kernel[i + 3][j + 3];
} else {
kernel[i + 3][j + 3] = 0.f;
}
}
}
@ -47,8 +51,12 @@ void compute5x5kernel(float sigma, float kernel[5][5]) {
float sum = 0.f;
for (int i = -2; i <= 2; ++i) {
for (int j = -2; j <= 2; ++j) {
kernel[i + 2][j + 2] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
sum += kernel[i + 2][j + 2];
if((rtengine::SQR(i) + rtengine::SQR(j)) <= rtengine::SQR(3.0 * 0.84)) {
kernel[i + 2][j + 2] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
sum += kernel[i + 2][j + 2];
} else {
kernel[i + 2][j + 2] = 0.f;
}
}
}
@ -247,15 +255,14 @@ template<class T> void gauss7x7div (T** RESTRICT src, T** RESTRICT dst, T** REST
float kernel[7][7];
compute7x7kernel(sigma, kernel);
#ifdef __SSE2__
vfloat kernelv[7][7] ALIGNED16;
for (int i = 0; i < 7; ++i) {
for (int j = 0; j < 7; ++j) {
kernelv[i][j] = F2V(kernel[i][j]);
}
}
const vfloat onev = F2V(1.f);
#endif
const float c31 = kernel[0][2];
const float c30 = kernel[0][3];
const float c22 = kernel[1][1];
const float c21 = kernel[1][2];
const float c20 = kernel[1][3];
const float c11 = kernel[2][2];
const float c10 = kernel[2][3];
const float c00 = kernel[3][3];
#ifdef _OPENMP
#pragma omp for schedule(dynamic, 16) nowait
@ -263,26 +270,54 @@ template<class T> void gauss7x7div (T** RESTRICT src, T** RESTRICT dst, T** REST
for (int i = 3; i < H - 3; ++i) {
dst[i][0] = dst[i][1] = dst[i][2] = 1.f;
int j = 3;
#ifdef __SSE2__
for (; j < W - 6; j += 4) {
vfloat valv = ZEROV;
for (int k = -3; k <= 3; ++k) {
for (int l = -3; l <= 3; ++l) {
valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 3][l + 3]);
}
}
STVFU(dst[i][j], vmaxf(LVFU(divBuffer[i][j]) / (vself(vmaskf_gt(valv, ZEROV), valv, onev)), ZEROV));
}
#endif
for (; j < W - 3; ++j) {
float val = 0;
for (int k = -3; k <= 3; ++k) {
for (int l = -3; l <= 3; ++l) {
val += src[i + k][j + l] * kernel[k + 3][l + 3];
}
}
dst[i][j] = rtengine::max(divBuffer[i][j] / (val > 0.f ? val : 1.f), 0.f);
// I tried hand written SSE code but gcc vectorizes better
for (int j = 3; j < W - 3; ++j) {
float val = src[i - 3][j - 1] * c31;
val += src[i - 3][j - 0] * c30;
val += src[i - 3][j + 1] * c31;
val += src[i - 2][j - 2] * c22;
val += src[i - 2][j - 1] * c21;
val += src[i - 2][j - 0] * c20;
val += src[i - 2][j + 1] * c21;
val += src[i - 2][j + 2] * c22;
val += src[i - 1][j - 3] * c31;
val += src[i - 1][j - 2] * c21;
val += src[i - 1][j - 1] * c11;
val += src[i - 1][j - 0] * c10;
val += src[i - 1][j + 1] * c11;
val += src[i - 1][j + 2] * c21;
val += src[i - 1][j + 3] * c31;
val += src[i][j - 3] * c30;
val += src[i][j - 2] * c20;
val += src[i][j - 1] * c10;
val += src[i][j - 0] * c00;
val += src[i][j + 1] * c10;
val += src[i][j + 2] * c20;
val += src[i][j + 3] * c30;
val += src[i + 1][j - 3] * c31;
val += src[i + 1][j - 2] * c21;
val += src[i + 1][j - 1] * c11;
val += src[i + 1][j - 0] * c10;
val += src[i + 1][j + 1] * c11;
val += src[i + 1][j + 2] * c21;
val += src[i + 1][j + 3] * c31;
val += src[i + 2][j - 2] * c22;
val += src[i + 2][j - 1] * c21;
val += src[i + 2][j - 0] * c20;
val += src[i + 2][j + 1] * c21;
val += src[i + 2][j + 2] * c22;
val += src[i + 3][j - 1] * c31;
val += src[i + 3][j - 0] * c30;
val += src[i + 3][j + 1] * c31;
val = val > 0.f ? val : 1.f;
dst[i][j] = std::max(divBuffer[i][j] / val, 0.f);
}
dst[i][W - 3] = dst[i][W - 2] = dst[i][W - 1] = 1.f;
}
@ -311,15 +346,11 @@ template<class T> void gauss5x5div (T** RESTRICT src, T** RESTRICT dst, T** REST
float kernel[5][5];
compute5x5kernel(sigma, kernel);
#ifdef __SSE2__
vfloat kernelv[5][5] ALIGNED16;
for (int i = 0; i < 5; ++i) {
for (int j = 0; j < 5; ++j) {
kernelv[i][j] = F2V(kernel[i][j]);
}
}
const vfloat onev = F2V(1.f);
#endif
const float c21 = kernel[0][1];
const float c20 = kernel[0][2];
const float c11 = kernel[1][1];
const float c10 = kernel[1][2];
const float c00 = kernel[2][2];
#ifdef _OPENMP
#pragma omp for schedule(dynamic, 16) nowait
@ -327,26 +358,36 @@ template<class T> void gauss5x5div (T** RESTRICT src, T** RESTRICT dst, T** REST
for (int i = 2; i < H - 2; ++i) {
dst[i][0] = dst[i][1] = 1.f;
int j = 2;
#ifdef __SSE2__
for (; j < W - 5; j += 4) {
vfloat valv = ZEROV;
for (int k = -2; k <= 2; ++k) {
for (int l = -2; l <= 2; ++l) {
valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 2][l + 2]);
}
}
STVFU(dst[i][j], vmaxf(LVFU(divBuffer[i][j]) / (vself(vmaskf_gt(valv, ZEROV), valv, onev)), ZEROV));
}
#endif
for (; j < W - 2; ++j) {
float val = 0;
for (int k = -2; k <= 2; ++k) {
for (int l = -2; l <= 2; ++l) {
val += src[i + k][j + l] * kernel[k + 2][l + 2];
}
}
dst[i][j] = rtengine::max(divBuffer[i][j] / (val > 0.f ? val : 1.f), 0.f);
// I tried hand written SSE code but gcc vectorizes better
for (int j = 2; j < W - 2; ++j) {
float val = src[i - 2][j - 1] * c21;
val += src[i - 2][j - 0] * c20;
val += src[i - 2][j + 1] * c21;
val += src[i - 1][j - 2] * c21;
val += src[i - 1][j - 1] * c11;
val += src[i - 1][j - 0] * c10;
val += src[i - 1][j + 1] * c11;
val += src[i - 1][j + 2] * c21;
val += src[i - 0][j - 2] * c20;
val += src[i - 0][j - 1] * c10;
val += src[i - 0][j - 0] * c00;
val += src[i - 0][j + 1] * c10;
val += src[i - 0][j + 2] * c20;
val += src[i + 1][j - 2] * c21;
val += src[i + 1][j - 1] * c11;
val += src[i + 1][j - 0] * c10;
val += src[i + 1][j + 1] * c11;
val += src[i + 1][j + 2] * c21;
val += src[i + 2][j - 1] * c21;
val += src[i + 2][j - 0] * c20;
val += src[i + 2][j + 1] * c21;
val = val > 0.f ? val : 1.f;
dst[i][j] = std::max(divBuffer[i][j] / val, 0.f);
}
dst[i][W - 2] = dst[i][W - 1] = 1.f;
}
@ -374,39 +415,66 @@ template<class T> void gauss7x7mult (T** RESTRICT src, T** RESTRICT dst, const i
float kernel[7][7];
compute7x7kernel(sigma, kernel);
#ifdef __SSE2__
vfloat kernelv[7][7] ALIGNED16;
for (int i = 0; i < 7; ++i) {
for (int j = 0; j < 7; ++j) {
kernelv[i][j] = F2V(kernel[i][j]);
}
}
#endif
const float c31 = kernel[0][2];
const float c30 = kernel[0][3];
const float c22 = kernel[1][1];
const float c21 = kernel[1][2];
const float c20 = kernel[1][3];
const float c11 = kernel[2][2];
const float c10 = kernel[2][3];
const float c00 = kernel[3][3];
#ifdef _OPENMP
#pragma omp for schedule(dynamic, 16)
#endif
for (int i = 3; i < H - 3; ++i) {
int j = 3;
#ifdef __SSE2__
for (; j < W - 6; j += 4) {
vfloat valv = ZEROV;
for (int k = -3; k <= 3; ++k) {
for (int l = -3; l <= 3; ++l) {
valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 3][l + 3]);
}
}
STVFU(dst[i][j], LVFU(dst[i][j]) * valv);
}
#endif
for (; j < W - 3; ++j) {
float val = 0;
for (int k = -3; k <= 3; ++k) {
for (int l = -3; l <= 3; ++l) {
val += src[i + k][j + l] * kernel[k + 3][l + 3];
}
}
// I tried hand written SSE code but gcc vectorizes better
for (int j = 3; j < W - 3; ++j) {
float val = src[i - 3][j - 1] * c31;
val += src[i - 3][j - 0] * c30;
val += src[i - 3][j + 1] * c31;
val += src[i - 2][j - 2] * c22;
val += src[i - 2][j - 1] * c21;
val += src[i - 2][j - 0] * c20;
val += src[i - 2][j + 1] * c21;
val += src[i - 2][j + 2] * c22;
val += src[i - 1][j - 3] * c31;
val += src[i - 1][j - 2] * c21;
val += src[i - 1][j - 1] * c11;
val += src[i - 1][j - 0] * c10;
val += src[i - 1][j + 1] * c11;
val += src[i - 1][j + 2] * c21;
val += src[i - 1][j + 3] * c31;
val += src[i][j - 3] * c30;
val += src[i][j - 2] * c20;
val += src[i][j - 1] * c10;
val += src[i][j - 0] * c00;
val += src[i][j + 1] * c10;
val += src[i][j + 2] * c20;
val += src[i][j + 3] * c30;
val += src[i + 1][j - 3] * c31;
val += src[i + 1][j - 2] * c21;
val += src[i + 1][j - 1] * c11;
val += src[i + 1][j - 0] * c10;
val += src[i + 1][j + 1] * c11;
val += src[i + 1][j + 2] * c21;
val += src[i + 1][j + 3] * c31;
val += src[i + 2][j - 2] * c22;
val += src[i + 2][j - 1] * c21;
val += src[i + 2][j - 0] * c20;
val += src[i + 2][j + 1] * c21;
val += src[i + 2][j + 2] * c22;
val += src[i + 3][j - 1] * c31;
val += src[i + 3][j - 0] * c30;
val += src[i + 3][j + 1] * c31;
dst[i][j] *= val;
}
}
@ -417,39 +485,46 @@ template<class T> void gauss5x5mult (T** RESTRICT src, T** RESTRICT dst, const i
float kernel[5][5];
compute5x5kernel(sigma, kernel);
#ifdef __SSE2__
vfloat kernelv[5][5] ALIGNED16;
for (int i = 0; i < 5; ++i) {
for (int j = 0; j < 5; ++j) {
kernelv[i][j] = F2V(kernel[i][j]);
}
}
#endif
const float c21 = kernel[0][1];
const float c20 = kernel[0][2];
const float c11 = kernel[1][1];
const float c10 = kernel[1][2];
const float c00 = kernel[2][2];
#ifdef _OPENMP
#pragma omp for schedule(dynamic, 16)
#endif
for (int i = 2; i < H - 2; ++i) {
int j = 2;
#ifdef __SSE2__
for (; j < W - 5; j += 4) {
vfloat valv = ZEROV;
for (int k = -2; k <= 2; ++k) {
for (int l = -2; l <= 2; ++l) {
valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 2][l + 2]);
}
}
STVFU(dst[i][j], LVFU(dst[i][j]) * valv);
}
#endif
for (; j < W - 2; ++j) {
float val = 0;
for (int k = -2; k <= 2; ++k) {
for (int l = -2; l <= 2; ++l) {
val += src[i + k][j + l] * kernel[k + 2][l + 2];
}
}
// I tried hand written SSE code but gcc vectorizes better
for (int j = 2; j < W - 2; ++j) {
float val = src[i - 2][j - 1] * c21;
val += src[i - 2][j - 0] * c20;
val += src[i - 2][j + 1] * c21;
val += src[i - 1][j - 2] * c21;
val += src[i - 1][j - 1] * c11;
val += src[i - 1][j - 0] * c10;
val += src[i - 1][j + 1] * c11;
val += src[i - 1][j + 2] * c21;
val += src[i - 0][j - 2] * c20;
val += src[i - 0][j - 1] * c10;
val += src[i - 0][j - 0] * c00;
val += src[i - 0][j + 1] * c10;
val += src[i - 0][j + 2] * c20;
val += src[i + 1][j - 2] * c21;
val += src[i + 1][j - 1] * c11;
val += src[i + 1][j - 0] * c10;
val += src[i + 1][j + 1] * c11;
val += src[i + 1][j + 2] * c21;
val += src[i + 2][j - 1] * c21;
val += src[i + 2][j - 0] * c20;
val += src[i + 2][j + 1] * c21;
dst[i][j] *= val;
}
}
@ -1390,6 +1465,8 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
{
static constexpr auto GAUSS_SKIP = 0.25;
static constexpr auto GAUSS_3X3_LIMIT = 0.6;
static constexpr auto GAUSS_5X5_LIMIT = 0.84;
static constexpr auto GAUSS_7X7_LIMIT = 1.15;
static constexpr auto GAUSS_DOUBLE = 25.0;
if(buffer) {
@ -1488,9 +1565,9 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
if (sigma < GAUSS_DOUBLE) {
switch (gausstype) {
case GAUSS_MULT : {
if (sigma < 0.84 && src != dst) {
if (sigma <= GAUSS_5X5_LIMIT && src != dst) {
gauss5x5mult(src, dst, W, H, sigma);
} else if (sigma < 1.15f && src != dst) {
} else if (sigma <= GAUSS_7X7_LIMIT && src != dst) {
gauss7x7mult(src, dst, W, H, sigma);
} else {
gaussHorizontalSse<T> (src, src, W, H, sigma);
@ -1500,9 +1577,9 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
}
case GAUSS_DIV : {
if (sigma < 0.84f && src != dst) {
if (sigma <= GAUSS_5X5_LIMIT && src != dst) {
gauss5x5div (src, dst, buffer2, W, H, sigma);
} else if (sigma < 1.15f && src != dst) {
} else if (sigma <= GAUSS_7X7_LIMIT && src != dst) {
gauss7x7div (src, dst, buffer2, W, H, sigma);
} else {
gaussHorizontalSse<T> (src, dst, W, H, sigma);
@ -1527,14 +1604,26 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
if (sigma < GAUSS_DOUBLE) {
switch (gausstype) {
case GAUSS_MULT : {
gaussHorizontal<T> (src, src, W, H, sigma);
gaussVerticalmult<T> (src, dst, W, H, sigma);
if (sigma <= GAUSS_5X5_LIMIT && src != dst) {
gauss5x5mult(src, dst, W, H, sigma);
} else if (sigma <= GAUSS_7X7_LIMIT && src != dst) {
gauss7x7mult(src, dst, W, H, sigma);
} else {
gaussHorizontal<T> (src, src, W, H, sigma);
gaussVerticalmult<T> (src, dst, W, H, sigma);
}
break;
}
case GAUSS_DIV : {
gaussHorizontal<T> (src, dst, W, H, sigma);
gaussVerticaldiv<T> (dst, dst, buffer2, W, H, sigma);
if (sigma <= GAUSS_5X5_LIMIT && src != dst) {
gauss5x5div (src, dst, buffer2, W, H, sigma);
} else if (sigma <= GAUSS_7X7_LIMIT && src != dst) {
gauss7x7div (src, dst, buffer2, W, H, sigma);
} else {
gaussHorizontal<T> (src, dst, W, H, sigma);
gaussVerticaldiv<T> (dst, dst, buffer2, W, H, sigma);
}
break;
}