Use std::vector Instead of Manually Managed Memory

This commit is contained in:
Michael Jaschob 2023-07-27 14:31:53 -06:00
parent 5b2811f42b
commit 31bbe91760
No known key found for this signature in database
GPG Key ID: 7739AE8FCCDD1F6A

View File

@ -19,6 +19,7 @@
#include <cmath> #include <cmath>
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <vector>
#include "gauss.h" #include "gauss.h"
@ -1119,8 +1120,6 @@ template<class T> void gaussVerticalSsediv (T** src, T** dst, T** divBuffer, con
template<class T> void gaussVertical (T** src, T** dst, const int W, const int H, const double sigma) template<class T> void gaussVertical (T** src, T** dst, const int W, const int H, const double sigma)
{ {
#define TEMP2(X, Y) temp2[(X) * numcols + (Y)]
double b1, b2, b3, B, M[3][3]; double b1, b2, b3, B, M[3][3];
calculateYvVFactors<double>(sigma, b1, b2, b3, B, M); calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
@ -1131,40 +1130,45 @@ template<class T> void gaussVertical (T** src, T** dst, const int W, const int H
// process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H) // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
static const int numcols = 8; static const int numcols = 8;
double *temp2 ALIGNED16 = new double[H * numcols]; std::vector<double> temp2(H * numcols);
double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols]; double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
auto coord =
[](const int &x, const int &y) -> int {
return x * numcols + y;
};
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp for nowait #pragma omp for nowait
#endif #endif
for (unsigned int i = 0; i < static_cast<unsigned>(std::max(0, W - numcols + 1)); i += numcols) { for (unsigned int i = 0; i < static_cast<unsigned>(std::max(0, W - numcols + 1)); i += numcols) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
TEMP2(0, k) = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k]; temp2[coord(0, k)] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
TEMP2(1, k) = B * src[1][i + k] + b1 * TEMP2(0, k) + b2 * src[0][i + k] + b3 * src[0][i + k]; temp2[coord(1, k)] = B * src[1][i + k] + b1 * temp2[coord(0, k)] + b2 * src[0][i + k] + b3 * src[0][i + k];
TEMP2(2, k) = B * src[2][i + k] + b1 * TEMP2(1, k) + b2 * TEMP2(0, k) + b3 * src[0][i + k]; temp2[coord(2, k)] = B * src[2][i + k] + b1 * temp2[coord(1, k)] + b2 * temp2[coord(0, k)] + b3 * src[0][i + k];
} }
for (int j = 3; j < H; j++) { for (int j = 3; j < H; j++) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
TEMP2(j, k) = B * src[j][i + k] + b1 * TEMP2(j - 1, k) + b2 * TEMP2(j - 2, k) + b3 * TEMP2(j - 3, k); temp2[coord(j, k)] = B * src[j][i + k] + b1 * temp2[coord(j - 1, k)] + b2 * temp2[coord(j - 2, k)] + b3 * temp2[coord(j - 3, k)];
} }
} }
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[0][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[0][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[0][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[0][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
temp2H[k] = src[H - 1][i + k] + M[1][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[1][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[1][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[1][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[1][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[2][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[2][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[2][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[2][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
} }
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
dst[H - 1][i + k] = TEMP2(H - 1, k) = temp2Hm1[k]; dst[H - 1][i + k] = temp2[coord(H - 1, k)] = temp2Hm1[k];
dst[H - 2][i + k] = TEMP2(H - 2, k) = B * TEMP2(H - 2, k) + b1 * TEMP2(H - 1, k) + b2 * temp2H[k] + b3 * temp2Hp1[k]; dst[H - 2][i + k] = temp2[coord(H - 2, k)] = B * temp2[coord(H - 2, k)] + b1 * temp2[coord(H - 1, k)] + b2 * temp2H[k] + b3 * temp2Hp1[k];
dst[H - 3][i + k] = TEMP2(H - 3, k) = B * TEMP2(H - 3, k) + b1 * TEMP2(H - 2, k) + b2 * TEMP2(H - 1, k) + b3 * temp2H[k]; dst[H - 3][i + k] = temp2[coord(H - 3, k)] = B * temp2[coord(H - 3, k)] + b1 * temp2[coord(H - 2, k)] + b2 * temp2[coord(H - 1, k)] + b3 * temp2H[k];
} }
for (int j = H - 4; j >= 0; j--) { for (int j = H - 4; j >= 0; j--) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
dst[j][i + k] = TEMP2(j, k) = B * TEMP2(j, k) + b1 * TEMP2(j + 1, k) + b2 * TEMP2(j + 2, k) + b3 * TEMP2(j + 3, k); dst[j][i + k] = temp2[coord(j, k)] = B * temp2[coord(j, k)] + b1 * temp2[coord(j + 1, k)] + b2 * temp2[coord(j + 2, k)] + b3 * temp2[coord(j + 3, k)];
} }
} }
} }
@ -1175,35 +1179,31 @@ template<class T> void gaussVertical (T** src, T** dst, const int W, const int H
// process remaining columns // process remaining columns
for (int i = W - (W % numcols); i < W; i++) { for (int i = W - (W % numcols); i < W; i++) {
TEMP2(0, 0) = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i]; temp2[coord(0, 0)] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
TEMP2(1, 0) = B * src[1][i] + b1 * TEMP2(0, 0) + b2 * src[0][i] + b3 * src[0][i]; temp2[coord(1, 0)] = B * src[1][i] + b1 * temp2[coord(0, 0)] + b2 * src[0][i] + b3 * src[0][i];
TEMP2(2, 0) = B * src[2][i] + b1 * TEMP2(1, 0) + b2 * TEMP2(0, 0) + b3 * src[0][i]; temp2[coord(2, 0)] = B * src[2][i] + b1 * temp2[coord(1, 0)] + b2 * temp2[coord(0, 0)] + b3 * src[0][i];
for (int j = 3; j < H; j++) { for (int j = 3; j < H; j++) {
TEMP2(j, 0) = B * src[j][i] + b1 * TEMP2(j - 1, 0) + b2 * TEMP2(j - 2, 0) + b3 * TEMP2(j - 3, 0); temp2[coord(j, 0)] = B * src[j][i] + b1 * temp2[coord(j - 1, 0)] + b2 * temp2[coord(j - 2, 0)] + b3 * temp2[coord(j - 3, 0)];
} }
double temp2Hm1 = src[H - 1][i] + M[0][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[0][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[0][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[0][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[0][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
double temp2H = src[H - 1][i] + M[1][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[1][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[1][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2H = src[H - 1][i] + M[1][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[1][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[1][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
double temp2Hp1 = src[H - 1][i] + M[2][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[2][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[2][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[2][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[2][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
dst[H - 1][i] = TEMP2(H - 1, 0) = temp2Hm1; dst[H - 1][i] = temp2[coord(H - 1, 0)] = temp2Hm1;
dst[H - 2][i] = TEMP2(H - 2, 0) = B * TEMP2(H - 2, 0) + b1 * TEMP2(H - 1, 0) + b2 * temp2H + b3 * temp2Hp1; dst[H - 2][i] = temp2[coord(H - 2, 0)] = B * temp2[coord(H - 2, 0)] + b1 * temp2[coord(H - 1, 0)] + b2 * temp2H + b3 * temp2Hp1;
dst[H - 3][i] = TEMP2(H - 3, 0) = B * TEMP2(H - 3, 0) + b1 * TEMP2(H - 2, 0) + b2 * TEMP2(H - 1, 0) + b3 * temp2H; dst[H - 3][i] = temp2[coord(H - 3, 0)] = B * temp2[coord(H - 3, 0)] + b1 * temp2[coord(H - 2, 0)] + b2 * temp2[coord(H - 1, 0)] + b3 * temp2H;
for (int j = H - 4; j >= 0; j--) { for (int j = H - 4; j >= 0; j--) {
dst[j][i] = TEMP2(j, 0) = B * TEMP2(j, 0) + b1 * TEMP2(j + 1, 0) + b2 * TEMP2(j + 2, 0) + b3 * TEMP2(j + 3, 0); dst[j][i] = temp2[coord(j, 0)] = B * temp2[coord(j, 0)] + b1 * temp2[coord(j + 1, 0)] + b2 * temp2[coord(j + 2, 0)] + b3 * temp2[coord(j + 3, 0)];
} }
} }
delete [] temp2;
} }
#ifndef __SSE2__ #ifndef __SSE2__
template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const int W, const int H, const double sigma) template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const int W, const int H, const double sigma)
{ {
#define TEMP2(X, Y) temp2[(X) * numcols + (Y)]
double b1, b2, b3, B, M[3][3]; double b1, b2, b3, B, M[3][3];
calculateYvVFactors<double>(sigma, b1, b2, b3, B, M); calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
@ -1214,40 +1214,45 @@ template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const
// process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H) // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
static const int numcols = 8; static const int numcols = 8;
double *temp2 ALIGNED16 = new double[H * numcols]; std::vector<double> temp2(H * numcols);
double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols]; double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
auto coord =
[](const int &x, const int &y) -> int {
return x * numcols + y;
};
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp for nowait #pragma omp for nowait
#endif #endif
for (int i = 0; i < W - numcols + 1; i += numcols) { for (int i = 0; i < W - numcols + 1; i += numcols) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
TEMP2(0, k) = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k]; temp2[coord(0, k)] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
TEMP2(1, k) = B * src[1][i + k] + b1 * TEMP2(0, k) + b2 * src[0][i + k] + b3 * src[0][i + k]; temp2[coord(1, k)] = B * src[1][i + k] + b1 * temp2[coord(0, k)] + b2 * src[0][i + k] + b3 * src[0][i + k];
TEMP2(2, k) = B * src[2][i + k] + b1 * TEMP2(1, k) + b2 * TEMP2(0, k) + b3 * src[0][i + k]; temp2[coord(2, k)] = B * src[2][i + k] + b1 * temp2[coord(1, k)] + b2 * temp2[coord(0, k)] + b3 * src[0][i + k];
} }
for (int j = 3; j < H; j++) { for (int j = 3; j < H; j++) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
TEMP2(j, k) = B * src[j][i + k] + b1 * TEMP2(j - 1, k) + b2 * TEMP2(j - 2, k) + b3 * TEMP2(j - 3, k); temp2[coord(j, k)] = B * src[j][i + k] + b1 * temp2[coord(j - 1, k)] + b2 * temp2[coord(j - 2, k)] + b3 * temp2[coord(j - 3, k)];
} }
} }
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[0][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[0][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[0][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[0][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
temp2H[k] = src[H - 1][i + k] + M[1][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[1][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[1][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[1][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[1][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[2][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[2][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[2][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[2][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
} }
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
dst[H - 1][i + k] = rtengine::max(divBuffer[H - 1][i + k] / (TEMP2(H - 1, k) = temp2Hm1[k]), 0.0); dst[H - 1][i + k] = rtengine::max(divBuffer[H - 1][i + k] / (temp2[coord(H - 1, k)] = temp2Hm1[k]), 0.0);
dst[H - 2][i + k] = rtengine::max(divBuffer[H - 2][i + k] / (TEMP2(H - 2, k) = B * TEMP2(H - 2, k) + b1 * TEMP2(H - 1, k) + b2 * temp2H[k] + b3 * temp2Hp1[k]), 0.0); dst[H - 2][i + k] = rtengine::max(divBuffer[H - 2][i + k] / (temp2[coord(H - 2, k)] = B * temp2[coord(H - 2, k)] + b1 * temp2[coord(H - 1, k)] + b2 * temp2H[k] + b3 * temp2Hp1[k]), 0.0);
dst[H - 3][i + k] = rtengine::max(divBuffer[H - 3][i + k] / (TEMP2(H - 3, k) = B * TEMP2(H - 3, k) + b1 * TEMP2(H - 2, k) + b2 * TEMP2(H - 1, k) + b3 * temp2H[k]), 0.0); dst[H - 3][i + k] = rtengine::max(divBuffer[H - 3][i + k] / (temp2[coord(H - 3, k)] = B * temp2[coord(H - 3, k)] + b1 * temp2[coord(H - 2, k)] + b2 * temp2[coord(H - 1, k)] + b3 * temp2H[k]), 0.0);
} }
for (int j = H - 4; j >= 0; j--) { for (int j = H - 4; j >= 0; j--) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
dst[j][i + k] = rtengine::max(divBuffer[j][i + k] / (TEMP2(j, k) = B * TEMP2(j, k) + b1 * TEMP2(j + 1, k) + b2 * TEMP2(j + 2, k) + b3 * TEMP2(j + 3, k)), 0.0); dst[j][i + k] = rtengine::max(divBuffer[j][i + k] / (temp2[coord(j, k)] = B * temp2[coord(j, k)] + b1 * temp2[coord(j + 1, k)] + b2 * temp2[coord(j + 2, k)] + b3 * temp2[coord(j + 3, k)]), 0.0);
} }
} }
} }
@ -1258,34 +1263,30 @@ template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const
// process remaining columns // process remaining columns
for (int i = W - (W % numcols); i < W; i++) { for (int i = W - (W % numcols); i < W; i++) {
TEMP2(0, 0) = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i]; temp2[coord(0, 0)] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
TEMP2(1, 0) = B * src[1][i] + b1 * TEMP2(0, 0) + b2 * src[0][i] + b3 * src[0][i]; temp2[coord(1, 0)] = B * src[1][i] + b1 * temp2[coord(0, 0)] + b2 * src[0][i] + b3 * src[0][i];
TEMP2(2, 0) = B * src[2][i] + b1 * TEMP2(1, 0) + b2 * TEMP2(0, 0) + b3 * src[0][i]; temp2[coord(2, 0)] = B * src[2][i] + b1 * temp2[coord(1, 0)] + b2 * temp2[coord(0, 0)] + b3 * src[0][i];
for (int j = 3; j < H; j++) { for (int j = 3; j < H; j++) {
TEMP2(j, 0) = B * src[j][i] + b1 * TEMP2(j - 1, 0) + b2 * TEMP2(j - 2, 0) + b3 * TEMP2(j - 3, 0); temp2[coord(j, 0)] = B * src[j][i] + b1 * temp2[coord(j - 1, 0)] + b2 * temp2[coord(j - 2, 0)] + b3 * temp2[coord(j - 3, 0)];
} }
double temp2Hm1 = src[H - 1][i] + M[0][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[0][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[0][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[0][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[0][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
double temp2H = src[H - 1][i] + M[1][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[1][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[1][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2H = src[H - 1][i] + M[1][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[1][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[1][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
double temp2Hp1 = src[H - 1][i] + M[2][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[2][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[2][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[2][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[2][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
dst[H - 1][i] = rtengine::max(divBuffer[H - 1][i] / (TEMP2(H - 1, 0) = temp2Hm1), 0.0); dst[H - 1][i] = rtengine::max(divBuffer[H - 1][i] / (temp2[coord(H - 1, 0)] = temp2Hm1), 0.0);
dst[H - 2][i] = rtengine::max(divBuffer[H - 2][i] / (TEMP2(H - 2, 0) = B * TEMP2(H - 2, 0) + b1 * TEMP2(H - 1, 0) + b2 * temp2H + b3 * temp2Hp1), 0.0); dst[H - 2][i] = rtengine::max(divBuffer[H - 2][i] / (temp2[coord(H - 2, 0)] = B * temp2[coord(H - 2, 0)] + b1 * temp2[coord(H - 1, 0)] + b2 * temp2H + b3 * temp2Hp1), 0.0);
dst[H - 3][i] = rtengine::max(divBuffer[H - 3][i] / (TEMP2(H - 3, 0) = B * TEMP2(H - 3, 0) + b1 * TEMP2(H - 2, 0) + b2 * TEMP2(H - 1, 0) + b3 * temp2H), 0.0); dst[H - 3][i] = rtengine::max(divBuffer[H - 3][i] / (temp2[coord(H - 3, 0)] = B * temp2[coord(H - 3, 0)] + b1 * temp2[coord(H - 2, 0)] + b2 * temp2[coord(H - 1, 0)] + b3 * temp2H), 0.0);
for (int j = H - 4; j >= 0; j--) { for (int j = H - 4; j >= 0; j--) {
dst[j][i] = rtengine::max(divBuffer[j][i] / (TEMP2(j, 0) = B * TEMP2(j, 0) + b1 * TEMP2(j + 1, 0) + b2 * TEMP2(j + 2, 0) + b3 * TEMP2(j + 3, 0)), 0.0); dst[j][i] = rtengine::max(divBuffer[j][i] / (temp2[coord(j, 0)] = B * temp2[coord(j, 0)] + b1 * temp2[coord(j + 1, 0)] + b2 * temp2[coord(j + 2, 0)] + b3 * temp2[coord(j + 3, 0)]), 0.0);
} }
} }
delete [] temp2;
} }
template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const int H, const double sigma) template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const int H, const double sigma)
{ {
#define TEMP2(X, Y) temp2[(X) * numcols + (Y)]
double b1, b2, b3, B, M[3][3]; double b1, b2, b3, B, M[3][3];
calculateYvVFactors<double>(sigma, b1, b2, b3, B, M); calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
@ -1296,40 +1297,45 @@ template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const i
// process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H) // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
static const int numcols = 8; static const int numcols = 8;
double *temp2 ALIGNED16 = new double[H * numcols]; std::vector<double> temp2(H * numcols);
double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols]; double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
auto coord =
[](const int &x, const int &y) -> int {
return x * numcols + y;
};
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp for nowait #pragma omp for nowait
#endif #endif
for (int i = 0; i < W - numcols + 1; i += numcols) { for (int i = 0; i < W - numcols + 1; i += numcols) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
TEMP2(0, k) = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k]; temp2[coord(0, k)] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
TEMP2(1, k) = B * src[1][i + k] + b1 * TEMP2(0, k) + b2 * src[0][i + k] + b3 * src[0][i + k]; temp2[coord(1, k)] = B * src[1][i + k] + b1 * temp2[coord(0, k)] + b2 * src[0][i + k] + b3 * src[0][i + k];
TEMP2(2, k) = B * src[2][i + k] + b1 * TEMP2(1, k) + b2 * TEMP2(0, k) + b3 * src[0][i + k]; temp2[coord(2, k)] = B * src[2][i + k] + b1 * temp2[coord(1, k)] + b2 * temp2[coord(0, k)] + b3 * src[0][i + k];
} }
for (int j = 3; j < H; j++) { for (int j = 3; j < H; j++) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
TEMP2(j, k) = B * src[j][i + k] + b1 * TEMP2(j - 1, k) + b2 * TEMP2(j - 2, k) + b3 * TEMP2(j - 3, k); temp2[coord(j, k)] = B * src[j][i + k] + b1 * temp2[coord(j - 1, k)] + b2 * temp2[coord(j - 2, k)] + b3 * temp2[coord(j - 3, k)];
} }
} }
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[0][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[0][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[0][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[0][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
temp2H[k] = src[H - 1][i + k] + M[1][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[1][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[1][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[1][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[1][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[2][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[2][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]); temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[coord(H - 1, k)] - src[H - 1][i + k]) + M[2][1] * (temp2[coord(H - 2, k)] - src[H - 1][i + k]) + M[2][2] * (temp2[coord(H - 3, k)] - src[H - 1][i + k]);
} }
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
dst[H - 1][i + k] *= TEMP2(H - 1, k) = temp2Hm1[k]; dst[H - 1][i + k] *= temp2[coord(H - 1, k)] = temp2Hm1[k];
dst[H - 2][i + k] *= TEMP2(H - 2, k) = B * TEMP2(H - 2, k) + b1 * TEMP2(H - 1, k) + b2 * temp2H[k] + b3 * temp2Hp1[k]; dst[H - 2][i + k] *= temp2[coord(H - 2, k)] = B * temp2[coord(H - 2, k)] + b1 * temp2[coord(H - 1, k)] + b2 * temp2H[k] + b3 * temp2Hp1[k];
dst[H - 3][i + k] *= TEMP2(H - 3, k) = B * TEMP2(H - 3, k) + b1 * TEMP2(H - 2, k) + b2 * TEMP2(H - 1, k) + b3 * temp2H[k]; dst[H - 3][i + k] *= temp2[coord(H - 3, k)] = B * temp2[coord(H - 3, k)] + b1 * temp2[coord(H - 2, k)] + b2 * temp2[coord(H - 1, k)] + b3 * temp2H[k];
} }
for (int j = H - 4; j >= 0; j--) { for (int j = H - 4; j >= 0; j--) {
for (int k = 0; k < numcols; k++) { for (int k = 0; k < numcols; k++) {
dst[j][i + k] *= (TEMP2(j, k) = B * TEMP2(j, k) + b1 * TEMP2(j + 1, k) + b2 * TEMP2(j + 2, k) + b3 * TEMP2(j + 3, k)); dst[j][i + k] *= (temp2[coord(j, k)] = B * temp2[coord(j, k)] + b1 * temp2[coord(j + 1, k)] + b2 * temp2[coord(j + 2, k)] + b3 * temp2[coord(j + 3, k)]);
} }
} }
} }
@ -1340,28 +1346,26 @@ template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const i
// process remaining columns // process remaining columns
for (int i = W - (W % numcols); i < W; i++) { for (int i = W - (W % numcols); i < W; i++) {
TEMP2(0, 0) = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i]; temp2[coord(0, 0)] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
TEMP2(1, 0) = B * src[1][i] + b1 * TEMP2(0, 0) + b2 * src[0][i] + b3 * src[0][i]; temp2[coord(1, 0)] = B * src[1][i] + b1 * temp2[coord(0, 0)] + b2 * src[0][i] + b3 * src[0][i];
TEMP2(2, 0) = B * src[2][i] + b1 * TEMP2(1, 0) + b2 * TEMP2(0, 0) + b3 * src[0][i]; temp2[coord(2, 0)] = B * src[2][i] + b1 * temp2[coord(1, 0)] + b2 * temp2[coord(0, 0)] + b3 * src[0][i];
for (int j = 3; j < H; j++) { for (int j = 3; j < H; j++) {
TEMP2(j, 0) = B * src[j][i] + b1 * TEMP2(j - 1, 0) + b2 * TEMP2(j - 2, 0) + b3 * TEMP2(j - 3, 0); temp2[coord(j, 0)] = B * src[j][i] + b1 * temp2[coord(j - 1, 0)] + b2 * temp2[coord(j - 2, 0)] + b3 * temp2[coord(j - 3, 0)];
} }
double temp2Hm1 = src[H - 1][i] + M[0][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[0][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[0][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[0][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[0][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
double temp2H = src[H - 1][i] + M[1][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[1][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[1][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2H = src[H - 1][i] + M[1][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[1][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[1][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
double temp2Hp1 = src[H - 1][i] + M[2][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[2][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[2][2] * (TEMP2(H - 3, 0) - src[H - 1][i]); double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[coord(H - 1, 0)] - src[H - 1][i]) + M[2][1] * (temp2[coord(H - 2, 0)] - src[H - 1][i]) + M[2][2] * (temp2[coord(H - 3, 0)] - src[H - 1][i]);
dst[H - 1][i] *= TEMP2(H - 1, 0) = temp2Hm1; dst[H - 1][i] *= temp2[coord(H - 1, 0)] = temp2Hm1;
dst[H - 2][i] *= TEMP2(H - 2, 0) = B * TEMP2(H - 2, 0) + b1 * TEMP2(H - 1, 0) + b2 * temp2H + b3 * temp2Hp1; dst[H - 2][i] *= temp2[coord(H - 2, 0)] = B * temp2[coord(H - 2, 0)] + b1 * temp2[coord(H - 1, 0)] + b2 * temp2H + b3 * temp2Hp1;
dst[H - 3][i] *= TEMP2(H - 3, 0) = B * TEMP2(H - 3, 0) + b1 * TEMP2(H - 2, 0) + b2 * TEMP2(H - 1, 0) + b3 * temp2H; dst[H - 3][i] *= temp2[coord(H - 3, 0)] = B * temp2[coord(H - 3, 0)] + b1 * temp2[coord(H - 2, 0)] + b2 * temp2[coord(H - 1, 0)] + b3 * temp2H;
for (int j = H - 4; j >= 0; j--) { for (int j = H - 4; j >= 0; j--) {
dst[j][i] *= (TEMP2(j, 0) = B * TEMP2(j, 0) + b1 * TEMP2(j + 1, 0) + b2 * TEMP2(j + 2, 0) + b3 * TEMP2(j + 3, 0)); dst[j][i] *= (temp2[coord(j, 0)] = B * temp2[coord(j, 0)] + b1 * temp2[coord(j + 1, 0)] + b2 * temp2[coord(j + 2, 0)] + b3 * temp2[coord(j + 3, 0)]);
} }
} }
delete [] temp2;
} }
#endif #endif