Allocate on Heap, Not Stack, in gaussVertical
and Friends
This commit is contained in:
parent
b68a6e9581
commit
5b2811f42b
@ -1119,6 +1119,8 @@ template<class T> void gaussVerticalSsediv (T** src, T** dst, T** divBuffer, con
|
||||
|
||||
template<class T> void gaussVertical (T** src, T** dst, const int W, const int H, const double sigma)
|
||||
{
|
||||
#define TEMP2(X, Y) temp2[(X) * numcols + (Y)]
|
||||
|
||||
double b1, b2, b3, B, M[3][3];
|
||||
calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
|
||||
|
||||
@ -1129,7 +1131,7 @@ template<class T> void gaussVertical (T** src, T** dst, const int W, const int H
|
||||
|
||||
// process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
|
||||
static const int numcols = 8;
|
||||
double temp2[H][numcols] ALIGNED16;
|
||||
double *temp2 ALIGNED16 = new double[H * numcols];
|
||||
double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
|
||||
#ifdef _OPENMP
|
||||
#pragma omp for nowait
|
||||
@ -1137,32 +1139,32 @@ template<class T> void gaussVertical (T** src, T** dst, const int W, const int H
|
||||
|
||||
for (unsigned int i = 0; i < static_cast<unsigned>(std::max(0, W - numcols + 1)); i += numcols) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
|
||||
TEMP2(0, k) = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
TEMP2(1, k) = B * src[1][i + k] + b1 * TEMP2(0, k) + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
TEMP2(2, k) = B * src[2][i + k] + b1 * TEMP2(1, k) + b2 * TEMP2(0, k) + b3 * src[0][i + k];
|
||||
}
|
||||
|
||||
for (int j = 3; j < H; j++) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
|
||||
TEMP2(j, k) = B * src[j][i + k] + b1 * TEMP2(j - 1, k) + b2 * TEMP2(j - 2, k) + b3 * TEMP2(j - 3, k);
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[0][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[0][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
temp2H[k] = src[H - 1][i + k] + M[1][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[1][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[1][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[2][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[2][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
}
|
||||
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
dst[H - 1][i + k] = temp2[H - 1][k] = temp2Hm1[k];
|
||||
dst[H - 2][i + k] = temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k];
|
||||
dst[H - 3][i + k] = temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k];
|
||||
dst[H - 1][i + k] = TEMP2(H - 1, k) = temp2Hm1[k];
|
||||
dst[H - 2][i + k] = TEMP2(H - 2, k) = B * TEMP2(H - 2, k) + b1 * TEMP2(H - 1, k) + b2 * temp2H[k] + b3 * temp2Hp1[k];
|
||||
dst[H - 3][i + k] = TEMP2(H - 3, k) = B * TEMP2(H - 3, k) + b1 * TEMP2(H - 2, k) + b2 * TEMP2(H - 1, k) + b3 * temp2H[k];
|
||||
}
|
||||
|
||||
for (int j = H - 4; j >= 0; j--) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
dst[j][i + k] = temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k];
|
||||
dst[j][i + k] = TEMP2(j, k) = B * TEMP2(j, k) + b1 * TEMP2(j + 1, k) + b2 * TEMP2(j + 2, k) + b3 * TEMP2(j + 3, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1173,31 +1175,35 @@ template<class T> void gaussVertical (T** src, T** dst, const int W, const int H
|
||||
|
||||
// process remaining columns
|
||||
for (int i = W - (W % numcols); i < W; i++) {
|
||||
temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
|
||||
temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i];
|
||||
temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i];
|
||||
TEMP2(0, 0) = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
|
||||
TEMP2(1, 0) = B * src[1][i] + b1 * TEMP2(0, 0) + b2 * src[0][i] + b3 * src[0][i];
|
||||
TEMP2(2, 0) = B * src[2][i] + b1 * TEMP2(1, 0) + b2 * TEMP2(0, 0) + b3 * src[0][i];
|
||||
|
||||
for (int j = 3; j < H; j++) {
|
||||
temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
|
||||
TEMP2(j, 0) = B * src[j][i] + b1 * TEMP2(j - 1, 0) + b2 * TEMP2(j - 2, 0) + b3 * TEMP2(j - 3, 0);
|
||||
}
|
||||
|
||||
double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2H = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2Hm1 = src[H - 1][i] + M[0][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[0][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[0][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
double temp2H = src[H - 1][i] + M[1][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[1][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[1][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
double temp2Hp1 = src[H - 1][i] + M[2][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[2][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[2][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
|
||||
dst[H - 1][i] = temp2[H - 1][0] = temp2Hm1;
|
||||
dst[H - 2][i] = temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
|
||||
dst[H - 3][i] = temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H;
|
||||
dst[H - 1][i] = TEMP2(H - 1, 0) = temp2Hm1;
|
||||
dst[H - 2][i] = TEMP2(H - 2, 0) = B * TEMP2(H - 2, 0) + b1 * TEMP2(H - 1, 0) + b2 * temp2H + b3 * temp2Hp1;
|
||||
dst[H - 3][i] = TEMP2(H - 3, 0) = B * TEMP2(H - 3, 0) + b1 * TEMP2(H - 2, 0) + b2 * TEMP2(H - 1, 0) + b3 * temp2H;
|
||||
|
||||
for (int j = H - 4; j >= 0; j--) {
|
||||
dst[j][i] = temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0];
|
||||
dst[j][i] = TEMP2(j, 0) = B * TEMP2(j, 0) + b1 * TEMP2(j + 1, 0) + b2 * TEMP2(j + 2, 0) + b3 * TEMP2(j + 3, 0);
|
||||
}
|
||||
}
|
||||
|
||||
delete [] temp2;
|
||||
}
|
||||
|
||||
#ifndef __SSE2__
|
||||
template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const int W, const int H, const double sigma)
|
||||
{
|
||||
#define TEMP2(X, Y) temp2[(X) * numcols + (Y)]
|
||||
|
||||
double b1, b2, b3, B, M[3][3];
|
||||
calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
|
||||
|
||||
@ -1208,7 +1214,7 @@ template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const
|
||||
|
||||
// process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
|
||||
static const int numcols = 8;
|
||||
double temp2[H][numcols] ALIGNED16;
|
||||
double *temp2 ALIGNED16 = new double[H * numcols];
|
||||
double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
|
||||
#ifdef _OPENMP
|
||||
#pragma omp for nowait
|
||||
@ -1216,32 +1222,32 @@ template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const
|
||||
|
||||
for (int i = 0; i < W - numcols + 1; i += numcols) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
|
||||
TEMP2(0, k) = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
TEMP2(1, k) = B * src[1][i + k] + b1 * TEMP2(0, k) + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
TEMP2(2, k) = B * src[2][i + k] + b1 * TEMP2(1, k) + b2 * TEMP2(0, k) + b3 * src[0][i + k];
|
||||
}
|
||||
|
||||
for (int j = 3; j < H; j++) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
|
||||
TEMP2(j, k) = B * src[j][i + k] + b1 * TEMP2(j - 1, k) + b2 * TEMP2(j - 2, k) + b3 * TEMP2(j - 3, k);
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[0][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[0][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
temp2H[k] = src[H - 1][i + k] + M[1][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[1][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[1][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[2][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[2][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
}
|
||||
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
dst[H - 1][i + k] = rtengine::max(divBuffer[H - 1][i + k] / (temp2[H - 1][k] = temp2Hm1[k]), 0.0);
|
||||
dst[H - 2][i + k] = rtengine::max(divBuffer[H - 2][i + k] / (temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k]), 0.0);
|
||||
dst[H - 3][i + k] = rtengine::max(divBuffer[H - 3][i + k] / (temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k]), 0.0);
|
||||
dst[H - 1][i + k] = rtengine::max(divBuffer[H - 1][i + k] / (TEMP2(H - 1, k) = temp2Hm1[k]), 0.0);
|
||||
dst[H - 2][i + k] = rtengine::max(divBuffer[H - 2][i + k] / (TEMP2(H - 2, k) = B * TEMP2(H - 2, k) + b1 * TEMP2(H - 1, k) + b2 * temp2H[k] + b3 * temp2Hp1[k]), 0.0);
|
||||
dst[H - 3][i + k] = rtengine::max(divBuffer[H - 3][i + k] / (TEMP2(H - 3, k) = B * TEMP2(H - 3, k) + b1 * TEMP2(H - 2, k) + b2 * TEMP2(H - 1, k) + b3 * temp2H[k]), 0.0);
|
||||
}
|
||||
|
||||
for (int j = H - 4; j >= 0; j--) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
dst[j][i + k] = rtengine::max(divBuffer[j][i + k] / (temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]), 0.0);
|
||||
dst[j][i + k] = rtengine::max(divBuffer[j][i + k] / (TEMP2(j, k) = B * TEMP2(j, k) + b1 * TEMP2(j + 1, k) + b2 * TEMP2(j + 2, k) + b3 * TEMP2(j + 3, k)), 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1252,30 +1258,34 @@ template<class T> void gaussVerticaldiv (T** src, T** dst, T** divBuffer, const
|
||||
|
||||
// process remaining columns
|
||||
for (int i = W - (W % numcols); i < W; i++) {
|
||||
temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
|
||||
temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i];
|
||||
temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i];
|
||||
TEMP2(0, 0) = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
|
||||
TEMP2(1, 0) = B * src[1][i] + b1 * TEMP2(0, 0) + b2 * src[0][i] + b3 * src[0][i];
|
||||
TEMP2(2, 0) = B * src[2][i] + b1 * TEMP2(1, 0) + b2 * TEMP2(0, 0) + b3 * src[0][i];
|
||||
|
||||
for (int j = 3; j < H; j++) {
|
||||
temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
|
||||
TEMP2(j, 0) = B * src[j][i] + b1 * TEMP2(j - 1, 0) + b2 * TEMP2(j - 2, 0) + b3 * TEMP2(j - 3, 0);
|
||||
}
|
||||
|
||||
double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2H = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2Hm1 = src[H - 1][i] + M[0][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[0][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[0][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
double temp2H = src[H - 1][i] + M[1][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[1][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[1][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
double temp2Hp1 = src[H - 1][i] + M[2][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[2][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[2][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
|
||||
dst[H - 1][i] = rtengine::max(divBuffer[H - 1][i] / (temp2[H - 1][0] = temp2Hm1), 0.0);
|
||||
dst[H - 2][i] = rtengine::max(divBuffer[H - 2][i] / (temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1), 0.0);
|
||||
dst[H - 3][i] = rtengine::max(divBuffer[H - 3][i] / (temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H), 0.0);
|
||||
dst[H - 1][i] = rtengine::max(divBuffer[H - 1][i] / (TEMP2(H - 1, 0) = temp2Hm1), 0.0);
|
||||
dst[H - 2][i] = rtengine::max(divBuffer[H - 2][i] / (TEMP2(H - 2, 0) = B * TEMP2(H - 2, 0) + b1 * TEMP2(H - 1, 0) + b2 * temp2H + b3 * temp2Hp1), 0.0);
|
||||
dst[H - 3][i] = rtengine::max(divBuffer[H - 3][i] / (TEMP2(H - 3, 0) = B * TEMP2(H - 3, 0) + b1 * TEMP2(H - 2, 0) + b2 * TEMP2(H - 1, 0) + b3 * temp2H), 0.0);
|
||||
|
||||
for (int j = H - 4; j >= 0; j--) {
|
||||
dst[j][i] = rtengine::max(divBuffer[j][i] / (temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]), 0.0);
|
||||
dst[j][i] = rtengine::max(divBuffer[j][i] / (TEMP2(j, 0) = B * TEMP2(j, 0) + b1 * TEMP2(j + 1, 0) + b2 * TEMP2(j + 2, 0) + b3 * TEMP2(j + 3, 0)), 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
delete [] temp2;
|
||||
}
|
||||
|
||||
template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const int H, const double sigma)
|
||||
{
|
||||
#define TEMP2(X, Y) temp2[(X) * numcols + (Y)]
|
||||
|
||||
double b1, b2, b3, B, M[3][3];
|
||||
calculateYvVFactors<double>(sigma, b1, b2, b3, B, M);
|
||||
|
||||
@ -1286,7 +1296,7 @@ template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const i
|
||||
|
||||
// process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
|
||||
static const int numcols = 8;
|
||||
double temp2[H][numcols] ALIGNED16;
|
||||
double *temp2 ALIGNED16 = new double[H * numcols];
|
||||
double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
|
||||
#ifdef _OPENMP
|
||||
#pragma omp for nowait
|
||||
@ -1294,32 +1304,32 @@ template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const i
|
||||
|
||||
for (int i = 0; i < W - numcols + 1; i += numcols) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
|
||||
TEMP2(0, k) = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
TEMP2(1, k) = B * src[1][i + k] + b1 * TEMP2(0, k) + b2 * src[0][i + k] + b3 * src[0][i + k];
|
||||
TEMP2(2, k) = B * src[2][i + k] + b1 * TEMP2(1, k) + b2 * TEMP2(0, k) + b3 * src[0][i + k];
|
||||
}
|
||||
|
||||
for (int j = 3; j < H; j++) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
|
||||
TEMP2(j, k) = B * src[j][i + k] + b1 * TEMP2(j - 1, k) + b2 * TEMP2(j - 2, k) + b3 * TEMP2(j - 3, k);
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2H[k] = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
|
||||
temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[0][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[0][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
temp2H[k] = src[H - 1][i + k] + M[1][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[1][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[1][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (TEMP2(H - 1, k) - src[H - 1][i + k]) + M[2][1] * (TEMP2(H - 2, k) - src[H - 1][i + k]) + M[2][2] * (TEMP2(H - 3, k) - src[H - 1][i + k]);
|
||||
}
|
||||
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
dst[H - 1][i + k] *= temp2[H - 1][k] = temp2Hm1[k];
|
||||
dst[H - 2][i + k] *= temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k];
|
||||
dst[H - 3][i + k] *= temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k];
|
||||
dst[H - 1][i + k] *= TEMP2(H - 1, k) = temp2Hm1[k];
|
||||
dst[H - 2][i + k] *= TEMP2(H - 2, k) = B * TEMP2(H - 2, k) + b1 * TEMP2(H - 1, k) + b2 * temp2H[k] + b3 * temp2Hp1[k];
|
||||
dst[H - 3][i + k] *= TEMP2(H - 3, k) = B * TEMP2(H - 3, k) + b1 * TEMP2(H - 2, k) + b2 * TEMP2(H - 1, k) + b3 * temp2H[k];
|
||||
}
|
||||
|
||||
for (int j = H - 4; j >= 0; j--) {
|
||||
for (int k = 0; k < numcols; k++) {
|
||||
dst[j][i + k] *= (temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k]);
|
||||
dst[j][i + k] *= (TEMP2(j, k) = B * TEMP2(j, k) + b1 * TEMP2(j + 1, k) + b2 * TEMP2(j + 2, k) + b3 * TEMP2(j + 3, k));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1330,26 +1340,28 @@ template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const i
|
||||
|
||||
// process remaining columns
|
||||
for (int i = W - (W % numcols); i < W; i++) {
|
||||
temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
|
||||
temp2[1][0] = B * src[1][i] + b1 * temp2[0][0] + b2 * src[0][i] + b3 * src[0][i];
|
||||
temp2[2][0] = B * src[2][i] + b1 * temp2[1][0] + b2 * temp2[0][0] + b3 * src[0][i];
|
||||
TEMP2(0, 0) = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
|
||||
TEMP2(1, 0) = B * src[1][i] + b1 * TEMP2(0, 0) + b2 * src[0][i] + b3 * src[0][i];
|
||||
TEMP2(2, 0) = B * src[2][i] + b1 * TEMP2(1, 0) + b2 * TEMP2(0, 0) + b3 * src[0][i];
|
||||
|
||||
for (int j = 3; j < H; j++) {
|
||||
temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
|
||||
TEMP2(j, 0) = B * src[j][i] + b1 * TEMP2(j - 1, 0) + b2 * TEMP2(j - 2, 0) + b3 * TEMP2(j - 3, 0);
|
||||
}
|
||||
|
||||
double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2H = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
|
||||
double temp2Hm1 = src[H - 1][i] + M[0][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[0][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[0][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
double temp2H = src[H - 1][i] + M[1][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[1][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[1][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
double temp2Hp1 = src[H - 1][i] + M[2][0] * (TEMP2(H - 1, 0) - src[H - 1][i]) + M[2][1] * (TEMP2(H - 2, 0) - src[H - 1][i]) + M[2][2] * (TEMP2(H - 3, 0) - src[H - 1][i]);
|
||||
|
||||
dst[H - 1][i] *= temp2[H - 1][0] = temp2Hm1;
|
||||
dst[H - 2][i] *= temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
|
||||
dst[H - 3][i] *= temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H;
|
||||
dst[H - 1][i] *= TEMP2(H - 1, 0) = temp2Hm1;
|
||||
dst[H - 2][i] *= TEMP2(H - 2, 0) = B * TEMP2(H - 2, 0) + b1 * TEMP2(H - 1, 0) + b2 * temp2H + b3 * temp2Hp1;
|
||||
dst[H - 3][i] *= TEMP2(H - 3, 0) = B * TEMP2(H - 3, 0) + b1 * TEMP2(H - 2, 0) + b2 * TEMP2(H - 1, 0) + b3 * temp2H;
|
||||
|
||||
for (int j = H - 4; j >= 0; j--) {
|
||||
dst[j][i] *= (temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0]);
|
||||
dst[j][i] *= (TEMP2(j, 0) = B * TEMP2(j, 0) + b1 * TEMP2(j + 1, 0) + b2 * TEMP2(j + 2, 0) + b3 * TEMP2(j + 3, 0));
|
||||
}
|
||||
}
|
||||
|
||||
delete [] temp2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user