merge with dev

This commit is contained in:
Desmis 2020-01-10 17:46:06 +01:00
commit bcbeb221ed

View File

@ -131,6 +131,9 @@ inline void gauss3x3div (float** RESTRICT src, float** RESTRICT dst, float** RES
const float c00 = kernel[1][1]; const float c00 = kernel[1][1];
for (int i = 1; i < tileSize - 1; i++) { for (int i = 1; i < tileSize - 1; i++) {
#ifdef __clang__
#pragma clang loop vectorize(assume_safety)
#endif
for (int j = 1; j < tileSize - 1; j++) { for (int j = 1; j < tileSize - 1; j++) {
const float val = c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + const float val = c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
@ -151,6 +154,9 @@ inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RES
for (int i = 2; i < tileSize - 2; ++i) { for (int i = 2; i < tileSize - 2; ++i) {
// I tried hand written SSE code but gcc vectorizes better // I tried hand written SSE code but gcc vectorizes better
#ifdef __clang__
#pragma clang loop vectorize(assume_safety)
#endif
for (int j = 2; j < tileSize - 2; ++j) { for (int j = 2; j < tileSize - 2; ++j) {
const float val = c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) + const float val = c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) + c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
@ -177,6 +183,9 @@ inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** REST
for (int i = 3; i < tileSize - 3; ++i) { for (int i = 3; i < tileSize - 3; ++i) {
// I tried hand written SSE code but gcc vectorizes better // I tried hand written SSE code but gcc vectorizes better
#ifdef __clang__
#pragma clang loop vectorize(assume_safety)
#endif
for (int j = 3; j < tileSize - 3; ++j) { for (int j = 3; j < tileSize - 3; ++j) {
const float val = c31 * ((src[i - 3][j - 1] + src[i - 3][j + 1]) + (src[i - 1][j - 3] + src[i - 1][j + 3]) + (src[i + 1][j - 3] + src[i + 1][j + 3]) + (src[i + 3][j - 1] + src[i + 3][j + 1])) + const float val = c31 * ((src[i - 3][j - 1] + src[i - 3][j + 1]) + (src[i - 1][j - 3] + src[i - 1][j + 3]) + (src[i + 1][j - 3] + src[i + 1][j + 3]) + (src[i + 3][j - 1] + src[i + 3][j + 1])) +
c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) + c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
@ -199,6 +208,9 @@ inline void gauss3x3mult(float** RESTRICT src, float** RESTRICT dst, const int t
const float c00 = kernel[1][1]; const float c00 = kernel[1][1];
for (int i = 1; i < tileSize - 1; i++) { for (int i = 1; i < tileSize - 1; i++) {
#ifdef __clang__
#pragma clang loop vectorize(assume_safety)
#endif
for (int j = 1; j < tileSize - 1; j++) { for (int j = 1; j < tileSize - 1; j++) {
const float val = c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + const float val = c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
@ -220,6 +232,9 @@ inline void gauss5x5mult (float** RESTRICT src, float** RESTRICT dst, const int
for (int i = 2; i < tileSize - 2; ++i) { for (int i = 2; i < tileSize - 2; ++i) {
// I tried hand written SSE code but gcc vectorizes better // I tried hand written SSE code but gcc vectorizes better
#ifdef __clang__
#pragma clang loop vectorize(assume_safety)
#endif
for (int j = 2; j < tileSize - 2; ++j) { for (int j = 2; j < tileSize - 2; ++j) {
const float val = c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) + const float val = c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) + c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
@ -246,6 +261,9 @@ inline void gauss7x7mult(float** RESTRICT src, float** RESTRICT dst, const int t
for (int i = 3; i < tileSize - 3; ++i) { for (int i = 3; i < tileSize - 3; ++i) {
// I tried hand written SSE code but gcc vectorizes better // I tried hand written SSE code but gcc vectorizes better
#ifdef __clang__
#pragma clang loop vectorize(assume_safety)
#endif
for (int j = 3; j < tileSize - 3; ++j) { for (int j = 3; j < tileSize - 3; ++j) {
const float val = c31 * ((src[i - 3][j - 1] + src[i - 3][j + 1]) + (src[i - 1][j - 3] + src[i - 1][j + 3]) + (src[i + 1][j - 3] + src[i + 1][j + 3]) + (src[i + 3][j - 1] + src[i + 3][j + 1])) + const float val = c31 * ((src[i - 3][j - 1] + src[i - 3][j + 1]) + (src[i - 1][j - 3] + src[i - 1][j + 3]) + (src[i + 1][j - 3] + src[i + 1][j + 3]) + (src[i + 3][j - 1] + src[i + 3][j + 1])) +
c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) + c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +