Dehaze: further speedup, #5456

This commit is contained in:
Ingo Weyrich
2019-09-19 20:56:33 +02:00
parent 991fc94d89
commit 3ab379ad0a
3 changed files with 442 additions and 260 deletions

View File

@@ -204,15 +204,15 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU( dst[0][col], tempv);
STVFU( dst[0][col + 4], temp1v);
STVFU(dst[0][col], tempv);
STVFU(dst[0][col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenp1v;
}
@@ -221,16 +221,16 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenm1v;
}
}
@@ -312,6 +312,221 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
}
inline void boxblur (float** src, float** dst, int radius, int W, int H, bool multiThread)
{
//box blur using rowbuffers and linebuffers instead of a full size buffer
if (radius == 0) {
if (src != dst) {
#ifdef _OPENMP
#pragma omp parallel for if (multiThread)
#endif
for (int row = 0; row < H; row++) {
for (int col = 0; col < W; col++) {
dst[row][col] = src[row][col];
}
}
}
return;
}
constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
#ifdef _OPENMP
#pragma omp parallel if (multiThread)
#endif
{
float* const buffer = new float[std::max(W, 8 * H)];
//horizontal blur
float* const lineBuffer = buffer;
#ifdef _OPENMP
#pragma omp for
#endif
for (int row = 0; row < H; row++) {
float len = radius + 1;
float tempval = src[row][0];
lineBuffer[0] = tempval;
for (int j = 1; j <= radius; j++) {
tempval += src[row][j];
}
tempval /= len;
dst[row][0] = tempval;
for (int col = 1; col <= radius; col++) {
lineBuffer[col] = src[row][col];
dst[row][col] = tempval = (tempval * len + src[row][col + radius]) / (len + 1);
len ++;
}
for (int col = radius + 1; col < W - radius; col++) {
lineBuffer[col] = src[row][col];
dst[row][col] = tempval = tempval + (src[row][col + radius] - lineBuffer[col - radius - 1]) / len;
}
for (int col = W - radius; col < W; col++) {
dst[row][col] = tempval = (tempval * len - lineBuffer[col - radius - 1]) / (len - 1);
len --;
}
}
//vertical blur
#ifdef __SSE2__
vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer;
vfloat leninitv = F2V(radius + 1);
vfloat onev = F2V(1.f);
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int col = 0; col < W - 7; col += 8) {
lenv = leninitv;
tempv = LVFU(dst[0][col]);
temp1v = LVFU(dst[0][col + 4]);
rowBuffer[0][0] = tempv;
rowBuffer[0][1] = temp1v;
for (int i = 1; i <= radius; i++) {
tempv = tempv + LVFU(dst[i][col]);
temp1v = temp1v + LVFU(dst[i][col + 4]);
}
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU(dst[0][col], tempv);
STVFU(dst[0][col + 4], temp1v);
for (int row = 1; row <= radius; row++) {
rowBuffer[row][0] = LVFU(dst[row][col]);
rowBuffer[row][1] = LVFU(dst[row][col + 4]);
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenp1v;
}
rlenv = onev / lenv;
for (int row = radius + 1; row < H - radius; row++) {
rowBuffer[row][0] = LVFU(dst[row][col]);
rowBuffer[row][1] = LVFU(dst[row][col + 4]);
tempv = tempv + (LVFU(dst[row + radius][col]) - rowBuffer[row - radius - 1][0]) * rlenv ;
temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - rowBuffer[row - radius - 1][1]) * rlenv ;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
}
for (int row = H - radius; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - rowBuffer[row - radius - 1][0]) / lenm1v;
temp1v = (temp1v * lenv - rowBuffer[row - radius - 1][1]) / lenm1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenm1v;
}
}
#else
float (* const rowBuffer)[8] = (float(*)[8]) buffer;
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int col = 0; col < W - numCols + 1; col += 8) {
float len = radius + 1;
for(int k = 0; k < numCols; k++) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int i = 1; i <= radius; i++) {
for(int k = 0; k < numCols; k++) {
dst[0][col + k] += dst[i][col + k];
}
}
for(int k = 0; k < numCols; k++) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; row++) {
for(int k = 0; k < numCols; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
}
len ++;
}
for (int row = radius + 1; row < H - radius; row++) {
for(int k = 0; k < numCols; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) / len;
}
}
for (int row = H - radius; row < H; row++) {
for(int k = 0; k < numCols; k++) {
dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
}
len --;
}
}
#endif
//vertical blur, remaining columns
#ifdef _OPENMP
#pragma omp single
#endif
{
const int remaining = W % numCols;
if (remaining > 0) {
float (* const rowBuffer)[8] = (float(*)[8]) buffer;
const int col = W - remaining;
float len = radius + 1;
for(int k = 0; k < remaining; k++) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int i = 1; i <= radius; i++) {
for(int k = 0; k < remaining; k++) {
dst[0][col + k] += dst[i][col + k];
}
}
for(int k = 0; k < remaining; k++) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; row++) {
for(int k = 0; k < remaining; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[(row - 1)][col + k] * len + dst[row + radius][col + k]) / (len + 1);
len ++;
}
}
const float rlen = 1.f / len;
for (int row = radius + 1; row < H - radius; row++) {
for(int k = 0; k < remaining; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = dst[(row - 1)][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) * rlen;
}
}
for (int row = H - radius; row < H; row++) {
for(int k = 0; k < remaining; k++) {
dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
len --;
}
}
}
}
delete [] buffer;
}
}
template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
{
//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
@@ -382,15 +597,15 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU( dst[0 * W + col], tempv);
STVFU( dst[0 * W + col + 4], temp1v);
STVFU(dst[0 * W + col], tempv);
STVFU(dst[0 * W + col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
lenv = lenp1v;
}
@@ -399,16 +614,16 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
lenv = lenm1v;
}
}
@@ -422,12 +637,12 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
}
tempv = tempv / lenv;
STVFU( dst[0 * W + col], tempv);
STVFU(dst[0 * W + col], tempv);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
STVFU( dst[row * W + col], tempv);
STVFU(dst[row * W + col], tempv);
lenv = lenp1v;
}
@@ -435,13 +650,13 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
STVFU( dst[row * W + col], tempv);
STVFU(dst[row * W + col], tempv);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
STVFU( dst[row * W + col], tempv);
STVFU(dst[row * W + col], tempv);
lenv = lenm1v;
}
}