From 0b5b9b0f3e0cca4cbdfbd0dc24a0e5d2caa8f4b8 Mon Sep 17 00:00:00 2001 From: Ingo Weyrich Date: Thu, 21 Nov 2019 15:37:55 +0100 Subject: [PATCH] findHotDeadPixels: further speedup and cleanup, #5531 --- rtengine/badpixels.cc | 167 ++++++++++++------------------------------ 1 file changed, 45 insertions(+), 122 deletions(-) diff --git a/rtengine/badpixels.cc b/rtengine/badpixels.cc index 578459189..0ae63a618 100644 --- a/rtengine/badpixels.cc +++ b/rtengine/badpixels.cc @@ -22,13 +22,39 @@ #include "pixelsmap.h" #include "rawimage.h" #include "rawimagesource.h" -#define BENCHMARK +//#define BENCHMARK #include "StopWatch.h" + namespace { unsigned fc(const unsigned int cfa[2][2], int r, int c) { return cfa[r & 1][c & 1]; } + +inline void sum5x5(const array2D& in, int col, float &sum) { +#ifdef __SSE2__ + // sum up 5*4 = 20 values using SSE + // 10 fabs function calls and 10 float additions with SSE + const vfloat sumv = (vabsf(LVFU(in[0][col])) + vabsf(LVFU(in[1][col]))) + + (vabsf(LVFU(in[2][col])) + vabsf(LVFU(in[3][col]))) + + vabsf(LVFU(in[4][col])); + // horizontally add the values and add the result to hfnbrave + sum += vhadd(sumv); + + // add remaining 5 values of last column + sum += (fabsf(in[0][col + 4]) + fabsf(in[1][col + 4])) + + (fabsf(in[2][col + 4]) + fabsf(in[3][col + 4])) + + fabsf(in[4][col + 4]); +#else + // 25 fabs function calls and 25 float additions without SSE + for (int nn = col; nn < col + 5; ++nn) { + sum += (fabsf(in[0][nn]) + fabsf(in[1][nn])) + + (fabsf(in[2][nn]) + fabsf(in[3][nn])) + + fabsf(in[4][nn]); + } +#endif + +} } namespace rtengine @@ -446,8 +472,8 @@ int RawImageSource::interpolateBadPixelsXtrans(const PixelsMap &bitmapBads) /* Search for hot or dead pixels in the image and update the map * For each pixel compare its value to the average of similar color surrounding * (Taken from Emil Martinec idea) - * (Optimized by Ingo Weyrich 2013, 2015 and 2019) - */ + * (Optimized by Ingo Weyrich 2013, 2015, and 2019) +*/ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, const bool findHotPixels, const bool findDeadPixels) const { BENCHFUN @@ -460,11 +486,7 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons #pragma omp parallel reduction(+:counter) #endif { - array2D cfablur(W, 5); - // zero left and right border - for (int i = 0; i < 5; ++i) { - cfablur[i][0] = cfablur[i][1] = cfablur[i][W - 2] = cfablur[i][W - 1]; - } + array2D cfablur(W, 5, ARRAY2D_CLEAR_DATA); int firstRow = -1; int lastRow = -1; @@ -473,28 +495,13 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons #pragma omp for schedule(static) nowait #endif - for (int i = 2; i < H - 2; i++) { + for (int i = 2; i < H - 2; ++i) { if (firstRow == -1) { firstRow = i; - if (firstRow == 2) { - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < W; ++j) { - cfablur[i][j] = 0.f; - } - } - } else { + if (firstRow > 2) { for (int row = firstRow - 2; row < firstRow; ++row) { - int destRow = row % 5; - int j = 2; - #ifdef __SSE2__ - for (; j < W - 5; j += 4) { - const vfloat tempv = median(LVFU(rawData[row - 2][j - 2]), LVFU(rawData[row - 2][j]), LVFU(rawData[row - 2][j + 2]), - LVFU(rawData[row][j - 2]), LVFU(rawData[row][j]), LVFU(rawData[row][j + 2]), - LVFU(rawData[row + 2][j - 2]), LVFU(rawData[row + 2][j]), LVFU(rawData[row + 2][j + 2])); - STVFU(cfablur[destRow][j], LVFU(rawData[row][j]) - tempv); - } - #endif - for (; j < W - 2; j++) { + const int destRow = row % 5; + for (int j = 2; j < W - 2; ++j) { const float temp = median(rawData[row - 2][j - 2], rawData[row - 2][j], rawData[row - 2][j + 2], rawData[row][j - 2], rawData[row][j], rawData[row][j + 2], rawData[row + 2][j - 2], rawData[row + 2][j], rawData[row + 2][j + 2]); @@ -505,16 +512,7 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons } lastRow = i; const int destRow = i % 5; - int j = 2; -#ifdef __SSE2__ - for (; j < W - 5; j += 4) { - const vfloat tempv = median(LVFU(rawData[i - 2][j - 2]), LVFU(rawData[i - 2][j]), LVFU(rawData[i - 2][j + 2]), - LVFU(rawData[i][j - 2]), LVFU(rawData[i][j]), LVFU(rawData[i][j + 2]), - LVFU(rawData[i + 2][j - 2]), LVFU(rawData[i + 2][j]), LVFU(rawData[i + 2][j + 2])); - STVFU(cfablur[destRow][j], LVFU(rawData[i][j]) - tempv); - } -#endif - for (; j < W - 2; j++) { + for (int j = 2; j < W - 2; ++j) { const float temp = median(rawData[i - 2][j - 2], rawData[i - 2][j], rawData[i - 2][j + 2], rawData[i][j - 2], rawData[i][j], rawData[i][j + 2], rawData[i + 2][j - 2], rawData[i + 2][j], rawData[i + 2][j + 2]); @@ -523,60 +521,26 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons if (i - 1 > firstRow) { const int rr = i - 2; - const int rrm2 = (rr - 2) % 5; - const int rrm1 = (rr - 1) % 5; const int rr0 = rr % 5; - const int rrp1 = (rr + 1) % 5; - const int rrp2 = (rr + 2) % 5; for (int cc = 2; cc < W - 2; ++cc) { //evaluate pixel for heat/death float pixdev = cfablur[rr0][cc]; - if (pixdev == 0.f) { + if (!findDeadPixels && pixdev <= 0.f) { continue; } - if ((!findDeadPixels) && pixdev < 0) { - continue; - } - - if ((!findHotPixels) && pixdev > 0) { + if (!findHotPixels && pixdev >= 0.f) { continue; } pixdev = fabsf(pixdev); float hfnbrave = -pixdev; - -#ifdef __SSE2__ - // sum up 5*4 = 20 values using SSE - // 10 fabs function calls and 10 float additions with SSE - vfloat sum1 = vabsf(LVFU(cfablur[rrm2][cc - 2])) + vabsf(LVFU(cfablur[rrm1][cc - 2])); - vfloat sum2 = vabsf(LVFU(cfablur[rr0][cc - 2])) + vabsf(LVFU(cfablur[rrp1][cc - 2])); - sum1 += vabsf(LVFU(cfablur[rrp2][cc - 2])); - // horizontally add the values and add the result to hfnbrave - hfnbrave += vhadd(sum1 + sum2); - - // add remaining 5 values of last column - hfnbrave += fabsf(cfablur[rrm2][cc + 2]); - hfnbrave += fabsf(cfablur[rrm1][cc + 2]); - hfnbrave += fabsf(cfablur[rr0][cc + 2]); - hfnbrave += fabsf(cfablur[rrp1][cc + 2]); - hfnbrave += fabsf(cfablur[rrp2][cc + 2]); -#else - - // 25 fabs function calls and 25 float additions without SSE - for (int nn = cc - 2; nn <= cc + 2; ++nn) { - hfnbrave += fabsf(cfablur[rrm2][nn]); - hfnbrave += fabsf(cfablur[rrm1][nn]); - hfnbrave += fabsf(cfablur[rr0][nn]); - hfnbrave += fabsf(cfablur[rrp1][nn]); - hfnbrave += fabsf(cfablur[rrp2][nn]); - } -#endif + sum5x5(cfablur, cc - 2, hfnbrave); if (pixdev > varthresh * hfnbrave) { // mark the pixel as "bad" bpMap.set(cc, rr); - counter++; + ++counter; } } //end of pixel evaluation } @@ -592,16 +556,7 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons cfablur[destRow][j] = 0.f; } } else { - int j = 2; -#ifdef __SSE2__ - for (; j < W - 5; j += 4) { - const vfloat tempv = median(LVFU(rawData[i - 2][j - 2]), LVFU(rawData[i - 2][j]), LVFU(rawData[i - 2][j + 2]), - LVFU(rawData[i][j - 2]), LVFU(rawData[i][j]), LVFU(rawData[i][j + 2]), - LVFU(rawData[i + 2][j - 2]), LVFU(rawData[i + 2][j]), LVFU(rawData[i + 2][j + 2])); - STVFU(cfablur[destRow][j], LVFU(rawData[i][j]) - tempv); - } -#endif - for (; j < W - 2; j++) { + for (int j = 2; j < W - 2; ++j) { const float temp = median(rawData[i - 2][j - 2], rawData[i - 2][j], rawData[i - 2][j + 2], rawData[i][j - 2], rawData[i][j], rawData[i][j + 2], rawData[i + 2][j - 2], rawData[i + 2][j], rawData[i + 2][j + 2]); @@ -609,64 +564,32 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons } } - const int rrm2 = (rr - 2) % 5; - const int rrm1 = (rr - 1) % 5; const int rr0 = rr % 5; - const int rrp1 = (rr + 1) % 5; - const int rrp2 = (rr + 2) % 5; for (int cc = 2; cc < W - 2; ++cc) { //evaluate pixel for heat/death float pixdev = cfablur[rr0][cc]; - if (pixdev == 0.f) { + if (!findDeadPixels && pixdev <= 0.f) { continue; } - if ((!findDeadPixels) && pixdev < 0) { - continue; - } - - if ((!findHotPixels) && pixdev > 0) { + if (!findHotPixels && pixdev >= 0.f) { continue; } pixdev = fabsf(pixdev); float hfnbrave = -pixdev; - -#ifdef __SSE2__ - // sum up 5*4 = 20 values using SSE - // 10 fabs function calls and 10 float additions with SSE - vfloat sum1 = vabsf(LVFU(cfablur[rrm2][cc - 2])) + vabsf(LVFU(cfablur[rrm1][cc - 2])); - vfloat sum2 = vabsf(LVFU(cfablur[rr0][cc - 2])) + vabsf(LVFU(cfablur[rrp1][cc - 2])); - sum1 += vabsf(LVFU(cfablur[rrp2][cc - 2])); - // horizontally add the values and add the result to hfnbrave - hfnbrave += vhadd(sum1 + sum2); - - // add remaining 5 values of last column - hfnbrave += fabsf(cfablur[rrm2][cc + 2]); - hfnbrave += fabsf(cfablur[rrm1][cc + 2]); - hfnbrave += fabsf(cfablur[rr0][cc + 2]); - hfnbrave += fabsf(cfablur[rrp1][cc + 2]); - hfnbrave += fabsf(cfablur[rrp2][cc + 2]); -#else - // 25 fabs function calls and 25 float additions without SSE - for (int nn = cc - 2; nn <= cc + 2; ++nn) { - hfnbrave += fabsf(cfablur[rrm2][nn]); - hfnbrave += fabsf(cfablur[rrm1][nn]); - hfnbrave += fabsf(cfablur[rr0][nn]); - hfnbrave += fabsf(cfablur[rrp1][nn]); - hfnbrave += fabsf(cfablur[rrp2][nn]); - } -#endif + sum5x5(cfablur, cc - 2, hfnbrave); if (pixdev > varthresh * hfnbrave) { // mark the pixel as "bad" bpMap.set(cc, rr); - counter++; + ++counter; } }//end of pixel evaluation } } }//end of parallel processing + return counter; }