From 3eb1d241c91371fb58b52ba9c8ebb1e72bdad2f7 Mon Sep 17 00:00:00 2001 From: Ingo Weyrich Date: Mon, 18 Nov 2019 20:13:12 +0100 Subject: [PATCH 1/5] findHotDeadPixels: speedup and reduced memory usage --- rtengine/badpixels.cc | 272 +++++++++++++++++++++++++++++------------- 1 file changed, 187 insertions(+), 85 deletions(-) diff --git a/rtengine/badpixels.cc b/rtengine/badpixels.cc index 2710cb28d..578459189 100644 --- a/rtengine/badpixels.cc +++ b/rtengine/badpixels.cc @@ -22,7 +22,8 @@ #include "pixelsmap.h" #include "rawimage.h" #include "rawimagesource.h" - +#define BENCHMARK +#include "StopWatch.h" namespace { unsigned fc(const unsigned int cfa[2][2], int r, int c) { @@ -445,126 +446,227 @@ int RawImageSource::interpolateBadPixelsXtrans(const PixelsMap &bitmapBads) /* Search for hot or dead pixels in the image and update the map * For each pixel compare its value to the average of similar color surrounding * (Taken from Emil Martinec idea) - * (Optimized by Ingo Weyrich 2013 and 2015) + * (Optimized by Ingo Weyrich 2013, 2015 and 2019) */ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, const bool findHotPixels, const bool findDeadPixels) const { + BENCHFUN const float varthresh = (20.0 * (thresh / 100.0) + 1.0) / 24.f; - // allocate temporary buffer - float* cfablur = new float[H * W]; - // counter for dead or hot pixels int counter = 0; #ifdef _OPENMP - #pragma omp parallel + #pragma omp parallel reduction(+:counter) #endif { + array2D cfablur(W, 5); + // zero left and right border + for (int i = 0; i < 5; ++i) { + cfablur[i][0] = cfablur[i][1] = cfablur[i][W - 2] = cfablur[i][W - 1]; + } + int firstRow = -1; + int lastRow = -1; + #ifdef _OPENMP - #pragma omp for schedule(dynamic,16) nowait + // note, static scheduling is important in this implementation + #pragma omp for schedule(static) nowait #endif for (int i = 2; i < H - 2; i++) { - for (int j = 2; j < W - 2; j++) { + if (firstRow == -1) { + firstRow = i; + if (firstRow == 2) { + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < W; ++j) { + cfablur[i][j] = 0.f; + } + } + } else { + for (int row = firstRow - 2; row < firstRow; ++row) { + int destRow = row % 5; + int j = 2; + #ifdef __SSE2__ + for (; j < W - 5; j += 4) { + const vfloat tempv = median(LVFU(rawData[row - 2][j - 2]), LVFU(rawData[row - 2][j]), LVFU(rawData[row - 2][j + 2]), + LVFU(rawData[row][j - 2]), LVFU(rawData[row][j]), LVFU(rawData[row][j + 2]), + LVFU(rawData[row + 2][j - 2]), LVFU(rawData[row + 2][j]), LVFU(rawData[row + 2][j + 2])); + STVFU(cfablur[destRow][j], LVFU(rawData[row][j]) - tempv); + } + #endif + for (; j < W - 2; j++) { + const float temp = median(rawData[row - 2][j - 2], rawData[row - 2][j], rawData[row - 2][j + 2], + rawData[row][j - 2], rawData[row][j], rawData[row][j + 2], + rawData[row + 2][j - 2], rawData[row + 2][j], rawData[row + 2][j + 2]); + cfablur[destRow][j] = rawData[row][j] - temp; + } + } + } + } + lastRow = i; + const int destRow = i % 5; + int j = 2; +#ifdef __SSE2__ + for (; j < W - 5; j += 4) { + const vfloat tempv = median(LVFU(rawData[i - 2][j - 2]), LVFU(rawData[i - 2][j]), LVFU(rawData[i - 2][j + 2]), + LVFU(rawData[i][j - 2]), LVFU(rawData[i][j]), LVFU(rawData[i][j + 2]), + LVFU(rawData[i + 2][j - 2]), LVFU(rawData[i + 2][j]), LVFU(rawData[i + 2][j + 2])); + STVFU(cfablur[destRow][j], LVFU(rawData[i][j]) - tempv); + } +#endif + for (; j < W - 2; j++) { const float temp = median(rawData[i - 2][j - 2], rawData[i - 2][j], rawData[i - 2][j + 2], rawData[i][j - 2], rawData[i][j], rawData[i][j + 2], rawData[i + 2][j - 2], rawData[i + 2][j], rawData[i + 2][j + 2]); - cfablur[i * W + j] = rawData[i][j] - temp; - } - } - - // process borders. Former version calculated the median using mirrored border which does not make sense because the original pixel loses weight - // Setting the difference between pixel and median for border pixels to zero should do the job not worse then former version -#ifdef _OPENMP - #pragma omp single -#endif - { - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < W; ++j) { - cfablur[i * W + j] = 0.f; - } + cfablur[destRow][j] = rawData[i][j] - temp; } - for (int i = 2; i < H - 2; ++i) { - for (int j = 0; j < 2; ++j) { - cfablur[i * W + j] = 0.f; - } + if (i - 1 > firstRow) { + const int rr = i - 2; + const int rrm2 = (rr - 2) % 5; + const int rrm1 = (rr - 1) % 5; + const int rr0 = rr % 5; + const int rrp1 = (rr + 1) % 5; + const int rrp2 = (rr + 2) % 5; + for (int cc = 2; cc < W - 2; ++cc) { + //evaluate pixel for heat/death + float pixdev = cfablur[rr0][cc]; - for (int j = W - 2; j < W; ++j) { - cfablur[i * W + j] = 0.f; - } - } + if (pixdev == 0.f) { + continue; + } - for (int i = H - 2; i < H; ++i) { - for (int j = 0; j < W; ++j) { - cfablur[i * W + j] = 0.f; - } - } - } + if ((!findDeadPixels) && pixdev < 0) { + continue; + } -#ifdef _OPENMP - #pragma omp barrier // barrier because of nowait clause above + if ((!findHotPixels) && pixdev > 0) { + continue; + } - #pragma omp for reduction(+:counter) schedule(dynamic,16) -#endif - - //cfa pixel heat/death evaluation - for (int rr = 2; rr < H - 2; ++rr) { - for (int cc = 2, rrmWpcc = rr * W + 2; cc < W - 2; ++cc, ++rrmWpcc) { - //evaluate pixel for heat/death - float pixdev = cfablur[rrmWpcc]; - - if (pixdev == 0.f) { - continue; - } - - if ((!findDeadPixels) && pixdev < 0) { - continue; - } - - if ((!findHotPixels) && pixdev > 0) { - continue; - } - - pixdev = fabsf(pixdev); - float hfnbrave = -pixdev; + pixdev = fabsf(pixdev); + float hfnbrave = -pixdev; #ifdef __SSE2__ - // sum up 5*4 = 20 values using SSE - // 10 fabs function calls and 10 float additions with SSE - vfloat sum = vabsf(LVFU(cfablur[(rr - 2) * W + cc - 2])) + vabsf(LVFU(cfablur[(rr - 1) * W + cc - 2])); - sum += vabsf(LVFU(cfablur[(rr) * W + cc - 2])); - sum += vabsf(LVFU(cfablur[(rr + 1) * W + cc - 2])); - sum += vabsf(LVFU(cfablur[(rr + 2) * W + cc - 2])); - // horizontally add the values and add the result to hfnbrave - hfnbrave += vhadd(sum); - - // add remaining 5 values of last column - for (int mm = rr - 2; mm <= rr + 2; ++mm) { - hfnbrave += fabsf(cfablur[mm * W + cc + 2]); - } + // sum up 5*4 = 20 values using SSE + // 10 fabs function calls and 10 float additions with SSE + vfloat sum1 = vabsf(LVFU(cfablur[rrm2][cc - 2])) + vabsf(LVFU(cfablur[rrm1][cc - 2])); + vfloat sum2 = vabsf(LVFU(cfablur[rr0][cc - 2])) + vabsf(LVFU(cfablur[rrp1][cc - 2])); + sum1 += vabsf(LVFU(cfablur[rrp2][cc - 2])); + // horizontally add the values and add the result to hfnbrave + hfnbrave += vhadd(sum1 + sum2); + // add remaining 5 values of last column + hfnbrave += fabsf(cfablur[rrm2][cc + 2]); + hfnbrave += fabsf(cfablur[rrm1][cc + 2]); + hfnbrave += fabsf(cfablur[rr0][cc + 2]); + hfnbrave += fabsf(cfablur[rrp1][cc + 2]); + hfnbrave += fabsf(cfablur[rrp2][cc + 2]); #else - // 25 fabs function calls and 25 float additions without SSE - for (int mm = rr - 2; mm <= rr + 2; ++mm) { + // 25 fabs function calls and 25 float additions without SSE for (int nn = cc - 2; nn <= cc + 2; ++nn) { - hfnbrave += fabsf(cfablur[mm * W + nn]); + hfnbrave += fabsf(cfablur[rrm2][nn]); + hfnbrave += fabsf(cfablur[rrm1][nn]); + hfnbrave += fabsf(cfablur[rr0][nn]); + hfnbrave += fabsf(cfablur[rrp1][nn]); + hfnbrave += fabsf(cfablur[rrp2][nn]); + } +#endif + if (pixdev > varthresh * hfnbrave) { + // mark the pixel as "bad" + bpMap.set(cc, rr); + counter++; + } + } //end of pixel evaluation + } + } + + if (lastRow > 0 && lastRow < H - 2) { + //cfa pixel heat/death evaluation + for (int rr = lastRow - 1; rr < lastRow + 1; ++rr) { + const int i = rr + 2; + const int destRow = i % 5; + if (i >= H - 2) { + for (int j = 2; j < W - 2; j++) { + cfablur[destRow][j] = 0.f; + } + } else { + int j = 2; +#ifdef __SSE2__ + for (; j < W - 5; j += 4) { + const vfloat tempv = median(LVFU(rawData[i - 2][j - 2]), LVFU(rawData[i - 2][j]), LVFU(rawData[i - 2][j + 2]), + LVFU(rawData[i][j - 2]), LVFU(rawData[i][j]), LVFU(rawData[i][j + 2]), + LVFU(rawData[i + 2][j - 2]), LVFU(rawData[i + 2][j]), LVFU(rawData[i + 2][j + 2])); + STVFU(cfablur[destRow][j], LVFU(rawData[i][j]) - tempv); + } +#endif + for (; j < W - 2; j++) { + const float temp = median(rawData[i - 2][j - 2], rawData[i - 2][j], rawData[i - 2][j + 2], + rawData[i][j - 2], rawData[i][j], rawData[i][j + 2], + rawData[i + 2][j - 2], rawData[i + 2][j], rawData[i + 2][j + 2]); + cfablur[destRow][j] = rawData[i][j] - temp; } } -#endif + const int rrm2 = (rr - 2) % 5; + const int rrm1 = (rr - 1) % 5; + const int rr0 = rr % 5; + const int rrp1 = (rr + 1) % 5; + const int rrp2 = (rr + 2) % 5; + for (int cc = 2; cc < W - 2; ++cc) { + //evaluate pixel for heat/death + float pixdev = cfablur[rr0][cc]; - if (pixdev > varthresh * hfnbrave) { - // mark the pixel as "bad" - bpMap.set(cc, rr); - counter++; - } - }//end of pixel evaluation + if (pixdev == 0.f) { + continue; + } + + if ((!findDeadPixels) && pixdev < 0) { + continue; + } + + if ((!findHotPixels) && pixdev > 0) { + continue; + } + + pixdev = fabsf(pixdev); + float hfnbrave = -pixdev; + +#ifdef __SSE2__ + // sum up 5*4 = 20 values using SSE + // 10 fabs function calls and 10 float additions with SSE + vfloat sum1 = vabsf(LVFU(cfablur[rrm2][cc - 2])) + vabsf(LVFU(cfablur[rrm1][cc - 2])); + vfloat sum2 = vabsf(LVFU(cfablur[rr0][cc - 2])) + vabsf(LVFU(cfablur[rrp1][cc - 2])); + sum1 += vabsf(LVFU(cfablur[rrp2][cc - 2])); + // horizontally add the values and add the result to hfnbrave + hfnbrave += vhadd(sum1 + sum2); + + // add remaining 5 values of last column + hfnbrave += fabsf(cfablur[rrm2][cc + 2]); + hfnbrave += fabsf(cfablur[rrm1][cc + 2]); + hfnbrave += fabsf(cfablur[rr0][cc + 2]); + hfnbrave += fabsf(cfablur[rrp1][cc + 2]); + hfnbrave += fabsf(cfablur[rrp2][cc + 2]); +#else + // 25 fabs function calls and 25 float additions without SSE + for (int nn = cc - 2; nn <= cc + 2; ++nn) { + hfnbrave += fabsf(cfablur[rrm2][nn]); + hfnbrave += fabsf(cfablur[rrm1][nn]); + hfnbrave += fabsf(cfablur[rr0][nn]); + hfnbrave += fabsf(cfablur[rrp1][nn]); + hfnbrave += fabsf(cfablur[rrp2][nn]); + } +#endif + if (pixdev > varthresh * hfnbrave) { + // mark the pixel as "bad" + bpMap.set(cc, rr); + counter++; + } + }//end of pixel evaluation + } } }//end of parallel processing - delete [] cfablur; return counter; } From 0b5b9b0f3e0cca4cbdfbd0dc24a0e5d2caa8f4b8 Mon Sep 17 00:00:00 2001 From: Ingo Weyrich Date: Thu, 21 Nov 2019 15:37:55 +0100 Subject: [PATCH 2/5] findHotDeadPixels: further speedup and cleanup, #5531 --- rtengine/badpixels.cc | 167 ++++++++++++------------------------------ 1 file changed, 45 insertions(+), 122 deletions(-) diff --git a/rtengine/badpixels.cc b/rtengine/badpixels.cc index 578459189..0ae63a618 100644 --- a/rtengine/badpixels.cc +++ b/rtengine/badpixels.cc @@ -22,13 +22,39 @@ #include "pixelsmap.h" #include "rawimage.h" #include "rawimagesource.h" -#define BENCHMARK +//#define BENCHMARK #include "StopWatch.h" + namespace { unsigned fc(const unsigned int cfa[2][2], int r, int c) { return cfa[r & 1][c & 1]; } + +inline void sum5x5(const array2D& in, int col, float &sum) { +#ifdef __SSE2__ + // sum up 5*4 = 20 values using SSE + // 10 fabs function calls and 10 float additions with SSE + const vfloat sumv = (vabsf(LVFU(in[0][col])) + vabsf(LVFU(in[1][col]))) + + (vabsf(LVFU(in[2][col])) + vabsf(LVFU(in[3][col]))) + + vabsf(LVFU(in[4][col])); + // horizontally add the values and add the result to hfnbrave + sum += vhadd(sumv); + + // add remaining 5 values of last column + sum += (fabsf(in[0][col + 4]) + fabsf(in[1][col + 4])) + + (fabsf(in[2][col + 4]) + fabsf(in[3][col + 4])) + + fabsf(in[4][col + 4]); +#else + // 25 fabs function calls and 25 float additions without SSE + for (int nn = col; nn < col + 5; ++nn) { + sum += (fabsf(in[0][nn]) + fabsf(in[1][nn])) + + (fabsf(in[2][nn]) + fabsf(in[3][nn])) + + fabsf(in[4][nn]); + } +#endif + +} } namespace rtengine @@ -446,8 +472,8 @@ int RawImageSource::interpolateBadPixelsXtrans(const PixelsMap &bitmapBads) /* Search for hot or dead pixels in the image and update the map * For each pixel compare its value to the average of similar color surrounding * (Taken from Emil Martinec idea) - * (Optimized by Ingo Weyrich 2013, 2015 and 2019) - */ + * (Optimized by Ingo Weyrich 2013, 2015, and 2019) +*/ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, const bool findHotPixels, const bool findDeadPixels) const { BENCHFUN @@ -460,11 +486,7 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons #pragma omp parallel reduction(+:counter) #endif { - array2D cfablur(W, 5); - // zero left and right border - for (int i = 0; i < 5; ++i) { - cfablur[i][0] = cfablur[i][1] = cfablur[i][W - 2] = cfablur[i][W - 1]; - } + array2D cfablur(W, 5, ARRAY2D_CLEAR_DATA); int firstRow = -1; int lastRow = -1; @@ -473,28 +495,13 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons #pragma omp for schedule(static) nowait #endif - for (int i = 2; i < H - 2; i++) { + for (int i = 2; i < H - 2; ++i) { if (firstRow == -1) { firstRow = i; - if (firstRow == 2) { - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < W; ++j) { - cfablur[i][j] = 0.f; - } - } - } else { + if (firstRow > 2) { for (int row = firstRow - 2; row < firstRow; ++row) { - int destRow = row % 5; - int j = 2; - #ifdef __SSE2__ - for (; j < W - 5; j += 4) { - const vfloat tempv = median(LVFU(rawData[row - 2][j - 2]), LVFU(rawData[row - 2][j]), LVFU(rawData[row - 2][j + 2]), - LVFU(rawData[row][j - 2]), LVFU(rawData[row][j]), LVFU(rawData[row][j + 2]), - LVFU(rawData[row + 2][j - 2]), LVFU(rawData[row + 2][j]), LVFU(rawData[row + 2][j + 2])); - STVFU(cfablur[destRow][j], LVFU(rawData[row][j]) - tempv); - } - #endif - for (; j < W - 2; j++) { + const int destRow = row % 5; + for (int j = 2; j < W - 2; ++j) { const float temp = median(rawData[row - 2][j - 2], rawData[row - 2][j], rawData[row - 2][j + 2], rawData[row][j - 2], rawData[row][j], rawData[row][j + 2], rawData[row + 2][j - 2], rawData[row + 2][j], rawData[row + 2][j + 2]); @@ -505,16 +512,7 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons } lastRow = i; const int destRow = i % 5; - int j = 2; -#ifdef __SSE2__ - for (; j < W - 5; j += 4) { - const vfloat tempv = median(LVFU(rawData[i - 2][j - 2]), LVFU(rawData[i - 2][j]), LVFU(rawData[i - 2][j + 2]), - LVFU(rawData[i][j - 2]), LVFU(rawData[i][j]), LVFU(rawData[i][j + 2]), - LVFU(rawData[i + 2][j - 2]), LVFU(rawData[i + 2][j]), LVFU(rawData[i + 2][j + 2])); - STVFU(cfablur[destRow][j], LVFU(rawData[i][j]) - tempv); - } -#endif - for (; j < W - 2; j++) { + for (int j = 2; j < W - 2; ++j) { const float temp = median(rawData[i - 2][j - 2], rawData[i - 2][j], rawData[i - 2][j + 2], rawData[i][j - 2], rawData[i][j], rawData[i][j + 2], rawData[i + 2][j - 2], rawData[i + 2][j], rawData[i + 2][j + 2]); @@ -523,60 +521,26 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons if (i - 1 > firstRow) { const int rr = i - 2; - const int rrm2 = (rr - 2) % 5; - const int rrm1 = (rr - 1) % 5; const int rr0 = rr % 5; - const int rrp1 = (rr + 1) % 5; - const int rrp2 = (rr + 2) % 5; for (int cc = 2; cc < W - 2; ++cc) { //evaluate pixel for heat/death float pixdev = cfablur[rr0][cc]; - if (pixdev == 0.f) { + if (!findDeadPixels && pixdev <= 0.f) { continue; } - if ((!findDeadPixels) && pixdev < 0) { - continue; - } - - if ((!findHotPixels) && pixdev > 0) { + if (!findHotPixels && pixdev >= 0.f) { continue; } pixdev = fabsf(pixdev); float hfnbrave = -pixdev; - -#ifdef __SSE2__ - // sum up 5*4 = 20 values using SSE - // 10 fabs function calls and 10 float additions with SSE - vfloat sum1 = vabsf(LVFU(cfablur[rrm2][cc - 2])) + vabsf(LVFU(cfablur[rrm1][cc - 2])); - vfloat sum2 = vabsf(LVFU(cfablur[rr0][cc - 2])) + vabsf(LVFU(cfablur[rrp1][cc - 2])); - sum1 += vabsf(LVFU(cfablur[rrp2][cc - 2])); - // horizontally add the values and add the result to hfnbrave - hfnbrave += vhadd(sum1 + sum2); - - // add remaining 5 values of last column - hfnbrave += fabsf(cfablur[rrm2][cc + 2]); - hfnbrave += fabsf(cfablur[rrm1][cc + 2]); - hfnbrave += fabsf(cfablur[rr0][cc + 2]); - hfnbrave += fabsf(cfablur[rrp1][cc + 2]); - hfnbrave += fabsf(cfablur[rrp2][cc + 2]); -#else - - // 25 fabs function calls and 25 float additions without SSE - for (int nn = cc - 2; nn <= cc + 2; ++nn) { - hfnbrave += fabsf(cfablur[rrm2][nn]); - hfnbrave += fabsf(cfablur[rrm1][nn]); - hfnbrave += fabsf(cfablur[rr0][nn]); - hfnbrave += fabsf(cfablur[rrp1][nn]); - hfnbrave += fabsf(cfablur[rrp2][nn]); - } -#endif + sum5x5(cfablur, cc - 2, hfnbrave); if (pixdev > varthresh * hfnbrave) { // mark the pixel as "bad" bpMap.set(cc, rr); - counter++; + ++counter; } } //end of pixel evaluation } @@ -592,16 +556,7 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons cfablur[destRow][j] = 0.f; } } else { - int j = 2; -#ifdef __SSE2__ - for (; j < W - 5; j += 4) { - const vfloat tempv = median(LVFU(rawData[i - 2][j - 2]), LVFU(rawData[i - 2][j]), LVFU(rawData[i - 2][j + 2]), - LVFU(rawData[i][j - 2]), LVFU(rawData[i][j]), LVFU(rawData[i][j + 2]), - LVFU(rawData[i + 2][j - 2]), LVFU(rawData[i + 2][j]), LVFU(rawData[i + 2][j + 2])); - STVFU(cfablur[destRow][j], LVFU(rawData[i][j]) - tempv); - } -#endif - for (; j < W - 2; j++) { + for (int j = 2; j < W - 2; ++j) { const float temp = median(rawData[i - 2][j - 2], rawData[i - 2][j], rawData[i - 2][j + 2], rawData[i][j - 2], rawData[i][j], rawData[i][j + 2], rawData[i + 2][j - 2], rawData[i + 2][j], rawData[i + 2][j + 2]); @@ -609,64 +564,32 @@ int RawImageSource::findHotDeadPixels(PixelsMap &bpMap, const float thresh, cons } } - const int rrm2 = (rr - 2) % 5; - const int rrm1 = (rr - 1) % 5; const int rr0 = rr % 5; - const int rrp1 = (rr + 1) % 5; - const int rrp2 = (rr + 2) % 5; for (int cc = 2; cc < W - 2; ++cc) { //evaluate pixel for heat/death float pixdev = cfablur[rr0][cc]; - if (pixdev == 0.f) { + if (!findDeadPixels && pixdev <= 0.f) { continue; } - if ((!findDeadPixels) && pixdev < 0) { - continue; - } - - if ((!findHotPixels) && pixdev > 0) { + if (!findHotPixels && pixdev >= 0.f) { continue; } pixdev = fabsf(pixdev); float hfnbrave = -pixdev; - -#ifdef __SSE2__ - // sum up 5*4 = 20 values using SSE - // 10 fabs function calls and 10 float additions with SSE - vfloat sum1 = vabsf(LVFU(cfablur[rrm2][cc - 2])) + vabsf(LVFU(cfablur[rrm1][cc - 2])); - vfloat sum2 = vabsf(LVFU(cfablur[rr0][cc - 2])) + vabsf(LVFU(cfablur[rrp1][cc - 2])); - sum1 += vabsf(LVFU(cfablur[rrp2][cc - 2])); - // horizontally add the values and add the result to hfnbrave - hfnbrave += vhadd(sum1 + sum2); - - // add remaining 5 values of last column - hfnbrave += fabsf(cfablur[rrm2][cc + 2]); - hfnbrave += fabsf(cfablur[rrm1][cc + 2]); - hfnbrave += fabsf(cfablur[rr0][cc + 2]); - hfnbrave += fabsf(cfablur[rrp1][cc + 2]); - hfnbrave += fabsf(cfablur[rrp2][cc + 2]); -#else - // 25 fabs function calls and 25 float additions without SSE - for (int nn = cc - 2; nn <= cc + 2; ++nn) { - hfnbrave += fabsf(cfablur[rrm2][nn]); - hfnbrave += fabsf(cfablur[rrm1][nn]); - hfnbrave += fabsf(cfablur[rr0][nn]); - hfnbrave += fabsf(cfablur[rrp1][nn]); - hfnbrave += fabsf(cfablur[rrp2][nn]); - } -#endif + sum5x5(cfablur, cc - 2, hfnbrave); if (pixdev > varthresh * hfnbrave) { // mark the pixel as "bad" bpMap.set(cc, rr); - counter++; + ++counter; } }//end of pixel evaluation } } }//end of parallel processing + return counter; } From 9bd53a5cc91d16d363802bf32d12bb8babd5af81 Mon Sep 17 00:00:00 2001 From: Ingo Weyrich Date: Thu, 21 Nov 2019 16:23:39 +0100 Subject: [PATCH 3/5] Dual demosaic: reduce memory usage by width * height * 4 byte --- rtengine/dual_demosaic_RT.cc | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/rtengine/dual_demosaic_RT.cc b/rtengine/dual_demosaic_RT.cc index b5839ee8b..0d9999e18 100644 --- a/rtengine/dual_demosaic_RT.cc +++ b/rtengine/dual_demosaic_RT.cc @@ -33,7 +33,7 @@ #include "../rtgui/options.h" -//#define BENCHMARK +#define BENCHMARK #include "StopWatch.h" using namespace std; @@ -66,14 +66,9 @@ void RawImageSource::dual_demosaic_RT(bool isBayer, const procparams::RAWParams return; } - array2D redTmp(winw, winh); - array2D greenTmp(winw, winh); - array2D blueTmp(winw, winh); array2D L(winw, winh); if (isBayer) { - vng4_demosaic(rawData, redTmp, greenTmp, blueTmp); - if (raw.bayersensor.method == procparams::RAWParams::BayerSensor::getMethodString(procparams::RAWParams::BayerSensor::Method::AMAZEVNG4) || raw.bayersensor.method == procparams::RAWParams::BayerSensor::getMethodString(procparams::RAWParams::BayerSensor::Method::PIXELSHIFT)) { amaze_demosaic_RT(0, 0, winw, winh, rawData, red, green, blue, options.chunkSizeAMAZE, options.measure); } else if (raw.bayersensor.method == procparams::RAWParams::BayerSensor::getMethodString(procparams::RAWParams::BayerSensor::Method::DCBVNG4) ) { @@ -87,7 +82,6 @@ void RawImageSource::dual_demosaic_RT(bool isBayer, const procparams::RAWParams } else { xtrans_interpolate (1, false, options.chunkSizeXT, options.measure); } - fast_xtrans_interpolate(rawData, redTmp, greenTmp, blueTmp); } const float xyz_rgb[3][3] = { // XYZ from RGB @@ -114,6 +108,17 @@ void RawImageSource::dual_demosaic_RT(bool isBayer, const procparams::RAWParams buildBlendMask(L, blend, winw, winh, contrastf, 1.f, autoContrast); contrast = contrastf * 100.f; + array2D& redTmp = L; // L is not needed anymore => reuse it + array2D greenTmp(winw, winh); + array2D blueTmp(winw, winh); + + if (isBayer) { + vng4_demosaic(rawData, redTmp, greenTmp, blueTmp); + } else { + fast_xtrans_interpolate(rawData, redTmp, greenTmp, blueTmp); + } + + // the following is split into 3 loops intentionally to avoid cache conflicts on CPUs with only 4-way cache #ifdef _OPENMP #pragma omp parallel for From 8b5533cfcb153c56d5755c83874b8a2f243aa87b Mon Sep 17 00:00:00 2001 From: Ingo Weyrich Date: Thu, 21 Nov 2019 16:28:45 +0100 Subject: [PATCH 4/5] Dual demosaic: disable timing code --- rtengine/dual_demosaic_RT.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtengine/dual_demosaic_RT.cc b/rtengine/dual_demosaic_RT.cc index 0d9999e18..69d1a189a 100644 --- a/rtengine/dual_demosaic_RT.cc +++ b/rtengine/dual_demosaic_RT.cc @@ -33,7 +33,7 @@ #include "../rtgui/options.h" -#define BENCHMARK +//#define BENCHMARK #include "StopWatch.h" using namespace std; From ff9eeb4744d6ec571a8e335129c0106135304933 Mon Sep 17 00:00:00 2001 From: Ingo Weyrich Date: Thu, 21 Nov 2019 18:09:14 +0100 Subject: [PATCH 5/5] Capture Sharpening missing from Parameters to save partial processing profile dialog box, fixes #5487 --- rtgui/partialpastedlg.cc | 18 ++++++++++++++++++ rtgui/partialpastedlg.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/rtgui/partialpastedlg.cc b/rtgui/partialpastedlg.cc index a7d2bd0d1..22f608ae4 100644 --- a/rtgui/partialpastedlg.cc +++ b/rtgui/partialpastedlg.cc @@ -136,6 +136,8 @@ PartialPasteDlg::PartialPasteDlg (const Glib::ustring &title, Gtk::Window* paren raw_ca_avoid_colourshift = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_RAWCACORR_AVOIDCOLORSHIFT"))); //--- filmNegative = Gtk::manage (new Gtk::CheckButton (M("PARTIALPASTE_FILMNEGATIVE")) ); + //--- + captureSharpening = Gtk::manage (new Gtk::CheckButton (M("TP_PDSHARPENING_LABEL")) ); Gtk::VBox* vboxes[8]; Gtk::HSeparator* hseps[8]; @@ -253,6 +255,7 @@ PartialPasteDlg::PartialPasteDlg (const Glib::ustring &title, Gtk::Window* paren vboxes[7]->pack_start (*raw_ca_avoid_colourshift, Gtk::PACK_SHRINK, 2); vboxes[7]->pack_start (*Gtk::manage (new Gtk::HSeparator ()), Gtk::PACK_SHRINK, 0); vboxes[7]->pack_start (*filmNegative, Gtk::PACK_SHRINK, 2); + vboxes[7]->pack_start (*captureSharpening, Gtk::PACK_SHRINK, 2); Gtk::VBox* vbCol1 = Gtk::manage (new Gtk::VBox ()); Gtk::VBox* vbCol2 = Gtk::manage (new Gtk::VBox ()); @@ -402,6 +405,8 @@ PartialPasteDlg::PartialPasteDlg (const Glib::ustring &title, Gtk::Window* paren raw_ca_avoid_colourshiftconn = raw_ca_avoid_colourshift->signal_toggled().connect (sigc::bind (sigc::mem_fun(*raw, &Gtk::CheckButton::set_inconsistent), true)); //--- filmNegativeConn = filmNegative->signal_toggled().connect (sigc::bind (sigc::mem_fun(*raw, &Gtk::CheckButton::set_inconsistent), true)); + //--- + captureSharpeningConn = captureSharpening->signal_toggled().connect (sigc::bind (sigc::mem_fun(*raw, &Gtk::CheckButton::set_inconsistent), true)); add_button (M("GENERAL_OK"), Gtk::RESPONSE_OK); add_button (M("GENERAL_CANCEL"), Gtk::RESPONSE_CANCEL); @@ -474,6 +479,7 @@ void PartialPasteDlg::rawToggled () ConnectionBlocker raw_caredblueBlocker(raw_caredblueConn); ConnectionBlocker raw_ca_avoid_colourshiftBlocker(raw_ca_avoid_colourshiftconn); ConnectionBlocker filmNegativeBlocker(filmNegativeConn); + ConnectionBlocker captureSharpeningBlocker(captureSharpeningConn); raw->set_inconsistent (false); @@ -503,6 +509,7 @@ void PartialPasteDlg::rawToggled () raw_caredblue->set_active (raw->get_active ()); raw_ca_avoid_colourshift->set_active (raw->get_active ()); filmNegative->set_active (raw->get_active()); + captureSharpening->set_active (raw->get_active()); } void PartialPasteDlg::basicToggled () @@ -981,6 +988,17 @@ void PartialPasteDlg::applyPaste (rtengine::procparams::ProcParams* dstPP, Param filterPE.filmNegative.blueRatio = falsePE.filmNegative.blueRatio; } + if (!captureSharpening->get_active ()) { + filterPE.pdsharpening.enabled = falsePE.pdsharpening.enabled; + filterPE.pdsharpening.contrast = falsePE.pdsharpening.contrast; + filterPE.pdsharpening.autoContrast = falsePE.pdsharpening.autoContrast; + filterPE.pdsharpening.autoRadius = falsePE.pdsharpening.autoRadius; + filterPE.pdsharpening.deconvradius = falsePE.pdsharpening.deconvradius; + filterPE.pdsharpening.deconvradiusOffset = falsePE.pdsharpening.deconvradiusOffset; + filterPE.pdsharpening.deconviter = falsePE.pdsharpening.deconviter; + filterPE.pdsharpening.deconvitercheck = falsePE.pdsharpening.deconvitercheck; + } + if (dstPE) { *dstPE = filterPE; } diff --git a/rtgui/partialpastedlg.h b/rtgui/partialpastedlg.h index da6c9251a..1403e7c1b 100644 --- a/rtgui/partialpastedlg.h +++ b/rtgui/partialpastedlg.h @@ -141,6 +141,7 @@ public: Gtk::CheckButton* ff_ClipControl; Gtk::CheckButton* filmNegative; + Gtk::CheckButton* captureSharpening; sigc::connection everythingConn, basicConn, detailConn, colorConn, lensConn, compositionConn, metaConn, rawConn, advancedConn; @@ -153,6 +154,7 @@ public: sigc::connection df_fileConn, df_AutoSelectConn, ff_fileConn, ff_AutoSelectConn, ff_BlurRadiusConn, ff_BlurTypeConn, ff_ClipControlConn; sigc::connection raw_caredblueConn, raw_ca_autocorrectConn, raw_ca_avoid_colourshiftconn, raw_hotpix_filtConn, raw_deadpix_filtConn, raw_pdaf_lines_filterConn, raw_linenoiseConn, raw_greenthreshConn, raw_ccStepsConn, raw_methodConn, raw_borderConn, raw_imagenumConn, raw_dcb_iterationsConn, raw_lmmse_iterationsConn, raw_pixelshiftConn, raw_dcb_enhanceConn, raw_exposConn, raw_blackConn; sigc::connection filmNegativeConn; + sigc::connection captureSharpeningConn; public: PartialPasteDlg (const Glib::ustring &title, Gtk::Window* parent);