diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc index 2b0404c85..7437fda35 100644 --- a/rtengine/demosaic_algos.cc +++ b/rtengine/demosaic_algos.cc @@ -36,7 +36,8 @@ #include "procparams.h" #include "sleef.c" #include "opthelper.h" - +#define BENCHMARK +#include "StopWatch.h" #ifdef _OPENMP #include #endif @@ -3886,12 +3887,12 @@ const float d65_white[3] = { 0.950456, 1, 1.088754 }; void RawImageSource::cielab (const float (*rgb)[3], float* l, float* a, float *b, const int width, const int height, const int labWidth, const float xyz_cam[3][3]) { - static LUTf cbrt(0x10000); + static LUTf cbrt(0x14000); static bool cbrtinit = false; if (!rgb) { if(!cbrtinit) { - for (int i = 0; i < 0x10000; i++) { + for (int i = 0; i < 0x14000; i++) { double r = i / 65535.0; cbrt[i] = r > 0.008856f ? std::cbrt(r) : 7.787f * r + 16.f / 116.f; } @@ -4014,10 +4015,11 @@ void RawImageSource::xtransborder_interpolate (int border) Frank Markesteijn's algorithm for Fuji X-Trans sensors adapted to RT by Ingo Weyrich 2014 */ - +// override CLIP function to test unclipped output +#define CLIP(x) (x) void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) { - + BENCHFUN constexpr int ts = 114; /* Tile Size */ constexpr int tsh = ts / 2; /* half of Tile Size */ @@ -4140,6 +4142,7 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) uint8_t (*homo)[ts][ts] = (uint8_t (*)[ts][ts]) (lab); // we can reuse the lab-buffer because they are not used together s_minmaxgreen (*greenminmaxtile)[tsh] = (s_minmaxgreen(*)[tsh]) (lab); // we can reuse the lab-buffer because they are not used together uint8_t (*homosum)[ts][ts] = (uint8_t (*)[ts][ts]) (drv); // we can reuse the drv-buffer because they are not used together + uint8_t (*homosummax)[ts] = (uint8_t (*)[ts]) homo[ndir - 1]; // we can reuse the homo-buffer because they are not used together #ifdef _OPENMP #pragma omp for collapse(2) schedule(dynamic) nowait @@ -4284,9 +4287,13 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) } } } else { - for (int col = leftstart; col < mcol; col += coloffset, coloffset ^= 3) { + short *hexmod[2]; + hexmod[0] = allhex[0][row % 3][leftstart % 3]; + hexmod[1] = allhex[0][row % 3][(leftstart + coloffset) % 3]; + + for (int col = leftstart, hexindex = 0; col < mcol; col += coloffset, coloffset ^= 3, hexindex ^= 1) { float *pix = &rawData[row][col]; - short *hex = allhex[0][row % 3][col % 3]; + short *hex = hexmod[hexindex]; float color[4]; color[0] = 0.6796875f * (pix[hex[1]] + pix[hex[0]]) - 0.1796875f * (pix[2 * hex[1]] + pix[2 * hex[0]]); @@ -4335,9 +4342,12 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) } } else { int f = fcol(row, leftstart); + short *hexmod[2]; + hexmod[0] = allhex[1][row % 3][leftstart % 3]; + hexmod[1] = allhex[1][row % 3][(leftstart + coloffset) % 3]; - for (int col = leftstart; col < mcol - 2; col += coloffset, coloffset ^= 3, f = f ^ (coloffset & 2) ) { - short *hex = allhex[1][row % 3][col % 3]; + for (int col = leftstart, hexindex = 0; col < mcol - 2; col += coloffset, coloffset ^= 3, f = f ^ (coloffset & 2), hexindex ^= 1 ) { + short *hex = hexmod[hexindex]; for (int d = 3; d < 6; d++) { float (*rix)[3] = &rgb[(d - 2) ^ 1][row - top][col - left]; @@ -4448,9 +4458,13 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) for (int row = topstart; row < mrow - 2; row++) { if ((row - sgrow) % 3) { - for (int col = leftstart, coloffset = coloffsetstart; col < mcol - 2; col += coloffset, coloffset ^= 3) { + short *hexmod[2]; + hexmod[0] = allhex[1][row % 3][leftstart % 3]; + hexmod[1] = allhex[1][row % 3][(leftstart + coloffsetstart) % 3]; + + for (int col = leftstart, coloffset = coloffsetstart, hexindex = 0; col < mcol - 2; col += coloffset, coloffset ^= 3, hexindex ^= 1) { float (*rix)[3] = &rgb[0][row - top][col - left]; - short *hex = allhex[1][row % 3][col % 3]; + short *hex = hexmod[hexindex]; for (int d = 0; d < ndir; d += 2, rix += ts * ts) { if (hex[d] + hex[d + 1]) { @@ -4582,7 +4596,7 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) for (int d = 0; d < ndir; d++) for (int v = -1; v <= 1; v++) for (int h = -1; h <= 1; h++) { - homo[d][row][col] += (drv[d][row + v - 5][col + h - 5] <= tr ? 1 : 0) ; + homo[d][row][col] += (drv[d][row + v - 5][col + h - 5] <= tr ? 1 : 0); } } @@ -4643,21 +4657,63 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) } } - /* Average the most homogenous pixels for the final result: */ + // calculate maximum of homogeneity maps per pixel. Vectorized calculation is a tiny bit faster than on the fly calculation in next step +#ifdef __SSE2__ + vint maskv = _mm_set1_epi8(31); +#endif + + for (int row = MIN(top, 8); row < mrow - 8; row++) { + int col = startcol; +#ifdef __SSE2__ + int endcol = row < mrow - 9 ? mcol - 8 : mcol - 23; + + for (; col < endcol; col += 16) { + vint maxval1 = _mm_max_epu8(_mm_loadu_si128((vint*)&homosum[0][row][col]), _mm_loadu_si128((vint*)&homosum[1][row][col])); + vint maxval2 = _mm_max_epu8(_mm_loadu_si128((vint*)&homosum[2][row][col]), _mm_loadu_si128((vint*)&homosum[3][row][col])); + + if(ndir > 4) { + vint maxval3 = _mm_max_epu8(_mm_loadu_si128((vint*)&homosum[4][row][col]), _mm_loadu_si128((vint*)&homosum[5][row][col])); + vint maxval4 = _mm_max_epu8(_mm_loadu_si128((vint*)&homosum[6][row][col]), _mm_loadu_si128((vint*)&homosum[7][row][col])); + maxval1 = _mm_max_epu8(maxval1, maxval3); + maxval1 = _mm_max_epu8(maxval1, maxval4); + } + + maxval1 = _mm_max_epu8(maxval1, maxval2); + // there is no shift intrinsic for epu8. Shift using epi32 and mask the wrong bits out + vint subv = _mm_srli_epi32( maxval1, 3 ); + subv = _mm_and_si128(subv, maskv); + maxval1 = _mm_subs_epu8(maxval1, subv); + _mm_storeu_si128((vint*)&homosummax[row][col], maxval1); + } + +#endif + + for (; col < mcol - 8; col ++) { + uint8_t maxval = homosum[0][row][col]; + + for(int d = 1; d < ndir; d++) { + maxval = maxval < homosum[d][row][col] ? homosum[d][row][col] : maxval; + } + + maxval -= maxval >> 3; + homosummax[row][col] = maxval; + } + } + + + /* Average the most homogeneous pixels for the final result: */ + uint8_t hm[8]; + for (int row = MIN(top, 8); row < mrow - 8; row++) for (int col = MIN(left, 8); col < mcol - 8; col++) { - uint8_t hm[8]; - uint8_t maxval = 0; int d = 0; for (; d < 4; d++) { hm[d] = homosum[d][row][col]; - maxval = (maxval < hm[d] ? hm[d] : maxval); } for (; d < ndir; d++) { hm[d] = homosum[d][row][col]; - maxval = (maxval < hm[d] ? hm[d] : maxval); if (hm[d - 4] < hm[d]) { hm[d - 4] = 0; @@ -4666,9 +4722,10 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) } } - maxval -= maxval >> 3; float avg[4] = {0.f}; + uint8_t maxval = homosummax[row][col]; + for (d = 0; d < ndir; d++) if (hm[d] >= maxval) { FORC3 avg[c] += rgb[d][row][col][c]; @@ -4698,7 +4755,7 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab) } } - +#undef CLIP void RawImageSource::fast_xtrans_interpolate () { if (settings->verbose) {