From 66382743cd0ab251e361b9615712aaeb9f36f19c Mon Sep 17 00:00:00 2001 From: "U-coolermaster2\\cuniek" Date: Mon, 27 Feb 2017 19:41:23 +0100 Subject: [PATCH 1/5] Improved DCB, less macroblicking on diagonals, much faster code --- rtengine/demosaic_algos.cc | 403 ++++++++++++++++++++----------------- rtengine/rawimagesource.h | 26 +-- 2 files changed, 232 insertions(+), 197 deletions(-) diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc index e641ed777..0f0e4841f 100644 --- a/rtengine/demosaic_algos.cc +++ b/rtengine/demosaic_algos.cc @@ -1,3 +1,4 @@ +/* /* * This file is part of RawTherapee. * @@ -37,7 +38,7 @@ #include "sleef.c" #include "opthelper.h" #include "median.h" -//#define BENCHMARK +#define BENCHMARK #include "StopWatch.h" #ifdef _OPENMP #include @@ -3251,7 +3252,7 @@ void RawImageSource::refinement_lassus(int PassCount) * the code is open source (BSD licence) */ -#define TILESIZE 256 +#define TILESIZE 192 #define TILEBORDER 10 #define CACHESIZE (TILESIZE+2*TILEBORDER) @@ -3279,7 +3280,7 @@ inline void RawImageSource::dcb_initTileLimits(int &colMin, int &rowMin, int &co } } -void RawImageSource::fill_raw( float (*cache )[4], int x0, int y0, float** rawData) +void RawImageSource::fill_raw( float (*cache )[3], int x0, int y0, float** rawData) { int rowMin, colMin, rowMax, colMax; dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 0); @@ -3290,7 +3291,7 @@ void RawImageSource::fill_raw( float (*cache )[4], int x0, int y0, float** rawDa } } -void RawImageSource::fill_border( float (*cache )[4], int border, int x0, int y0) +void RawImageSource::fill_border( float (*cache )[3], int border, int x0, int y0) { unsigned row, col, y, x, f, c; float sum[8]; @@ -3325,93 +3326,55 @@ void RawImageSource::fill_border( float (*cache )[4], int border, int x0, int y0 } } } + // saves red and blue -void RawImageSource::copy_to_buffer( float (*buffer)[3], float (*image)[4]) + +// change buffer[3] -> buffer[2], possibly to buffer[1] if split +// into two loops, one for R and another for B, could also be smaller because +// there is no need for green pixels pass +// this would decrease the amount of needed memory +// from megapixels*2 records to megapixels*0.5 +// also don't know if float is needed as data is 1-65536 integer (I believe!!) +// comment from Ingo: float is needed because rawdata in rt is float +void RawImageSource::copy_to_buffer( float (*buffer)[2], float (*image)[3]) { for (int indx = 0; indx < CACHESIZE * CACHESIZE; indx++) { buffer[indx][0] = image[indx][0]; //R - buffer[indx][2] = image[indx][2]; //B + buffer[indx][1] = image[indx][2]; //B } } // restores red and blue -void RawImageSource::restore_from_buffer(float (*image)[4], float (*buffer)[3]) + +// other comments like in copy_to_buffer +void RawImageSource::restore_from_buffer(float (*image)[3], float (*buffer)[2]) { for (int indx = 0; indx < CACHESIZE * CACHESIZE; indx++) { image[indx][0] = buffer[indx][0]; //R - image[indx][2] = buffer[indx][2]; //B + image[indx][2] = buffer[indx][1]; //B } } // First pass green interpolation -void RawImageSource::dcb_hid(float (*image)[4], float (*bufferH)[3], float (*bufferV)[3], int x0, int y0) + +// remove entirely: bufferH and bufferV +void RawImageSource::dcb_hid(float (*image)[3], int x0, int y0) { - const int u = CACHESIZE, v = 2 * CACHESIZE; - int rowMin, colMin, rowMax, colMax; + const int u = CACHESIZE; + int rowMin, colMin, rowMax, colMax, c; dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 2); - // green pixels - for (int row = rowMin; row < rowMax; row++) { - for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col; col < colMax; col += 2, indx += 2) { - assert(indx - u >= 0 && indx + u < u * u); - bufferH[indx][1] = (image[indx - 1][1] + image[indx + 1][1]) * 0.5f; - bufferV[indx][1] = (image[indx + u][1] + image[indx - u][1]) * 0.5f; - } - } - - // red in blue pixel, blue in red pixel +// simple green bilinear in R and B pixels for (int row = rowMin; row < rowMax; row++) - for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = 2 - FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { + for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { assert(indx - u - 1 >= 0 && indx + u + 1 < u * u && c >= 0 && c < 3); - bufferH[indx][c] = ( 4.f * bufferH[indx][1] - - bufferH[indx + u + 1][1] - bufferH[indx + u - 1][1] - bufferH[indx - u + 1][1] - bufferH[indx - u - 1][1] - + image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] ) * 0.25f; - bufferV[indx][c] = ( 4.f * bufferV[indx][1] - - bufferV[indx + u + 1][1] - bufferV[indx + u - 1][1] - bufferV[indx - u + 1][1] - bufferV[indx - u - 1][1] - + image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] ) * 0.25f; - } - - // red or blue in green pixels - for (int row = rowMin; row < rowMax; row++) - for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + 1) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col + 1), d = 2 - c; col < colMax; col += 2, indx += 2) { - assert(indx - u >= 0 && indx + u < u * u && c >= 0 && c < 3 && d >= 0 && d < 3); - bufferH[indx][c] = (image[indx + 1][c] + image[indx - 1][c]) * 0.5f; - bufferH[indx][d] = (2.f * bufferH[indx][1] - bufferH[indx + u][1] - bufferH[indx - u][1] + image[indx + u][d] + image[indx - u][d]) * 0.5f; - bufferV[indx][c] = (2.f * bufferV[indx][1] - bufferV[indx + 1][1] - bufferV[indx - 1][1] + image[indx + 1][c] + image[indx - 1][c]) * 0.5f; - bufferV[indx][d] = (image[indx + u][d] + image[indx - u][d]) * 0.5f; - } - - // Decide green pixels - for (int row = rowMin; row < rowMax; row++) - for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col), d = 2 - c; col < colMax; col += 2, indx += 2) { - float current = max(image[indx + v][c], image[indx - v][c], image[indx - 2][c], image[indx + 2][c]) - - min(image[indx + v][c], image[indx - v][c], image[indx - 2][c], image[indx + 2][c]) + - max(image[indx + 1 + u][d], image[indx + 1 - u][d], image[indx - 1 + u][d], image[indx - 1 - u][d]) - - min(image[indx + 1 + u][d], image[indx + 1 - u][d], image[indx - 1 + u][d], image[indx - 1 - u][d]); - - float currentH = max(bufferH[indx + v][d], bufferH[indx - v][d], bufferH[indx - 2][d], bufferH[indx + 2][d]) - - min(bufferH[indx + v][d], bufferH[indx - v][d], bufferH[indx - 2][d], bufferH[indx + 2][d]) + - max(bufferH[indx + 1 + u][c], bufferH[indx + 1 - u][c], bufferH[indx - 1 + u][c], bufferH[indx - 1 - u][c]) - - min(bufferH[indx + 1 + u][c], bufferH[indx + 1 - u][c], bufferH[indx - 1 + u][c], bufferH[indx - 1 - u][c]); - - float currentV = max(bufferV[indx + v][d], bufferV[indx - v][d], bufferV[indx - 2][d], bufferV[indx + 2][d]) - - min(bufferV[indx + v][d], bufferV[indx - v][d], bufferV[indx - 2][d], bufferV[indx + 2][d]) + - max(bufferV[indx + 1 + u][c], bufferV[indx + 1 - u][c], bufferV[indx - 1 + u][c], bufferV[indx - 1 - u][c]) - - min(bufferV[indx + 1 + u][c], bufferV[indx + 1 - u][c], bufferV[indx - 1 + u][c], bufferV[indx - 1 - u][c]); - - assert(indx >= 0 && indx < u * u); - - if (ABS(current - currentH) < ABS(current - currentV)) { - image[indx][1] = bufferH[indx][1]; - } else { - image[indx][1] = bufferV[indx][1]; - } - } + image[indx][1] = 0.25*(image[indx-1][1]+image[indx+1][1]+image[indx-u][1]+image[indx+u][1]); + } } -// missing colors are interpolated -void RawImageSource::dcb_color(float (*image)[4], int x0, int y0) +// missing colours are interpolated +void RawImageSource::dcb_color(float (*image)[3], int x0, int y0) { const int u = CACHESIZE; int rowMin, colMin, rowMax, colMax; @@ -3421,22 +3384,39 @@ void RawImageSource::dcb_color(float (*image)[4], int x0, int y0) for (int row = rowMin; row < rowMax; row++) for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = 2 - FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { assert(indx >= 0 && indx < u * u && c >= 0 && c < 4); + + +//Jacek comment: one multiplication less + image[indx][c] = image[indx][1] + + ( image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] + - (image[indx + u + 1][1] + image[indx + u - 1][1] + image[indx - u + 1][1] + image[indx - u - 1][1]) ) * 0.25f; + +/* original image[indx][c] = ( 4.f * image[indx][1] - image[indx + u + 1][1] - image[indx + u - 1][1] - image[indx - u + 1][1] - image[indx - u - 1][1] + image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] ) * 0.25f; - } +*/ + } // red or blue in green pixels for (int row = rowMin; row < rowMax; row++) for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + 1) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col + 1), d = 2 - c; col < colMax; col += 2, indx += 2) { assert(indx >= 0 && indx < u * u && c >= 0 && c < 4); + +//Jacek comment: two multiplications (in total) less + image[indx][c] = image[indx][1] + (image[indx + 1][c] + image[indx - 1][c] - (image[indx + 1][1] + image[indx - 1][1])) * 0.5f; + image[indx][d] = image[indx][1] + (image[indx + u][d] + image[indx - u][d] - (image[indx + u][1] + image[indx - u][1])) * 0.5f; + + +/* original image[indx][c] = (2.f * image[indx][1] - image[indx + 1][1] - image[indx - 1][1] + image[indx + 1][c] + image[indx - 1][c]) * 0.5f; image[indx][d] = (2.f * image[indx][1] - image[indx + u][1] - image[indx - u][1] + image[indx + u][d] + image[indx - u][d]) * 0.5f; - } +*/ + } } // green correction -void RawImageSource::dcb_hid2(float (*image)[4], int x0, int y0) +void RawImageSource::dcb_hid2(float (*image)[3], int x0, int y0) { const int u = CACHESIZE, v = 2 * CACHESIZE; int rowMin, colMin, rowMax, colMax; @@ -3445,8 +3425,16 @@ void RawImageSource::dcb_hid2(float (*image)[4], int x0, int y0) for (int row = rowMin; row < rowMax; row++) { for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { assert(indx - v >= 0 && indx + v < u * u); - image[indx][1] = (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]) * 0.25f + + +//Jacek comment: one multiplication less + image[indx][1] = image[indx][c] + + (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1] + - (image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c])) * 0.25f; + +/* original + image[indx][1] = (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]) * 0.25f + image[indx][c] - ( image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c]) * 0.25f; +*/ } } } @@ -3456,9 +3444,12 @@ void RawImageSource::dcb_hid2(float (*image)[4], int x0, int y0) // 1 = vertical // 0 = horizontal // saved in image[][3] -void RawImageSource::dcb_map(float (*image)[4], int x0, int y0) + +// seems at least 2 persons implemented some code, as this one has different coding style, could be unified +// I don't know if *pix is faster than a loop working on image[] directly +void RawImageSource::dcb_map(float (*image)[3], uint8_t *map, int x0, int y0) { - const int u = 4 * CACHESIZE; + const int u = 3 * CACHESIZE; int rowMin, colMin, rowMax, colMax; dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 2); @@ -3468,36 +3459,41 @@ void RawImageSource::dcb_map(float (*image)[4], int x0, int y0) assert(indx >= 0 && indx < u * u); - if ( *pix > ( pix[-4] + pix[+4] + pix[-u] + pix[+u]) / 4 ) { - image[indx][3] = ((min(pix[-4], pix[+4]) + pix[-4] + pix[+4] ) < (min(pix[-u], pix[+u]) + pix[-u] + pix[+u])); + // comparing 4 * a to (b+c+d+e) instead of a to (b+c+d+e)/4 is faster because divisions are slow + if ( 4 * (*pix) > ( (pix[-3] + pix[+3]) + (pix[-u] + pix[+u])) ) { + map[indx] = ((min(pix[-3], pix[+3]) + (pix[-3] + pix[+3]) ) < (min(pix[-u], pix[+u]) + (pix[-u] + pix[+u]))); } else { - image[indx][3] = ((max(pix[-4], pix[+4]) + pix[-4] + pix[+4] ) > (max(pix[-u], pix[+u]) + pix[-u] + pix[+u])); + map[indx] = ((max(pix[-3], pix[+3]) + (pix[-3] + pix[+3]) ) > (max(pix[-u], pix[+u]) + (pix[-u] + pix[+u]))); } } } } // interpolated green pixels are corrected using the map -void RawImageSource::dcb_correction(float (*image)[4], int x0, int y0) +void RawImageSource::dcb_correction(float (*image)[3], uint8_t *map, int x0, int y0) { const int u = CACHESIZE, v = 2 * CACHESIZE; int rowMin, colMin, rowMax, colMax; dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 2); for (int row = rowMin; row < rowMax; row++) { - for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col; col < colMax; col += 2, indx += 2) { - float current = 4.f * image[indx][3] + - 2.f * (image[indx + u][3] + image[indx - u][3] + image[indx + 1][3] + image[indx - 1][3]) + - image[indx + v][3] + image[indx - v][3] + image[indx + 2][3] + image[indx - 2][3]; + for (int indx = row * CACHESIZE + colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1); indx < row * CACHESIZE + colMax; indx += 2) { +// for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col; col < colMax; col += 2, indx += 2) { + float current = 4 * map[indx] + + 2 * (map[indx + u] + map[indx - u] + map[indx + 1] + map[indx - 1]) + + map[indx + v] + map[indx - v] + map[indx + 2] + map[indx - 2]; assert(indx >= 0 && indx < u * u); - image[indx][1] = ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1]) * 0.5f + current * (image[indx - u][1] + image[indx + u][1]) * 0.5f ) * 0.0625f; + image[indx][1] = ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1]) + current * (image[indx - u][1] + image[indx + u][1]) ) * 0.03125f; +// image[indx][1] = ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1]) * 0.5f + current * (image[indx - u][1] + image[indx + u][1]) * 0.5f ) * 0.0625f; } } } // R and B smoothing using green contrast, all pixels except 2 pixel wide border -void RawImageSource::dcb_pp(float (*image)[4], int x0, int y0) + +// again code with *pix, is this kind of calculating faster in C, than this what was commented? +void RawImageSource::dcb_pp(float (*image)[3], int x0, int y0) { const int u = CACHESIZE; int rowMin, colMin, rowMax, colMax; @@ -3505,10 +3501,10 @@ void RawImageSource::dcb_pp(float (*image)[4], int x0, int y0) for (int row = rowMin; row < rowMax; row++) for (int col = colMin, indx = row * CACHESIZE + col; col < colMax; col++, indx++) { - //int r1 = ( image[indx-1][0] + image[indx+1][0] + image[indx-u][0] + image[indx+u][0] + image[indx-u-1][0] + image[indx+u+1][0] + image[indx-u+1][0] + image[indx+u-1][0])/8; - //int g1 = ( image[indx-1][1] + image[indx+1][1] + image[indx-u][1] + image[indx+u][1] + image[indx-u-1][1] + image[indx+u+1][1] + image[indx-u+1][1] + image[indx+u-1][1])/8; - //int b1 = ( image[indx-1][2] + image[indx+1][2] + image[indx-u][2] + image[indx+u][2] + image[indx-u-1][2] + image[indx+u+1][2] + image[indx-u+1][2] + image[indx+u-1][2])/8; - float (*pix)[4] = image + (indx - u - 1); +// float r1 = image[indx-1][0] + image[indx+1][0] + image[indx-u][0] + image[indx+u][0] + image[indx-u-1][0] + image[indx+u+1][0] + image[indx-u+1][0] + image[indx+u-1][0]; +// float g1 = image[indx-1][1] + image[indx+1][1] + image[indx-u][1] + image[indx+u][1] + image[indx-u-1][1] + image[indx+u+1][1] + image[indx-u+1][1] + image[indx+u-1][1]; +// float b1 = image[indx-1][2] + image[indx+1][2] + image[indx-u][2] + image[indx+u][2] + image[indx-u-1][2] + image[indx+u+1][2] + image[indx-u+1][2] + image[indx+u-1][2]; + float (*pix)[3] = image + (indx - u - 1); float r1 = (*pix)[0]; float g1 = (*pix)[1]; float b1 = (*pix)[2]; @@ -3543,8 +3539,8 @@ void RawImageSource::dcb_pp(float (*image)[4], int x0, int y0) r1 *= 0.125f; g1 *= 0.125f; b1 *= 0.125f; - r1 = r1 + ( image[indx][1] - g1 ); - b1 = b1 + ( image[indx][1] - g1 ); + r1 += ( image[indx][1] - g1 ); + b1 += ( image[indx][1] - g1 ); assert(indx >= 0 && indx < u * u); image[indx][0] = r1; @@ -3554,70 +3550,90 @@ void RawImageSource::dcb_pp(float (*image)[4], int x0, int y0) // interpolated green pixels are corrected using the map // with correction -void RawImageSource::dcb_correction2(float (*image)[4], int x0, int y0) +void RawImageSource::dcb_correction2(float (*image)[3], uint8_t *map, int x0, int y0) { const int u = CACHESIZE, v = 2 * CACHESIZE; int rowMin, colMin, rowMax, colMax; dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 4); for (int row = rowMin; row < rowMax; row++) { - for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { - float current = 4.f * image[indx][3] + - 2.f * (image[indx + u][3] + image[indx - u][3] + image[indx + 1][3] + image[indx - 1][3]) + - image[indx + v][3] + image[indx - v][3] + image[indx + 2][3] + image[indx - 2][3]; + for (int indx = row * CACHESIZE + colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1)); indx < row * CACHESIZE + colMax; indx += 2) { + // map values are uint8_t either 0 or 1. Adding them using integer instructions is perfectly valid and fast. Final result is converted to float then + float current = 4 * map[indx] + + 2 * (map[indx + u] + map[indx - u] + map[indx + 1] + map[indx - 1]) + + map[indx + v] + map[indx - v] + map[indx + 2] + map[indx - 2]; assert(indx >= 0 && indx < u * u); + +// Jacek comment: works now, and has 3 float mults and 9 float adds + image[indx][1] = image[indx][c] + + ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1] - (image[indx + 2][c] + image[indx - 2][c])) + + current * (image[indx - u][1] + image[indx + u][1] - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f; + + + // 4 float mults and 9 float adds + // Jacek comment: not mathematically identical to original +/* image[indx][1] = 16.f * image[indx][c] + + ((16.f - current) * ((image[indx - 1][1] + image[indx + 1][1]) + - (image[indx + 2][c] + image[indx - 2][c])) + + current * ((image[indx - u][1] + image[indx + u][1]) - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f; +*/ + // 7 float mults and 10 float adds + // original code +/* image[indx][1] = ((16.f - current) * ((image[indx - 1][1] + image[indx + 1][1]) * 0.5f + image[indx][c] - (image[indx + 2][c] + image[indx - 2][c]) * 0.5f) + current * ((image[indx - u][1] + image[indx + u][1]) * 0.5f + image[indx][c] - (image[indx + v][c] + image[indx - v][c]) * 0.5f)) * 0.0625f; - } +*/ + } } } // image refinement -void RawImageSource::dcb_refinement(float (*image)[4], int x0, int y0) +void RawImageSource::dcb_refinement(float (*image)[3], uint8_t *map, int x0, int y0) { const int u = CACHESIZE, v = 2 * CACHESIZE, w = 3 * CACHESIZE; int rowMin, colMin, rowMax, colMax; dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 4); - float f[5], g1, g2; + float f0, f1, f2, g1, h0, h1, h2, g2, current; for (int row = rowMin; row < rowMax; row++) for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { - float current = 4.f * image[indx][3] + - 2.f * (image[indx + u][3] + image[indx - u][3] + image[indx + 1][3] + image[indx - 1][3]) - + image[indx + v][3] + image[indx - v][3] + image[indx - 2][3] + image[indx + 2][3]; - f[0] = (float)(image[indx - u][1] + image[indx + u][1]) / (2.f + 2.f * image[indx][c]); - f[1] = 2.f * image[indx - u][1] / (2 + image[indx - v][c] + image[indx][c]); - f[2] = (float)(image[indx - u][1] + image[indx - w][1]) / (2.f + 2.f * image[indx - v][c]); - f[3] = 2.f * image[indx + u][1] / (2 + image[indx + v][c] + image[indx][c]); - f[4] = (float)(image[indx + u][1] + image[indx + w][1]) / (2.f + 2.f * image[indx + v][c]); + float current = 4 * map[indx] + + 2 * (map[indx + u] + map[indx - u] + map[indx + 1] + map[indx - 1]) + + map[indx + v] + map[indx - v] + map[indx - 2] + map[indx + 2]; - g1 = (f[0] + f[1] + f[2] + f[3] + f[4] - max(f[1], f[2], f[3], f[4]) - min(f[1], f[2], f[3], f[4])) / 3.f; + float currPix = image[indx][c]; - f[0] = (float)(image[indx - 1][1] + image[indx + 1][1]) / (2.f + 2.f * image[indx][c]); - f[1] = 2.f * image[indx - 1][1] / (2 + image[indx - 2][c] + image[indx][c]); - f[2] = (float)(image[indx - 1][1] + image[indx - 3][1]) / (2.f + 2.f * image[indx - 2][c]); - f[3] = 2.f * image[indx + 1][1] / (2 + image[indx + 2][c] + image[indx][c]); - f[4] = (float)(image[indx + 1][1] + image[indx + 3][1]) / (2.f + 2.f * image[indx + 2][c]); + f0 = (float)(image[indx - u][1] + image[indx + u][1]) / (1.f + 2.f * currPix); + f1 = 2.f * image[indx - u][1] / (1.f + image[indx - v][c] + currPix); + f2 = 2.f * image[indx + u][1] / (1.f + image[indx + v][c] + currPix); - g2 = (f[0] + f[1] + f[2] + f[3] + f[4] - max(f[1], f[2], f[3], f[4]) - min(f[1], f[2], f[3], f[4])) / 3.f; + g1 = f0 + f1 + f2; + h0 = (float)(image[indx - 1][1] + image[indx + 1][1]) / (1.f + 2.f * currPix); + h1 = 2.f * image[indx - 1][1] / (1.f + image[indx - 2][c] + currPix); + h2 = 2.f * image[indx + 1][1] / (1.f + image[indx + 2][c] + currPix); + + g2 = h0 + h1 + h2; + + // new green value assert(indx >= 0 && indx < u * u); - image[indx][1] = (2.f + image[indx][c]) * (current * g1 + (16.f - current) * g2) * 0.0625f; + currPix *= (current * g1 + (16.f - current) * g2) / 48.f; - // get rid of the overshooted pixels - float min_f = min(image[indx + 1 + u][1], min(image[indx + 1 - u][1], min(image[indx - 1 + u][1], min(image[indx - 1 - u][1], min(image[indx - 1][1], min(image[indx + 1][1], min(image[indx - u][1], image[indx + u][1]))))))); - float max_f = max(image[indx + 1 + u][1], max(image[indx + 1 - u][1], max(image[indx - 1 + u][1], max(image[indx - 1 - u][1], max(image[indx - 1][1], max(image[indx + 1][1], max(image[indx - u][1], image[indx + u][1]))))))); + // get rid of the overshot pixels + float minVal = min(image[indx - 1][1], min(image[indx + 1][1], min(image[indx - u][1], image[indx + u][1]))); + float maxVal = max(image[indx - 1][1], max(image[indx + 1][1], max(image[indx - u][1], image[indx + u][1]))); + + image[indx][1] = LIM(currPix, minVal, maxVal); - image[indx][1] = LIM(image[indx][1], min_f, max_f); } } -// missing colors are interpolated using high quality algorithm by Luis Sanz Rodriguez -void RawImageSource::dcb_color_full(float (*image)[4], int x0, int y0, float (*chroma)[2]) +// missing colours are interpolated using high quality algorithm by Luis Sanz Rodriguez +void RawImageSource::dcb_color_full(float (*image)[3], int x0, int y0, float (*chroma)[2]) { const int u = CACHESIZE, w = 3 * CACHESIZE; int rowMin, colMin, rowMax, colMax; @@ -3637,10 +3653,15 @@ void RawImageSource::dcb_color_full(float (*image)[4], int x0, int y0, float (*c f[1] = 1.f / (float)(1.f + fabs(chroma[indx - u + 1][c] - chroma[indx + u - 1][c]) + fabs(chroma[indx - u + 1][c] - chroma[indx - w + 3][c]) + fabs(chroma[indx + u - 1][c] - chroma[indx - w + 3][c])); f[2] = 1.f / (float)(1.f + fabs(chroma[indx + u - 1][c] - chroma[indx - u + 1][c]) + fabs(chroma[indx + u - 1][c] - chroma[indx + w + 3][c]) + fabs(chroma[indx - u + 1][c] - chroma[indx + w - 3][c])); f[3] = 1.f / (float)(1.f + fabs(chroma[indx + u + 1][c] - chroma[indx - u - 1][c]) + fabs(chroma[indx + u + 1][c] - chroma[indx + w - 3][c]) + fabs(chroma[indx - u - 1][c] - chroma[indx + w + 3][c])); - g[0] = 1.325f * chroma[indx - u - 1][c] - 0.175f * chroma[indx - w - 3][c] - 0.075f * chroma[indx - w - 1][c] - 0.075f * chroma[indx - u - 3][c]; - g[1] = 1.325f * chroma[indx - u + 1][c] - 0.175f * chroma[indx - w + 3][c] - 0.075f * chroma[indx - w + 1][c] - 0.075f * chroma[indx - u + 3][c]; - g[2] = 1.325f * chroma[indx + u - 1][c] - 0.175f * chroma[indx + w - 3][c] - 0.075f * chroma[indx + w - 1][c] - 0.075f * chroma[indx + u - 3][c]; - g[3] = 1.325f * chroma[indx + u + 1][c] - 0.175f * chroma[indx + w + 3][c] - 0.075f * chroma[indx + w + 1][c] - 0.075f * chroma[indx + u + 3][c]; + g[0] = 1.325f * chroma[indx - u - 1][c] - 0.175f * chroma[indx - w - 3][c] - 0.075f * (chroma[indx - w - 1][c] + chroma[indx - u - 3][c]); + g[1] = 1.325f * chroma[indx - u + 1][c] - 0.175f * chroma[indx - w + 3][c] - 0.075f * (chroma[indx - w + 1][c] + chroma[indx - u + 3][c]); + g[2] = 1.325f * chroma[indx + u - 1][c] - 0.175f * chroma[indx + w - 3][c] - 0.075f * (chroma[indx + w - 1][c] + chroma[indx + u - 3][c]); + g[3] = 1.325f * chroma[indx + u + 1][c] - 0.175f * chroma[indx + w + 3][c] - 0.075f * (chroma[indx + w + 1][c] + chroma[indx + u + 3][c]); + +// g[0] = 1.325f * chroma[indx - u - 1][c] - 0.175f * chroma[indx - w - 3][c] - 0.075f * chroma[indx - w - 1][c] - 0.075f * chroma[indx - u - 3][c]; +// g[1] = 1.325f * chroma[indx - u + 1][c] - 0.175f * chroma[indx - w + 3][c] - 0.075f * chroma[indx - w + 1][c] - 0.075f * chroma[indx - u + 3][c]; +// g[2] = 1.325f * chroma[indx + u - 1][c] - 0.175f * chroma[indx + w - 3][c] - 0.075f * chroma[indx + w - 1][c] - 0.075f * chroma[indx + u - 3][c]; +// g[3] = 1.325f * chroma[indx + u + 1][c] - 0.175f * chroma[indx + w + 3][c] - 0.075f * chroma[indx + w + 1][c] - 0.075f * chroma[indx + u + 3][c]; assert(indx >= 0 && indx < u * u && c >= 0 && c < 2); chroma[indx][c] = (f[0] * g[0] + f[1] * g[1] + f[2] * g[2] + f[3] * g[3]) / (f[0] + f[1] + f[2] + f[3]); @@ -3649,15 +3670,20 @@ void RawImageSource::dcb_color_full(float (*image)[4], int x0, int y0, float (*c for (int row = rowMin; row < rowMax; row++) for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + 1) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col + 1) / 2; col < colMax; col += 2, indx += 2) for(int d = 0; d <= 1; c = 1 - c, d++) { - f[0] = 1.f / (float)(1.f + fabs(chroma[indx - u][c] - chroma[indx + u][c]) + fabs(chroma[indx - u][c] - chroma[indx - w][c]) + fabs(chroma[indx + u][c] - chroma[indx - w][c])); - f[1] = 1.f / (float)(1.f + fabs(chroma[indx + 1][c] - chroma[indx - 1][c]) + fabs(chroma[indx + 1][c] - chroma[indx + 3][c]) + fabs(chroma[indx - 1][c] - chroma[indx + 3][c])); - f[2] = 1.f / (float)(1.f + fabs(chroma[indx - 1][c] - chroma[indx + 1][c]) + fabs(chroma[indx - 1][c] - chroma[indx - 3][c]) + fabs(chroma[indx + 1][c] - chroma[indx - 3][c])); - f[3] = 1.f / (float)(1.f + fabs(chroma[indx + u][c] - chroma[indx - u][c]) + fabs(chroma[indx + u][c] - chroma[indx + w][c]) + fabs(chroma[indx - u][c] - chroma[indx + w][c])); + f[0] = 1.f / (1.f + fabs(chroma[indx - u][c] - chroma[indx + u][c]) + fabs(chroma[indx - u][c] - chroma[indx - w][c]) + fabs(chroma[indx + u][c] - chroma[indx - w][c])); + f[1] = 1.f / (1.f + fabs(chroma[indx + 1][c] - chroma[indx - 1][c]) + fabs(chroma[indx + 1][c] - chroma[indx + 3][c]) + fabs(chroma[indx - 1][c] - chroma[indx + 3][c])); + f[2] = 1.f / (1.f + fabs(chroma[indx - 1][c] - chroma[indx + 1][c]) + fabs(chroma[indx - 1][c] - chroma[indx - 3][c]) + fabs(chroma[indx + 1][c] - chroma[indx - 3][c])); + f[3] = 1.f / (1.f + fabs(chroma[indx + u][c] - chroma[indx - u][c]) + fabs(chroma[indx + u][c] - chroma[indx + w][c]) + fabs(chroma[indx - u][c] - chroma[indx + w][c])); - g[0] = 0.875f * chroma[indx - u][c] + 0.125f * chroma[indx - w][c]; - g[1] = 0.875f * chroma[indx + 1][c] + 0.125f * chroma[indx + 3][c]; - g[2] = 0.875f * chroma[indx - 1][c] + 0.125f * chroma[indx - 3][c]; - g[3] = 0.875f * chroma[indx + u][c] + 0.125f * chroma[indx + w][c]; + g[0] = intp(0.875f, chroma[indx - u][c], chroma[indx - w][c]); + g[1] = intp(0.875f, chroma[indx + 1][c], chroma[indx + 3][c]); + g[2] = intp(0.875f, chroma[indx - 1][c], chroma[indx - 3][c]); + g[3] = intp(0.875f, chroma[indx + u][c], chroma[indx + w][c]); + +// g[0] = 0.875f * chroma[indx - u][c] + 0.125f * chroma[indx - w][c]; +// g[1] = 0.875f * chroma[indx + 1][c] + 0.125f * chroma[indx + 3][c]; +// g[2] = 0.875f * chroma[indx - 1][c] + 0.125f * chroma[indx - 3][c]; +// g[3] = 0.875f * chroma[indx + u][c] + 0.125f * chroma[indx + w][c]; assert(indx >= 0 && indx < u * u && c >= 0 && c < 2); chroma[indx][c] = (f[0] * g[0] + f[1] * g[1] + f[2] * g[2] + f[3] * g[3]) / (f[0] + f[1] + f[2] + f[3]); @@ -3672,9 +3698,10 @@ void RawImageSource::dcb_color_full(float (*image)[4], int x0, int y0, float (*c } } -// DCB demosaicing main routine (sharp version) +// DCB demosaicing main routine void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance) { +BENCHFUN double currentProgress = 0.0; if(plistener) { @@ -3686,29 +3713,24 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance) int hTiles = H / TILESIZE + (H % TILESIZE ? 1 : 0); int numTiles = wTiles * hTiles; int tilesDone = 0; + constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ... + #ifdef _OPENMP - int nthreads = omp_get_max_threads(); - float (**image)[4] = (float(**)[4]) calloc( nthreads, sizeof( void*) ); - float (**image2)[3] = (float(**)[3]) calloc( nthreads, sizeof( void*) ); - float (**image3)[3] = (float(**)[3]) calloc( nthreads, sizeof( void*) ); - float (**chroma)[2] = (float (**)[2]) calloc( nthreads, sizeof( void*) ); - - for(int i = 0; i < nthreads; i++) { - image[i] = (float(*)[4]) calloc( CACHESIZE * CACHESIZE, sizeof **image); - image2[i] = (float(*)[3]) calloc( CACHESIZE * CACHESIZE, sizeof **image2); - image3[i] = (float(*)[3]) calloc( CACHESIZE * CACHESIZE, sizeof **image3); - chroma[i] = (float (*)[2]) calloc( CACHESIZE * CACHESIZE, sizeof **chroma); - } - -#else - float (*image)[4] = (float(*)[4]) calloc( CACHESIZE * CACHESIZE, sizeof * image); - float (*image2)[3] = (float(*)[3]) calloc( CACHESIZE * CACHESIZE, sizeof * image2); - float (*image3)[3] = (float(*)[3]) calloc( CACHESIZE * CACHESIZE, sizeof * image3); - float (*chroma)[2] = (float (*)[2]) calloc( CACHESIZE * CACHESIZE, sizeof * chroma); + #pragma omp parallel #endif +{ + // assign working space + char *buffer0 = (char *) malloc(5 * sizeof(float) * CACHESIZE * CACHESIZE + sizeof(uint8_t) * CACHESIZE * CACHESIZE + 3 * cldf * 64 + 63); + // aligned to 64 byte boundary + char *data = (char*)( ( uintptr_t(buffer0) + uintptr_t(63)) / 64 * 64); + + float (*tile)[3] = (float(*)[3]) data; + float (*buffer)[2] = (float(*)[2]) ((char*)tile + sizeof(float) * CACHESIZE * CACHESIZE * 3 + cldf * 64); + float (*chrm)[2] = (float(*)[2]) (buffer); // No overlap in usage of buffer and chrm means we can reuse buffer + uint8_t *map = (uint8_t*) ((char*)buffer + sizeof(float) * CACHESIZE * CACHESIZE * 2 + cldf * 64); #ifdef _OPENMP - #pragma omp parallel for + #pragma omp for schedule(dynamic) nowait #endif for( int iTile = 0; iTile < numTiles; iTile++) { @@ -3717,19 +3739,8 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance) int x0 = xTile * TILESIZE; int y0 = yTile * TILESIZE; -#ifdef _OPENMP - int tid = omp_get_thread_num(); - assert(tid < nthreads); - float (*tile)[4] = image[tid]; - float (*buffer)[3] = image2[tid]; - float (*buffer2)[3] = image3[tid]; - float (*chrm)[2] = chroma[tid]; -#else - float (*tile)[4] = image; - float (*buffer)[3] = image2; - float (*buffer2)[3] = image3; - float (*chrm)[2] = chroma; -#endif + memset(tile, 0, CACHESIZE * CACHESIZE * sizeof * tile); + memset(map, 0, CACHESIZE * CACHESIZE * sizeof * map); fill_raw( tile, x0, y0, rawData ); @@ -3737,7 +3748,44 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance) fill_border(tile, 6, x0, y0); } + copy_to_buffer(buffer, tile); + dcb_hid(tile, x0, y0); + + for (int i = iterations; i > 0; i--) { + dcb_hid2(tile, x0, y0); + dcb_hid2(tile, x0, y0); + dcb_hid2(tile, x0, y0); + dcb_map(tile, map, x0, y0); + dcb_correction(tile, map, x0, y0); + } + + dcb_color(tile, x0, y0); + dcb_pp(tile, x0, y0); + dcb_map(tile, map, x0, y0); + dcb_correction2(tile, map, x0, y0); + dcb_map(tile, map, x0, y0); + dcb_correction(tile, map, x0, y0); + dcb_color(tile, x0, y0); + dcb_map(tile, map, x0, y0); + dcb_correction(tile, map, x0, y0); + dcb_map(tile, map, x0, y0); + dcb_correction(tile, map, x0, y0); + dcb_map(tile, map, x0, y0); + restore_from_buffer(tile, buffer); + + if (!dcb_enhance) + dcb_color(tile, x0, y0); + else + { + memset(chrm, 0, CACHESIZE * CACHESIZE * sizeof * chrm); + dcb_refinement(tile, map, x0, y0); + dcb_color_full(tile, x0, y0, chrm); + } + + /* dcb_hid(tile, buffer, buffer2, x0, y0); + dcb_color(tile, x0, y0); + copy_to_buffer(buffer, tile); for (int i = iterations; i > 0; i--) { @@ -3761,13 +3809,13 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance) dcb_correction(tile, x0, y0); dcb_map(tile, x0, y0); restore_from_buffer(tile, buffer); - dcb_color(tile, x0, y0); + dcb_color_full(tile, x0, y0, chrm); - if (dcb_enhance) { + if (dcb_enhance) { dcb_refinement(tile, x0, y0); dcb_color_full(tile, x0, y0, chrm); - } - + } +*/ for(int y = 0; y < TILESIZE && y0 + y < H; y++) { for (int j = 0; j < TILESIZE && x0 + j < W; j++) { red[y0 + y][x0 + j] = tile[(y + TILEBORDER) * CACHESIZE + TILEBORDER + j][0]; @@ -3792,21 +3840,8 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance) #endif tilesDone++; } - -#ifdef _OPENMP - - for(int i = 0; i < nthreads; i++) { - free(image[i]); - free(image2[i]); - free(image3[i]); - free(chroma[i]); - } - -#endif - free(image); - free(image2); - free(image3); - free(chroma); + free(buffer0); +} if(plistener) { plistener->setProgress (1.0); diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h index 151edf959..9fafef8bb 100644 --- a/rtengine/rawimagesource.h +++ b/rtengine/rawimagesource.h @@ -236,19 +236,19 @@ protected: void border_interpolate(unsigned int border, float (*image)[4], unsigned int start = 0, unsigned int end = 0); void border_interpolate2(int winw, int winh, int lborders); void dcb_initTileLimits(int &colMin, int &rowMin, int &colMax, int &rowMax, int x0, int y0, int border); - void fill_raw( float (*cache )[4], int x0, int y0, float** rawData); - void fill_border( float (*cache )[4], int border, int x0, int y0); - void copy_to_buffer(float (*image2)[3], float (*image)[4]); - void dcb_hid(float (*image)[4], float (*bufferH)[3], float (*bufferV)[3], int x0, int y0); - void dcb_color(float (*image)[4], int x0, int y0); - void dcb_hid2(float (*image)[4], int x0, int y0); - void dcb_map(float (*image)[4], int x0, int y0); - void dcb_correction(float (*image)[4], int x0, int y0); - void dcb_pp(float (*image)[4], int x0, int y0); - void dcb_correction2(float (*image)[4], int x0, int y0); - void restore_from_buffer(float (*image)[4], float (*image2)[3]); - void dcb_refinement(float (*image)[4], int x0, int y0); - void dcb_color_full(float (*image)[4], int x0, int y0, float (*chroma)[2]); + void fill_raw( float (*cache )[3], int x0, int y0, float** rawData); + void fill_border( float (*cache )[3], int border, int x0, int y0); + void copy_to_buffer(float (*image2)[2], float (*image)[3]); + void dcb_hid(float (*image)[3], int x0, int y0); + void dcb_color(float (*image)[3], int x0, int y0); + void dcb_hid2(float (*image)[3], int x0, int y0); + void dcb_map(float (*image)[3], uint8_t *map, int x0, int y0); + void dcb_correction(float (*image)[3], uint8_t *map, int x0, int y0); + void dcb_pp(float (*image)[3], int x0, int y0); + void dcb_correction2(float (*image)[3], uint8_t *map, int x0, int y0); + void restore_from_buffer(float (*image)[3], float (*image2)[2]); + void dcb_refinement(float (*image)[3], uint8_t *map, int x0, int y0); + void dcb_color_full(float (*image)[3], int x0, int y0, float (*chroma)[2]); void cielab (const float (*rgb)[3], float* l, float* a, float *b, const int width, const int height, const int labWidth, const float xyz_cam[3][3]); void xtransborder_interpolate (int border); void xtrans_interpolate (const int passes, const bool useCieLab); From 5e5ca6eee4627be4fea173383c6d85c7780dd39b Mon Sep 17 00:00:00 2001 From: heckflosse Date: Tue, 28 Feb 2017 20:05:36 +0100 Subject: [PATCH 2/5] disabled timing code in dcb_demosaic --- rtengine/demosaic_algos.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc index 0f0e4841f..1ad3a15df 100644 --- a/rtengine/demosaic_algos.cc +++ b/rtengine/demosaic_algos.cc @@ -38,7 +38,7 @@ #include "sleef.c" #include "opthelper.h" #include "median.h" -#define BENCHMARK +//#define BENCHMARK #include "StopWatch.h" #ifdef _OPENMP #include @@ -3772,7 +3772,7 @@ BENCHFUN dcb_correction(tile, map, x0, y0); dcb_map(tile, map, x0, y0); restore_from_buffer(tile, buffer); - + if (!dcb_enhance) dcb_color(tile, x0, y0); else From d72d931c9e2b090942deb48fc52c9d1021f6604d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Tue, 28 Feb 2017 20:57:19 +0100 Subject: [PATCH 3/5] Variadic template version of `rtengine::(min|max)()` This change allows for an arbitrary number of arguments to `min()` and `max()` by using recursion on variadic template functions. The disassembly of GCC 6.3 was carefully checked for regressions, but nothing was found other than the flipping of arguments (recursion is now `(((a,b),c),d)` and was `(d,(c,(a,b)))` before). I also unified the common type `_Tp` to to the even more common `T`. --- rtengine/rt_math.h | 68 ++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h index a7cbbf301..32db8e92f 100644 --- a/rtengine/rt_math.h +++ b/rtengine/rt_math.h @@ -27,71 +27,57 @@ constexpr float RT_PI_F_2 = RT_PI_2; constexpr float RT_INFINITY_F = std::numeric_limits::infinity(); constexpr float RT_NAN_F = std::numeric_limits::quiet_NaN(); -template -inline _Tp SQR (_Tp x) +template +inline T SQR (T x) { // return std::pow(x,2); Slower than: return x * x; } -template -inline const _Tp& min(const _Tp& a, const _Tp& b) +template +inline const T& min(const T& a, const T& b) { return std::min(a, b); } -template -inline const _Tp& max(const _Tp& a, const _Tp& b) +template +inline const T& min(const T& a, const T& b, const ARGS&... args) +{ + return min(min(a, b), args...); +} + +template +inline const T& max(const T& a, const T& b) { return std::max(a, b); } +template +inline const T& max(const T& a, const T& b, const ARGS&... args) +{ + return max(max(a, b), args...); +} -template -inline const _Tp& LIM(const _Tp& a, const _Tp& b, const _Tp& c) +template +inline const T& LIM(const T& a, const T& b, const T& c) { return std::max(b, std::min(a, c)); } -template -inline _Tp LIM01(const _Tp& a) +template +inline T LIM01(const T& a) { - return std::max(_Tp(0), std::min(a, _Tp(1))); + return std::max(T(0), std::min(a, T(1))); } -template -inline _Tp CLIP(const _Tp& a) +template +inline T CLIP(const T& a) { - return LIM(a, static_cast<_Tp>(0), static_cast<_Tp>(MAXVAL)); + return LIM(a, static_cast(0), static_cast(MAXVAL)); } - -template -inline const _Tp& min(const _Tp& a, const _Tp& b, const _Tp& c) -{ - return std::min(c, std::min(a, b)); -} - -template -inline const _Tp& max(const _Tp& a, const _Tp& b, const _Tp& c) -{ - return std::max(c, std::max(a, b)); -} - -template -inline const _Tp& min(const _Tp& a, const _Tp& b, const _Tp& c, const _Tp& d) -{ - return std::min(d, std::min(c, std::min(a, b))); -} - -template -inline const _Tp& max(const _Tp& a, const _Tp& b, const _Tp& c, const _Tp& d) -{ - return std::max(d, std::max(c, std::max(a, b))); -} - -template -inline _Tp intp(_Tp a, _Tp b, _Tp c) +template +inline T intp(T a, T b, T c) { // calculate a * b + (1 - a) * c // following is valid: From e2b8ccd38b416736a7a2b6c46ce12543e729c8e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Tue, 28 Feb 2017 21:16:10 +0100 Subject: [PATCH 4/5] Whitespace correction (#3719) --- rtengine/demosaic_algos.cc | 82 +++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc index 1ad3a15df..60abe8998 100644 --- a/rtengine/demosaic_algos.cc +++ b/rtengine/demosaic_algos.cc @@ -3329,8 +3329,8 @@ void RawImageSource::fill_border( float (*cache )[3], int border, int x0, int y0 // saves red and blue -// change buffer[3] -> buffer[2], possibly to buffer[1] if split -// into two loops, one for R and another for B, could also be smaller because +// change buffer[3] -> buffer[2], possibly to buffer[1] if split +// into two loops, one for R and another for B, could also be smaller because // there is no need for green pixels pass // this would decrease the amount of needed memory // from megapixels*2 records to megapixels*0.5 @@ -3369,8 +3369,8 @@ void RawImageSource::dcb_hid(float (*image)[3], int x0, int y0) for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { assert(indx - u - 1 >= 0 && indx + u + 1 < u * u && c >= 0 && c < 3); - image[indx][1] = 0.25*(image[indx-1][1]+image[indx+1][1]+image[indx-u][1]+image[indx+u][1]); - } + image[indx][1] = 0.25*(image[indx-1][1]+image[indx+1][1]+image[indx-u][1]+image[indx+u][1]); + } } // missing colours are interpolated @@ -3385,10 +3385,10 @@ void RawImageSource::dcb_color(float (*image)[3], int x0, int y0) for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = 2 - FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { assert(indx >= 0 && indx < u * u && c >= 0 && c < 4); - -//Jacek comment: one multiplication less - image[indx][c] = image[indx][1] + - ( image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] + +//Jacek comment: one multiplication less + image[indx][c] = image[indx][1] + + ( image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] - (image[indx + u + 1][1] + image[indx + u - 1][1] + image[indx - u + 1][1] + image[indx - u - 1][1]) ) * 0.25f; /* original @@ -3396,23 +3396,23 @@ void RawImageSource::dcb_color(float (*image)[3], int x0, int y0) - image[indx + u + 1][1] - image[indx + u - 1][1] - image[indx - u + 1][1] - image[indx - u - 1][1] + image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] ) * 0.25f; */ - } + } // red or blue in green pixels for (int row = rowMin; row < rowMax; row++) for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + 1) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col + 1), d = 2 - c; col < colMax; col += 2, indx += 2) { assert(indx >= 0 && indx < u * u && c >= 0 && c < 4); -//Jacek comment: two multiplications (in total) less +//Jacek comment: two multiplications (in total) less image[indx][c] = image[indx][1] + (image[indx + 1][c] + image[indx - 1][c] - (image[indx + 1][1] + image[indx - 1][1])) * 0.5f; image[indx][d] = image[indx][1] + (image[indx + u][d] + image[indx - u][d] - (image[indx + u][1] + image[indx - u][1])) * 0.5f; - - + + /* original image[indx][c] = (2.f * image[indx][1] - image[indx + 1][1] - image[indx - 1][1] + image[indx + 1][c] + image[indx - 1][c]) * 0.5f; image[indx][d] = (2.f * image[indx][1] - image[indx + u][1] - image[indx - u][1] + image[indx + u][d] + image[indx - u][d]) * 0.5f; */ - } + } } // green correction @@ -3425,14 +3425,14 @@ void RawImageSource::dcb_hid2(float (*image)[3], int x0, int y0) for (int row = rowMin; row < rowMax; row++) { for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) { assert(indx - v >= 0 && indx + v < u * u); - -//Jacek comment: one multiplication less + +//Jacek comment: one multiplication less image[indx][1] = image[indx][c] + - (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1] - - (image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c])) * 0.25f; + (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1] + - (image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c])) * 0.25f; /* original - image[indx][1] = (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]) * 0.25f + + image[indx][1] = (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]) * 0.25f + image[indx][c] - ( image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c]) * 0.25f; */ } @@ -3446,7 +3446,7 @@ void RawImageSource::dcb_hid2(float (*image)[3], int x0, int y0) // saved in image[][3] // seems at least 2 persons implemented some code, as this one has different coding style, could be unified -// I don't know if *pix is faster than a loop working on image[] directly +// I don't know if *pix is faster than a loop working on image[] directly void RawImageSource::dcb_map(float (*image)[3], uint8_t *map, int x0, int y0) { const int u = 3 * CACHESIZE; @@ -3565,27 +3565,27 @@ void RawImageSource::dcb_correction2(float (*image)[3], uint8_t *map, int x0, in assert(indx >= 0 && indx < u * u); -// Jacek comment: works now, and has 3 float mults and 9 float adds - image[indx][1] = image[indx][c] + - ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1] - (image[indx + 2][c] + image[indx - 2][c])) - + current * (image[indx - u][1] + image[indx + u][1] - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f; - - +// Jacek comment: works now, and has 3 float mults and 9 float adds + image[indx][1] = image[indx][c] + + ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1] - (image[indx + 2][c] + image[indx - 2][c])) + + current * (image[indx - u][1] + image[indx + u][1] - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f; + + // 4 float mults and 9 float adds - // Jacek comment: not mathematically identical to original -/* image[indx][1] = 16.f * image[indx][c] + + // Jacek comment: not mathematically identical to original +/* image[indx][1] = 16.f * image[indx][c] + ((16.f - current) * ((image[indx - 1][1] + image[indx + 1][1]) - (image[indx + 2][c] + image[indx - 2][c])) + current * ((image[indx - u][1] + image[indx + u][1]) - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f; */ // 7 float mults and 10 float adds - // original code -/* + // original code +/* image[indx][1] = ((16.f - current) * ((image[indx - 1][1] + image[indx + 1][1]) * 0.5f + image[indx][c] - (image[indx + 2][c] + image[indx - 2][c]) * 0.5f) + current * ((image[indx - u][1] + image[indx + u][1]) * 0.5f + image[indx][c] - (image[indx + v][c] + image[indx - v][c]) * 0.5f)) * 0.0625f; */ - } + } } } @@ -3618,7 +3618,7 @@ void RawImageSource::dcb_refinement(float (*image)[3], uint8_t *map, int x0, int h2 = 2.f * image[indx + 1][1] / (1.f + image[indx + 2][c] + currPix); g2 = h0 + h1 + h2; - + // new green value assert(indx >= 0 && indx < u * u); currPix *= (current * g1 + (16.f - current) * g2) / 48.f; @@ -3698,7 +3698,7 @@ void RawImageSource::dcb_color_full(float (*image)[3], int x0, int y0, float (*c } } -// DCB demosaicing main routine +// DCB demosaicing main routine void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance) { BENCHFUN @@ -3750,7 +3750,7 @@ BENCHFUN copy_to_buffer(buffer, tile); dcb_hid(tile, x0, y0); - + for (int i = iterations; i > 0; i--) { dcb_hid2(tile, x0, y0); dcb_hid2(tile, x0, y0); @@ -3773,19 +3773,19 @@ BENCHFUN dcb_map(tile, map, x0, y0); restore_from_buffer(tile, buffer); - if (!dcb_enhance) - dcb_color(tile, x0, y0); - else - { - memset(chrm, 0, CACHESIZE * CACHESIZE * sizeof * chrm); + if (!dcb_enhance) + dcb_color(tile, x0, y0); + else + { + memset(chrm, 0, CACHESIZE * CACHESIZE * sizeof * chrm); dcb_refinement(tile, map, x0, y0); dcb_color_full(tile, x0, y0, chrm); } /* dcb_hid(tile, buffer, buffer2, x0, y0); - dcb_color(tile, x0, y0); - + dcb_color(tile, x0, y0); + copy_to_buffer(buffer, tile); for (int i = iterations; i > 0; i--) { @@ -3811,7 +3811,7 @@ BENCHFUN restore_from_buffer(tile, buffer); dcb_color_full(tile, x0, y0, chrm); - if (dcb_enhance) { + if (dcb_enhance) { dcb_refinement(tile, x0, y0); dcb_color_full(tile, x0, y0, chrm); } From 6e7712831a5e146aa694165cc488a3b7230c669f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Wed, 1 Mar 2017 18:13:16 +0100 Subject: [PATCH 5/5] Break `min()` and `max()` parameter dependencies Also convert most functions in `rt_math.h` to `constexpr` by implementing `min()` and `max()` natively. `constexpr` indeed has a positive impact on the generated assembly. --- rtengine/rt_math.h | 53 ++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h index 32db8e92f..e1c01a94f 100644 --- a/rtengine/rt_math.h +++ b/rtengine/rt_math.h @@ -27,57 +27,68 @@ constexpr float RT_PI_F_2 = RT_PI_2; constexpr float RT_INFINITY_F = std::numeric_limits::infinity(); constexpr float RT_NAN_F = std::numeric_limits::quiet_NaN(); -template -inline T SQR (T x) +template +constexpr T SQR(T x) { -// return std::pow(x,2); Slower than: return x * x; } template -inline const T& min(const T& a, const T& b) +constexpr const T& min(const T& a) { - return std::min(a, b); + return a; +} + +template +constexpr const T& min(const T& a, const T& b) +{ + return a < b ? a : b; } template -inline const T& min(const T& a, const T& b, const ARGS&... args) +constexpr const T& min(const T& a, const T& b, const ARGS&... args) { - return min(min(a, b), args...); + return min(min(a, b), min(args...)); } template -inline const T& max(const T& a, const T& b) +constexpr const T& max(const T& a) { - return std::max(a, b); + return a; +} + +template +constexpr const T& max(const T& a, const T& b) +{ + return a < b ? b : a; } template -inline const T& max(const T& a, const T& b, const ARGS&... args) +constexpr const T& max(const T& a, const T& b, const ARGS&... args) { - return max(max(a, b), args...); + return max(max(a, b), max(args...)); } template -inline const T& LIM(const T& a, const T& b, const T& c) +constexpr const T& LIM(const T& a, const T& b, const T& c) { - return std::max(b, std::min(a, c)); + return max(b, min(a, c)); } template -inline T LIM01(const T& a) +constexpr T LIM01(const T& a) { - return std::max(T(0), std::min(a, T(1))); + return max(T(0), min(a, T(1))); } template -inline T CLIP(const T& a) +constexpr T CLIP(const T& a) { return LIM(a, static_cast(0), static_cast(MAXVAL)); } template -inline T intp(T a, T b, T c) +constexpr T intp(T a, T b, T c) { // calculate a * b + (1 - a) * c // following is valid: @@ -101,13 +112,13 @@ inline T norm2(const T& x, const T& y) template< typename T > inline T norminf(const T& x, const T& y) { - return std::max(std::abs(x), std::abs(y)); + return max(std::abs(x), std::abs(y)); } -inline int float2uint16range(float d) // clips input to [0;65535] and rounds +constexpr int float2uint16range(float d) { - d = CLIP(d); // clip to [0;65535] - return d + 0.5f; + // clips input to [0;65535] and rounds + return CLIP(d) + 0.5f; } constexpr std::uint8_t uint16ToUint8Rounded(std::uint16_t i)