From 66382743cd0ab251e361b9615712aaeb9f36f19c Mon Sep 17 00:00:00 2001
From: "U-coolermaster2\\cuniek" <cuniek@kft.umcs.lublin.pl>
Date: Mon, 27 Feb 2017 19:41:23 +0100
Subject: [PATCH 1/5] Improved DCB, less macroblicking on diagonals, much
 faster code

---
 rtengine/demosaic_algos.cc | 403 ++++++++++++++++++++-----------------
 rtengine/rawimagesource.h  |  26 +--
 2 files changed, 232 insertions(+), 197 deletions(-)

diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc
index e641ed777..0f0e4841f 100644
--- a/rtengine/demosaic_algos.cc
+++ b/rtengine/demosaic_algos.cc
@@ -1,3 +1,4 @@
+/*
 /*
  *  This file is part of RawTherapee.
  *
@@ -37,7 +38,7 @@
 #include "sleef.c"
 #include "opthelper.h"
 #include "median.h"
-//#define BENCHMARK
+#define BENCHMARK
 #include "StopWatch.h"
 #ifdef _OPENMP
 #include <omp.h>
@@ -3251,7 +3252,7 @@ void RawImageSource::refinement_lassus(int PassCount)
  * the code is open source (BSD licence)
 */
 
-#define TILESIZE 256
+#define TILESIZE 192
 #define TILEBORDER 10
 #define CACHESIZE (TILESIZE+2*TILEBORDER)
 
@@ -3279,7 +3280,7 @@ inline void RawImageSource::dcb_initTileLimits(int &colMin, int &rowMin, int &co
     }
 }
 
-void RawImageSource::fill_raw( float (*cache )[4], int x0, int y0, float** rawData)
+void RawImageSource::fill_raw( float (*cache )[3], int x0, int y0, float** rawData)
 {
     int rowMin, colMin, rowMax, colMax;
     dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 0);
@@ -3290,7 +3291,7 @@ void RawImageSource::fill_raw( float (*cache )[4], int x0, int y0, float** rawDa
         }
 }
 
-void RawImageSource::fill_border( float (*cache )[4], int border, int x0, int y0)
+void RawImageSource::fill_border( float (*cache )[3], int border, int x0, int y0)
 {
     unsigned row, col, y, x, f, c;
     float sum[8];
@@ -3325,93 +3326,55 @@ void RawImageSource::fill_border( float (*cache )[4], int border, int x0, int y0
         }
     }
 }
+
 // saves red and blue
-void RawImageSource::copy_to_buffer( float (*buffer)[3], float (*image)[4])
+
+// change buffer[3] -> buffer[2],  possibly to buffer[1] if split 
+// into two loops, one for R and another for B, could also be smaller because 
+// there is no need for green pixels pass
+// this would decrease the amount of needed memory
+// from megapixels*2 records to megapixels*0.5
+// also don't know if float is needed as data is 1-65536 integer (I believe!!)
+// comment from Ingo: float is needed because rawdata in rt is float
+void RawImageSource::copy_to_buffer( float (*buffer)[2], float (*image)[3])
 {
     for (int indx = 0; indx < CACHESIZE * CACHESIZE; indx++) {
         buffer[indx][0] = image[indx][0]; //R
-        buffer[indx][2] = image[indx][2]; //B
+        buffer[indx][1] = image[indx][2]; //B
     }
 }
 
 // restores red and blue
-void RawImageSource::restore_from_buffer(float (*image)[4], float (*buffer)[3])
+
+// other comments like in copy_to_buffer
+void RawImageSource::restore_from_buffer(float (*image)[3], float (*buffer)[2])
 {
     for (int indx = 0; indx < CACHESIZE * CACHESIZE; indx++) {
         image[indx][0] = buffer[indx][0]; //R
-        image[indx][2] = buffer[indx][2]; //B
+        image[indx][2] = buffer[indx][1]; //B
     }
 }
 
 // First pass green interpolation
-void RawImageSource::dcb_hid(float (*image)[4], float (*bufferH)[3], float (*bufferV)[3], int x0, int y0)
+
+// remove entirely: bufferH and bufferV
+void RawImageSource::dcb_hid(float (*image)[3], int x0, int y0)
 {
-    const int u = CACHESIZE, v = 2 * CACHESIZE;
-    int rowMin, colMin, rowMax, colMax;
+    const int u = CACHESIZE;
+    int rowMin, colMin, rowMax, colMax, c;
     dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 2);
 
-    // green pixels
-    for (int row = rowMin; row < rowMax; row++) {
-        for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col; col < colMax; col += 2, indx += 2) {
-            assert(indx - u >= 0 && indx + u < u * u);
-            bufferH[indx][1] = (image[indx - 1][1] + image[indx + 1][1]) * 0.5f;
-            bufferV[indx][1] = (image[indx + u][1] + image[indx - u][1]) * 0.5f;
-        }
-    }
-
-    // red in blue pixel, blue in red pixel
+// simple green bilinear in R and B pixels
     for (int row = rowMin; row < rowMax; row++)
-        for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = 2 - FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
+        for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
             assert(indx - u - 1 >= 0 && indx + u + 1 < u * u && c >= 0 && c < 3);
 
-            bufferH[indx][c] = ( 4.f * bufferH[indx][1]
-                                 - bufferH[indx + u + 1][1] - bufferH[indx + u - 1][1] - bufferH[indx - u + 1][1] - bufferH[indx - u - 1][1]
-                                 + image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] ) * 0.25f;
-            bufferV[indx][c] = ( 4.f * bufferV[indx][1]
-                                 - bufferV[indx + u + 1][1] - bufferV[indx + u - 1][1] - bufferV[indx - u + 1][1] - bufferV[indx - u - 1][1]
-                                 + image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] ) * 0.25f;
-        }
-
-    // red or blue in green pixels
-    for (int row = rowMin; row < rowMax; row++)
-        for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + 1) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col + 1), d = 2 - c; col < colMax; col += 2, indx += 2) {
-            assert(indx - u >= 0 && indx + u < u * u && c >= 0 && c < 3 && d >= 0 && d < 3);
-            bufferH[indx][c] = (image[indx + 1][c] + image[indx - 1][c]) * 0.5f;
-            bufferH[indx][d] = (2.f * bufferH[indx][1] - bufferH[indx + u][1] - bufferH[indx - u][1] + image[indx + u][d] + image[indx - u][d]) * 0.5f;
-            bufferV[indx][c] = (2.f * bufferV[indx][1] - bufferV[indx + 1][1] - bufferV[indx - 1][1] + image[indx + 1][c] + image[indx - 1][c]) * 0.5f;
-            bufferV[indx][d] = (image[indx + u][d] + image[indx - u][d]) * 0.5f;
-        }
-
-    // Decide green pixels
-    for (int row = rowMin; row < rowMax; row++)
-        for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col), d = 2 - c; col < colMax; col += 2, indx += 2) {
-            float current =  max(image[indx + v][c], image[indx - v][c], image[indx - 2][c], image[indx + 2][c]) -
-                             min(image[indx + v][c], image[indx - v][c], image[indx - 2][c], image[indx + 2][c]) +
-                             max(image[indx + 1 + u][d], image[indx + 1 - u][d], image[indx - 1 + u][d], image[indx - 1 - u][d]) -
-                             min(image[indx + 1 + u][d], image[indx + 1 - u][d], image[indx - 1 + u][d], image[indx - 1 - u][d]);
-
-            float currentH = max(bufferH[indx + v][d], bufferH[indx - v][d], bufferH[indx - 2][d], bufferH[indx + 2][d]) -
-                             min(bufferH[indx + v][d], bufferH[indx - v][d], bufferH[indx - 2][d], bufferH[indx + 2][d]) +
-                             max(bufferH[indx + 1 + u][c], bufferH[indx + 1 - u][c], bufferH[indx - 1 + u][c], bufferH[indx - 1 - u][c]) -
-                             min(bufferH[indx + 1 + u][c], bufferH[indx + 1 - u][c], bufferH[indx - 1 + u][c], bufferH[indx - 1 - u][c]);
-
-            float currentV = max(bufferV[indx + v][d], bufferV[indx - v][d], bufferV[indx - 2][d], bufferV[indx + 2][d]) -
-                             min(bufferV[indx + v][d], bufferV[indx - v][d], bufferV[indx - 2][d], bufferV[indx + 2][d]) +
-                             max(bufferV[indx + 1 + u][c], bufferV[indx + 1 - u][c], bufferV[indx - 1 + u][c], bufferV[indx - 1 - u][c]) -
-                             min(bufferV[indx + 1 + u][c], bufferV[indx + 1 - u][c], bufferV[indx - 1 + u][c], bufferV[indx - 1 - u][c]);
-
-            assert(indx >= 0 && indx < u * u);
-
-            if (ABS(current - currentH) < ABS(current - currentV)) {
-                image[indx][1] = bufferH[indx][1];
-            } else {
-                image[indx][1] = bufferV[indx][1];
-            }
-        }
+			image[indx][1] = 0.25*(image[indx-1][1]+image[indx+1][1]+image[indx-u][1]+image[indx+u][1]);
+		}
 }
 
-// missing colors are interpolated
-void RawImageSource::dcb_color(float (*image)[4], int x0, int y0)
+// missing colours are interpolated
+void RawImageSource::dcb_color(float (*image)[3], int x0, int y0)
 {
     const int u = CACHESIZE;
     int rowMin, colMin, rowMax, colMax;
@@ -3421,22 +3384,39 @@ void RawImageSource::dcb_color(float (*image)[4], int x0, int y0)
     for (int row = rowMin; row < rowMax; row++)
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = 2 - FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
             assert(indx >= 0 && indx < u * u && c >= 0 && c < 4);
+
+			
+//Jacek comment: one multiplication less			
+			image[indx][c] = image[indx][1] + 
+							   ( image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c]
+                              - (image[indx + u + 1][1] + image[indx + u - 1][1] + image[indx - u + 1][1] + image[indx - u - 1][1]) ) * 0.25f;
+
+/* original
             image[indx][c] = ( 4.f * image[indx][1]
                                - image[indx + u + 1][1] - image[indx + u - 1][1] - image[indx - u + 1][1] - image[indx - u - 1][1]
                                + image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] ) * 0.25f;
-        }
+*/
+	   }
 
     // red or blue in green pixels
     for (int row = rowMin; row < rowMax; row++)
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + 1) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col + 1), d = 2 - c; col < colMax; col += 2, indx += 2) {
             assert(indx >= 0 && indx < u * u && c >= 0 && c < 4);
+
+//Jacek comment: two multiplications (in total) less			
+            image[indx][c] = image[indx][1] + (image[indx + 1][c] + image[indx - 1][c] - (image[indx + 1][1] + image[indx - 1][1])) * 0.5f;
+            image[indx][d] = image[indx][1] + (image[indx + u][d] + image[indx - u][d] - (image[indx + u][1] + image[indx - u][1])) * 0.5f;
+			
+			
+/* original
             image[indx][c] = (2.f * image[indx][1] - image[indx + 1][1] - image[indx - 1][1] + image[indx + 1][c] + image[indx - 1][c]) * 0.5f;
             image[indx][d] = (2.f * image[indx][1] - image[indx + u][1] - image[indx - u][1] + image[indx + u][d] + image[indx - u][d]) * 0.5f;
-        }
+*/
+		}
 }
 
 // green correction
-void RawImageSource::dcb_hid2(float (*image)[4], int x0, int y0)
+void RawImageSource::dcb_hid2(float (*image)[3], int x0, int y0)
 {
     const int u = CACHESIZE, v = 2 * CACHESIZE;
     int rowMin, colMin, rowMax, colMax;
@@ -3445,8 +3425,16 @@ void RawImageSource::dcb_hid2(float (*image)[4], int x0, int y0)
     for (int row = rowMin; row < rowMax; row++) {
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
             assert(indx - v >= 0 && indx + v < u * u);
-            image[indx][1] = (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]) * 0.25f +
+ 
+//Jacek comment: one multiplication less 
+            image[indx][1] = image[indx][c] +
+							 (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1] 
+						   - (image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c])) * 0.25f;
+
+/* original
+			image[indx][1] = (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]) * 0.25f +
                              image[indx][c] - ( image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c]) * 0.25f;
+*/
         }
     }
 }
@@ -3456,9 +3444,12 @@ void RawImageSource::dcb_hid2(float (*image)[4], int x0, int y0)
 // 1 = vertical
 // 0 = horizontal
 // saved in image[][3]
-void RawImageSource::dcb_map(float (*image)[4], int x0, int y0)
+
+// seems at least 2 persons implemented some code, as this one has different coding style, could be unified
+// I don't know if *pix is faster than a loop working on image[] directly 
+void RawImageSource::dcb_map(float (*image)[3], uint8_t *map, int x0, int y0)
 {
-    const int u = 4 * CACHESIZE;
+    const int u = 3 * CACHESIZE;
     int rowMin, colMin, rowMax, colMax;
     dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 2);
 
@@ -3468,36 +3459,41 @@ void RawImageSource::dcb_map(float (*image)[4], int x0, int y0)
 
             assert(indx >= 0 && indx < u * u);
 
-            if ( *pix > ( pix[-4] + pix[+4] + pix[-u] + pix[+u]) / 4 ) {
-                image[indx][3] = ((min(pix[-4], pix[+4]) + pix[-4] + pix[+4] ) < (min(pix[-u], pix[+u]) + pix[-u] + pix[+u]));
+            // comparing 4 * a to (b+c+d+e) instead of a to (b+c+d+e)/4 is faster because divisions are slow
+            if ( 4 * (*pix) > ( (pix[-3] + pix[+3]) + (pix[-u] + pix[+u])) ) {
+                map[indx] = ((min(pix[-3], pix[+3]) + (pix[-3] + pix[+3]) ) < (min(pix[-u], pix[+u]) + (pix[-u] + pix[+u])));
             } else {
-                image[indx][3] = ((max(pix[-4], pix[+4]) + pix[-4] + pix[+4] ) > (max(pix[-u], pix[+u]) + pix[-u] + pix[+u]));
+                map[indx] = ((max(pix[-3], pix[+3]) + (pix[-3] + pix[+3]) ) > (max(pix[-u], pix[+u]) + (pix[-u] + pix[+u])));
             }
         }
     }
 }
 
 // interpolated green pixels are corrected using the map
-void RawImageSource::dcb_correction(float (*image)[4], int x0, int y0)
+void RawImageSource::dcb_correction(float (*image)[3], uint8_t *map, int x0, int y0)
 {
     const int u = CACHESIZE, v = 2 * CACHESIZE;
     int rowMin, colMin, rowMax, colMax;
     dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 2);
 
     for (int row = rowMin; row < rowMax; row++) {
-        for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col; col < colMax; col += 2, indx += 2) {
-            float current = 4.f * image[indx][3] +
-                            2.f * (image[indx + u][3] + image[indx - u][3] + image[indx + 1][3] + image[indx - 1][3]) +
-                            image[indx + v][3] + image[indx - v][3] + image[indx + 2][3] + image[indx - 2][3];
+        for (int indx = row * CACHESIZE + colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1); indx < row * CACHESIZE + colMax; indx += 2) {
+//        for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col; col < colMax; col += 2, indx += 2) {
+            float current = 4 * map[indx] +
+                            2 * (map[indx + u] + map[indx - u] + map[indx + 1] + map[indx - 1]) +
+                            map[indx + v] + map[indx - v] + map[indx + 2] + map[indx - 2];
 
             assert(indx >= 0 && indx < u * u);
-            image[indx][1] = ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1]) * 0.5f + current * (image[indx - u][1] + image[indx + u][1]) * 0.5f ) * 0.0625f;
+            image[indx][1] = ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1]) + current * (image[indx - u][1] + image[indx + u][1]) ) * 0.03125f;
+//            image[indx][1] = ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1]) * 0.5f + current * (image[indx - u][1] + image[indx + u][1]) * 0.5f ) * 0.0625f;
         }
     }
 }
 
 // R and B smoothing using green contrast, all pixels except 2 pixel wide border
-void RawImageSource::dcb_pp(float (*image)[4], int x0, int y0)
+
+// again code with *pix, is this kind of calculating faster in C, than this what was commented?
+void RawImageSource::dcb_pp(float (*image)[3], int x0, int y0)
 {
     const int u = CACHESIZE;
     int rowMin, colMin, rowMax, colMax;
@@ -3505,10 +3501,10 @@ void RawImageSource::dcb_pp(float (*image)[4], int x0, int y0)
 
     for (int row = rowMin; row < rowMax; row++)
         for (int col = colMin, indx = row * CACHESIZE + col; col < colMax; col++, indx++) {
-            //int r1 = ( image[indx-1][0] + image[indx+1][0] + image[indx-u][0] + image[indx+u][0] + image[indx-u-1][0] + image[indx+u+1][0] + image[indx-u+1][0] + image[indx+u-1][0])/8;
-            //int g1 = ( image[indx-1][1] + image[indx+1][1] + image[indx-u][1] + image[indx+u][1] + image[indx-u-1][1] + image[indx+u+1][1] + image[indx-u+1][1] + image[indx+u-1][1])/8;
-            //int b1 = ( image[indx-1][2] + image[indx+1][2] + image[indx-u][2] + image[indx+u][2] + image[indx-u-1][2] + image[indx+u+1][2] + image[indx-u+1][2] + image[indx+u-1][2])/8;
-            float (*pix)[4] = image + (indx - u - 1);
+//            float r1 = image[indx-1][0] + image[indx+1][0] + image[indx-u][0] + image[indx+u][0] + image[indx-u-1][0] + image[indx+u+1][0] + image[indx-u+1][0] + image[indx+u-1][0];
+//            float g1 = image[indx-1][1] + image[indx+1][1] + image[indx-u][1] + image[indx+u][1] + image[indx-u-1][1] + image[indx+u+1][1] + image[indx-u+1][1] + image[indx+u-1][1];
+//            float b1 = image[indx-1][2] + image[indx+1][2] + image[indx-u][2] + image[indx+u][2] + image[indx-u-1][2] + image[indx+u+1][2] + image[indx-u+1][2] + image[indx+u-1][2];
+            float (*pix)[3] = image + (indx - u - 1);
             float r1 = (*pix)[0];
             float g1 = (*pix)[1];
             float b1 = (*pix)[2];
@@ -3543,8 +3539,8 @@ void RawImageSource::dcb_pp(float (*image)[4], int x0, int y0)
             r1 *= 0.125f;
             g1 *= 0.125f;
             b1 *= 0.125f;
-            r1 = r1 + ( image[indx][1] - g1 );
-            b1 = b1 + ( image[indx][1] - g1 );
+            r1 += ( image[indx][1] - g1 );
+            b1 += ( image[indx][1] - g1 );
 
             assert(indx >= 0 && indx < u * u);
             image[indx][0] = r1;
@@ -3554,70 +3550,90 @@ void RawImageSource::dcb_pp(float (*image)[4], int x0, int y0)
 
 // interpolated green pixels are corrected using the map
 // with correction
-void RawImageSource::dcb_correction2(float (*image)[4], int x0, int y0)
+void RawImageSource::dcb_correction2(float (*image)[3], uint8_t *map, int x0, int y0)
 {
     const int u = CACHESIZE, v = 2 * CACHESIZE;
     int rowMin, colMin, rowMax, colMax;
     dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 4);
 
     for (int row = rowMin; row < rowMax; row++) {
-        for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
-            float current = 4.f * image[indx][3] +
-                            2.f * (image[indx + u][3] + image[indx - u][3] + image[indx + 1][3] + image[indx - 1][3]) +
-                            image[indx + v][3] + image[indx - v][3] + image[indx + 2][3] + image[indx - 2][3];
+        for (int indx = row * CACHESIZE + colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1)); indx < row * CACHESIZE + colMax; indx += 2) {
+            // map values are uint8_t either 0 or 1. Adding them using integer instructions is perfectly valid and fast. Final result is converted to float then
+            float current = 4 * map[indx] +
+                            2 * (map[indx + u] + map[indx - u] + map[indx + 1] + map[indx - 1]) +
+                            map[indx + v] + map[indx - v] + map[indx + 2] + map[indx - 2];
 
             assert(indx >= 0 && indx < u * u);
+
+// Jacek comment: works now, and has 3 float mults and 9 float adds		
+			image[indx][1] =  image[indx][c] +
+					((16.f - current) * (image[indx - 1][1] + image[indx + 1][1] - (image[indx + 2][c] + image[indx - 2][c])) 
+							+ current * (image[indx - u][1] + image[indx + u][1] - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f;			
+			
+			
+            // 4 float mults and 9 float adds
+			// Jacek comment: not mathematically identical to original
+/*            image[indx][1] = 16.f * image[indx][c] + 
+                             ((16.f - current) * ((image[indx - 1][1] + image[indx + 1][1])
+                                                  - (image[indx + 2][c] + image[indx - 2][c]))
+                              + current * ((image[indx - u][1] + image[indx + u][1]) - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f;
+*/
+            // 7 float mults and 10 float adds
+			// original code
+/*           
             image[indx][1] = ((16.f - current) * ((image[indx - 1][1] + image[indx + 1][1]) * 0.5f
                                                   + image[indx][c] - (image[indx + 2][c] + image[indx - 2][c]) * 0.5f)
                               + current * ((image[indx - u][1] + image[indx + u][1]) * 0.5f + image[indx][c] - (image[indx + v][c] + image[indx - v][c]) * 0.5f)) * 0.0625f;
-        }
+*/
+		}
     }
 }
 
 // image refinement
-void RawImageSource::dcb_refinement(float (*image)[4], int x0, int y0)
+void RawImageSource::dcb_refinement(float (*image)[3], uint8_t *map, int x0, int y0)
 {
     const int u = CACHESIZE, v = 2 * CACHESIZE, w = 3 * CACHESIZE;
     int rowMin, colMin, rowMax, colMax;
     dcb_initTileLimits(colMin, rowMin, colMax, rowMax, x0, y0, 4);
 
-    float f[5], g1, g2;
+    float f0, f1, f2, g1, h0, h1, h2, g2, current;
 
     for (int row = rowMin; row < rowMax; row++)
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
-            float current = 4.f * image[indx][3] +
-                            2.f * (image[indx + u][3] + image[indx - u][3] + image[indx + 1][3] + image[indx - 1][3])
-                            + image[indx + v][3] + image[indx - v][3] + image[indx - 2][3] + image[indx + 2][3];
 
-            f[0] = (float)(image[indx - u][1] + image[indx + u][1]) / (2.f + 2.f * image[indx][c]);
-            f[1] = 2.f * image[indx - u][1] / (2 + image[indx - v][c] + image[indx][c]);
-            f[2] = (float)(image[indx - u][1] + image[indx - w][1]) / (2.f + 2.f * image[indx - v][c]);
-            f[3] = 2.f * image[indx + u][1] / (2 + image[indx + v][c] + image[indx][c]);
-            f[4] = (float)(image[indx + u][1] + image[indx + w][1]) / (2.f + 2.f * image[indx + v][c]);
+            float current = 4 * map[indx] +
+                            2 * (map[indx + u] + map[indx - u] + map[indx + 1] + map[indx - 1])
+                            + map[indx + v] + map[indx - v] + map[indx - 2] + map[indx + 2];
 
-            g1 = (f[0] + f[1] + f[2] + f[3] + f[4] - max(f[1], f[2], f[3], f[4]) - min(f[1], f[2], f[3], f[4])) / 3.f;
+            float currPix = image[indx][c];
 
-            f[0] = (float)(image[indx - 1][1] + image[indx + 1][1]) / (2.f + 2.f * image[indx][c]);
-            f[1] = 2.f * image[indx - 1][1] / (2 + image[indx - 2][c] + image[indx][c]);
-            f[2] = (float)(image[indx - 1][1] + image[indx - 3][1]) / (2.f + 2.f * image[indx - 2][c]);
-            f[3] = 2.f * image[indx + 1][1] / (2 + image[indx + 2][c] + image[indx][c]);
-            f[4] = (float)(image[indx + 1][1] + image[indx + 3][1]) / (2.f + 2.f * image[indx + 2][c]);
+            f0 = (float)(image[indx - u][1] + image[indx + u][1]) / (1.f + 2.f * currPix);
+            f1 = 2.f * image[indx - u][1] / (1.f + image[indx - v][c] + currPix);
+            f2 = 2.f * image[indx + u][1] / (1.f + image[indx + v][c] + currPix);
 
-            g2 = (f[0] + f[1] + f[2] + f[3] + f[4] - max(f[1], f[2], f[3], f[4]) - min(f[1], f[2], f[3], f[4])) / 3.f;
+            g1 = f0 + f1 + f2;
 
+            h0 = (float)(image[indx - 1][1] + image[indx + 1][1]) / (1.f + 2.f * currPix);
+            h1 = 2.f * image[indx - 1][1] / (1.f + image[indx - 2][c] + currPix);
+            h2 = 2.f * image[indx + 1][1] / (1.f + image[indx + 2][c] + currPix);
+
+            g2 = h0 + h1 + h2;
+		
+            // new green value
             assert(indx >= 0 && indx < u * u);
-            image[indx][1] = (2.f + image[indx][c]) * (current * g1 + (16.f - current) * g2) * 0.0625f;
+            currPix *= (current * g1 + (16.f - current) * g2) / 48.f;
 
-            // get rid of the overshooted pixels
-            float min_f = min(image[indx + 1 + u][1], min(image[indx + 1 - u][1], min(image[indx - 1 + u][1], min(image[indx - 1 - u][1], min(image[indx - 1][1], min(image[indx + 1][1], min(image[indx - u][1], image[indx + u][1])))))));
-            float max_f = max(image[indx + 1 + u][1], max(image[indx + 1 - u][1], max(image[indx - 1 + u][1], max(image[indx - 1 - u][1], max(image[indx - 1][1], max(image[indx + 1][1], max(image[indx - u][1], image[indx + u][1])))))));
+            // get rid of the overshot pixels
+            float minVal = min(image[indx - 1][1], min(image[indx + 1][1], min(image[indx - u][1], image[indx + u][1])));
+            float maxVal = max(image[indx - 1][1], max(image[indx + 1][1], max(image[indx - u][1], image[indx + u][1])));
+
+            image[indx][1] =  LIM(currPix, minVal, maxVal);
 
-            image[indx][1] =  LIM(image[indx][1], min_f, max_f);
         }
 }
 
-// missing colors are interpolated using high quality algorithm by Luis Sanz Rodriguez
-void RawImageSource::dcb_color_full(float (*image)[4], int x0, int y0, float (*chroma)[2])
+// missing colours are interpolated using high quality algorithm by Luis Sanz Rodriguez
+void RawImageSource::dcb_color_full(float (*image)[3], int x0, int y0, float (*chroma)[2])
 {
     const int u = CACHESIZE, w = 3 * CACHESIZE;
     int rowMin, colMin, rowMax, colMax;
@@ -3637,10 +3653,15 @@ void RawImageSource::dcb_color_full(float (*image)[4], int x0, int y0, float (*c
             f[1] = 1.f / (float)(1.f + fabs(chroma[indx - u + 1][c] - chroma[indx + u - 1][c]) + fabs(chroma[indx - u + 1][c] - chroma[indx - w + 3][c]) + fabs(chroma[indx + u - 1][c] - chroma[indx - w + 3][c]));
             f[2] = 1.f / (float)(1.f + fabs(chroma[indx + u - 1][c] - chroma[indx - u + 1][c]) + fabs(chroma[indx + u - 1][c] - chroma[indx + w + 3][c]) + fabs(chroma[indx - u + 1][c] - chroma[indx + w - 3][c]));
             f[3] = 1.f / (float)(1.f + fabs(chroma[indx + u + 1][c] - chroma[indx - u - 1][c]) + fabs(chroma[indx + u + 1][c] - chroma[indx + w - 3][c]) + fabs(chroma[indx - u - 1][c] - chroma[indx + w + 3][c]));
-            g[0] = 1.325f * chroma[indx - u - 1][c] - 0.175f * chroma[indx - w - 3][c] - 0.075f * chroma[indx - w - 1][c] - 0.075f * chroma[indx - u - 3][c];
-            g[1] = 1.325f * chroma[indx - u + 1][c] - 0.175f * chroma[indx - w + 3][c] - 0.075f * chroma[indx - w + 1][c] - 0.075f * chroma[indx - u + 3][c];
-            g[2] = 1.325f * chroma[indx + u - 1][c] - 0.175f * chroma[indx + w - 3][c] - 0.075f * chroma[indx + w - 1][c] - 0.075f * chroma[indx + u - 3][c];
-            g[3] = 1.325f * chroma[indx + u + 1][c] - 0.175f * chroma[indx + w + 3][c] - 0.075f * chroma[indx + w + 1][c] - 0.075f * chroma[indx + u + 3][c];
+            g[0] = 1.325f * chroma[indx - u - 1][c] - 0.175f * chroma[indx - w - 3][c] - 0.075f * (chroma[indx - w - 1][c] + chroma[indx - u - 3][c]);
+            g[1] = 1.325f * chroma[indx - u + 1][c] - 0.175f * chroma[indx - w + 3][c] - 0.075f * (chroma[indx - w + 1][c] + chroma[indx - u + 3][c]);
+            g[2] = 1.325f * chroma[indx + u - 1][c] - 0.175f * chroma[indx + w - 3][c] - 0.075f * (chroma[indx + w - 1][c] + chroma[indx + u - 3][c]);
+            g[3] = 1.325f * chroma[indx + u + 1][c] - 0.175f * chroma[indx + w + 3][c] - 0.075f * (chroma[indx + w + 1][c] + chroma[indx + u + 3][c]);
+
+//            g[0] = 1.325f * chroma[indx - u - 1][c] - 0.175f * chroma[indx - w - 3][c] - 0.075f * chroma[indx - w - 1][c] - 0.075f * chroma[indx - u - 3][c];
+//            g[1] = 1.325f * chroma[indx - u + 1][c] - 0.175f * chroma[indx - w + 3][c] - 0.075f * chroma[indx - w + 1][c] - 0.075f * chroma[indx - u + 3][c];
+//            g[2] = 1.325f * chroma[indx + u - 1][c] - 0.175f * chroma[indx + w - 3][c] - 0.075f * chroma[indx + w - 1][c] - 0.075f * chroma[indx + u - 3][c];
+//            g[3] = 1.325f * chroma[indx + u + 1][c] - 0.175f * chroma[indx + w + 3][c] - 0.075f * chroma[indx + w + 1][c] - 0.075f * chroma[indx + u + 3][c];
 
             assert(indx >= 0 && indx < u * u && c >= 0 && c < 2);
             chroma[indx][c] = (f[0] * g[0] + f[1] * g[1] + f[2] * g[2] + f[3] * g[3]) / (f[0] + f[1] + f[2] + f[3]);
@@ -3649,15 +3670,20 @@ void RawImageSource::dcb_color_full(float (*image)[4], int x0, int y0, float (*c
     for (int row = rowMin; row < rowMax; row++)
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + 1) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col + 1) / 2; col < colMax; col += 2, indx += 2)
             for(int d = 0; d <= 1; c = 1 - c, d++) {
-                f[0] = 1.f / (float)(1.f + fabs(chroma[indx - u][c] - chroma[indx + u][c]) + fabs(chroma[indx - u][c] - chroma[indx - w][c]) + fabs(chroma[indx + u][c] - chroma[indx - w][c]));
-                f[1] = 1.f / (float)(1.f + fabs(chroma[indx + 1][c] - chroma[indx - 1][c]) + fabs(chroma[indx + 1][c] - chroma[indx + 3][c]) + fabs(chroma[indx - 1][c] - chroma[indx + 3][c]));
-                f[2] = 1.f / (float)(1.f + fabs(chroma[indx - 1][c] - chroma[indx + 1][c]) + fabs(chroma[indx - 1][c] - chroma[indx - 3][c]) + fabs(chroma[indx + 1][c] - chroma[indx - 3][c]));
-                f[3] = 1.f / (float)(1.f + fabs(chroma[indx + u][c] - chroma[indx - u][c]) + fabs(chroma[indx + u][c] - chroma[indx + w][c]) + fabs(chroma[indx - u][c] - chroma[indx + w][c]));
+                f[0] = 1.f / (1.f + fabs(chroma[indx - u][c] - chroma[indx + u][c]) + fabs(chroma[indx - u][c] - chroma[indx - w][c]) + fabs(chroma[indx + u][c] - chroma[indx - w][c]));
+                f[1] = 1.f / (1.f + fabs(chroma[indx + 1][c] - chroma[indx - 1][c]) + fabs(chroma[indx + 1][c] - chroma[indx + 3][c]) + fabs(chroma[indx - 1][c] - chroma[indx + 3][c]));
+                f[2] = 1.f / (1.f + fabs(chroma[indx - 1][c] - chroma[indx + 1][c]) + fabs(chroma[indx - 1][c] - chroma[indx - 3][c]) + fabs(chroma[indx + 1][c] - chroma[indx - 3][c]));
+                f[3] = 1.f / (1.f + fabs(chroma[indx + u][c] - chroma[indx - u][c]) + fabs(chroma[indx + u][c] - chroma[indx + w][c]) + fabs(chroma[indx - u][c] - chroma[indx + w][c]));
 
-                g[0] = 0.875f * chroma[indx - u][c] + 0.125f * chroma[indx - w][c];
-                g[1] = 0.875f * chroma[indx + 1][c] + 0.125f * chroma[indx + 3][c];
-                g[2] = 0.875f * chroma[indx - 1][c] + 0.125f * chroma[indx - 3][c];
-                g[3] = 0.875f * chroma[indx + u][c] + 0.125f * chroma[indx + w][c];
+                g[0] = intp(0.875f, chroma[indx - u][c], chroma[indx - w][c]);
+                g[1] = intp(0.875f, chroma[indx + 1][c], chroma[indx + 3][c]);
+                g[2] = intp(0.875f, chroma[indx - 1][c], chroma[indx - 3][c]);
+                g[3] = intp(0.875f, chroma[indx + u][c], chroma[indx + w][c]);
+
+//                g[0] = 0.875f * chroma[indx - u][c] + 0.125f * chroma[indx - w][c];
+//                g[1] = 0.875f * chroma[indx + 1][c] + 0.125f * chroma[indx + 3][c];
+//                g[2] = 0.875f * chroma[indx - 1][c] + 0.125f * chroma[indx - 3][c];
+//                g[3] = 0.875f * chroma[indx + u][c] + 0.125f * chroma[indx + w][c];
 
                 assert(indx >= 0 && indx < u * u && c >= 0 && c < 2);
                 chroma[indx][c] = (f[0] * g[0] + f[1] * g[1] + f[2] * g[2] + f[3] * g[3]) / (f[0] + f[1] + f[2] + f[3]);
@@ -3672,9 +3698,10 @@ void RawImageSource::dcb_color_full(float (*image)[4], int x0, int y0, float (*c
         }
 }
 
-// DCB demosaicing main routine (sharp version)
+// DCB demosaicing main routine 
 void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance)
 {
+BENCHFUN
     double currentProgress = 0.0;
 
     if(plistener) {
@@ -3686,29 +3713,24 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance)
     int hTiles = H / TILESIZE + (H % TILESIZE ? 1 : 0);
     int numTiles = wTiles * hTiles;
     int tilesDone = 0;
+    constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
+
 #ifdef _OPENMP
-    int nthreads = omp_get_max_threads();
-    float (**image)[4]  =  (float(**)[4]) calloc( nthreads, sizeof( void*) );
-    float (**image2)[3] =   (float(**)[3]) calloc( nthreads, sizeof( void*) );
-    float (**image3)[3] =   (float(**)[3]) calloc( nthreads, sizeof( void*) );
-    float  (**chroma)[2] =  (float (**)[2]) calloc( nthreads, sizeof( void*) );
-
-    for(int i = 0; i < nthreads; i++) {
-        image[i] = (float(*)[4]) calloc( CACHESIZE * CACHESIZE, sizeof **image);
-        image2[i] = (float(*)[3]) calloc( CACHESIZE * CACHESIZE, sizeof **image2);
-        image3[i] = (float(*)[3]) calloc( CACHESIZE * CACHESIZE, sizeof **image3);
-        chroma[i] = (float (*)[2]) calloc( CACHESIZE * CACHESIZE, sizeof **chroma);
-    }
-
-#else
-    float (*image)[4]  = (float(*)[4]) calloc( CACHESIZE * CACHESIZE, sizeof * image);
-    float (*image2)[3] = (float(*)[3]) calloc( CACHESIZE * CACHESIZE, sizeof * image2);
-    float (*image3)[3] = (float(*)[3]) calloc( CACHESIZE * CACHESIZE, sizeof * image3);
-    float  (*chroma)[2] = (float (*)[2]) calloc( CACHESIZE * CACHESIZE, sizeof * chroma);
+    #pragma omp parallel
 #endif
+{
+    // assign working space
+    char *buffer0 = (char *) malloc(5 * sizeof(float) * CACHESIZE * CACHESIZE + sizeof(uint8_t) * CACHESIZE * CACHESIZE + 3 * cldf * 64 + 63);
+    // aligned to 64 byte boundary
+    char *data = (char*)( ( uintptr_t(buffer0) + uintptr_t(63)) / 64 * 64);
+
+    float (*tile)[3]   = (float(*)[3]) data;
+    float (*buffer)[2] = (float(*)[2]) ((char*)tile + sizeof(float) * CACHESIZE * CACHESIZE * 3 + cldf * 64);
+    float (*chrm)[2]   = (float(*)[2]) (buffer); // No overlap in usage of buffer and chrm means we can reuse buffer
+    uint8_t *map       = (uint8_t*) ((char*)buffer + sizeof(float) * CACHESIZE * CACHESIZE * 2 + cldf * 64);
 
 #ifdef _OPENMP
-    #pragma omp parallel for
+    #pragma omp for schedule(dynamic) nowait
 #endif
 
     for( int iTile = 0; iTile < numTiles; iTile++) {
@@ -3717,19 +3739,8 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance)
         int x0 = xTile * TILESIZE;
         int y0 = yTile * TILESIZE;
 
-#ifdef _OPENMP
-        int tid = omp_get_thread_num();
-        assert(tid < nthreads);
-        float (*tile)[4]   = image[tid];
-        float (*buffer)[3] = image2[tid];
-        float (*buffer2)[3] = image3[tid];
-        float  (*chrm)[2]   = chroma[tid];
-#else
-        float (*tile)[4]   = image;
-        float (*buffer)[3] = image2;
-        float (*buffer2)[3] = image3;
-        float  (*chrm)[2]   = chroma;
-#endif
+        memset(tile, 0, CACHESIZE * CACHESIZE * sizeof * tile);
+        memset(map, 0, CACHESIZE * CACHESIZE * sizeof * map);
 
         fill_raw( tile, x0, y0, rawData );
 
@@ -3737,7 +3748,44 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance)
             fill_border(tile, 6, x0, y0);
         }
 
+        copy_to_buffer(buffer, tile);
+        dcb_hid(tile, x0, y0);
+	  
+        for (int i = iterations; i > 0; i--) {
+            dcb_hid2(tile, x0, y0);
+            dcb_hid2(tile, x0, y0);
+            dcb_hid2(tile, x0, y0);
+            dcb_map(tile, map, x0, y0);
+            dcb_correction(tile, map, x0, y0);
+        }
+
+        dcb_color(tile, x0, y0);
+        dcb_pp(tile, x0, y0);
+        dcb_map(tile, map, x0, y0);
+        dcb_correction2(tile, map, x0, y0);
+        dcb_map(tile, map, x0, y0);
+        dcb_correction(tile, map, x0, y0);
+        dcb_color(tile, x0, y0);
+        dcb_map(tile, map, x0, y0);
+        dcb_correction(tile, map, x0, y0);
+        dcb_map(tile, map, x0, y0);
+        dcb_correction(tile, map, x0, y0);
+        dcb_map(tile, map, x0, y0);
+        restore_from_buffer(tile, buffer);
+        
+		if (!dcb_enhance) 
+			dcb_color(tile, x0, y0);
+		else 
+		{
+			memset(chrm, 0, CACHESIZE * CACHESIZE * sizeof * chrm);
+            dcb_refinement(tile, map, x0, y0);
+            dcb_color_full(tile, x0, y0, chrm);
+        }
+
+ /*
         dcb_hid(tile, buffer, buffer2, x0, y0);
+		dcb_color(tile, x0, y0);
+		
         copy_to_buffer(buffer, tile);
 
         for (int i = iterations; i > 0; i--) {
@@ -3761,13 +3809,13 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance)
         dcb_correction(tile, x0, y0);
         dcb_map(tile, x0, y0);
         restore_from_buffer(tile, buffer);
-        dcb_color(tile, x0, y0);
+        dcb_color_full(tile, x0, y0, chrm);
 
-        if (dcb_enhance) {
+		if (dcb_enhance) {
             dcb_refinement(tile, x0, y0);
             dcb_color_full(tile, x0, y0, chrm);
-        }
-
+         }
+*/
         for(int y = 0; y < TILESIZE && y0 + y < H; y++) {
             for (int j = 0; j < TILESIZE && x0 + j < W; j++) {
                 red[y0 + y][x0 + j]   = tile[(y + TILEBORDER) * CACHESIZE + TILEBORDER + j][0];
@@ -3792,21 +3840,8 @@ void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance)
 #endif
         tilesDone++;
     }
-
-#ifdef _OPENMP
-
-    for(int i = 0; i < nthreads; i++) {
-        free(image[i]);
-        free(image2[i]);
-        free(image3[i]);
-        free(chroma[i]);
-    }
-
-#endif
-    free(image);
-    free(image2);
-    free(image3);
-    free(chroma);
+    free(buffer0);
+}
 
     if(plistener) {
         plistener->setProgress (1.0);
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index 151edf959..9fafef8bb 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -236,19 +236,19 @@ protected:
     void border_interpolate(unsigned int border, float (*image)[4], unsigned int start = 0, unsigned int end = 0);
     void border_interpolate2(int winw, int winh, int lborders);
     void dcb_initTileLimits(int &colMin, int &rowMin, int &colMax, int &rowMax, int x0, int y0, int border);
-    void fill_raw( float (*cache )[4], int x0, int y0, float** rawData);
-    void fill_border( float (*cache )[4], int border, int x0, int y0);
-    void copy_to_buffer(float (*image2)[3], float (*image)[4]);
-    void dcb_hid(float (*image)[4], float (*bufferH)[3], float (*bufferV)[3], int x0, int y0);
-    void dcb_color(float (*image)[4], int x0, int y0);
-    void dcb_hid2(float (*image)[4], int x0, int y0);
-    void dcb_map(float (*image)[4], int x0, int y0);
-    void dcb_correction(float (*image)[4], int x0, int y0);
-    void dcb_pp(float (*image)[4], int x0, int y0);
-    void dcb_correction2(float (*image)[4], int x0, int y0);
-    void restore_from_buffer(float (*image)[4], float (*image2)[3]);
-    void dcb_refinement(float (*image)[4], int x0, int y0);
-    void dcb_color_full(float (*image)[4], int x0, int y0, float (*chroma)[2]);
+    void fill_raw( float (*cache )[3], int x0, int y0, float** rawData);
+    void fill_border( float (*cache )[3], int border, int x0, int y0);
+    void copy_to_buffer(float (*image2)[2], float (*image)[3]);
+    void dcb_hid(float (*image)[3], int x0, int y0);
+    void dcb_color(float (*image)[3], int x0, int y0);
+    void dcb_hid2(float (*image)[3], int x0, int y0);
+    void dcb_map(float (*image)[3], uint8_t *map, int x0, int y0);
+    void dcb_correction(float (*image)[3], uint8_t *map, int x0, int y0);
+    void dcb_pp(float (*image)[3], int x0, int y0);
+    void dcb_correction2(float (*image)[3], uint8_t *map, int x0, int y0);
+    void restore_from_buffer(float (*image)[3], float (*image2)[2]);
+    void dcb_refinement(float (*image)[3], uint8_t *map, int x0, int y0);
+    void dcb_color_full(float (*image)[3], int x0, int y0, float (*chroma)[2]);
     void cielab (const float (*rgb)[3], float* l, float* a, float *b, const int width, const int height, const int labWidth, const float xyz_cam[3][3]);
     void xtransborder_interpolate (int border);
     void xtrans_interpolate (const int passes, const bool useCieLab);

From 5e5ca6eee4627be4fea173383c6d85c7780dd39b Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Tue, 28 Feb 2017 20:05:36 +0100
Subject: [PATCH 2/5] disabled timing code in dcb_demosaic

---
 rtengine/demosaic_algos.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc
index 0f0e4841f..1ad3a15df 100644
--- a/rtengine/demosaic_algos.cc
+++ b/rtengine/demosaic_algos.cc
@@ -38,7 +38,7 @@
 #include "sleef.c"
 #include "opthelper.h"
 #include "median.h"
-#define BENCHMARK
+//#define BENCHMARK
 #include "StopWatch.h"
 #ifdef _OPENMP
 #include <omp.h>
@@ -3772,7 +3772,7 @@ BENCHFUN
         dcb_correction(tile, map, x0, y0);
         dcb_map(tile, map, x0, y0);
         restore_from_buffer(tile, buffer);
-        
+
 		if (!dcb_enhance) 
 			dcb_color(tile, x0, y0);
 		else 

From d72d931c9e2b090942deb48fc52c9d1021f6604d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Tue, 28 Feb 2017 20:57:19 +0100
Subject: [PATCH 3/5] Variadic template version of `rtengine::(min|max)()`

This change allows for an arbitrary number of arguments to `min()` and
`max()` by using recursion on variadic template functions. The
disassembly of GCC 6.3 was carefully checked for regressions, but
nothing was found other than the flipping of arguments (recursion is
now `(((a,b),c),d)` and was `(d,(c,(a,b)))` before).

I also unified the common type `_Tp` to to the even more common `T`.
---
 rtengine/rt_math.h | 68 ++++++++++++++++++----------------------------
 1 file changed, 27 insertions(+), 41 deletions(-)

diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h
index a7cbbf301..32db8e92f 100644
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@@ -27,71 +27,57 @@ constexpr float RT_PI_F_2 = RT_PI_2;
 constexpr float RT_INFINITY_F = std::numeric_limits<float>::infinity();
 constexpr float RT_NAN_F = std::numeric_limits<float>::quiet_NaN();
 
-template <typename _Tp>
-inline _Tp SQR (_Tp x)
+template <typename T>
+inline T SQR (T x)
 {
 //      return std::pow(x,2); Slower than:
     return x * x;
 }
 
-template<typename _Tp>
-inline const _Tp& min(const _Tp& a, const _Tp& b)
+template<typename T>
+inline const T& min(const T& a, const T& b)
 {
     return std::min(a, b);
 }
 
-template<typename _Tp>
-inline const _Tp& max(const _Tp& a, const _Tp& b)
+template<typename T, typename... ARGS>
+inline const T& min(const T& a, const T& b, const ARGS&... args)
+{
+    return min(min(a, b), args...);
+}
+
+template<typename T>
+inline const T& max(const T& a, const T& b)
 {
     return std::max(a, b);
 }
 
+template<typename T, typename... ARGS>
+inline const T& max(const T& a, const T& b, const ARGS&... args)
+{
+    return max(max(a, b), args...);
+}
 
-template<typename _Tp>
-inline const _Tp& LIM(const _Tp& a, const _Tp& b, const _Tp& c)
+template<typename T>
+inline const T& LIM(const T& a, const T& b, const T& c)
 {
     return std::max(b, std::min(a, c));
 }
 
-template<typename _Tp>
-inline _Tp LIM01(const _Tp& a)
+template<typename T>
+inline T LIM01(const T& a)
 {
-    return std::max(_Tp(0), std::min(a, _Tp(1)));
+    return std::max(T(0), std::min(a, T(1)));
 }
 
-template<typename _Tp>
-inline _Tp CLIP(const _Tp& a)
+template<typename T>
+inline T CLIP(const T& a)
 {
-    return LIM(a, static_cast<_Tp>(0), static_cast<_Tp>(MAXVAL));
+    return LIM(a, static_cast<T>(0), static_cast<T>(MAXVAL));
 }
 
-
-template<typename _Tp>
-inline const _Tp& min(const _Tp& a, const _Tp& b, const _Tp& c)
-{
-    return std::min(c, std::min(a, b));
-}
-
-template<typename _Tp>
-inline const _Tp& max(const _Tp& a, const _Tp& b, const _Tp& c)
-{
-    return std::max(c, std::max(a, b));
-}
-
-template<typename _Tp>
-inline const _Tp& min(const _Tp& a, const _Tp& b, const _Tp& c, const _Tp& d)
-{
-    return std::min(d, std::min(c, std::min(a, b)));
-}
-
-template<typename _Tp>
-inline const _Tp& max(const _Tp& a, const _Tp& b, const _Tp& c, const _Tp& d)
-{
-    return std::max(d, std::max(c, std::max(a, b)));
-}
-
-template<typename _Tp>
-inline _Tp intp(_Tp a, _Tp b, _Tp c)
+template<typename T>
+inline T intp(T a, T b, T c)
 {
     // calculate a * b + (1 - a) * c
     // following is valid:

From e2b8ccd38b416736a7a2b6c46ce12543e729c8e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Tue, 28 Feb 2017 21:16:10 +0100
Subject: [PATCH 4/5] Whitespace correction (#3719)

---
 rtengine/demosaic_algos.cc | 82 +++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc
index 1ad3a15df..60abe8998 100644
--- a/rtengine/demosaic_algos.cc
+++ b/rtengine/demosaic_algos.cc
@@ -3329,8 +3329,8 @@ void RawImageSource::fill_border( float (*cache )[3], int border, int x0, int y0
 
 // saves red and blue
 
-// change buffer[3] -> buffer[2],  possibly to buffer[1] if split 
-// into two loops, one for R and another for B, could also be smaller because 
+// change buffer[3] -> buffer[2],  possibly to buffer[1] if split
+// into two loops, one for R and another for B, could also be smaller because
 // there is no need for green pixels pass
 // this would decrease the amount of needed memory
 // from megapixels*2 records to megapixels*0.5
@@ -3369,8 +3369,8 @@ void RawImageSource::dcb_hid(float (*image)[3], int x0, int y0)
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
             assert(indx - u - 1 >= 0 && indx + u + 1 < u * u && c >= 0 && c < 3);
 
-			image[indx][1] = 0.25*(image[indx-1][1]+image[indx+1][1]+image[indx-u][1]+image[indx+u][1]);
-		}
+            image[indx][1] = 0.25*(image[indx-1][1]+image[indx+1][1]+image[indx-u][1]+image[indx+u][1]);
+        }
 }
 
 // missing colours are interpolated
@@ -3385,10 +3385,10 @@ void RawImageSource::dcb_color(float (*image)[3], int x0, int y0)
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = 2 - FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
             assert(indx >= 0 && indx < u * u && c >= 0 && c < 4);
 
-			
-//Jacek comment: one multiplication less			
-			image[indx][c] = image[indx][1] + 
-							   ( image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c]
+
+//Jacek comment: one multiplication less
+            image[indx][c] = image[indx][1] +
+                               ( image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c]
                               - (image[indx + u + 1][1] + image[indx + u - 1][1] + image[indx - u + 1][1] + image[indx - u - 1][1]) ) * 0.25f;
 
 /* original
@@ -3396,23 +3396,23 @@ void RawImageSource::dcb_color(float (*image)[3], int x0, int y0)
                                - image[indx + u + 1][1] - image[indx + u - 1][1] - image[indx - u + 1][1] - image[indx - u - 1][1]
                                + image[indx + u + 1][c] + image[indx + u - 1][c] + image[indx - u + 1][c] + image[indx - u - 1][c] ) * 0.25f;
 */
-	   }
+       }
 
     // red or blue in green pixels
     for (int row = rowMin; row < rowMax; row++)
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin + 1) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col + 1), d = 2 - c; col < colMax; col += 2, indx += 2) {
             assert(indx >= 0 && indx < u * u && c >= 0 && c < 4);
 
-//Jacek comment: two multiplications (in total) less			
+//Jacek comment: two multiplications (in total) less
             image[indx][c] = image[indx][1] + (image[indx + 1][c] + image[indx - 1][c] - (image[indx + 1][1] + image[indx - 1][1])) * 0.5f;
             image[indx][d] = image[indx][1] + (image[indx + u][d] + image[indx - u][d] - (image[indx + u][1] + image[indx - u][1])) * 0.5f;
-			
-			
+
+
 /* original
             image[indx][c] = (2.f * image[indx][1] - image[indx + 1][1] - image[indx - 1][1] + image[indx + 1][c] + image[indx - 1][c]) * 0.5f;
             image[indx][d] = (2.f * image[indx][1] - image[indx + u][1] - image[indx - u][1] + image[indx + u][d] + image[indx - u][d]) * 0.5f;
 */
-		}
+        }
 }
 
 // green correction
@@ -3425,14 +3425,14 @@ void RawImageSource::dcb_hid2(float (*image)[3], int x0, int y0)
     for (int row = rowMin; row < rowMax; row++) {
         for (int col = colMin + (FC(y0 - TILEBORDER + row, x0 - TILEBORDER + colMin) & 1), indx = row * CACHESIZE + col, c = FC(y0 - TILEBORDER + row, x0 - TILEBORDER + col); col < colMax; col += 2, indx += 2) {
             assert(indx - v >= 0 && indx + v < u * u);
- 
-//Jacek comment: one multiplication less 
+
+//Jacek comment: one multiplication less
             image[indx][1] = image[indx][c] +
-							 (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1] 
-						   - (image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c])) * 0.25f;
+                             (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]
+                           - (image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c])) * 0.25f;
 
 /* original
-			image[indx][1] = (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]) * 0.25f +
+            image[indx][1] = (image[indx + v][1] + image[indx - v][1] + image[indx - 2][1] + image[indx + 2][1]) * 0.25f +
                              image[indx][c] - ( image[indx + v][c] + image[indx - v][c] + image[indx - 2][c] + image[indx + 2][c]) * 0.25f;
 */
         }
@@ -3446,7 +3446,7 @@ void RawImageSource::dcb_hid2(float (*image)[3], int x0, int y0)
 // saved in image[][3]
 
 // seems at least 2 persons implemented some code, as this one has different coding style, could be unified
-// I don't know if *pix is faster than a loop working on image[] directly 
+// I don't know if *pix is faster than a loop working on image[] directly
 void RawImageSource::dcb_map(float (*image)[3], uint8_t *map, int x0, int y0)
 {
     const int u = 3 * CACHESIZE;
@@ -3565,27 +3565,27 @@ void RawImageSource::dcb_correction2(float (*image)[3], uint8_t *map, int x0, in
 
             assert(indx >= 0 && indx < u * u);
 
-// Jacek comment: works now, and has 3 float mults and 9 float adds		
-			image[indx][1] =  image[indx][c] +
-					((16.f - current) * (image[indx - 1][1] + image[indx + 1][1] - (image[indx + 2][c] + image[indx - 2][c])) 
-							+ current * (image[indx - u][1] + image[indx + u][1] - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f;			
-			
-			
+// Jacek comment: works now, and has 3 float mults and 9 float adds
+            image[indx][1] =  image[indx][c] +
+                    ((16.f - current) * (image[indx - 1][1] + image[indx + 1][1] - (image[indx + 2][c] + image[indx - 2][c]))
+                            + current * (image[indx - u][1] + image[indx + u][1] - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f;
+
+
             // 4 float mults and 9 float adds
-			// Jacek comment: not mathematically identical to original
-/*            image[indx][1] = 16.f * image[indx][c] + 
+            // Jacek comment: not mathematically identical to original
+/*            image[indx][1] = 16.f * image[indx][c] +
                              ((16.f - current) * ((image[indx - 1][1] + image[indx + 1][1])
                                                   - (image[indx + 2][c] + image[indx - 2][c]))
                               + current * ((image[indx - u][1] + image[indx + u][1]) - (image[indx + v][c] + image[indx - v][c]))) * 0.03125f;
 */
             // 7 float mults and 10 float adds
-			// original code
-/*           
+            // original code
+/*
             image[indx][1] = ((16.f - current) * ((image[indx - 1][1] + image[indx + 1][1]) * 0.5f
                                                   + image[indx][c] - (image[indx + 2][c] + image[indx - 2][c]) * 0.5f)
                               + current * ((image[indx - u][1] + image[indx + u][1]) * 0.5f + image[indx][c] - (image[indx + v][c] + image[indx - v][c]) * 0.5f)) * 0.0625f;
 */
-		}
+        }
     }
 }
 
@@ -3618,7 +3618,7 @@ void RawImageSource::dcb_refinement(float (*image)[3], uint8_t *map, int x0, int
             h2 = 2.f * image[indx + 1][1] / (1.f + image[indx + 2][c] + currPix);
 
             g2 = h0 + h1 + h2;
-		
+
             // new green value
             assert(indx >= 0 && indx < u * u);
             currPix *= (current * g1 + (16.f - current) * g2) / 48.f;
@@ -3698,7 +3698,7 @@ void RawImageSource::dcb_color_full(float (*image)[3], int x0, int y0, float (*c
         }
 }
 
-// DCB demosaicing main routine 
+// DCB demosaicing main routine
 void RawImageSource::dcb_demosaic(int iterations, bool dcb_enhance)
 {
 BENCHFUN
@@ -3750,7 +3750,7 @@ BENCHFUN
 
         copy_to_buffer(buffer, tile);
         dcb_hid(tile, x0, y0);
-	  
+
         for (int i = iterations; i > 0; i--) {
             dcb_hid2(tile, x0, y0);
             dcb_hid2(tile, x0, y0);
@@ -3773,19 +3773,19 @@ BENCHFUN
         dcb_map(tile, map, x0, y0);
         restore_from_buffer(tile, buffer);
 
-		if (!dcb_enhance) 
-			dcb_color(tile, x0, y0);
-		else 
-		{
-			memset(chrm, 0, CACHESIZE * CACHESIZE * sizeof * chrm);
+        if (!dcb_enhance)
+            dcb_color(tile, x0, y0);
+        else
+        {
+            memset(chrm, 0, CACHESIZE * CACHESIZE * sizeof * chrm);
             dcb_refinement(tile, map, x0, y0);
             dcb_color_full(tile, x0, y0, chrm);
         }
 
  /*
         dcb_hid(tile, buffer, buffer2, x0, y0);
-		dcb_color(tile, x0, y0);
-		
+        dcb_color(tile, x0, y0);
+
         copy_to_buffer(buffer, tile);
 
         for (int i = iterations; i > 0; i--) {
@@ -3811,7 +3811,7 @@ BENCHFUN
         restore_from_buffer(tile, buffer);
         dcb_color_full(tile, x0, y0, chrm);
 
-		if (dcb_enhance) {
+        if (dcb_enhance) {
             dcb_refinement(tile, x0, y0);
             dcb_color_full(tile, x0, y0, chrm);
          }

From 6e7712831a5e146aa694165cc488a3b7230c669f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Wed, 1 Mar 2017 18:13:16 +0100
Subject: [PATCH 5/5] Break `min()` and `max()` parameter dependencies

Also convert most functions in `rt_math.h` to `constexpr` by
implementing `min()` and `max()` natively. `constexpr` indeed has a
positive impact on the generated assembly.
---
 rtengine/rt_math.h | 53 ++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h
index 32db8e92f..e1c01a94f 100644
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@@ -27,57 +27,68 @@ constexpr float RT_PI_F_2 = RT_PI_2;
 constexpr float RT_INFINITY_F = std::numeric_limits<float>::infinity();
 constexpr float RT_NAN_F = std::numeric_limits<float>::quiet_NaN();
 
-template <typename T>
-inline T SQR (T x)
+template<typename T>
+constexpr T SQR(T x)
 {
-//      return std::pow(x,2); Slower than:
     return x * x;
 }
 
 template<typename T>
-inline const T& min(const T& a, const T& b)
+constexpr const T& min(const T& a)
 {
-    return std::min(a, b);
+    return a;
+}
+
+template<typename T>
+constexpr const T& min(const T& a, const T& b)
+{
+    return a < b ? a : b;
 }
 
 template<typename T, typename... ARGS>
-inline const T& min(const T& a, const T& b, const ARGS&... args)
+constexpr const T& min(const T& a, const T& b, const ARGS&... args)
 {
-    return min(min(a, b), args...);
+    return min(min(a, b), min(args...));
 }
 
 template<typename T>
-inline const T& max(const T& a, const T& b)
+constexpr const T& max(const T& a)
 {
-    return std::max(a, b);
+    return a;
+}
+
+template<typename T>
+constexpr const T& max(const T& a, const T& b)
+{
+    return a < b ? b : a;
 }
 
 template<typename T, typename... ARGS>
-inline const T& max(const T& a, const T& b, const ARGS&... args)
+constexpr const T& max(const T& a, const T& b, const ARGS&... args)
 {
-    return max(max(a, b), args...);
+    return max(max(a, b), max(args...));
 }
 
 template<typename T>
-inline const T& LIM(const T& a, const T& b, const T& c)
+constexpr const T& LIM(const T& a, const T& b, const T& c)
 {
-    return std::max(b, std::min(a, c));
+    return max(b, min(a, c));
 }
 
 template<typename T>
-inline T LIM01(const T& a)
+constexpr T LIM01(const T& a)
 {
-    return std::max(T(0), std::min(a, T(1)));
+    return max(T(0), min(a, T(1)));
 }
 
 template<typename T>
-inline T CLIP(const T& a)
+constexpr T CLIP(const T& a)
 {
     return LIM(a, static_cast<T>(0), static_cast<T>(MAXVAL));
 }
 
 template<typename T>
-inline T intp(T a, T b, T c)
+constexpr T intp(T a, T b, T c)
 {
     // calculate a * b + (1 - a) * c
     // following is valid:
@@ -101,13 +112,13 @@ inline T norm2(const T& x, const T& y)
 template< typename T >
 inline T norminf(const T& x, const T& y)
 {
-    return std::max(std::abs(x), std::abs(y));
+    return max(std::abs(x), std::abs(y));
 }
 
-inline int float2uint16range(float d) // clips input to [0;65535] and rounds
+constexpr int float2uint16range(float d)
 {
-    d = CLIP(d); // clip to [0;65535]
-    return d + 0.5f;
+    // clips input to [0;65535] and rounds
+    return CLIP(d) + 0.5f;
 }
 
 constexpr std::uint8_t uint16ToUint8Rounded(std::uint16_t i)