From 72ee9918587bb69efc3de935768d21ab61d5a498 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 7 Dec 2018 00:39:41 +0100
Subject: [PATCH 1/5] Improve detection of flat regions for calculation of dual
 demosaic contrast threshold

---
 rtengine/rt_algo.cc | 159 +++++++++++++++++++++++++++++++-------------
 1 file changed, 113 insertions(+), 46 deletions(-)

diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc
index af2083d8e..bd45819d9 100644
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -51,6 +51,49 @@ vfloat calcBlendFactor(vfloat valv, vfloat thresholdv) {
     return onev / (onev + xexpf(c16v - c16v * valv / thresholdv));
 }
 #endif
+
+float tileAverage(float **data, size_t tileY, size_t tileX, size_t tilesize) {
+
+#ifdef __SSE2__
+    vfloat avgv = ZEROV;
+    for (size_t y = tileY; y < tileY + tilesize; ++y) {
+        for (size_t x = tileX; x < tileX + tilesize; x += 4) {
+            avgv += LVFU(data[y][x]);
+        }
+    }
+    const float avg = vhadd(avgv);
+#else
+    float avg = 0.f;
+    for (size_t y = tileY; y < tileY + tilesize; ++y) {
+        for (size_t x = tileX; x < tileX + tilesize; ++x) {
+            avg += data[y][x];
+        }
+    }
+#endif
+    return avg / rtengine::SQR(tilesize);
+}
+
+float tileVariance(float **data, size_t tileY, size_t tileX, size_t tilesize, float avg) {
+
+#ifdef __SSE2__
+    vfloat varv = ZEROV;
+    const vfloat avgv = F2V(avg);
+    for (size_t y = tileY; y < tileY + tilesize; ++y) {
+        for (size_t x = tileX; x < tileX + tilesize; x +=4) {
+            varv += SQRV(LVFU(data[y][x]) - avgv);
+        }
+    }
+    const float var = vhadd(varv);
+#else
+    float var = 0.f;
+    for (size_t y = tileY; y < tileY + tilesize; ++y) {
+        for (size_t x = tileX;; x < tileX + tilesize; ++x) {
+            var += rtengine::SQR(data[y][x] - avg);
+        }
+    }
+#endif
+    return var / (rtengine::SQR(tilesize) * avg);
+}
 }
 
 namespace rtengine
@@ -202,6 +245,8 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
         }
     } else {
         if (autoContrast) {
+            constexpr float minLuminance = 2000.f;
+            constexpr float maxLuminance = 20000.f;
             for (int pass = 0; pass < 2; ++pass) {
                 const int tilesize = 80 / (pass + 1);
                 const int skip = pass == 0 ? tilesize : tilesize / 4;
@@ -216,47 +261,14 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                     const int tileY = i * skip;
                     for (int j = 0; j < numTilesW; ++j) {
                         const int tileX = j * skip;
-#ifdef __SSE2__
-                        vfloat avgv = ZEROV;
-                        for (int y = tileY; y < tileY + tilesize; ++y) {
-                            for (int x = tileX; x < tileX + tilesize; x += 4) {
-                                avgv += LVFU(luminance[y][x]);
-                            }
-                        }
-                        float avg = vhadd(avgv);
-#else
-                        float avg = 0.f;
-                        for (int y = tileY; y < tileY + tilesize; ++y) {
-                            for (int x = tileX; x < tileX + tilesize; ++x) {
-                                avg += luminance[y][x];
-                            }
-                        }
-#endif
-                        avg /= SQR(tilesize);
-                        if (avg < 2000.f || avg > 20000.f) {
+                        const float avg = tileAverage(luminance, tileY, tileX, tilesize);
+                        if (avg < minLuminance || avg > maxLuminance) {
                             // too dark or too bright => skip the tile
                             variances[i][j] = RT_INFINITY_F;
                             continue;
+                        } else {
+                            variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
                         }
-#ifdef __SSE2__
-                        vfloat varv = ZEROV;
-                        avgv = F2V(avg);
-                        for (int y = tileY; y < tileY + tilesize; ++y) {
-                            for (int x = tileX; x < tileX + tilesize; x +=4) {
-                                varv += SQRV(LVFU(luminance[y][x]) - avgv);
-                            }
-                        }
-                        float var = vhadd(varv);
-#else
-                        float var = 0.f;
-                        for (int y = tileY; y < tileY + tilesize; ++y) {
-                            for (int x = tileX; x < tileX + tilesize; ++x) {
-                                var += SQR(luminance[y][x] - avg);
-                            }
-                        }
-#endif
-                        var /= (SQR(tilesize) * avg);
-                        variances[i][j] = var;
                     }
                 }
 
@@ -276,17 +288,72 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                 const int minX = skip * minJ;
 
                 if (minvar <= 1.f || pass == 1) {
-                    // a variance <= 1 means we already found a flat region and can skip second pass
-                    // in second pass we allow a variance of 2
-                    JaggedArray<float> Lum(tilesize, tilesize);
-                    JaggedArray<float> Blend(tilesize, tilesize);
-                    for (int i = 0; i < tilesize; ++i) {
-                        for (int j = 0; j < tilesize; ++j) {
-                            Lum[i][j] = luminance[i + minY][j + minX];
+                    if (pass == 0) {
+                        // a variance <= 1 means we already found a flat region and can skip second pass
+                        JaggedArray<float> Lum(tilesize, tilesize);
+                        JaggedArray<float> Blend(tilesize, tilesize);
+                        for (int i = 0; i < tilesize; ++i) {
+                            for (int j = 0; j < tilesize; ++j) {
+                                Lum[i][j] = luminance[i + minY][j + minX];
+                            }
+                        }
+                        contrastThreshold = calcContrastThreshold(Lum, Blend, tilesize, tilesize) / 100.f;
+                        break;
+                    } else {
+                        // in second pass we allow a variance of 4
+                        // we additionally scan the tiles +-skip pixels around the best tile from pass 2
+                        // Means we scan (2 * skip + 1)^2 tiles in this step to get a better hit rate
+                        // fortunately the scan is quite fast, so we use only one core and don't parallelize
+                        const int topLeftYStart = std::max(minY - skip, 0);
+                        const int topLeftXStart = std::max(minX - skip, 0);
+                        const int topLeftYEnd = std::min(minY + skip, H - tilesize);
+                        const int topLeftXEnd = std::min(minX + skip, W - tilesize);
+                        const int numTilesH = topLeftYEnd - topLeftYStart + 1;
+                        const int numTilesW = topLeftXEnd - topLeftXStart + 1;
+
+                        std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
+                        for (int i = 0; i < numTilesH; ++i) {
+                            const int tileY = topLeftYStart + i;
+                            for (int j = 0; j < numTilesW; ++j) {
+                                const int tileX = topLeftXStart + j;
+                                const float avg = tileAverage(luminance, tileY, tileX, tilesize);
+
+                                if (avg < minLuminance || avg > maxLuminance) {
+                                    // too dark or too bright => skip the tile
+                                    variances[i][j] = RT_INFINITY_F;
+                                    continue;
+                                } else {
+                                    variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
+                                }
+                            }
+                        }
+
+                        float minvar = RT_INFINITY_F;
+                        int minI = 0, minJ = 0;
+                        for (int i = 0; i < numTilesH; ++i) {
+                            for (int j = 0; j < numTilesW; ++j) {
+                                if (variances[i][j] < minvar) {
+                                    minvar = variances[i][j];
+                                    minI = i;
+                                    minJ = j;
+                                }
+                            }
+                        }
+                        if (minvar <= 4.f) {
+                            JaggedArray<float> Lum(tilesize, tilesize);
+                            JaggedArray<float> Blend(tilesize, tilesize);
+                            const int minY = topLeftYStart + minI;
+                            const int minX = topLeftXStart + minJ;
+                            for (int i = 0; i < tilesize; ++i) {
+                                for (int j = 0; j < tilesize; ++j) {
+                                    Lum[i][j] = luminance[i + minY][j + minX];
+                                }
+                            }
+                            contrastThreshold = calcContrastThreshold(Lum, Blend, tilesize, tilesize) / 100.f;
+                        } else {
+                            contrastThreshold = 0.f;
                         }
                     }
-                    contrastThreshold = (pass == 0 || minvar <= 4.f) ? calcContrastThreshold(Lum, Blend, tilesize, tilesize) / 100.f : 0.f;
-                    break;
                 }
             }
         }

From d9d8005706cb400cb7afdde3f89eb291ead130e7 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 7 Dec 2018 16:22:24 +0100
Subject: [PATCH 2/5] Some changes as suggested by @Floessie, #5070

---
 rtengine/rt_algo.cc | 472 +++++++++++++++++++++-----------------------
 rtengine/rt_algo.h  |   1 -
 2 files changed, 225 insertions(+), 248 deletions(-)

diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc
index bd45819d9..a2ca3cac4 100644
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <vector>
 #ifdef _OPENMP
@@ -31,7 +32,6 @@
 #include "rt_algo.h"
 #include "rt_math.h"
 #include "sleef.c"
-#include "jaggedarray.h"
 
 namespace {
 float calcBlendFactor(float val, float threshold) {
@@ -54,46 +54,110 @@ vfloat calcBlendFactor(vfloat valv, vfloat thresholdv) {
 
 float tileAverage(float **data, size_t tileY, size_t tileX, size_t tilesize) {
 
+    float avg = 0.f;
 #ifdef __SSE2__
     vfloat avgv = ZEROV;
-    for (size_t y = tileY; y < tileY + tilesize; ++y) {
-        for (size_t x = tileX; x < tileX + tilesize; x += 4) {
+#endif
+    for (std::size_t y = tileY; y < tileY + tilesize; ++y) {
+        std::size_t x = tileX;
+#ifdef __SSE2__
+        for (; x < tileX + tilesize - 3; x += 4) {
             avgv += LVFU(data[y][x]);
         }
-    }
-    const float avg = vhadd(avgv);
-#else
-    float avg = 0.f;
-    for (size_t y = tileY; y < tileY + tilesize; ++y) {
-        for (size_t x = tileX; x < tileX + tilesize; ++x) {
+#endif
+        for (; x < tileX + tilesize; ++x) {
             avg += data[y][x];
         }
     }
+#ifdef __SSE2__
+    avg += vhadd(avgv);
 #endif
     return avg / rtengine::SQR(tilesize);
 }
 
 float tileVariance(float **data, size_t tileY, size_t tileX, size_t tilesize, float avg) {
 
+    float var = 0.f;
 #ifdef __SSE2__
     vfloat varv = ZEROV;
     const vfloat avgv = F2V(avg);
-    for (size_t y = tileY; y < tileY + tilesize; ++y) {
-        for (size_t x = tileX; x < tileX + tilesize; x +=4) {
+#endif
+    for (std::size_t y = tileY; y < tileY + tilesize; ++y) {
+        std::size_t x = tileX;
+#ifdef __SSE2__
+        for (; x < tileX + tilesize - 3; x += 4) {
             varv += SQRV(LVFU(data[y][x]) - avgv);
         }
-    }
-    const float var = vhadd(varv);
-#else
-    float var = 0.f;
-    for (size_t y = tileY; y < tileY + tilesize; ++y) {
-        for (size_t x = tileX;; x < tileX + tilesize; ++x) {
+#endif
+        for (; x < tileX + tilesize; ++x) {
             var += rtengine::SQR(data[y][x] - avg);
         }
     }
+#ifdef __SSE2__
+    var += vhadd(varv);
 #endif
     return var / (rtengine::SQR(tilesize) * avg);
 }
+
+float calcContrastThreshold(float** luminance, int tileY, int tileX, int tilesize) {
+
+    constexpr float scale = 0.0625f / 327.68f;
+    std::vector<std::vector<float>> blend(tilesize - 4, std::vector<float>(tilesize - 4));
+
+#ifdef __SSE2__
+    const vfloat scalev = F2V(scale);
+#endif
+
+    for(int j = tileY + 2; j < tileY + tilesize - 2; ++j) {
+        int i = tileX + 2;
+#ifdef __SSE2__
+        for(; i < tileX + tilesize - 5; i += 4) {
+            vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
+                                      SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
+            STVFU(blend[j - tileY - 2][i - tileX - 2], contrastv);
+        }
+#endif
+        for(; i < tileX + tilesize - 2; ++i) {
+
+            float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + 
+                                   rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
+
+            blend[j - tileY - 2][i - tileX - 2] = contrast;
+        }
+    }
+
+    const float limit = rtengine::SQR(tilesize - 4) / 100.f;
+
+    int c;
+    for (c = 1; c < 100; ++c) {
+        const float contrastThreshold = c / 100.f;
+        float sum = 0.f;
+#ifdef __SSE2__
+        const vfloat contrastThresholdv = F2V(contrastThreshold);
+        vfloat sumv = ZEROV;
+#endif
+
+        for(int j = 0; j < tilesize - 4; ++j) {
+            int i = 0;
+#ifdef __SSE2__
+            for(; i < tilesize - 7; i += 4) {
+                sumv += calcBlendFactor(LVFU(blend[j][i]), contrastThresholdv);
+            }
+#endif
+            for(; i < tilesize - 4; ++i) {
+                sum += calcBlendFactor(blend[j][i], contrastThreshold);
+            }
+        }
+#ifdef __SSE2__
+        sum += vhadd(sumv);
+#endif
+        if (sum <= limit) {
+            break;
+        }
+    }
+
+    return c / 100.f;
+}
 }
 
 namespace rtengine
@@ -237,254 +301,168 @@ void findMinMaxPercentile(const float* data, size_t size, float minPrct, float&
 
 void buildBlendMask(float** luminance, float **blend, int W, int H, float &contrastThreshold, float amount, bool autoContrast) {
 
-    if(contrastThreshold == 0.f && !autoContrast) {
+    if (autoContrast) {
+        constexpr float minLuminance = 2000.f;
+        constexpr float maxLuminance = 20000.f;
+        for (int pass = 0; pass < 2; ++pass) {
+            const int tilesize = 80 / (pass + 1);
+            const int skip = pass == 0 ? tilesize : tilesize / 4;
+            const int numTilesW = W / skip - 3 * pass;
+            const int numTilesH = H / skip - 3 * pass;
+            std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
+
+#ifdef _OPENMP
+            #pragma omp parallel for schedule(dynamic)
+#endif
+            for (int i = 0; i < numTilesH; ++i) {
+                const int tileY = i * skip;
+                for (int j = 0; j < numTilesW; ++j) {
+                    const int tileX = j * skip;
+                    const float avg = tileAverage(luminance, tileY, tileX, tilesize);
+                    if (avg < minLuminance || avg > maxLuminance) {
+                        // too dark or too bright => skip the tile
+                        variances[i][j] = RT_INFINITY_F;
+                        continue;
+                    } else {
+                        variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
+                    }
+                }
+            }
+
+            float minvar = RT_INFINITY_F;
+            int minI = 0, minJ = 0;
+            for (int i = 0; i < numTilesH; ++i) {
+                for (int j = 0; j < numTilesW; ++j) {
+                    if (variances[i][j] < minvar) {
+                        minvar = variances[i][j];
+                        minI = i;
+                        minJ = j;
+                    }
+                }
+            }
+
+            const int minY = skip * minI;
+            const int minX = skip * minJ;
+
+            if (minvar <= 1.f || pass == 1) {
+                if (pass == 0) {
+                    // a variance <= 1 means we already found a flat region and can skip second pass
+                    contrastThreshold = calcContrastThreshold(luminance, minY, minX, tilesize);
+                    break;
+                } else {
+                    // in second pass we allow a variance of 4
+                    // we additionally scan the tiles +-skip pixels around the best tile from pass 2
+                    // Means we scan (2 * skip + 1)^2 tiles in this step to get a better hit rate
+                    // fortunately the scan is quite fast, so we use only one core and don't parallelize
+                    const int topLeftYStart = std::max(minY - skip, 0);
+                    const int topLeftXStart = std::max(minX - skip, 0);
+                    const int topLeftYEnd = std::min(minY + skip, H - tilesize);
+                    const int topLeftXEnd = std::min(minX + skip, W - tilesize);
+                    const int numTilesH = topLeftYEnd - topLeftYStart + 1;
+                    const int numTilesW = topLeftXEnd - topLeftXStart + 1;
+
+                    std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
+                    for (int i = 0; i < numTilesH; ++i) {
+                        const int tileY = topLeftYStart + i;
+                        for (int j = 0; j < numTilesW; ++j) {
+                            const int tileX = topLeftXStart + j;
+                            const float avg = tileAverage(luminance, tileY, tileX, tilesize);
+
+                            if (avg < minLuminance || avg > maxLuminance) {
+                                // too dark or too bright => skip the tile
+                                variances[i][j] = RT_INFINITY_F;
+                                continue;
+                            } else {
+                                variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
+                            }
+                        }
+                    }
+
+                    float minvar = RT_INFINITY_F;
+                    int minI = 0, minJ = 0;
+                    for (int i = 0; i < numTilesH; ++i) {
+                        for (int j = 0; j < numTilesW; ++j) {
+                            if (variances[i][j] < minvar) {
+                                minvar = variances[i][j];
+                                minI = i;
+                                minJ = j;
+                            }
+                        }
+                    }
+                    contrastThreshold = minvar <= 4.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f;
+                }
+            }
+        }
+    }
+
+    if(contrastThreshold == 0.f) {
         for(int j = 0; j < H; ++j) {
             for(int i = 0; i < W; ++i) {
                 blend[j][i] = amount;
             }
         }
     } else {
-        if (autoContrast) {
-            constexpr float minLuminance = 2000.f;
-            constexpr float maxLuminance = 20000.f;
-            for (int pass = 0; pass < 2; ++pass) {
-                const int tilesize = 80 / (pass + 1);
-                const int skip = pass == 0 ? tilesize : tilesize / 4;
-                const int numTilesW = W / skip - 3 * pass;
-                const int numTilesH = H / skip - 3 * pass;
-                std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
-
+        constexpr float scale = 0.0625f / 327.68f;
 #ifdef _OPENMP
-                #pragma omp parallel for schedule(dynamic)
+        #pragma omp parallel
+#endif
+        {
+#ifdef __SSE2__
+            const vfloat contrastThresholdv = F2V(contrastThreshold);
+            const vfloat scalev = F2V(scale);
+            const vfloat amountv = F2V(amount);
 #endif
-                for (int i = 0; i < numTilesH; ++i) {
-                    const int tileY = i * skip;
-                    for (int j = 0; j < numTilesW; ++j) {
-                        const int tileX = j * skip;
-                        const float avg = tileAverage(luminance, tileY, tileX, tilesize);
-                        if (avg < minLuminance || avg > maxLuminance) {
-                            // too dark or too bright => skip the tile
-                            variances[i][j] = RT_INFINITY_F;
-                            continue;
-                        } else {
-                            variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
-                        }
-                    }
-                }
-
-                float minvar = RT_INFINITY_F;
-                int minI = 0, minJ = 0;
-                for (int i = 0; i < numTilesH; ++i) {
-                    for (int j = 0; j < numTilesW; ++j) {
-                        if (variances[i][j] < minvar) {
-                            minvar = variances[i][j];
-                            minI = i;
-                            minJ = j;
-                        }
-                    }
-                }
-
-                const int minY = skip * minI;
-                const int minX = skip * minJ;
-
-                if (minvar <= 1.f || pass == 1) {
-                    if (pass == 0) {
-                        // a variance <= 1 means we already found a flat region and can skip second pass
-                        JaggedArray<float> Lum(tilesize, tilesize);
-                        JaggedArray<float> Blend(tilesize, tilesize);
-                        for (int i = 0; i < tilesize; ++i) {
-                            for (int j = 0; j < tilesize; ++j) {
-                                Lum[i][j] = luminance[i + minY][j + minX];
-                            }
-                        }
-                        contrastThreshold = calcContrastThreshold(Lum, Blend, tilesize, tilesize) / 100.f;
-                        break;
-                    } else {
-                        // in second pass we allow a variance of 4
-                        // we additionally scan the tiles +-skip pixels around the best tile from pass 2
-                        // Means we scan (2 * skip + 1)^2 tiles in this step to get a better hit rate
-                        // fortunately the scan is quite fast, so we use only one core and don't parallelize
-                        const int topLeftYStart = std::max(minY - skip, 0);
-                        const int topLeftXStart = std::max(minX - skip, 0);
-                        const int topLeftYEnd = std::min(minY + skip, H - tilesize);
-                        const int topLeftXEnd = std::min(minX + skip, W - tilesize);
-                        const int numTilesH = topLeftYEnd - topLeftYStart + 1;
-                        const int numTilesW = topLeftXEnd - topLeftXStart + 1;
-
-                        std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
-                        for (int i = 0; i < numTilesH; ++i) {
-                            const int tileY = topLeftYStart + i;
-                            for (int j = 0; j < numTilesW; ++j) {
-                                const int tileX = topLeftXStart + j;
-                                const float avg = tileAverage(luminance, tileY, tileX, tilesize);
-
-                                if (avg < minLuminance || avg > maxLuminance) {
-                                    // too dark or too bright => skip the tile
-                                    variances[i][j] = RT_INFINITY_F;
-                                    continue;
-                                } else {
-                                    variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
-                                }
-                            }
-                        }
-
-                        float minvar = RT_INFINITY_F;
-                        int minI = 0, minJ = 0;
-                        for (int i = 0; i < numTilesH; ++i) {
-                            for (int j = 0; j < numTilesW; ++j) {
-                                if (variances[i][j] < minvar) {
-                                    minvar = variances[i][j];
-                                    minI = i;
-                                    minJ = j;
-                                }
-                            }
-                        }
-                        if (minvar <= 4.f) {
-                            JaggedArray<float> Lum(tilesize, tilesize);
-                            JaggedArray<float> Blend(tilesize, tilesize);
-                            const int minY = topLeftYStart + minI;
-                            const int minX = topLeftXStart + minJ;
-                            for (int i = 0; i < tilesize; ++i) {
-                                for (int j = 0; j < tilesize; ++j) {
-                                    Lum[i][j] = luminance[i + minY][j + minX];
-                                }
-                            }
-                            contrastThreshold = calcContrastThreshold(Lum, Blend, tilesize, tilesize) / 100.f;
-                        } else {
-                            contrastThreshold = 0.f;
-                        }
-                    }
-                }
-            }
-        }
-
-        if(contrastThreshold == 0.f) {
-            for(int j = 0; j < H; ++j) {
-                for(int i = 0; i < W; ++i) {
-                    blend[j][i] = amount;
-                }
-            }
-        } else {
-            constexpr float scale = 0.0625f / 327.68f;
 #ifdef _OPENMP
-            #pragma omp parallel
+            #pragma omp for schedule(dynamic,16)
+#endif
+
+            for(int j = 2; j < H - 2; ++j) {
+                int i = 2;
+#ifdef __SSE2__
+                for(; i < W - 5; i += 4) {
+                    vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
+                                              SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
+
+                    STVFU(blend[j][i], amountv * calcBlendFactor(contrastv, contrastThresholdv));
+                }
+#endif
+                for(; i < W - 2; ++i) {
+
+                    float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + 
+                                           rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
+
+                    blend[j][i] = amount * calcBlendFactor(contrast, contrastThreshold);
+                }
+            }
+
+#ifdef _OPENMP
+            #pragma omp single
 #endif
             {
-#ifdef __SSE2__
-                const vfloat contrastThresholdv = F2V(contrastThreshold);
-                const vfloat scalev = F2V(scale);
-                const vfloat amountv = F2V(amount);
-#endif
-#ifdef _OPENMP
-                #pragma omp for schedule(dynamic,16)
-#endif
-
-                for(int j = 2; j < H - 2; ++j) {
-                    int i = 2;
-#ifdef __SSE2__
-                    for(; i < W - 5; i += 4) {
-                        vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
-                                                  SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
-
-                        STVFU(blend[j][i], amountv * calcBlendFactor(contrastv, contrastThresholdv));
-                    }
-#endif
-                    for(; i < W - 2; ++i) {
-
-                        float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + 
-                                               rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
-
-                        blend[j][i] = amount * calcBlendFactor(contrast, contrastThreshold);
+                // upper border
+                for(int j = 0; j < 2; ++j) {
+                    for(int i = 2; i < W - 2; ++i) {
+                        blend[j][i] = blend[2][i];
                     }
                 }
-
-#ifdef _OPENMP
-                #pragma omp single
-#endif
-                {
-                    // upper border
-                    for(int j = 0; j < 2; ++j) {
-                        for(int i = 2; i < W - 2; ++i) {
-                            blend[j][i] = blend[2][i];
-                        }
-                    }
-                    // lower border
-                    for(int j = H - 2; j < H; ++j) {
-                        for(int i = 2; i < W - 2; ++i) {
-                            blend[j][i] = blend[H-3][i];
-                        }
-                    }
-                    for(int j = 0; j < H; ++j) {
-                        // left border
-                        blend[j][0] = blend[j][1] = blend[j][2];
-                        // right border
-                        blend[j][W - 2] = blend[j][W - 1] = blend[j][W - 3];
+                // lower border
+                for(int j = H - 2; j < H; ++j) {
+                    for(int i = 2; i < W - 2; ++i) {
+                        blend[j][i] = blend[H-3][i];
                     }
                 }
-
-                // blur blend mask to smooth transitions
-                gaussianBlur(blend, blend, W, H, 2.0);
+                for(int j = 0; j < H; ++j) {
+                    // left border
+                    blend[j][0] = blend[j][1] = blend[j][2];
+                    // right border
+                    blend[j][W - 2] = blend[j][W - 1] = blend[j][W - 3];
+                }
             }
+
+            // blur blend mask to smooth transitions
+            gaussianBlur(blend, blend, W, H, 2.0);
         }
     }
 }
 
-int calcContrastThreshold(float** luminance, float **blend, int W, int H) {
-
-    constexpr float scale = 0.0625f / 327.68f;
-
-#ifdef __SSE2__
-    const vfloat scalev = F2V(scale);
-#endif
-
-    for(int j = 2; j < H - 2; ++j) {
-        int i = 2;
-#ifdef __SSE2__
-        for(; i < W - 5; i += 4) {
-            vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
-                                      SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
-            STVFU(blend[j -2 ][i - 2], contrastv);
-        }
-#endif
-        for(; i < W - 2; ++i) {
-
-            float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + 
-                                   rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
-
-            blend[j -2][i- 2] = contrast;
-        }
-    }
-
-    const float limit = (W - 4) * (H - 4) / 100.f;
-
-    int c;
-    for (c = 1; c < 100; ++c) {
-        const float contrastThreshold = c / 100.f;
-        float sum = 0.f;
-#ifdef __SSE2__
-        const vfloat contrastThresholdv = F2V(contrastThreshold);
-        vfloat sumv = ZEROV;
-#endif
-
-        for(int j = 0; j < H - 4; ++j) {
-            int i = 0;
-#ifdef __SSE2__
-            for(; i < W - 7; i += 4) {
-                sumv += calcBlendFactor(LVFU(blend[j][i]), contrastThresholdv);
-            }
-#endif
-            for(; i < W - 4; ++i) {
-                sum += calcBlendFactor(blend[j][i], contrastThreshold);
-            }
-        }
-#ifdef __SSE2__
-        sum += vhadd(sumv);
-#endif
-        if (sum <= limit) {
-            break;
-        }
-    }
-
-    return c;
-}
 }
diff --git a/rtengine/rt_algo.h b/rtengine/rt_algo.h
index 0207e6f57..a8e2e3e23 100644
--- a/rtengine/rt_algo.h
+++ b/rtengine/rt_algo.h
@@ -25,5 +25,4 @@ namespace rtengine
 {
 void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multiThread = true);
 void buildBlendMask(float** luminance, float **blend, int W, int H, float &contrastThreshold, float amount = 1.f, bool autoContrast = false);
-int calcContrastThreshold(float** luminance, float **blend, int W, int H);
 }

From 3f606c776a06a42f6851bec4142d4a7da6e9d09d Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 7 Dec 2018 22:07:56 +0100
Subject: [PATCH 3/5] exclude tiles with too low variance from calculation of
 dual demosaic contrast threshold

---
 rtengine/rt_algo.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc
index a2ca3cac4..a304b654f 100644
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -304,6 +304,7 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
     if (autoContrast) {
         constexpr float minLuminance = 2000.f;
         constexpr float maxLuminance = 20000.f;
+        constexpr float minTileVariance = 0.5f;
         for (int pass = 0; pass < 2; ++pass) {
             const int tilesize = 80 / (pass + 1);
             const int skip = pass == 0 ? tilesize : tilesize / 4;
@@ -325,6 +326,8 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                         continue;
                     } else {
                         variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
+                        // exclude tiles with a variance less than minTileVariance
+                        variances[i][j] = variances[i][j] < minTileVariance ? RT_INFINITY_F : variances[i][j];
                     }
                 }
             }
@@ -341,10 +344,9 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                 }
             }
 
-            const int minY = skip * minI;
-            const int minX = skip * minJ;
-
             if (minvar <= 1.f || pass == 1) {
+                const int minY = skip * minI;
+                const int minX = skip * minJ;
                 if (pass == 0) {
                     // a variance <= 1 means we already found a flat region and can skip second pass
                     contrastThreshold = calcContrastThreshold(luminance, minY, minX, tilesize);
@@ -374,6 +376,8 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                                 continue;
                             } else {
                                 variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
+                            // exclude tiles with a variance less than minTileVariance
+                            variances[i][j] = variances[i][j] < minTileVariance ? RT_INFINITY_F : variances[i][j];
                             }
                         }
                     }

From 1d010029538833bd21af3b3d9ae1d19211ec9474 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sat, 8 Dec 2018 17:42:31 +0100
Subject: [PATCH 4/5] Added console output, #5070

---
 rtengine/rt_algo.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc
index a304b654f..60042cba4 100644
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -32,7 +32,7 @@
 #include "rt_algo.h"
 #include "rt_math.h"
 #include "sleef.c"
-
+#include <iostream>
 namespace {
 float calcBlendFactor(float val, float threshold) {
     // sigmoid function
@@ -343,7 +343,7 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                     }
                 }
             }
-
+            std::cout << pass + 1 << " : " << minvar << std::endl;
             if (minvar <= 1.f || pass == 1) {
                 const int minY = skip * minI;
                 const int minX = skip * minJ;
@@ -393,6 +393,7 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                             }
                         }
                     }
+                    std::cout << 3 << " : " << minvar << std::endl;
                     contrastThreshold = minvar <= 4.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f;
                 }
             }

From 872279ae97a34a9290d42f080007e7c3c34f0cff Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sun, 9 Dec 2018 22:39:00 +0100
Subject: [PATCH 5/5] Remove some console output before merge

---
 rtengine/rt_algo.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc
index 60042cba4..1011ae7b7 100644
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -32,7 +32,7 @@
 #include "rt_algo.h"
 #include "rt_math.h"
 #include "sleef.c"
-#include <iostream>
+
 namespace {
 float calcBlendFactor(float val, float threshold) {
     // sigmoid function
@@ -343,7 +343,7 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                     }
                 }
             }
-            std::cout << pass + 1 << " : " << minvar << std::endl;
+
             if (minvar <= 1.f || pass == 1) {
                 const int minY = skip * minI;
                 const int minX = skip * minJ;
@@ -393,7 +393,7 @@ void buildBlendMask(float** luminance, float **blend, int W, int H, float &contr
                             }
                         }
                     }
-                    std::cout << 3 << " : " << minvar << std::endl;
+
                     contrastThreshold = minvar <= 4.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f;
                 }
             }