Merge pull request #6064 from Beep6581/rcd_new

Rcd improvements
2021-01-19 10:59:18 +01:00
parent 26a97468d9 d607028871
commit 049a6143f0
1 changed files with 82 additions and 98 deletions
--- a/rtengine/rcd_demosaic.cc
+++ b/rtengine/rcd_demosaic.cc
@@ -21,7 +21,6 @@
 #include "rawimagesource.h"
 #include "rt_math.h"
 #include "../rtgui/multilangmgr.h"
-#include "opthelper.h"
 #include "StopWatch.h"

 using namespace std;
@@ -36,7 +35,7 @@ unsigned fc(const unsigned int cfa[2][2], int r, int c) {
 namespace rtengine
 {

-/**
+/*
 * RATIO CORRECTED DEMOSAICING
 * Luis Sanz Rodriguez (luis.sanz.rodriguez(at)gmail(dot)com)
 *
@@ -45,7 +44,12 @@ namespace rtengine
 * Original code from https://github.com/LuisSR/RCD-Demosaicing
 * Licensed under the GNU GPL version 3
 */
+
 // Tiled version by Ingo Weyrich (heckflosse67@gmx.de)
+// Luis Sanz Rodriguez significantly optimised the v 2.3 code and simplified the directional
+// coefficients in an exact, shorter and more performant formula.
+// In cooperation with Hanno Schwalm (hanno@schwalm-bremen.de) and Luis Sanz Rodriguez this has been tuned for performance.
+
 void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
 {
    // Test for RGB cfa
@@ -61,7 +65,6 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
    }

    std::unique_ptr<StopWatch> stop;
-
    if (measure) {
        std::cout << "Demosaicing " << W << "x" << H << " image using rcd with " << chunkSize << " tiles per thread" << std::endl;
        stop.reset(new StopWatch("rcd demosaic"));
@@ -75,9 +78,10 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
    }
    
    const unsigned int cfarray[2][2] = {{FC(0,0), FC(0,1)}, {FC(1,0), FC(1,1)}};
-    constexpr int rcdBorder = 9;
-    constexpr int tileSize = 214;
-    constexpr int tileSizeN = tileSize - 2 * rcdBorder;
+    constexpr int tileBorder = 9; // avoid tile-overlap errors
+    constexpr int rcdBorder = 6;  // for the outermost tiles we can have a smaller outer border
+    constexpr int tileSize = 194;
+    constexpr int tileSizeN = tileSize - 2 * tileBorder;
    const int numTh = H / (tileSizeN) + ((H % (tileSizeN)) ? 1 : 0);
    const int numTw = W / (tileSizeN) + ((W % (tileSizeN)) ? 1 : 0);
    constexpr int w1 = tileSize, w2 = 2 * tileSize, w3 = 3 * tileSize, w4 = 4 * tileSize;
@@ -96,6 +100,8 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
    float *const VH_Dir = (float*) calloc(tileSize * tileSize, sizeof *VH_Dir);
    float *const PQ_Dir = (float*) calloc(tileSize * tileSize / 2, sizeof *PQ_Dir);
    float *const lpf = PQ_Dir; // reuse buffer, they don't overlap in usage
+    float *const P_CDiff_Hpf = (float*) calloc(tileSize * tileSize / 2, sizeof *P_CDiff_Hpf);
+    float *const Q_CDiff_Hpf = (float*) calloc(tileSize * tileSize / 2, sizeof *Q_CDiff_Hpf);

 #ifdef _OPENMP
    #pragma omp for schedule(dynamic, chunkSize) collapse(2) nowait
@@ -104,12 +110,12 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
        for (int tc = 0; tc < numTw; ++tc) {
            const int rowStart = tr * tileSizeN;
            const int rowEnd = std::min(rowStart + tileSize, H);
-            if (rowStart + rcdBorder == rowEnd - rcdBorder) {
+            if (rowStart + tileBorder == rowEnd - tileBorder) {
                continue;
            }
            const int colStart = tc * tileSizeN;
            const int colEnd = std::min(colStart + tileSize, W);
-            if (colStart + rcdBorder == colEnd - rcdBorder) {
+            if (colStart + tileBorder == colEnd - tileBorder) {
                continue;
            }

@@ -117,93 +123,59 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
            const int tilecols = std::min(colEnd - colStart, tileSize);

            for (int row = rowStart; row < rowEnd; row++) {
-                int indx = (row - rowStart) * tileSize;
                const int c0 = fc(cfarray, row, colStart);
                const int c1 = fc(cfarray, row, colStart + 1);
-                int col = colStart;
-                for (; col < colEnd - 1; col+=2, indx+=2) {
-                    cfa[indx] = rgb[c0][indx] = LIM01(rawData[row][col] / scale);
-                    cfa[indx + 1] = rgb[c1][indx + 1] = LIM01(rawData[row][col + 1] / scale);
-                }
-                if (col < colEnd) {
-                    cfa[indx] = rgb[c0][indx] = LIM01(rawData[row][col] / scale);
+                for (int col = colStart, indx = (row - rowStart) * tileSize; col < colEnd; ++col, ++indx) {
+                    cfa[indx] = rgb[c0][indx] = rgb[c1][indx] = LIM01(rawData[row][col] / scale);
                }
            }

-            /**
-            * STEP 1: Find cardinal and diagonal interpolation directions
-            */
+            // Step 1: Find cardinal and diagonal interpolation directions
+            float bufferV[3][tileSize - 8];

-            for (int row = 4; row < tileRows - 4; row++) {
-                for (int col = 4, indx = row * tileSize + col; col < tilecols - 4; col++, indx++) {
-                    const float cfai = cfa[indx];
-                    //Calculate h/v local discrimination
-                    float V_Stat = std::max(epssq, -18.f * cfai * (cfa[indx - w1] + cfa[indx + w1] + 2.f * (cfa[indx - w2] + cfa[indx + w2]) - cfa[indx - w3] - cfa[indx + w3]) - 2.f * cfai * (cfa[indx - w4] + cfa[indx + w4] - 19.f * cfai) - cfa[indx - w1] * (70.f * cfa[indx + w1] + 12.f * cfa[indx - w2] - 24.f * cfa[indx + w2] + 38.f * cfa[indx - w3] - 16.f * cfa[indx + w3] - 12.f * cfa[indx - w4] + 6.f * cfa[indx + w4] - 46.f * cfa[indx - w1]) + cfa[indx + w1] * (24.f * cfa[indx - w2] - 12.f * cfa[indx + w2] + 16.f * cfa[indx - w3] - 38.f * cfa[indx + w3] - 6.f * cfa[indx - w4] + 12.f * cfa[indx + w4] + 46.f * cfa[indx + w1]) + cfa[indx - w2] * (14.f * cfa[indx + w2] - 12.f * cfa[indx + w3] - 2.f * cfa[indx - w4] + 2.f * cfa[indx + w4] + 11.f * cfa[indx - w2]) + cfa[indx + w2] * (-12.f * cfa[indx - w3] + 2.f * (cfa[indx - w4] - cfa[indx + w4]) + 11.f * cfa[indx + w2]) + cfa[indx - w3] * (2.f * cfa[indx + w3] - 6.f * cfa[indx - w4] + 10.f * cfa[indx - w3]) + cfa[indx + w3] * (-6.f * cfa[indx + w4] + 10.f * cfa[indx + w3]) + cfa[indx - w4] * cfa[indx - w4] + cfa[indx + w4] * cfa[indx + w4]);
-                    float H_Stat = std::max(epssq, -18.f * cfai * (cfa[indx -  1] + cfa[indx +  1] + 2.f * (cfa[indx -  2] + cfa[indx +  2]) - cfa[indx -  3] - cfa[indx +  3]) - 2.f * cfai * (cfa[indx -  4] + cfa[indx +  4] - 19.f * cfai) - cfa[indx -  1] * (70.f * cfa[indx +  1] + 12.f * cfa[indx -  2] - 24.f * cfa[indx +  2] + 38.f * cfa[indx -  3] - 16.f * cfa[indx +  3] - 12.f * cfa[indx -  4] + 6.f * cfa[indx +  4] - 46.f * cfa[indx -  1]) + cfa[indx +  1] * (24.f * cfa[indx -  2] - 12.f * cfa[indx +  2] + 16.f * cfa[indx -  3] - 38.f * cfa[indx +  3] - 6.f * cfa[indx -  4] + 12.f * cfa[indx +  4] + 46.f * cfa[indx +  1]) + cfa[indx -  2] * (14.f * cfa[indx +  2] - 12.f * cfa[indx +  3] - 2.f * cfa[indx -  4] + 2.f * cfa[indx +  4] + 11.f * cfa[indx -  2]) + cfa[indx +  2] * (-12.f * cfa[indx -  3] + 2.f * (cfa[indx -  4] - cfa[indx +  4]) + 11.f * cfa[indx +  2]) + cfa[indx -  3] * (2.f * cfa[indx +  3] - 6.f * cfa[indx -  4] + 10.f * cfa[indx -  3]) + cfa[indx +  3] * (-6.f * cfa[indx +  4] + 10.f * cfa[indx +  3]) + cfa[indx -  4] * cfa[indx -  4] + cfa[indx +  4] * cfa[indx +  4]);
+            // Step 1.1: Calculate the square of the vertical and horizontal color difference high pass filter
+            for (int row = 3; row < std::min(tileRows - 3, 5); ++row) {
+                for (int col = 4, indx = row * tileSize + col; col < tilecols - 4; ++col, ++indx) {
+                    bufferV[row - 3][col - 4] = SQR((cfa[indx - w3] - cfa[indx - w1] - cfa[indx + w1] + cfa[indx + w3]) - 3.f * (cfa[indx - w2] + cfa[indx + w2])  + 6.f * cfa[indx]);
+                }
+            }
+
+            // Step 1.2: Obtain the vertical and horizontal directional discrimination strength
+            float bufferH[tileSize - 6] ALIGNED16;
+            float* V0 = bufferV[0];
+            float* V1 = bufferV[1];
+            float* V2 = bufferV[2];
+            for (int row = 4; row < tileRows - 4; ++row) {
+                for (int col = 3, indx = row * tileSize + col; col < tilecols - 3; ++col, ++indx) {
+                    bufferH[col - 3] = SQR((cfa[indx -  3] - cfa[indx -  1] - cfa[indx +  1] + cfa[indx +  3]) - 3.f * (cfa[indx -  2] + cfa[indx +  2]) + 6.f * cfa[indx]);
+                }
+                for (int col = 4, indx = (row + 1) * tileSize + col; col < tilecols - 4; ++col, ++indx) {
+                    V2[col - 4] = SQR((cfa[indx - w3] - cfa[indx - w1] - cfa[indx + w1] + cfa[indx + w3]) - 3.f * (cfa[indx - w2] + cfa[indx + w2])  + 6.f * cfa[indx]);
+                }
+                for (int col = 4, indx = row * tileSize + col; col < tilecols - 4; ++col, ++indx) {
+
+                    float V_Stat = std::max(epssq, V0[col - 4] + V1[col - 4] + V2[col - 4]);
+                    float H_Stat = std::max(epssq, bufferH[col -  4] + bufferH[col - 3] + bufferH[col -  2]);

                    VH_Dir[indx] = V_Stat / (V_Stat + H_Stat);
                }
+                // rotate pointers from row0, row1, row2 to row1, row2, row0
+                std::swap(V0, V2);
+                std::swap(V0, V1);
            }

-            /**
-            * STEP 2: Calculate the low pass filter
-            */
-            // Step 2.1: Low pass filter incorporating green, red and blue local samples from the raw data
-
-            for (int row = 2; row < tileRows - 2; row++) {
+            // Step 2: Low pass filter incorporating green, red and blue local samples from the raw data
+            for (int row = 2; row < tileRows - 2; ++row) {
                for (int col = 2 + (fc(cfarray, row, 0) & 1), indx = row * tileSize + col, lpindx = indx / 2; col < tilecols - 2; col += 2, indx += 2, ++lpindx) {
-                    lpf[lpindx] = 0.25f * cfa[indx] +
-                                  0.125f * (cfa[indx - w1] + cfa[indx + w1] + cfa[indx - 1] + cfa[indx + 1]) +
-                                  0.0625f * (cfa[indx - w1 - 1] + cfa[indx - w1 + 1] + cfa[indx + w1 - 1] + cfa[indx + w1 + 1]);
+                    lpf[lpindx] = cfa[indx] +
+                                  0.5f * (cfa[indx - w1] + cfa[indx + w1] + cfa[indx - 1] + cfa[indx + 1]) +
+                                  0.25f * (cfa[indx - w1 - 1] + cfa[indx - w1 + 1] + cfa[indx + w1 - 1] + cfa[indx + w1 + 1]);
                }
            }

-            /**
-            * STEP 3: Populate the green channel
-            */
-            // Step 3.1: Populate the green channel at blue and red CFA positions
-            for (int row = 4; row < tileRows - 4; row++) {
-                int col = 4 + (fc(cfarray, row, 0) & 1);
-                int indx = row * tileSize + col;
-                int lpindx = indx / 2;
-#ifdef __SSE2__
-                const vfloat zd5v = F2V(0.5f);
-                const vfloat zd25v = F2V(0.25f);
-                const vfloat epsv = F2V(eps);
-                for (; col < tilecols - 7; col += 8, indx += 8, lpindx += 4) {
-                   // Cardinal gradients
-                    const vfloat cfai = LC2VFU(cfa[indx]);
-                    const vfloat N_Grad = epsv + (vabsf(LC2VFU(cfa[indx - w1]) - LC2VFU(cfa[indx + w1])) + vabsf(cfai - LC2VFU(cfa[indx - w2]))) + (vabsf(LC2VFU(cfa[indx - w1]) - LC2VFU(cfa[indx - w3])) + vabsf(LC2VFU(cfa[indx - w2]) - LC2VFU(cfa[indx - w4])));
-                    const vfloat S_Grad = epsv + (vabsf(LC2VFU(cfa[indx - w1]) - LC2VFU(cfa[indx + w1])) + vabsf(cfai - LC2VFU(cfa[indx + w2]))) + (vabsf(LC2VFU(cfa[indx + w1]) - LC2VFU(cfa[indx + w3])) + vabsf(LC2VFU(cfa[indx + w2]) - LC2VFU(cfa[indx + w4])));
-                    const vfloat W_Grad = epsv + (vabsf(LC2VFU(cfa[indx -  1]) - LC2VFU(cfa[indx +  1])) + vabsf(cfai - LC2VFU(cfa[indx -  2]))) + (vabsf(LC2VFU(cfa[indx -  1]) - LC2VFU(cfa[indx -  3])) + vabsf(LC2VFU(cfa[indx -  2]) - LC2VFU(cfa[indx -  4])));
-                    const vfloat E_Grad = epsv + (vabsf(LC2VFU(cfa[indx -  1]) - LC2VFU(cfa[indx +  1])) + vabsf(cfai - LC2VFU(cfa[indx +  2]))) + (vabsf(LC2VFU(cfa[indx +  1]) - LC2VFU(cfa[indx +  3])) + vabsf(LC2VFU(cfa[indx +  2]) - LC2VFU(cfa[indx +  4])));
-
-                    // Cardinal pixel estimations
-                    const vfloat lpfi = LVFU(lpf[lpindx]);
-                    const vfloat N_Est = LC2VFU(cfa[indx - w1]) + (LC2VFU(cfa[indx - w1]) * (lpfi - LVFU(lpf[lpindx - w1])) / (epsv + lpfi + LVFU(lpf[lpindx - w1])));
-                    const vfloat S_Est = LC2VFU(cfa[indx + w1]) + (LC2VFU(cfa[indx + w1]) * (lpfi - LVFU(lpf[lpindx + w1])) / (epsv + lpfi + LVFU(lpf[lpindx + w1])));
-                    const vfloat W_Est = LC2VFU(cfa[indx -  1]) + (LC2VFU(cfa[indx -  1]) * (lpfi - LVFU(lpf[lpindx -  1])) / (epsv + lpfi + LVFU(lpf[lpindx -  1])));
-                    const vfloat E_Est = LC2VFU(cfa[indx +  1]) + (LC2VFU(cfa[indx +  1]) * (lpfi - LVFU(lpf[lpindx +  1])) / (epsv + lpfi + LVFU(lpf[lpindx +  1])));
-
-                    // Vertical and horizontal estimations
-                    const vfloat V_Est = (S_Grad * N_Est + N_Grad * S_Est) / (N_Grad + S_Grad);
-                    const vfloat H_Est = (W_Grad * E_Est + E_Grad * W_Est) / (E_Grad + W_Grad);
-
-                    // G@B and G@R interpolation
-                    // Refined vertical and horizontal local discrimination
-                    const vfloat VH_Central_Value = LC2VFU(VH_Dir[indx]);
-                    const vfloat VH_Neighbourhood_Value = zd25v * ((LC2VFU(VH_Dir[indx - w1 - 1]) + LC2VFU(VH_Dir[indx - w1 + 1])) + (LC2VFU(VH_Dir[indx + w1 - 1]) + LC2VFU(VH_Dir[indx + w1 + 1])));
-
-#if defined(__clang__)
-                    const vfloat VH_Disc = vself(vmaskf_lt(vabsf(zd5v - VH_Central_Value), vabsf(zd5v - VH_Neighbourhood_Value)), VH_Neighbourhood_Value, VH_Central_Value);
-#else
-                    const vfloat VH_Disc = vabsf(zd5v - VH_Central_Value) < vabsf(zd5v - VH_Neighbourhood_Value) ? VH_Neighbourhood_Value : VH_Central_Value;
-#endif
-                    const vfloat result = vintpf(VH_Disc, H_Est, V_Est);
-                    STC2VFU(rgb[1][indx], result);
-               }
-#endif
-               for (; col < tilecols - 4; col += 2, indx += 2, ++lpindx) {
+            // Step 3: Populate the green channel at blue and red CFA positions
+            for (int row = 4; row < tileRows - 4; ++row) {
+                for (int col = 4 + (fc(cfarray, row, 0) & 1), indx = row * tileSize + col, lpindx = indx / 2; col < tilecols - 4; col += 2, indx += 2, ++lpindx) {
                    // Cardinal gradients
                    const float cfai = cfa[indx];
                    const float N_Grad = eps + (std::fabs(cfa[indx - w1] - cfa[indx + w1]) + std::fabs(cfai - cfa[indx - w2])) + (std::fabs(cfa[indx - w1] - cfa[indx - w3]) + std::fabs(cfa[indx - w2] - cfa[indx - w4]));
@@ -213,10 +185,10 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)

                    // Cardinal pixel estimations
                    const float lpfi = lpf[lpindx];
-                    const float N_Est = cfa[indx - w1] * (1.f + (lpfi - lpf[lpindx - w1]) / (eps + lpfi + lpf[lpindx - w1]));
-                    const float S_Est = cfa[indx + w1] * (1.f + (lpfi - lpf[lpindx + w1]) / (eps + lpfi + lpf[lpindx + w1]));
-                    const float W_Est = cfa[indx -  1] * (1.f + (lpfi - lpf[lpindx -  1]) / (eps + lpfi + lpf[lpindx -  1]));
-                    const float E_Est = cfa[indx +  1] * (1.f + (lpfi - lpf[lpindx +  1]) / (eps + lpfi + lpf[lpindx +  1]));
+                    const float N_Est = cfa[indx - w1] * (lpfi + lpfi) / (eps + lpfi + lpf[lpindx - w1]);
+                    const float S_Est = cfa[indx + w1] * (lpfi + lpfi) / (eps + lpfi + lpf[lpindx + w1]);
+                    const float W_Est = cfa[indx -  1] * (lpfi + lpfi) / (eps + lpfi + lpf[lpindx -  1]);
+                    const float E_Est = cfa[indx +  1] * (lpfi + lpfi) / (eps + lpfi + lpf[lpindx +  1]);

                    // Vertical and horizontal estimations
                    const float V_Est = (S_Grad * N_Est + N_Grad * S_Est) / (N_Grad + S_Grad);
@@ -236,21 +208,26 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
            * STEP 4: Populate the red and blue channels
            */

-            // Step 4.1: Calculate P/Q diagonal local discrimination
-            for (int row = rcdBorder - 4; row < tileRows - rcdBorder + 4; row++) {
-                for (int col = rcdBorder - 4 + (fc(cfarray, row, rcdBorder) & 1), indx = row * tileSize + col, pqindx = indx / 2; col < tilecols - rcdBorder + 4; col += 2, indx += 2, ++pqindx) {
-                    const float cfai = cfa[indx];
+            // Step 4.0: Calculate the square of the P/Q diagonals color difference high pass filter
+            for (int row = 3; row < tileRows - 3; ++row) {
+                for (int col = 3, indx = row * tileSize + col, indx2 = indx / 2; col < tilecols - 3; col+=2, indx+=2, indx2++ ) {
+                    P_CDiff_Hpf[indx2] = SQR((cfa[indx - w3 - 3] - cfa[indx - w1 - 1] - cfa[indx + w1 + 1] + cfa[indx + w3 + 3]) - 3.f * (cfa[indx - w2 - 2] + cfa[indx + w2 + 2]) + 6.f * cfa[indx]);
+                    Q_CDiff_Hpf[indx2] = SQR((cfa[indx - w3 + 3] - cfa[indx - w1 + 1] - cfa[indx + w1 - 1] + cfa[indx + w3 - 3]) - 3.f * (cfa[indx - w2 + 2] + cfa[indx + w2 - 2]) + 6.f * cfa[indx]);
+                }
+            }

-                    float P_Stat = std::max(epssq, - 18.f * cfai * (cfa[indx - w1 - 1] + cfa[indx + w1 + 1] + 2.f * (cfa[indx - w2 - 2] + cfa[indx + w2 + 2]) - cfa[indx - w3 - 3] - cfa[indx + w3 + 3]) - 2.f * cfai * (cfa[indx - w4 - 4] + cfa[indx + w4 + 4] - 19.f * cfai) - cfa[indx - w1 - 1] * (70.f * cfa[indx + w1 + 1] - 12.f * cfa[indx - w2 - 2] + 24.f * cfa[indx + w2 + 2] - 38.f * cfa[indx - w3 - 3] + 16.f * cfa[indx + w3 + 3] + 12.f * cfa[indx - w4 - 4] - 6.f * cfa[indx + w4 + 4] + 46.f * cfa[indx - w1 - 1]) + cfa[indx + w1 + 1] * (24.f * cfa[indx - w2 - 2] - 12.f * cfa[indx + w2 + 2] + 16.f * cfa[indx - w3 - 3] - 38.f * cfa[indx + w3 + 3] - 6.f * cfa[indx - w4 - 4] + 12.f * cfa[indx + w4 + 4] + 46.f * cfa[indx + w1 + 1]) + cfa[indx - w2 - 2] * (14.f * cfa[indx + w2 + 2] - 12.f * cfa[indx + w3 + 3] - 2.f * (cfa[indx - w4 - 4] - cfa[indx + w4 + 4]) + 11.f * cfa[indx - w2 - 2]) - cfa[indx + w2 + 2] * (12.f * cfa[indx - w3 - 3] + 2.f * (cfa[indx - w4 - 4] - cfa[indx + w4 + 4]) + 11.f * cfa[indx + w2 + 2]) + cfa[indx - w3 - 3] * (2.f * cfa[indx + w3 + 3] - 6.f * cfa[indx - w4 - 4] + 10.f * cfa[indx - w3 - 3]) - cfa[indx + w3 + 3] * (6.f * cfa[indx + w4 + 4] + 10.f * cfa[indx + w3 + 3]) + cfa[indx - w4 - 4] * cfa[indx - w4 - 4] + cfa[indx + w4 + 4] * cfa[indx + w4 + 4]);
-                    float Q_Stat = std::max(epssq, - 18.f * cfai * (cfa[indx + w1 - 1] + cfa[indx - w1 + 1] + 2.f * (cfa[indx + w2 - 2] + cfa[indx - w2 + 2]) - cfa[indx + w3 - 3] - cfa[indx - w3 + 3]) - 2.f * cfai * (cfa[indx + w4 - 4] + cfa[indx - w4 + 4] - 19.f * cfai) - cfa[indx + w1 - 1] * (70.f * cfa[indx - w1 + 1] - 12.f * cfa[indx + w2 - 2] + 24.f * cfa[indx - w2 + 2] - 38.f * cfa[indx + w3 - 3] + 16.f * cfa[indx - w3 + 3] + 12.f * cfa[indx + w4 - 4] - 6.f * cfa[indx - w4 + 4] + 46.f * cfa[indx + w1 - 1]) + cfa[indx - w1 + 1] * (24.f * cfa[indx + w2 - 2] - 12.f * cfa[indx - w2 + 2] + 16.f * cfa[indx + w3 - 3] - 38.f * cfa[indx - w3 + 3] - 6.f * cfa[indx + w4 - 4] + 12.f * cfa[indx - w4 + 4] + 46.f * cfa[indx - w1 + 1]) + cfa[indx + w2 - 2] * (14.f * cfa[indx - w2 + 2] - 12.f * cfa[indx - w3 + 3] - 2.f * (cfa[indx + w4 - 4] - cfa[indx - w4 + 4]) + 11.f * cfa[indx + w2 - 2]) - cfa[indx - w2 + 2] * (12.f * cfa[indx + w3 - 3] + 2.f * (cfa[indx + w4 - 4] - cfa[indx - w4 + 4]) + 11.f * cfa[indx - w2 + 2]) + cfa[indx + w3 - 3] * (2.f * cfa[indx - w3 + 3] - 6.f * cfa[indx + w4 - 4] + 10.f * cfa[indx + w3 - 3]) - cfa[indx - w3 + 3] * (6.f * cfa[indx - w4 + 4] + 10.f * cfa[indx - w3 + 3]) + cfa[indx + w4 - 4] * cfa[indx + w4 - 4] + cfa[indx - w4 + 4] * cfa[indx - w4 + 4]);
-
-                    PQ_Dir[pqindx] = P_Stat / (P_Stat + Q_Stat);
+            // Step 4.1: Obtain the P/Q diagonals directional discrimination strength
+            for (int row = 4; row < tileRows - 4; ++row) {
+                for (int col = 4 + (fc(cfarray, row, 0) & 1), indx = row * tileSize + col, indx2 = indx / 2, indx3 = (indx - w1 - 1) / 2, indx4 = (indx + w1 - 1) / 2; col < tilecols - 4; col += 2, indx += 2, indx2++, indx3++, indx4++ ) {
+                    float P_Stat = std::max(epssq, P_CDiff_Hpf[indx3] + P_CDiff_Hpf[indx2] + P_CDiff_Hpf[indx4 + 1]);
+                    float Q_Stat = std::max(epssq, Q_CDiff_Hpf[indx3 + 1] + Q_CDiff_Hpf[indx2] + Q_CDiff_Hpf[indx4]);
+                    PQ_Dir[indx2] = P_Stat / (P_Stat + Q_Stat);
                }
            }

            // Step 4.2: Populate the red and blue channels at blue and red CFA positions
-            for (int row = rcdBorder - 3; row < tileRows - rcdBorder + 3; row++) {
-                for (int col = rcdBorder - 3 + (fc(cfarray, row, rcdBorder - 1) & 1), indx = row * tileSize + col, c = 2 - fc(cfarray, row, col), pqindx = indx / 2, pqindx2 = (indx - w1 - 1) / 2, pqindx3 = (indx + w1 - 1) / 2; col < tilecols - rcdBorder + 3; col += 2, indx += 2, ++pqindx, ++pqindx2, ++pqindx3) {
+            for (int row = 4; row < tileRows - 4; ++row) {
+                for (int col = 4 + (fc(cfarray, row, 0) & 1), indx = row * tileSize + col, c = 2 - fc(cfarray, row, col), pqindx = indx / 2, pqindx2 = (indx - w1 - 1) / 2, pqindx3 = (indx + w1 - 1) / 2; col < tilecols - 4; col += 2, indx += 2, ++pqindx, ++pqindx2, ++pqindx3) {

                    // Refined P/Q diagonal local discrimination
                    float PQ_Central_Value   = PQ_Dir[pqindx];
@@ -280,8 +257,8 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
            }

            // Step 4.3: Populate the red and blue channels at green CFA positions
-            for (int row = rcdBorder; row < tileRows - rcdBorder; row++) {
-                for (int col = rcdBorder + (fc(cfarray, row, rcdBorder - 1) & 1), indx = row * tileSize + col; col < tilecols - rcdBorder; col += 2, indx += 2) {
+            for (int row = 4; row < tileRows - 4; ++row) {
+                for (int col = 4 + (fc(cfarray, row, 1) & 1), indx = row * tileSize + col; col < tilecols - 4; col += 2, indx += 2) {

                    // Refined vertical and horizontal local discrimination
                    float VH_Central_Value = VH_Dir[indx];
@@ -323,8 +300,13 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
                }
            }

-            for (int row = rowStart + rcdBorder; row < rowEnd - rcdBorder; ++row) {
-                for (int col = colStart + rcdBorder; col < colEnd - rcdBorder; ++col) {
+            // For the outermost tiles in all directions we can use a smaller border margin
+            const int firstVertical = rowStart + ((tr == 0) ? rcdBorder : tileBorder);
+            const int lastVertical = rowEnd - ((tr == numTh - 1) ? rcdBorder : tileBorder);
+            const int firstHorizontal = colStart + ((tc == 0) ? rcdBorder : tileBorder);
+            const int lastHorizontal =  colEnd - ((tc == numTw - 1) ? rcdBorder : tileBorder);
+            for (int row = firstVertical; row < lastVertical; ++row) {
+                for (int col = firstHorizontal; col < lastHorizontal; ++col) {
                    int idx = (row - rowStart) * tileSize + col - colStart ;
                    red[row][col] = std::max(0.f, rgb[0][idx] * scale);
                    green[row][col] = std::max(0.f, rgb[1][idx] * scale);
@@ -352,6 +334,8 @@ void RawImageSource::rcd_demosaic(size_t chunkSize, bool measure)
    free(rgb);
    free(VH_Dir);
    free(PQ_Dir);
+    free(P_CDiff_Hpf);
+    free(Q_CDiff_Hpf);
 }

    border_interpolate(W, H, rcdBorder, rawData, red, green, blue);