From 6a975200078abb25bf0cb6da3e097b5a7f0d112f Mon Sep 17 00:00:00 2001
From: TooWaBoo <TooWaBoo@users.noreply.github.com>
Date: Sat, 19 May 2018 11:46:07 +0200
Subject: [PATCH 1/6] Update Deutsch locale revised

---
 rtdata/languages/Deutsch | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/rtdata/languages/Deutsch b/rtdata/languages/Deutsch
index 4fdd0e449..bbe76a991 100644
--- a/rtdata/languages/Deutsch
+++ b/rtdata/languages/Deutsch
@@ -56,6 +56,7 @@
 #55 06.04.2018 Erweiterung (TooWaBoo) RT 5.4
 #56 27.04.2018 Erweiterung (TooWaBoo) RT 5.4
 #57 17.05.2018 Erweiterung (TooWaBoo) RT 5.4
+#58 19.05.2018 Erweiterung (TooWaBoo) RT 5.4
 
 ABOUT_TAB_BUILD;Version
 ABOUT_TAB_CREDITS;Danksagungen
@@ -1339,7 +1340,7 @@ TP_BWMIX_MET;Methode
 TP_BWMIX_MET_CHANMIX;Kanalmixer
 TP_BWMIX_MET_DESAT;Entsättigung
 TP_BWMIX_MET_LUMEQUAL;Luminanz
-TP_BWMIX_RGBLABEL;R: %1%%   G: %2%%   B: %3%%  Gesamt: %4%%
+TP_BWMIX_RGBLABEL;R: %1%% G: %2%% B: %3%% Gesamt: %4%%
 TP_BWMIX_RGBLABEL_HINT;RGB-Faktoren\n\nGesamt: Summe aller RGB-Werte.\n- immer 100% im Modus Relativ\n- höher (heller), oder niedriger (dunkler) 100% im Modus Absolut
 TP_BWMIX_RGB_TOOLTIP;Mischen Sie die Kanäle. Verwenden Sie die Vorgaben zur Orientierung.\nNegative Werte können zu Artefakten führen.
 TP_BWMIX_SETTING;Voreinstellung
@@ -2278,11 +2279,11 @@ ZOOMPANEL_ZOOMOUT;Herauszoomen\nTaste: <b>-</b>
 ! Untranslated keys follow; remove the ! prefix after an entry is translated.
 !!!!!!!!!!!!!!!!!!!!!!!!!
 
-!ADJUSTER_RESET_TO_DEFAULT;<b>Click</b> - reset to default value.\n<b>Ctrl</b>+<b>click</b> - reset to initial value.
-!GENERAL_RESET;Reset
-!HISTORY_MSG_235;B&amp;W - CM - Auto
-!HISTORY_MSG_237;B&amp;W - CM
-!HISTORY_MSG_273;CT - Color Balance SMH
-!HISTORY_MSG_392;W - Residual - Color Balance
-!TP_BWMIX_MIXC;Channel Mixer
-!TP_BWMIX_NEUTRAL;Reset
+ADJUSTER_RESET_TO_DEFAULT;<b>Klick</b> - Auf Standardwert zurücksetzen.\n<b>Strg</b> + <b>Klick</b> - Auf Initialwert zurücksetzen.
+GENERAL_RESET;Zurücksetzen
+HISTORY_MSG_235;(Schwarz/Weiß)\nAuto-Kanalmixer
+HISTORY_MSG_237;(Schwarz/Weiß) - Mixer
+HISTORY_MSG_273;(Farbanpassungen)\nFarbausgleich\nRegler zurücksetzen
+HISTORY_MSG_392;(Wavelet) - Restbild\nFarbausgleich
+TP_BWMIX_MIXC;Kanalmixer
+TP_BWMIX_NEUTRAL;Zurücksetzen

From 2828e2933dc5bf7e7bd0c1f09593488dded2b886 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sat, 19 May 2018 12:24:29 +0200
Subject: [PATCH 2/6] pixelshift: use auto-calculated ca-correction parameters
 from first frame for all frames

---
 rtengine/CA_correct_RT.cc  | 73 ++++++++++++++++++++++++++++++--------
 rtengine/rawimagesource.cc |  9 +++--
 rtengine/rawimagesource.h  |  2 +-
 3 files changed, 66 insertions(+), 18 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index cc527a891..3712eea22 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -27,7 +27,7 @@
 #include "rawimagesource.h"
 #include "rt_math.h"
 #include "median.h"
-
+#include "StopWatch.h"
 namespace {
 
 bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
@@ -111,7 +111,7 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
 using namespace std;
 using namespace rtengine;
 
-void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData)
+void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
 {
 // multithreaded and vectorized by Ingo Weyrich
     constexpr int ts = 128;
@@ -135,11 +135,13 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
 
     // local variables
     const int width = W + (W & 1), height = H;
-    //temporary array to store simple interpolation of G
-    float *Gtmp = (float (*)) malloc ((height * width) / 2 * sizeof * Gtmp);
 
-    // temporary array to avoid race conflicts, only every second pixel needs to be saved here
-    float *RawDataTmp = (float*) malloc( (height * width) * sizeof(float) / 2);
+    //temporary array to store simple interpolation of G
+    if (!buffer) {
+        buffer = (float (*)) malloc ((height * width) / 2 * sizeof (float) + (height * width) * sizeof(float) / 2);
+    }
+    float *Gtmp = buffer;
+    float *RawDataTmp = buffer + (height * width) / 2;
 
     float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
 
@@ -159,7 +161,17 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
     float (*blockshifts)[2][2] = (float (*)[2][2])(blockwt + vblsz * hblsz);
 
     double fitparams[2][2][16];
-
+    const bool fitParamsSet = fitParamsTransfer && fitParamsIn;
+    if(autoCA && fitParamsSet) {
+        int index = 0;
+        for(int c = 0; c < 2; ++c) {
+            for(int d = 0; d < 2; ++d) {
+                for(int e = 0; e < 16; ++e) {
+                    fitparams[c][d][e] = fitParamsTransfer[index++];
+                }
+            }
+        }
+    }
     //order of 2d polynomial fit (polyord), and numpar=polyord^2
     int polyord = 4, numpar = 16;
 
@@ -186,8 +198,8 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
 
         // assign working space
         constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
-        char *buffer = (char *) malloc(buffersize);
-        char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
+        char *bufferThr = (char *) malloc(buffersize);
+        char *data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
 
         // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache
 
@@ -214,12 +226,12 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
         float *gshift  = rbhpfv; // there is no overlap in buffer usage => share
 
 
-        if (autoCA) {
+        if (autoCA && !fitParamsSet) {
             // Main algorithm: Tile loop calculating correction parameters per tile
             #pragma omp for collapse(2) schedule(dynamic) nowait
             for (int top = -border ; top < height; top += ts - border2)
                 for (int left = -border; left < width - (W & 1); left += ts - border2) {
-                    memset(buffer, 0, buffersize);
+                    memset(bufferThr, 0, buffersize);
                     const int vblock = ((top + border) / (ts - border2)) + 1;
                     const int hblock = ((left + border) / (ts - border2)) + 1;
                     const int bottom = min(top + ts, height + border);
@@ -741,7 +753,6 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                                     processpasstwo = false;
                                 }
                             }
-
                 }
 
                 //fitparams[polyord*i+j] gives the coefficients of (vblock^i hblock^j) in a polynomial fit for i,j<=4
@@ -756,7 +767,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
 
             for (int top = -border; top < height; top += ts - border2)
                 for (int left = -border; left < width - (W & 1); left += ts - border2) {
-                    memset(buffer, 0, buffersize);
+                    memset(bufferThr, 0, buffersize);
                     float lblockshifts[2][2];
                     const int vblock = ((top + border) / (ts - border2)) + 1;
                     const int hblock = ((left + border) / (ts - border2)) + 1;
@@ -929,6 +940,25 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         lblockshifts[1][1] = 2 * hfrac * cablue;
                     } else {
                         //CA auto correction; use CA diagnostic pass to set shift parameters
+                        if (fitParamsIn) {
+                            for (int rr = 3; rr < rr1 - 3; rr++) {
+                                for (int cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) {
+                                    int c = FC(rr, cc);
+
+                                    if (c != 1) {
+                                        //compute directional weights using image gradients
+                                        float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
+                                        float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
+                                        float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
+                                        float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc + 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
+
+                                        //store in rgb array the interpolated G value at R/B grid points using directional weighted average
+                                        rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
+                                    }
+                                }
+                            }
+                        }
+
                         lblockshifts[0][0] = lblockshifts[0][1] = 0;
                         lblockshifts[1][0] = lblockshifts[1][1] = 0;
                         double powVblock = 1.0;
@@ -1153,12 +1183,25 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
         }
 
         // clean up
+        free(bufferThr);
+    }
+
+    if(autoCA && fitParamsTransfer && fitParamsOut) {
+        int index = 0;
+        for(int c = 0; c < 2; ++c) {
+            for(int d = 0; d < 2; ++d) {
+                for(int e = 0; e < 16; ++e) {
+                    fitParamsTransfer[index++] = fitparams[c][d][e];
+                }
+            }
+        }
+    }
+
+    if(freeBuffer) {
         free(buffer);
     }
 
-    free(Gtmp);
     free(blockwt);
-    free(RawDataTmp);
 
     if(plistener) {
         plistener->setProgress(1.0);
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 6afde5b67..2020302f5 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2015,9 +2015,14 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
             plistener->setProgress (0.0);
         }
         if(numFrames == 4) {
-            for(int i=0; i<4; ++i) {
-                CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i]);
+            StopWatch Stop1("ps ca correction");
+            double fitParams[64];
+            float *buffer = nullptr;
+            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, false, buffer, false);
+            for(int i = 1; i < 3; ++i) {
+                CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, false, false, buffer, false);
             }
+            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[3], fitParams, false, false, buffer);
         } else {
             CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, rawData);
         }
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index 04b7396bc..98b8a0602 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -245,7 +245,7 @@ protected:
     inline  void interpolate_row_rb     (float* ar, float* ab, float* pg, float* cg, float* ng, int i);
     inline  void interpolate_row_rb_mul_pp (float* ar, float* ab, float* pg, float* cg, float* ng, int i, float r_mul, float g_mul, float b_mul, int x1, int width, int skip);
 
-    void CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData);
+    void CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer = nullptr, bool fitParamsIn = false, bool fitParamsOut = false, float *buffer = nullptr, bool freeBuffer = true);
     void ddct8x8s(int isgn, float a[8][8]);
     void processRawWhitepoint (float expos, float preser, array2D<float> &rawData);  // exposure before interpolation
 

From 68fabf0be5c7066c9391663b0dc2f53f441b69e3 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sat, 19 May 2018 13:47:44 +0200
Subject: [PATCH 3/6] Fix for last commit

---
 rtengine/rawimagesource.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 2020302f5..38fccda85 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2018,11 +2018,11 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
             StopWatch Stop1("ps ca correction");
             double fitParams[64];
             float *buffer = nullptr;
-            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, false, buffer, false);
+            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, buffer, false);
             for(int i = 1; i < 3; ++i) {
-                CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, false, false, buffer, false);
+                CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, true, false, buffer, false);
             }
-            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[3], fitParams, false, false, buffer);
+            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[3], fitParams, true, false, buffer);
         } else {
             CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, rawData);
         }

From ac78dd311e5ee97d0cc6d1cc26e29fe1a1d64648 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sun, 20 May 2018 13:59:50 +0200
Subject: [PATCH 4/6] Speedup for ca-correction of pixelshift files, also fixed
 a memory leak

---
 rtengine/CA_correct_RT.cc  | 161 +++++++++++++++++++------------------
 rtengine/rawimagesource.cc |   3 +-
 rtengine/rawimagesource.h  |   2 +-
 3 files changed, 83 insertions(+), 83 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 3712eea22..b1f696f5f 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -111,7 +111,7 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
 using namespace std;
 using namespace rtengine;
 
-void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
+float* RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
 {
 // multithreaded and vectorized by Ingo Weyrich
     constexpr int ts = 128;
@@ -124,7 +124,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
         for(int j = 0; j < 2; j++)
             if(FC(i, j) == 3) {
                 printf("CA correction supports only RGB Colour filter arrays\n");
-                return;
+                return buffer;
             }
 
     volatile double progress = 0.0;
@@ -135,19 +135,6 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
 
     // local variables
     const int width = W + (W & 1), height = H;
-
-    //temporary array to store simple interpolation of G
-    if (!buffer) {
-        buffer = (float (*)) malloc ((height * width) / 2 * sizeof (float) + (height * width) * sizeof(float) / 2);
-    }
-    float *Gtmp = buffer;
-    float *RawDataTmp = buffer + (height * width) / 2;
-
-    float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
-
-    // Because we can't break parallel processing, we need a switch do handle the errors
-    bool processpasstwo = true;
-
     constexpr int border = 8;
     constexpr int border2 = 16;
 
@@ -156,13 +143,27 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
     const int vblsz = ceil((float)(height + border2) / (ts - border2) + 2 + vz1);
     const int hblsz = ceil((float)(width + border2) / (ts - border2) + 2 + hz1);
 
+    //temporary array to store simple interpolation of G
+    if (!buffer) {
+        buffer = static_cast<float*>(malloc ((height * width + vblsz * hblsz * (2 * 2 + 1)) * sizeof(float)));
+    }
+    float *Gtmp = buffer;
+    float *RawDataTmp = buffer + (height * width) / 2;
+
     //block CA shift values and weight assigned to block
-    float* const blockwt = static_cast<float*>(calloc(vblsz * hblsz * (2 * 2 + 1), sizeof(float)));
+    float *const blockwt = buffer + (height * width);
+    memset(blockwt, 0, vblsz * hblsz * (2 * 2 + 1) * sizeof(float));
     float (*blockshifts)[2][2] = (float (*)[2][2])(blockwt + vblsz * hblsz);
 
+    float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
+
+    // Because we can't break parallel processing, we need a switch do handle the errors
+    bool processpasstwo = true;
+
     double fitparams[2][2][16];
     const bool fitParamsSet = fitParamsTransfer && fitParamsIn;
     if(autoCA && fitParamsSet) {
+        // use stored parameters
         int index = 0;
         for(int c = 0; c < 2; ++c) {
             for(int d = 0; d < 2; ++d) {
@@ -186,22 +187,18 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
 
         int shifthfloor[3], shiftvfloor[3], shifthceil[3], shiftvceil[3];
 
-        //local quadratic fit to shift data within a tile
-        float   coeff[2][3][2];
-        //measured CA shift parameters for a tile
-        float   CAshift[2][2];
         //polynomial fit coefficients
         //residual CA shift amount within a plaquette
         float   shifthfrac[3], shiftvfrac[3];
-        //per thread data for evaluation of block CA shift variance
-        float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
 
         // assign working space
         constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
-        char *bufferThr = (char *) malloc(buffersize);
-        char *data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
+        constexpr int buffersizePassTwo = sizeof(float) * ts * ts + 4 * sizeof(float) * ts * tsh + 4 * 64 + 63;
+        char * const bufferThr = (char *) malloc((autoCA && !fitParamsSet) ? buffersize : buffersizePassTwo);
 
-        // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache
+        char * const data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
+
+        // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <= 4-way associative L1-Cache
 
         //rgb data in a tile
         float* rgb[3];
@@ -209,25 +206,29 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
         rgb[1]         = (float (*)) (data + sizeof(float) * ts * tsh + 1 * 64);
         rgb[2]         = (float (*)) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);
 
-        //high pass filter for R/B in vertical direction
-        float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
-        //high pass filter for R/B in horizontal direction
-        float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
-        //low pass filter for R/B in horizontal direction
-        float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
-        //low pass filter for R/B in vertical direction
-        float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
-        //low pass filter for colour differences in horizontal direction
-        float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
-        //low pass filter for colour differences in vertical direction
-        float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
-        float *grbdiff = rbhpfh; // there is no overlap in buffer usage => share
-        //green interpolated to optical sample points for R/B
-        float *gshift  = rbhpfv; // there is no overlap in buffer usage => share
-
-
         if (autoCA && !fitParamsSet) {
+            //high pass filter for R/B in vertical direction
+            float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
+            //high pass filter for R/B in horizontal direction
+            float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
+            //low pass filter for R/B in horizontal direction
+            float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
+            //low pass filter for R/B in vertical direction
+            float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
+            //low pass filter for colour differences in horizontal direction
+            float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
+            //low pass filter for colour differences in vertical direction
+            float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
             // Main algorithm: Tile loop calculating correction parameters per tile
+
+            //local quadratic fit to shift data within a tile
+            float coeff[2][3][2];
+            //measured CA shift parameters for a tile
+            float CAshift[2][2];
+
+            //per thread data for evaluation of block CA shift variance
+            float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
+
             #pragma omp for collapse(2) schedule(dynamic) nowait
             for (int top = -border ; top < height; top += ts - border2)
                 for (int left = -border; left < width - (W & 1); left += ts - border2) {
@@ -763,11 +764,14 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
 
         // Main algorithm: Tile loop
         if(processpasstwo) {
+            float *grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
+            //green interpolated to optical sample points for R/B
+            float *gshift  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
             #pragma omp for schedule(dynamic) collapse(2) nowait
 
             for (int top = -border; top < height; top += ts - border2)
                 for (int left = -border; left < width - (W & 1); left += ts - border2) {
-                    memset(bufferThr, 0, buffersize);
+                    memset(bufferThr, 0, buffersizePassTwo);
                     float lblockshifts[2][2];
                     const int vblock = ((top + border) / (ts - border2)) + 1;
                     const int hblock = ((left + border) / (ts - border2)) + 1;
@@ -913,25 +917,42 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                     //end of border fill
                     // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-                    if (!autoCA) {
+                    if (!autoCA || fitParamsIn) {
+#ifdef __SSE2__
+                        const vfloat onev = F2V(1.f);
+                        const vfloat epsv = F2V(eps);
+#endif
+
                         //manual CA correction; use red/blue slider values to set CA shift parameters
-                        for (int rr = 3; rr < rr1 - 3; rr++)
-                            for (int cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) {
-                                int c = FC(rr, cc);
-
-                                if (c != 1) {
-                                    //compute directional weights using image gradients
-                                    float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
-                                    float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
-                                    float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
-                                    float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc + 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
-
-                                    //store in rgb array the interpolated G value at R/B grid points using directional weighted average
-                                    rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
-                                }
+                        for (int rr = 3; rr < rr1 - 3; rr++) {
+                            int cc = 3 + FC(rr, 1), c = FC(rr,cc), indx = rr * ts + cc;
+#ifdef __SSE2__
+                            for (; cc < cc1 - 10; cc += 8, indx += 8) {
+                                //compute directional weights using image gradients
+                                vfloat val1v = epsv + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 1) * ts + cc]));
+                                vfloat val2v = epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1]));
+                                vfloat wtuv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr - 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr - 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 3) * ts + cc])));
+                                vfloat wtdv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr + 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr + 3) * ts + cc])));
+                                vfloat wtlv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
+                                vfloat wtrv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));
 
+                                //store in rgb array the interpolated G value at R/B grid points using directional weighted average
+                                STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
                             }
+#endif
+                            for (; cc < cc1 - 3; cc += 2, indx += 2) {
+                                //compute directional weights using image gradients
+                                float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
+                                float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
+                                float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
+                                float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
 
+                                //store in rgb array the interpolated G value at R/B grid points using directional weighted average
+                                rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
+                            }
+                        }
+                    }
+                    if (!autoCA) {
                         float hfrac = -((float)(hblock - 0.5) / (hblsz - 2) - 0.5);
                         float vfrac = -((float)(vblock - 0.5) / (vblsz - 2) - 0.5) * height / width;
                         lblockshifts[0][0] = 2 * vfrac * cared;
@@ -940,32 +961,12 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                         lblockshifts[1][1] = 2 * hfrac * cablue;
                     } else {
                         //CA auto correction; use CA diagnostic pass to set shift parameters
-                        if (fitParamsIn) {
-                            for (int rr = 3; rr < rr1 - 3; rr++) {
-                                for (int cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) {
-                                    int c = FC(rr, cc);
-
-                                    if (c != 1) {
-                                        //compute directional weights using image gradients
-                                        float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
-                                        float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
-                                        float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
-                                        float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc + 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
-
-                                        //store in rgb array the interpolated G value at R/B grid points using directional weighted average
-                                        rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
-                                    }
-                                }
-                            }
-                        }
-
                         lblockshifts[0][0] = lblockshifts[0][1] = 0;
                         lblockshifts[1][0] = lblockshifts[1][1] = 0;
                         double powVblock = 1.0;
                         for (int i = 0; i < polyord; i++) {
                             double powHblock = powVblock;
                             for (int j = 0; j < polyord; j++) {
-                                //printf("i= %d j= %d polycoeff= %f \n",i,j,fitparams[0][0][polyord*i+j]);
                                 lblockshifts[0][0] += powHblock * fitparams[0][0][polyord * i + j];
                                 lblockshifts[0][1] += powHblock * fitparams[0][1][polyord * i + j];
                                 lblockshifts[1][0] += powHblock * fitparams[1][0][polyord * i + j];
@@ -1187,6 +1188,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
     }
 
     if(autoCA && fitParamsTransfer && fitParamsOut) {
+        // store calculated parameters
         int index = 0;
         for(int c = 0; c < 2; ++c) {
             for(int d = 0; d < 2; ++d) {
@@ -1201,9 +1203,8 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
         free(buffer);
     }
 
-    free(blockwt);
-
     if(plistener) {
         plistener->setProgress(1.0);
     }
+    return buffer;
 }
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 38fccda85..ffcbbf188 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2017,8 +2017,7 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
         if(numFrames == 4) {
             StopWatch Stop1("ps ca correction");
             double fitParams[64];
-            float *buffer = nullptr;
-            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, buffer, false);
+            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, nullptr, false);
             for(int i = 1; i < 3; ++i) {
                 CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, true, false, buffer, false);
             }
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index 98b8a0602..66800b204 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -245,7 +245,7 @@ protected:
     inline  void interpolate_row_rb     (float* ar, float* ab, float* pg, float* cg, float* ng, int i);
     inline  void interpolate_row_rb_mul_pp (float* ar, float* ab, float* pg, float* cg, float* ng, int i, float r_mul, float g_mul, float b_mul, int x1, int width, int skip);
 
-    void CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer = nullptr, bool fitParamsIn = false, bool fitParamsOut = false, float *buffer = nullptr, bool freeBuffer = true);
+    float* CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer = nullptr, bool fitParamsIn = false, bool fitParamsOut = false, float * buffer = nullptr, bool freeBuffer = true);
     void ddct8x8s(int isgn, float a[8][8]);
     void processRawWhitepoint (float expos, float preser, array2D<float> &rawData);  // exposure before interpolation
 

From 2d3148b9622d4fc0f7a6f387a1c428ce290e91c7 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sun, 20 May 2018 23:42:42 +0200
Subject: [PATCH 5/6] Fixes some things @Floessie suggested in code review

---
 rtengine/CA_correct_RT.cc  | 1 +
 rtengine/rawimagesource.cc | 4 ++--
 rtengine/rawimagesource.h  | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index b1f696f5f..a2f34584f 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1201,6 +1201,7 @@ float* RawImageSource::CA_correct_RT(const bool autoCA, const double cared, cons
 
     if(freeBuffer) {
         free(buffer);
+        buffer = nullptr;
     }
 
     if(plistener) {
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index ffcbbf188..755420817 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2021,9 +2021,9 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
             for(int i = 1; i < 3; ++i) {
                 CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, true, false, buffer, false);
             }
-            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[3], fitParams, true, false, buffer);
+            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[3], fitParams, true, false, buffer, true);
         } else {
-            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, rawData);
+            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, rawData, nullptr, false, false, nullptr, true);
         }
     }
 
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index 66800b204..0512af790 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -245,7 +245,7 @@ protected:
     inline  void interpolate_row_rb     (float* ar, float* ab, float* pg, float* cg, float* ng, int i);
     inline  void interpolate_row_rb_mul_pp (float* ar, float* ab, float* pg, float* cg, float* ng, int i, float r_mul, float g_mul, float b_mul, int x1, int width, int skip);
 
-    float* CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer = nullptr, bool fitParamsIn = false, bool fitParamsOut = false, float * buffer = nullptr, bool freeBuffer = true);
+    float* CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float * buffer, bool freeBuffer);
     void ddct8x8s(int isgn, float a[8][8]);
     void processRawWhitepoint (float expos, float preser, array2D<float> &rawData);  // exposure before interpolation
 

From 7d49895769734d7a9885620c36b14803805260eb Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Tue, 22 May 2018 14:38:32 +0200
Subject: [PATCH 6/6] Removed Stopwatches

---
 rtengine/pixelshift.cc     | 2 +-
 rtengine/rawimagesource.cc | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/rtengine/pixelshift.cc b/rtengine/pixelshift.cc
index 5aac37d20..488538e8a 100644
--- a/rtengine/pixelshift.cc
+++ b/rtengine/pixelshift.cc
@@ -26,7 +26,7 @@
 #include "procparams.h"
 #include "gauss.h"
 #include "median.h"
-#define BENCHMARK
+//#define BENCHMARK
 #include "StopWatch.h"
 namespace
 {
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 755420817..6f119a2ad 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -39,7 +39,6 @@
 #include <omp.h>
 #endif
 #include "opthelper.h"
-#include "StopWatch.h"
 #define clipretinex( val, minv, maxv )    (( val = (val < minv ? minv : val ) ) > maxv ? maxv : val )
 #undef CLIPD
 #define CLIPD(a) ((a)>0.0f?((a)<1.0f?(a):1.0f):0.0f)
@@ -2015,7 +2014,6 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
             plistener->setProgress (0.0);
         }
         if(numFrames == 4) {
-            StopWatch Stop1("ps ca correction");
             double fitParams[64];
             float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, nullptr, false);
             for(int i = 1; i < 3; ++i) {