From 9ecc7e687659b86941c817a1f46c09b39dfbffa0 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 5 Sep 2018 13:30:15 +0200
Subject: [PATCH 01/22] iterative raw auto ca correction, #4774

---
 rtdata/languages/default   |    2 +
 rtengine/CA_correct_RT.cc  | 1903 ++++++++++++++++++------------------
 rtengine/procparams.cc     |    4 +
 rtengine/procparams.h      |    1 +
 rtengine/rawimagesource.cc |    8 +-
 rtengine/rawimagesource.h  |    2 +-
 rtgui/paramsedited.cc      |    8 +-
 rtgui/paramsedited.h       |    1 +
 rtgui/partialpastedlg.cc   |    1 +
 rtgui/rawcacorrection.cc   |   27 +-
 rtgui/rawcacorrection.h    |    3 +
 11 files changed, 1002 insertions(+), 958 deletions(-)
diff --git a/rtdata/languages/default b/rtdata/languages/default
index b1fafc48e..05498985f 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -748,6 +748,7 @@ HISTORY_MSG_PREPROCESS_LINEDENOISE_DIRECTION;Line noise filter direction
 HISTORY_MSG_PREPROCESS_PDAFLINESFILTER;PDAF lines filter
 HISTORY_MSG_PRSHARPEN_CONTRAST;PRS - Contrast threshold
 HISTORY_MSG_RAW_BORDER;Raw border
+HISTORY_MSG_RAWCACORR_AUTOIT;Raw CA Correction - Iterations
 HISTORY_MSG_RESIZE_ALLOWUPSCALING;Resize - Allow upscaling
 HISTORY_MSG_SHARPENING_CONTRAST;Sharpening - Contrast threshold
 HISTORY_MSG_SOFTLIGHT_ENABLED;Soft light
@@ -1780,6 +1781,7 @@ TP_PREPROCESS_PDAFLINESFILTER_TOOLTIP;Tries to suppress stripe noise caused by o
 TP_PRSHARPENING_LABEL;Post-Resize Sharpening
 TP_PRSHARPENING_TOOLTIP;Sharpens the image after resizing. Only works when the "Lanczos" resizing method is used. It is impossible to preview the effects of this tool. See RawPedia for usage instructions.
 TP_RAWCACORR_AUTO;Auto-correction
+TP_RAWCACORR_AUTOIT;Iterations
 TP_RAWCACORR_CABLUE;Blue
 TP_RAWCACORR_CARED;Red
 TP_RAWCACORR_CASTR;Strength
diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index a2f34584f..35e36ce05 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -111,7 +111,7 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
 using namespace std;
 using namespace rtengine;
 
-float* RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
+float* RawImageSource::CA_correct_RT(const bool autoCA, const size_t autoIterations, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
 {
 // multithreaded and vectorized by Ingo Weyrich
     constexpr int ts = 128;
@@ -155,1038 +155,1039 @@ float* RawImageSource::CA_correct_RT(const bool autoCA, const double cared, cons
     memset(blockwt, 0, vblsz * hblsz * (2 * 2 + 1) * sizeof(float));
     float (*blockshifts)[2][2] = (float (*)[2][2])(blockwt + vblsz * hblsz);
 
-    float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
-
     // Because we can't break parallel processing, we need a switch do handle the errors
     bool processpasstwo = true;
-
     double fitparams[2][2][16];
-    const bool fitParamsSet = fitParamsTransfer && fitParamsIn;
-    if(autoCA && fitParamsSet) {
-        // use stored parameters
-        int index = 0;
-        for(int c = 0; c < 2; ++c) {
-            for(int d = 0; d < 2; ++d) {
-                for(int e = 0; e < 16; ++e) {
-                    fitparams[c][d][e] = fitParamsTransfer[index++];
+
+    const size_t iterations = autoCA ? std::max(autoIterations, static_cast<size_t>(1)) : 1;
+    for (size_t it = 0; it < iterations && processpasstwo; ++it) {
+        float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
+        const bool fitParamsSet = fitParamsTransfer && fitParamsIn;
+        if(autoCA && fitParamsSet && iterations < 2) {
+            // use stored parameters
+            int index = 0;
+            for(int c = 0; c < 2; ++c) {
+                for(int d = 0; d < 2; ++d) {
+                    for(int e = 0; e < 16; ++e) {
+                        fitparams[c][d][e] = fitParamsTransfer[index++];
+                    }
                 }
             }
         }
-    }
-    //order of 2d polynomial fit (polyord), and numpar=polyord^2
-    int polyord = 4, numpar = 16;
+        //order of 2d polynomial fit (polyord), and numpar=polyord^2
+        int polyord = 4, numpar = 16;
 
-    constexpr float eps = 1e-5f, eps2 = 1e-10f; //tolerance to avoid dividing by zero
+        constexpr float eps = 1e-5f, eps2 = 1e-10f; //tolerance to avoid dividing by zero
 
-    #pragma omp parallel
-    {
-        int progresscounter = 0;
+        #pragma omp parallel
+        {
+            int progresscounter = 0;
 
-        //direction of the CA shift in a tile
-        int GRBdir[2][3];
+            //direction of the CA shift in a tile
+            int GRBdir[2][3];
 
-        int shifthfloor[3], shiftvfloor[3], shifthceil[3], shiftvceil[3];
+            int shifthfloor[3], shiftvfloor[3], shifthceil[3], shiftvceil[3];
 
-        //polynomial fit coefficients
-        //residual CA shift amount within a plaquette
-        float   shifthfrac[3], shiftvfrac[3];
+            //polynomial fit coefficients
+            //residual CA shift amount within a plaquette
+            float   shifthfrac[3], shiftvfrac[3];
 
-        // assign working space
-        constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
-        constexpr int buffersizePassTwo = sizeof(float) * ts * ts + 4 * sizeof(float) * ts * tsh + 4 * 64 + 63;
-        char * const bufferThr = (char *) malloc((autoCA && !fitParamsSet) ? buffersize : buffersizePassTwo);
+            // assign working space
+            constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
+            constexpr int buffersizePassTwo = sizeof(float) * ts * ts + 4 * sizeof(float) * ts * tsh + 4 * 64 + 63;
+            char * const bufferThr = (char *) malloc((autoCA && !fitParamsSet) ? buffersize : buffersizePassTwo);
 
-        char * const data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
+            char * const data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
 
-        // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <= 4-way associative L1-Cache
+            // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <= 4-way associative L1-Cache
 
-        //rgb data in a tile
-        float* rgb[3];
-        rgb[0]         = (float (*)) data;
-        rgb[1]         = (float (*)) (data + sizeof(float) * ts * tsh + 1 * 64);
-        rgb[2]         = (float (*)) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);
+            //rgb data in a tile
+            float* rgb[3];
+            rgb[0]         = (float (*)) data;
+            rgb[1]         = (float (*)) (data + sizeof(float) * ts * tsh + 1 * 64);
+            rgb[2]         = (float (*)) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);
 
-        if (autoCA && !fitParamsSet) {
-            //high pass filter for R/B in vertical direction
-            float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
-            //high pass filter for R/B in horizontal direction
-            float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
-            //low pass filter for R/B in horizontal direction
-            float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
-            //low pass filter for R/B in vertical direction
-            float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
-            //low pass filter for colour differences in horizontal direction
-            float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
-            //low pass filter for colour differences in vertical direction
-            float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
-            // Main algorithm: Tile loop calculating correction parameters per tile
+            if (autoCA && !fitParamsSet) {
+                //high pass filter for R/B in vertical direction
+                float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
+                //high pass filter for R/B in horizontal direction
+                float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
+                //low pass filter for R/B in horizontal direction
+                float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
+                //low pass filter for R/B in vertical direction
+                float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
+                //low pass filter for colour differences in horizontal direction
+                float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
+                //low pass filter for colour differences in vertical direction
+                float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
+                // Main algorithm: Tile loop calculating correction parameters per tile
 
-            //local quadratic fit to shift data within a tile
-            float coeff[2][3][2];
-            //measured CA shift parameters for a tile
-            float CAshift[2][2];
+                //local quadratic fit to shift data within a tile
+                float coeff[2][3][2];
+                //measured CA shift parameters for a tile
+                float CAshift[2][2];
 
-            //per thread data for evaluation of block CA shift variance
-            float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
+                //per thread data for evaluation of block CA shift variance
+                float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
 
-            #pragma omp for collapse(2) schedule(dynamic) nowait
-            for (int top = -border ; top < height; top += ts - border2)
-                for (int left = -border; left < width - (W & 1); left += ts - border2) {
-                    memset(bufferThr, 0, buffersize);
-                    const int vblock = ((top + border) / (ts - border2)) + 1;
-                    const int hblock = ((left + border) / (ts - border2)) + 1;
-                    const int bottom = min(top + ts, height + border);
-                    const int right  = min(left + ts, width - (W & 1) + border);
-                    const int rr1 = bottom - top;
-                    const int cc1 = right - left;
-                    const int rrmin = top < 0 ? border : 0;
-                    const int rrmax = bottom > height ? height - top : rr1;
-                    const int ccmin = left < 0 ? border : 0;
-                    const int ccmax = (right > width - (W & 1)) ? width - (W & 1) - left : cc1;
+                #pragma omp for collapse(2) schedule(dynamic) nowait
+                for (int top = -border ; top < height; top += ts - border2)
+                    for (int left = -border; left < width - (W & 1); left += ts - border2) {
+                        memset(bufferThr, 0, buffersize);
+                        const int vblock = ((top + border) / (ts - border2)) + 1;
+                        const int hblock = ((left + border) / (ts - border2)) + 1;
+                        const int bottom = min(top + ts, height + border);
+                        const int right  = min(left + ts, width - (W & 1) + border);
+                        const int rr1 = bottom - top;
+                        const int cc1 = right - left;
+                        const int rrmin = top < 0 ? border : 0;
+                        const int rrmax = bottom > height ? height - top : rr1;
+                        const int ccmin = left < 0 ? border : 0;
+                        const int ccmax = (right > width - (W & 1)) ? width - (W & 1) - left : cc1;
 
-                    // rgb from input CFA data
-                    // rgb values should be floating point numbers between 0 and 1
-                    // after white balance multipliers are applied
+                        // rgb from input CFA data
+                        // rgb values should be floating point numbers between 0 and 1
+                        // after white balance multipliers are applied
 
-#ifdef __SSE2__
-                    vfloat c65535v = F2V(65535.f);
-#endif
+    #ifdef __SSE2__
+                        vfloat c65535v = F2V(65535.f);
+    #endif
 
-                    for (int rr = rrmin; rr < rrmax; rr++) {
-                        int row = rr + top;
-                        int cc = ccmin;
-                        int col = cc + left;
-#ifdef __SSE2__
-                        int c0 = FC(rr, cc);
-                        if(c0 == 1) {
-                            rgb[c0][rr * ts + cc] = rawData[row][col] / 65535.f;
-                            cc++;
-                            col++;
-                            c0 = FC(rr, cc);
-                        }
-                        int indx1 = rr * ts + cc;
-                        for (; cc < ccmax - 7; cc+=8, col+=8, indx1 += 8) {
-                            vfloat val1 = LVFU(rawData[row][col]) / c65535v;
-                            vfloat val2 = LVFU(rawData[row][col + 4]) / c65535v;
-                            vfloat nonGreenv = _mm_shuffle_ps(val1,val2,_MM_SHUFFLE( 2,0,2,0 ));
-                            STVFU(rgb[c0][indx1 >> 1], nonGreenv);
-                            STVFU(rgb[1][indx1], val1);
-                            STVFU(rgb[1][indx1 + 4], val2);
-                        }
-#endif
-                        for (; cc < ccmax; cc++, col++) {
-                            int c = FC(rr, cc);
+                        for (int rr = rrmin; rr < rrmax; rr++) {
+                            int row = rr + top;
+                            int cc = ccmin;
+                            int col = cc + left;
+    #ifdef __SSE2__
+                            int c0 = FC(rr, cc);
+                            if(c0 == 1) {
+                                rgb[c0][rr * ts + cc] = rawData[row][col] / 65535.f;
+                                cc++;
+                                col++;
+                                c0 = FC(rr, cc);
+                            }
                             int indx1 = rr * ts + cc;
-                            rgb[c][indx1 >> ((c & 1) ^ 1)] = rawData[row][col] / 65535.f;
-                        }
-                    }
-
-                    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                    //fill borders
-                    if (rrmin > 0) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = ccmin; cc < ccmax; cc++) {
+                            for (; cc < ccmax - 7; cc+=8, col+=8, indx1 += 8) {
+                                vfloat val1 = LVFU(rawData[row][col]) / c65535v;
+                                vfloat val2 = LVFU(rawData[row][col + 4]) / c65535v;
+                                vfloat nonGreenv = _mm_shuffle_ps(val1,val2,_MM_SHUFFLE( 2,0,2,0 ));
+                                STVFU(rgb[c0][indx1 >> 1], nonGreenv);
+                                STVFU(rgb[1][indx1], val1);
+                                STVFU(rgb[1][indx1 + 4], val2);
+                            }
+    #endif
+                            for (; cc < ccmax; cc++, col++) {
                                 int c = FC(rr, cc);
-                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
-                            }
-                    }
-
-                    if (rrmax < rr1) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = ccmin; cc < ccmax; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][left + cc] / 65535.f;
-                            }
-                    }
-
-                    if (ccmin > 0) {
-                        for (int rr = rrmin; rr < rrmax; rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
-                            }
-                    }
-
-                    if (ccmax < cc1) {
-                        for (int rr = rrmin; rr < rrmax; rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(top + rr)][(width - cc - 2)] / 65535.f;
-                            }
-                    }
-
-                    //also, fill the image corners
-                    if (rrmin > 0 && ccmin > 0) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rawData[border2 - rr][border2 - cc] / 65535.f;
-                            }
-                    }
-
-                    if (rrmax < rr1 && ccmax < cc1) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][(width - cc - 2)] / 65535.f;
-                            }
-                    }
-
-                    if (rrmin > 0 && ccmax < cc1) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(border2 - rr)][(width - cc - 2)] / 65535.f;
-                            }
-                    }
-
-                    if (rrmax < rr1 && ccmin > 0) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][(border2 - cc)] / 65535.f;
-                            }
-                    }
-
-                    //end of border fill
-                    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                    //end of initialization
-
-
-#ifdef __SSE2__
-                    vfloat onev = F2V(1.f);
-                    vfloat epsv = F2V(eps);
-#endif
-                    for (int rr = 3; rr < rr1 - 3; rr++) {
-                        int row = rr + top;
-                        int cc = 3 + (FC(rr,3) & 1);
-                        int indx = rr * ts + cc;
-                        int c = FC(rr,cc);
-#ifdef __SSE2__
-                        for (; cc < cc1 - 9; cc+=8, indx+=8) {
-                            //compute directional weights using image gradients
-                            vfloat rgb1mv1v = LC2VFU(rgb[1][indx - v1]);
-                            vfloat rgb1pv1v = LC2VFU(rgb[1][indx + v1]);
-                            vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
-                            vfloat temp1v = epsv + vabsf(rgb1mv1v - rgb1pv1v);
-                            vfloat wtuv = onev / SQRV(temp1v + vabsf(rgbcv - LVFU(rgb[c][(indx - v2) >> 1])) + vabsf(rgb1mv1v - LC2VFU(rgb[1][indx - v3])));
-                            vfloat wtdv = onev / SQRV(temp1v + vabsf(rgbcv - LVFU(rgb[c][(indx + v2) >> 1])) + vabsf(rgb1pv1v - LC2VFU(rgb[1][indx + v3])));
-                            vfloat rgb1m1v = LC2VFU(rgb[1][indx - 1]);
-                            vfloat rgb1p1v = LC2VFU(rgb[1][indx + 1]);
-                            vfloat temp2v = epsv + vabsf(rgb1m1v - rgb1p1v);
-                            vfloat wtlv = onev / SQRV(temp2v + vabsf(rgbcv - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(rgb1m1v - LC2VFU(rgb[1][indx - 3])));
-                            vfloat wtrv = onev / SQRV(temp2v + vabsf(rgbcv - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(rgb1p1v - LC2VFU(rgb[1][indx + 3])));
-
-                            //store in rgb array the interpolated G value at R/B grid points using directional weighted average
-                            STC2VFU(rgb[1][indx], (wtuv * rgb1mv1v + wtdv * rgb1pv1v + wtlv * rgb1m1v + wtrv * rgb1p1v) / (wtuv + wtdv + wtlv + wtrv));
-                        }
-
-#endif
-                        for (; cc < cc1 - 3; cc+=2, indx+=2) {
-                            //compute directional weights using image gradients
-                            float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - v2) >> 1]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
-                            float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + v2) >> 1]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
-                            float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - 2) >> 1]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
-                            float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + 2) >> 1]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));
-
-                            //store in rgb array the interpolated G value at R/B grid points using directional weighted average
-                            rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
-                        }
-
-                        if (row > -1 && row < height) {
-                            int offset = (FC(row,max(left + 3, 0)) & 1);
-                            int col = max(left + 3, 0) + offset;
-                            int indx = rr * ts + 3 - (left < 0 ? (left+3) : 0) + offset;
-#ifdef __SSE2__
-                            for(; col < min(cc1 + left - 3, width) - 7; col+=8, indx+=8) {
-                                STVFU(Gtmp[(row * width + col) >> 1], LC2VFU(rgb[1][indx]));
-                            }
-#endif
-                            for(; col < min(cc1 + left - 3, width); col+=2, indx+=2) {
-                                Gtmp[(row * width + col) >> 1] = rgb[1][indx];
+                                int indx1 = rr * ts + cc;
+                                rgb[c][indx1 >> ((c & 1) ^ 1)] = rawData[row][col] / 65535.f;
                             }
                         }
 
-                    }
-                    //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-#ifdef __SSE2__
-                    vfloat zd25v = F2V(0.25f);
-#endif
-                    for (int rr = 4; rr < rr1 - 4; rr++) {
-                        int cc = 4 + (FC(rr, 2) & 1);
-                        int indx = rr * ts + cc;
-                        int c = FC(rr, cc);
-#ifdef __SSE2__
-                        for (; cc < cc1 - 10; cc += 8, indx += 8) {
-                            vfloat rgb1v = LC2VFU(rgb[1][indx]);
-                            vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
-                            vfloat rgb1mv4 = LC2VFU(rgb[1][indx - v4]);
-                            vfloat rgb1pv4 = LC2VFU(rgb[1][indx + v4]);
-                            vfloat temp1v = vabsf(vabsf((rgb1v - rgbcv) - (rgb1pv4 - LVFU(rgb[c][(indx + v4) >> 1]))) +
-                                                  vabsf(rgb1mv4 - LVFU(rgb[c][(indx - v4) >> 1]) - rgb1v + rgbcv) -
-                                                  vabsf(rgb1mv4 - LVFU(rgb[c][(indx - v4) >> 1]) - rgb1pv4 + LVFU(rgb[c][(indx + v4) >> 1])));
-                            STVFU(rbhpfv[indx >> 1], temp1v);
-                            vfloat rgb1m4 = LC2VFU(rgb[1][indx - 4]);
-                            vfloat rgb1p4 = LC2VFU(rgb[1][indx + 4]);
-                            vfloat temp2v = vabsf(vabsf((rgb1v - rgbcv) - (rgb1p4 - LVFU(rgb[c][(indx + 4) >> 1]))) +
-                                                  vabsf(rgb1m4 - LVFU(rgb[c][(indx - 4) >> 1]) - rgb1v + rgbcv) -
-                                                  vabsf(rgb1m4 - LVFU(rgb[c][(indx - 4) >> 1]) - rgb1p4 + LVFU(rgb[c][(indx + 4) >> 1])));
-                            STVFU(rbhpfh[indx >> 1], temp2v);
-
-                            //low and high pass 1D filters of G in vertical/horizontal directions
-                            rgb1v = vmul2f(rgb1v);
-                            vfloat glpfvv = (rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]));
-                            vfloat glpfhv = (rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]));
-                            rgbcv = vmul2f(rgbcv);
-                            STVFU(rblpfv[indx >> 1], zd25v * vabsf(glpfvv - (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
-                            STVFU(rblpfh[indx >> 1], zd25v * vabsf(glpfhv - (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
-                            STVFU(grblpfv[indx >> 1], zd25v * (glpfvv + (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
-                            STVFU(grblpfh[indx >> 1], zd25v * (glpfhv + (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
-                        }
-
-#endif
-                        for (; cc < cc1 - 4; cc += 2, indx += 2) {
-                            rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])) +
-                                                      fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
-                                                      fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])));
-                            rbhpfh[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])) +
-                                                      fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
-                                                      fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])));
-
-                            //low and high pass 1D filters of G in vertical/horizontal directions
-                            float glpfv = (2.f * rgb[1][indx] + rgb[1][indx + v2] + rgb[1][indx - v2]);
-                            float glpfh = (2.f * rgb[1][indx] + rgb[1][indx + 2] + rgb[1][indx - 2]);
-                            rblpfv[indx >> 1] = 0.25f * fabsf(glpfv - (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]));
-                            rblpfh[indx >> 1] = 0.25f * fabsf(glpfh - (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]));
-                            grblpfv[indx >> 1] = 0.25f * (glpfv + (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]));
-                            grblpfh[indx >> 1] = 0.25f * (glpfh + (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]));
-                        }
-                    }
-
-                    for (int dir = 0; dir < 2; dir++) {
-                        for (int k = 0; k < 3; k++) {
-                            for (int c = 0; c < 2; c++) {
-                                coeff[dir][k][c] = 0;
-                            }
-                        }
-                    }
-
-#ifdef __SSE2__
-                    vfloat zd3v = F2V(0.3f);
-                    vfloat zd1v = F2V(0.1f);
-                    vfloat zd5v = F2V(0.5f);
-#endif
-
-                    // along line segments, find the point along each segment that minimizes the colour variance
-                    // averaged over the tile; evaluate for up/down and left/right away from R/B grid point
-                    for (int rr = 8; rr < rr1 - 8; rr++) {
-                        int cc = 8 + (FC(rr, 2) & 1);
-                        int indx = rr * ts + cc;
-                        int c = FC(rr, cc);
-#ifdef __SSE2__
-                        vfloat coeff00v = ZEROV;
-                        vfloat coeff01v = ZEROV;
-                        vfloat coeff02v = ZEROV;
-                        vfloat coeff10v = ZEROV;
-                        vfloat coeff11v = ZEROV;
-                        vfloat coeff12v = ZEROV;
-                        for (; cc < cc1 - 14; cc += 8, indx += 8) {
-
-                            //in linear interpolation, colour differences are a quadratic function of interpolation position;
-                            //solve for the interpolation position that minimizes colour difference variance over the tile
-
-                            //vertical
-                            vfloat temp1 = zd3v * (LC2VFU(rgb[1][indx + ts + 1]) - LC2VFU(rgb[1][indx - ts - 1]));
-                            vfloat temp2 = zd3v * (LC2VFU(rgb[1][indx - ts + 1]) - LC2VFU(rgb[1][indx + ts - 1]));
-                            vfloat gdiffvv = (LC2VFU(rgb[1][indx + ts]) - LC2VFU(rgb[1][indx - ts])) + (temp1 - temp2);
-                            vfloat deltgrbv = LVFU(rgb[c][indx >> 1]) - LC2VFU(rgb[1][indx]);
-
-                            vfloat gradwtvv = (LVFU(rbhpfv[indx >> 1]) + zd5v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1]))) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) + LVFU(rblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) + v1]));
-
-                            coeff00v += gradwtvv * deltgrbv * deltgrbv;
-                            coeff01v += gradwtvv * gdiffvv * deltgrbv;
-                            coeff02v += gradwtvv * gdiffvv * gdiffvv;
-
-                            //horizontal
-                            vfloat gdiffhv = (LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + (temp1 + temp2);
-
-                            vfloat gradwthv = (LVFU(rbhpfh[indx >> 1]) + zd5v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1]))) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) + LVFU(rblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) + 1]));
-
-                            coeff10v += gradwthv * deltgrbv * deltgrbv;
-                            coeff11v += gradwthv * gdiffhv * deltgrbv;
-                            coeff12v += gradwthv * gdiffhv * gdiffhv;
-                        }
-
-                        coeff[0][0][c>>1] += vhadd(coeff00v);
-                        coeff[0][1][c>>1] += vhadd(coeff01v);
-                        coeff[0][2][c>>1] += vhadd(coeff02v);
-                        coeff[1][0][c>>1] += vhadd(coeff10v);
-                        coeff[1][1][c>>1] += vhadd(coeff11v);
-                        coeff[1][2][c>>1] += vhadd(coeff12v);
-
-#endif
-                        for (; cc < cc1 - 8; cc += 2, indx += 2) {
-
-                            //in linear interpolation, colour differences are a quadratic function of interpolation position;
-                            //solve for the interpolation position that minimizes colour difference variance over the tile
-
-                            //vertical
-                            float gdiff = (rgb[1][indx + ts] - rgb[1][indx - ts]) + 0.3f * (rgb[1][indx + ts + 1] - rgb[1][indx - ts + 1] + rgb[1][indx + ts - 1] - rgb[1][indx - ts - 1]);
-                            float deltgrb = (rgb[c][indx >> 1] - rgb[1][indx]);
-
-                            float gradwt = (rbhpfv[indx >> 1] + 0.5f * (rbhpfv[(indx >> 1) + 1] + rbhpfv[(indx >> 1) - 1]) ) * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) / (eps + 0.1f * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) + rblpfv[(indx >> 1) - v1] + rblpfv[(indx >> 1) + v1]);
-
-                            coeff[0][0][c>>1] += gradwt * deltgrb * deltgrb;
-                            coeff[0][1][c>>1] += gradwt * gdiff * deltgrb;
-                            coeff[0][2][c>>1] += gradwt * gdiff * gdiff;
-
-                            //horizontal
-                            gdiff = (rgb[1][indx + 1] - rgb[1][indx - 1]) + 0.3f * (rgb[1][indx + 1 + ts] - rgb[1][indx - 1 + ts] + rgb[1][indx + 1 - ts] - rgb[1][indx - 1 - ts]);
-
-                            gradwt = (rbhpfh[indx >> 1] + 0.5f * (rbhpfh[(indx >> 1) + v1] + rbhpfh[(indx >> 1) - v1]) ) * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) / (eps + 0.1f * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) + rblpfh[(indx >> 1) - 1] + rblpfh[(indx >> 1) + 1]);
-
-                            coeff[1][0][c>>1] += gradwt * deltgrb * deltgrb;
-                            coeff[1][1][c>>1] += gradwt * gdiff * deltgrb;
-                            coeff[1][2][c>>1] += gradwt * gdiff * gdiff;
-
-                            //  In Mathematica,
-                            //  f[x_]=Expand[Total[Flatten[
-                            //  ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]];
-                            //  extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2]
-                        }
-                    }
-
-                    for (int dir = 0; dir < 2; dir++) {
-                        for (int k = 0; k < 3; k++) {
-                            for (int c = 0; c < 2; c++) {
-                                coeff[dir][k][c] *= 0.25f;
-                                if(k == 1) {
-                                    coeff[dir][k][c] *= 0.3125f;
-                                } else if(k == 2) {
-                                    coeff[dir][k][c] *= SQR(0.3125f);
+                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                        //fill borders
+                        if (rrmin > 0) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = ccmin; cc < ccmax; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                                 }
-                            }
                         }
-                    }
 
-                    for (int c = 0; c < 2; c++) {
-                        for (int dir = 0; dir < 2; dir++) { // vert/hor
-
-                            // CAshift[dir][c] are the locations
-                            // that minimize colour difference variances;
-                            // This is the approximate _optical_ location of the R/B pixels
-                            if (coeff[dir][2][c] > eps2) {
-                                CAshift[dir][c] = coeff[dir][1][c] / coeff[dir][2][c];
-                                blockwt[vblock * hblsz + hblock] = coeff[dir][2][c] / (eps + coeff[dir][0][c]) ;
-                            } else {
-                                CAshift[dir][c] = 17.0;
-                                blockwt[vblock * hblsz + hblock] = 0;
-                            }
-
-                            //data structure = CAshift[vert/hor][colour]
-                            //dir : 0=vert, 1=hor
-
-                            //offset gives NW corner of square containing the min; dir : 0=vert, 1=hor
-                            if (fabsf(CAshift[dir][c]) < 2.0f) {
-                                blockavethr[dir][c] += CAshift[dir][c];
-                                blocksqavethr[dir][c] += SQR(CAshift[dir][c]);
-                                blockdenomthr[dir][c] += 1;
-                            }
-                            //evaluate the shifts to the location that minimizes CA within the tile
-                            blockshifts[vblock * hblsz + hblock][c][dir] = CAshift[dir][c]; //vert/hor CA shift for R/B
-
-                        }//vert/hor
-                    }//colour
-
-                    if(plistener) {
-                        progresscounter++;
-
-                        if(progresscounter % 8 == 0)
-                            #pragma omp critical (cadetectpass1)
-                        {
-                            progress += (double)(8.0 * (ts - border2) * (ts - border2)) / (2 * height * width);
-
-                            if (progress > 1.0) {
-                                progress = 1.0;
-                            }
-
-                            plistener->setProgress(progress);
+                        if (rrmax < rr1) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = ccmin; cc < ccmax; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][left + cc] / 65535.f;
+                                }
                         }
-                    }
 
-                }
-
-            //end of diagnostic pass
-            #pragma omp critical (cadetectpass2)
-            {
-                for (int dir = 0; dir < 2; dir++)
-                    for (int c = 0; c < 2; c++) {
-                        blockdenom[dir][c] += blockdenomthr[dir][c];
-                        blocksqave[dir][c] += blocksqavethr[dir][c];
-                        blockave[dir][c]   += blockavethr[dir][c];
-                    }
-            }
-            #pragma omp barrier
-
-            #pragma omp single
-            {
-                for (int dir = 0; dir < 2; dir++)
-                    for (int c = 0; c < 2; c++) {
-                        if (blockdenom[dir][c]) {
-                            blockvar[dir][c] = blocksqave[dir][c] / blockdenom[dir][c] - SQR(blockave[dir][c] / blockdenom[dir][c]);
-                        } else {
-                            processpasstwo = false;
-                            printf ("blockdenom vanishes \n");
-                            break;
+                        if (ccmin > 0) {
+                            for (int rr = rrmin; rr < rrmax; rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
+                                }
                         }
-                    }
 
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-                //now prepare for CA correction pass
-                //first, fill border blocks of blockshift array
-                if(processpasstwo) {
-                    for (int vblock = 1; vblock < vblsz - 1; vblock++) { //left and right sides
-                        for (int c = 0; c < 2; c++) {
-                            for (int i = 0; i < 2; i++) {
-                                blockshifts[vblock * hblsz][c][i] = blockshifts[(vblock) * hblsz + 2][c][i];
-                                blockshifts[vblock * hblsz + hblsz - 1][c][i] = blockshifts[(vblock) * hblsz + hblsz - 3][c][i];
-                            }
+                        if (ccmax < cc1) {
+                            for (int rr = rrmin; rr < rrmax; rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(top + rr)][(width - cc - 2)] / 65535.f;
+                                }
                         }
-                    }
 
-                    for (int hblock = 0; hblock < hblsz; hblock++) { //top and bottom sides
-                        for (int c = 0; c < 2; c++) {
-                            for (int i = 0; i < 2; i++) {
-                                blockshifts[hblock][c][i] = blockshifts[2 * hblsz + hblock][c][i];
-                                blockshifts[(vblsz - 1)*hblsz + hblock][c][i] = blockshifts[(vblsz - 3) * hblsz + hblock][c][i];
-                            }
+                        //also, fill the image corners
+                        if (rrmin > 0 && ccmin > 0) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rawData[border2 - rr][border2 - cc] / 65535.f;
+                                }
                         }
-                    }
 
-                    //end of filling border pixels of blockshift array
-
-                    //initialize fit arrays
-                    double polymat[2][2][256], shiftmat[2][2][16];
-
-                    for (int i = 0; i < 256; i++) {
-                        polymat[0][0][i] = polymat[0][1][i] = polymat[1][0][i] = polymat[1][1][i] = 0;
-                    }
-
-                    for (int i = 0; i < 16; i++) {
-                        shiftmat[0][0][i] = shiftmat[0][1][i] = shiftmat[1][0][i] = shiftmat[1][1][i] = 0;
-                    }
-
-                    int numblox[2] = {0, 0};
-
-                    for (int vblock = 1; vblock < vblsz - 1; vblock++)
-                        for (int hblock = 1; hblock < hblsz - 1; hblock++) {
-                            // block 3x3 median of blockshifts for robustness
-                            for (int c = 0; c < 2; c ++) {
-                                float bstemp[2];
-                                for (int dir = 0; dir < 2; dir++) {
-                                    //temporary storage for median filter
-                                    const std::array<float, 9> p = {
-                                        blockshifts[(vblock - 1) * hblsz + hblock - 1][c][dir],
-                                        blockshifts[(vblock - 1) * hblsz + hblock][c][dir],
-                                        blockshifts[(vblock - 1) * hblsz + hblock + 1][c][dir],
-                                        blockshifts[(vblock) * hblsz + hblock - 1][c][dir],
-                                        blockshifts[(vblock) * hblsz + hblock][c][dir],
-                                        blockshifts[(vblock) * hblsz + hblock + 1][c][dir],
-                                        blockshifts[(vblock + 1) * hblsz + hblock - 1][c][dir],
-                                        blockshifts[(vblock + 1) * hblsz + hblock][c][dir],
-                                        blockshifts[(vblock + 1) * hblsz + hblock + 1][c][dir]
-                                    };
-                                    bstemp[dir] = median(p);
+                        if (rrmax < rr1 && ccmax < cc1) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][(width - cc - 2)] / 65535.f;
                                 }
-
-                                //now prepare coefficient matrix; use only data points within caautostrength/2 std devs of zero
-                                if (SQR(bstemp[0]) > caautostrength * blockvar[0][c] || SQR(bstemp[1]) > caautostrength * blockvar[1][c]) {
-                                    continue;
-                                }
-
-                                numblox[c]++;
-
-                                for (int dir = 0; dir < 2; dir++) {
-                                    double powVblockInit = 1.0;
-                                    for (int i = 0; i < polyord; i++) {
-                                        double powHblockInit = 1.0;
-                                        for (int j = 0; j < polyord; j++) {
-                                            double powVblock = powVblockInit;
-                                            for (int m = 0; m < polyord; m++) {
-                                                double powHblock = powHblockInit;
-                                                for (int n = 0; n < polyord; n++) {
-                                                    polymat[c][dir][numpar * (polyord * i + j) + (polyord * m + n)] += powVblock * powHblock * blockwt[vblock * hblsz + hblock];
-                                                    powHblock *= hblock;
-                                                }
-                                                powVblock *= vblock;
-                                            }
-                                            shiftmat[c][dir][(polyord * i + j)] += powVblockInit * powHblockInit * bstemp[dir] * blockwt[vblock * hblsz + hblock];
-                                            powHblockInit *= hblock;
-                                        }
-                                        powVblockInit *= vblock;
-                                    }//monomials
-                                }//dir
-                            }//c
-                        }//blocks
-
-                    numblox[1] = min(numblox[0], numblox[1]);
-
-                    //if too few data points, restrict the order of the fit to linear
-                    if (numblox[1] < 32) {
-                        polyord = 2;
-                        numpar = 4;
-
-                        if (numblox[1] < 10) {
-
-                            printf ("numblox = %d \n", numblox[1]);
-                            processpasstwo = false;
                         }
-                    }
 
-                    if(processpasstwo)
-
-                        //fit parameters to blockshifts
-                        for (int c = 0; c < 2; c++)
-                            for (int dir = 0; dir < 2; dir++) {
-                                if (!LinEqSolve(numpar, polymat[c][dir], shiftmat[c][dir], fitparams[c][dir])) {
-                                    printf("CA correction pass failed -- can't solve linear equations for colour %d direction %d...\n", c, dir);
-                                    processpasstwo = false;
+                        if (rrmin > 0 && ccmax < cc1) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(border2 - rr)][(width - cc - 2)] / 65535.f;
                                 }
-                            }
-                }
-
-                //fitparams[polyord*i+j] gives the coefficients of (vblock^i hblock^j) in a polynomial fit for i,j<=4
-            }
-            //end of initialization for CA correction pass
-            //only executed if autoCA is true
-        }
-
-        // Main algorithm: Tile loop
-        if(processpasstwo) {
-            float *grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
-            //green interpolated to optical sample points for R/B
-            float *gshift  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
-            #pragma omp for schedule(dynamic) collapse(2) nowait
-
-            for (int top = -border; top < height; top += ts - border2)
-                for (int left = -border; left < width - (W & 1); left += ts - border2) {
-                    memset(bufferThr, 0, buffersizePassTwo);
-                    float lblockshifts[2][2];
-                    const int vblock = ((top + border) / (ts - border2)) + 1;
-                    const int hblock = ((left + border) / (ts - border2)) + 1;
-                    const int bottom = min(top + ts, height + border);
-                    const int right  = min(left + ts, width - (W & 1) + border);
-                    const int rr1 = bottom - top;
-                    const int cc1 = right - left;
-
-                    const int rrmin = top < 0 ? border : 0;
-                    const int rrmax = bottom > height ? height - top : rr1;
-                    const int ccmin = left < 0 ? border : 0;
-                    const int ccmax = (right > width - (W & 1)) ? width - (W & 1) - left : cc1;
-
-                    // rgb from input CFA data
-                    // rgb values should be floating point number between 0 and 1
-                    // after white balance multipliers are applied
-
-#ifdef __SSE2__
-                    vfloat c65535v = F2V(65535.f);
-                    vmask gmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
-#endif
-                    for (int rr = rrmin; rr < rrmax; rr++) {
-                        int row = rr + top;
-                        int cc = ccmin;
-                        int col = cc + left;
-                        int indx = row * width + col;
-                        int indx1 = rr * ts + cc;
-#ifdef __SSE2__
-                        int c = FC(rr, cc);
-                        if(c & 1) {
-                            rgb[1][indx1] = rawData[row][col] / 65535.f;
-                            indx++;
-                            indx1++;
-                            cc++;
-                            col++;
-                            c = FC(rr, cc);
                         }
-                        for (; cc < ccmax - 7; cc += 8, col += 8, indx += 8, indx1 += 8) {
-                            vfloat val1v = LVFU(rawData[row][col]) / c65535v;
-                            vfloat val2v = LVFU(rawData[row][col + 4]) / c65535v;
-                            STVFU(rgb[c][indx1 >> 1], _mm_shuffle_ps(val1v, val2v, _MM_SHUFFLE(2, 0, 2, 0)));
-                            vfloat gtmpv = LVFU(Gtmp[indx >> 1]);
-                            STVFU(rgb[1][indx1], vself(gmask, PERMUTEPS(gtmpv, _MM_SHUFFLE(1, 1, 0, 0)), val1v));
-                            STVFU(rgb[1][indx1 + 4], vself(gmask, PERMUTEPS(gtmpv, _MM_SHUFFLE(3, 3, 2, 2)), val2v));
+
+                        if (rrmax < rr1 && ccmin > 0) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][(border2 - cc)] / 65535.f;
+                                }
                         }
-#endif
-                        for (; cc < ccmax; cc++, col++, indx++, indx1++) {
-                            int c = FC(rr, cc);
-                            rgb[c][indx1 >> ((c & 1) ^ 1)] = rawData[row][col] / 65535.f;
 
-                            if ((c & 1) == 0) {
-                                rgb[1][indx1] = Gtmp[indx >> 1];
-                            }
-                        }
-                    }
-                    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                    //fill borders
-                    if (rrmin > 0) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = ccmin; cc < ccmax; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
-                                rgb[1][rr * ts + cc] = rgb[1][(border2 - rr) * ts + cc];
-                            }
-                    }
+                        //end of border fill
+                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                        //end of initialization
 
-                    if (rrmax < rr1) {
-                        for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
-                            for (int cc = ccmin; cc < ccmax; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][left + cc]) / 65535.f;
-                                if ((c & 1) == 0) {
-                                    rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + left + cc) >> 1];
-                                }
-                            }
-                    }
 
-                    if (ccmin > 0) {
-                        for (int rr = rrmin; rr < rrmax; rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
-                                rgb[1][rr * ts + cc] = rgb[1][rr * ts + border2 - cc];
-                            }
-                    }
-
-                    if (ccmax < cc1) {
-                        for (int rr = rrmin; rr < rrmax; rr++)
-                            for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.f;
-                                if ((c & 1) == 0) {
-                                    rgb[1][rr * ts + ccmax + cc] = Gtmp[((top + rr) * width + (width - cc - 2)) >> 1];
-                                }
-                            }
-                    }
-
-                    //also, fill the image corners
-                    if (rrmin > 0 && ccmin > 0) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = (rawData[border2 - rr][border2 - cc]) / 65535.f;
-                                if ((c & 1) == 0) {
-                                    rgb[1][rr * ts + cc] = Gtmp[((border2 - rr) * width + border2 - cc) >> 1];
-                                }
-                            }
-                    }
-
-                    if (rrmax < rr1 && ccmax < cc1) {
-                        for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
-                            for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.f;
-                                if ((c & 1) == 0) {
-                                    rgb[1][(rrmax + rr)*ts + ccmax + cc] = Gtmp[((height - rr - 2) * width + (width - cc - 2)) >> 1];
-                                }
-                            }
-                    }
-
-                    if (rrmin > 0 && ccmax < cc1) {
-                        for (int rr = 0; rr < border; rr++)
-                            for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.f;
-                                if ((c & 1) == 0) {
-                                    rgb[1][rr * ts + ccmax + cc] = Gtmp[((border2 - rr) * width + (width - cc - 2)) >> 1];
-                                }
-                            }
-                    }
-
-                    if (rrmax < rr1 && ccmin > 0) {
-                        for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
-                            for (int cc = 0; cc < border; cc++) {
-                                int c = FC(rr, cc);
-                                rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.f;
-                                if ((c & 1) == 0) {
-                                    rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + (border2 - cc)) >> 1];
-                                }
-                            }
-                    }
-
-                    //end of border fill
-                    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-                    if (!autoCA || fitParamsIn) {
-#ifdef __SSE2__
-                        const vfloat onev = F2V(1.f);
-                        const vfloat epsv = F2V(eps);
-#endif
-
-                        //manual CA correction; use red/blue slider values to set CA shift parameters
+    #ifdef __SSE2__
+                        vfloat onev = F2V(1.f);
+                        vfloat epsv = F2V(eps);
+    #endif
                         for (int rr = 3; rr < rr1 - 3; rr++) {
-                            int cc = 3 + FC(rr, 1), c = FC(rr,cc), indx = rr * ts + cc;
-#ifdef __SSE2__
-                            for (; cc < cc1 - 10; cc += 8, indx += 8) {
+                            int row = rr + top;
+                            int cc = 3 + (FC(rr,3) & 1);
+                            int indx = rr * ts + cc;
+                            int c = FC(rr,cc);
+    #ifdef __SSE2__
+                            for (; cc < cc1 - 9; cc+=8, indx+=8) {
                                 //compute directional weights using image gradients
-                                vfloat val1v = epsv + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 1) * ts + cc]));
-                                vfloat val2v = epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1]));
-                                vfloat wtuv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr - 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr - 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 3) * ts + cc])));
-                                vfloat wtdv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr + 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr + 3) * ts + cc])));
-                                vfloat wtlv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
-                                vfloat wtrv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));
+                                vfloat rgb1mv1v = LC2VFU(rgb[1][indx - v1]);
+                                vfloat rgb1pv1v = LC2VFU(rgb[1][indx + v1]);
+                                vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
+                                vfloat temp1v = epsv + vabsf(rgb1mv1v - rgb1pv1v);
+                                vfloat wtuv = onev / SQRV(temp1v + vabsf(rgbcv - LVFU(rgb[c][(indx - v2) >> 1])) + vabsf(rgb1mv1v - LC2VFU(rgb[1][indx - v3])));
+                                vfloat wtdv = onev / SQRV(temp1v + vabsf(rgbcv - LVFU(rgb[c][(indx + v2) >> 1])) + vabsf(rgb1pv1v - LC2VFU(rgb[1][indx + v3])));
+                                vfloat rgb1m1v = LC2VFU(rgb[1][indx - 1]);
+                                vfloat rgb1p1v = LC2VFU(rgb[1][indx + 1]);
+                                vfloat temp2v = epsv + vabsf(rgb1m1v - rgb1p1v);
+                                vfloat wtlv = onev / SQRV(temp2v + vabsf(rgbcv - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(rgb1m1v - LC2VFU(rgb[1][indx - 3])));
+                                vfloat wtrv = onev / SQRV(temp2v + vabsf(rgbcv - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(rgb1p1v - LC2VFU(rgb[1][indx + 3])));
 
                                 //store in rgb array the interpolated G value at R/B grid points using directional weighted average
-                                STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
+                                STC2VFU(rgb[1][indx], (wtuv * rgb1mv1v + wtdv * rgb1pv1v + wtlv * rgb1m1v + wtrv * rgb1p1v) / (wtuv + wtdv + wtlv + wtrv));
                             }
-#endif
-                            for (; cc < cc1 - 3; cc += 2, indx += 2) {
+
+    #endif
+                            for (; cc < cc1 - 3; cc+=2, indx+=2) {
                                 //compute directional weights using image gradients
-                                float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
-                                float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
-                                float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
-                                float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
+                                float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - v2) >> 1]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
+                                float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + v2) >> 1]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
+                                float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - 2) >> 1]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
+                                float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + 2) >> 1]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));
 
                                 //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                 rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
                             }
-                        }
-                    }
-                    if (!autoCA) {
-                        float hfrac = -((float)(hblock - 0.5) / (hblsz - 2) - 0.5);
-                        float vfrac = -((float)(vblock - 0.5) / (vblsz - 2) - 0.5) * height / width;
-                        lblockshifts[0][0] = 2 * vfrac * cared;
-                        lblockshifts[0][1] = 2 * hfrac * cared;
-                        lblockshifts[1][0] = 2 * vfrac * cablue;
-                        lblockshifts[1][1] = 2 * hfrac * cablue;
-                    } else {
-                        //CA auto correction; use CA diagnostic pass to set shift parameters
-                        lblockshifts[0][0] = lblockshifts[0][1] = 0;
-                        lblockshifts[1][0] = lblockshifts[1][1] = 0;
-                        double powVblock = 1.0;
-                        for (int i = 0; i < polyord; i++) {
-                            double powHblock = powVblock;
-                            for (int j = 0; j < polyord; j++) {
-                                lblockshifts[0][0] += powHblock * fitparams[0][0][polyord * i + j];
-                                lblockshifts[0][1] += powHblock * fitparams[0][1][polyord * i + j];
-                                lblockshifts[1][0] += powHblock * fitparams[1][0][polyord * i + j];
-                                lblockshifts[1][1] += powHblock * fitparams[1][1][polyord * i + j];
-                                powHblock *= hblock;
-                            }
-                            powVblock *= vblock;
-                        }
-                        constexpr float bslim = 3.99; //max allowed CA shift
-                        lblockshifts[0][0] = LIM(lblockshifts[0][0], -bslim, bslim);
-                        lblockshifts[0][1] = LIM(lblockshifts[0][1], -bslim, bslim);
-                        lblockshifts[1][0] = LIM(lblockshifts[1][0], -bslim, bslim);
-                        lblockshifts[1][1] = LIM(lblockshifts[1][1], -bslim, bslim);
-                    }//end of setting CA shift parameters
 
-
-                    for (int c = 0; c < 3; c += 2) {
-
-                        //some parameters for the bilinear interpolation
-                        shiftvfloor[c] = floor((float)lblockshifts[c>>1][0]);
-                        shiftvceil[c] = ceil((float)lblockshifts[c>>1][0]);
-                        shiftvfrac[c] = lblockshifts[c>>1][0] - shiftvfloor[c];
-
-                        shifthfloor[c] = floor((float)lblockshifts[c>>1][1]);
-                        shifthceil[c] = ceil((float)lblockshifts[c>>1][1]);
-                        shifthfrac[c] = lblockshifts[c>>1][1] - shifthfloor[c];
-
-                        GRBdir[0][c] = lblockshifts[c>>1][0] > 0 ? 2 : -2;
-                        GRBdir[1][c] = lblockshifts[c>>1][1] > 0 ? 2 : -2;
-
-                    }
-
-
-                    for (int rr = 4; rr < rr1 - 4; rr++) {
-                        int cc = 4 + (FC(rr, 2) & 1);
-                        int c = FC(rr, cc);
-                        int indx = (rr * ts + cc) >> 1;
-                        int indxfc = (rr + shiftvfloor[c]) * ts + cc + shifthceil[c];
-                        int indxff = (rr + shiftvfloor[c]) * ts + cc + shifthfloor[c];
-                        int indxcc = (rr + shiftvceil[c]) * ts + cc + shifthceil[c];
-                        int indxcf = (rr + shiftvceil[c]) * ts + cc + shifthfloor[c];
-#ifdef __SSE2__
-                        vfloat shifthfracv = F2V(shifthfrac[c]);
-                        vfloat shiftvfracv = F2V(shiftvfrac[c]);
-                        for (; cc < cc1 - 10; cc += 8, indxfc += 8, indxff += 8, indxcc += 8, indxcf += 8, indx += 4) {
-                            //perform CA correction using colour ratios or colour differences
-                            vfloat Ginthfloorv = vintpf(shifthfracv, LC2VFU(rgb[1][indxfc]), LC2VFU(rgb[1][indxff]));
-                            vfloat Ginthceilv = vintpf(shifthfracv, LC2VFU(rgb[1][indxcc]), LC2VFU(rgb[1][indxcf]));
-                            //Gint is bilinear interpolation of G at CA shift point
-                            vfloat Gintv = vintpf(shiftvfracv, Ginthceilv, Ginthfloorv);
-
-                            //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
-                            //but first we need to interpolate G-R/G-B to grid points...
-                            STVFU(grbdiff[indx], Gintv - LVFU(rgb[c][indx]));
-                            STVFU(gshift[indx], Gintv);
-                        }
-
-#endif
-                        for (; cc < cc1 - 4; cc += 2, indxfc += 2, indxff += 2, indxcc += 2, indxcf += 2, ++indx) {
-                            //perform CA correction using colour ratios or colour differences
-                            float Ginthfloor = intp(shifthfrac[c], rgb[1][indxfc], rgb[1][indxff]);
-                            float Ginthceil = intp(shifthfrac[c], rgb[1][indxcc], rgb[1][indxcf]);
-                            //Gint is bilinear interpolation of G at CA shift point
-                            float Gint = intp(shiftvfrac[c], Ginthceil, Ginthfloor);
-
-                            //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
-                            //but first we need to interpolate G-R/G-B to grid points...
-                            grbdiff[indx] = Gint - rgb[c][indx];
-                            gshift[indx] = Gint;
-                        }
-                    }
-
-                    shifthfrac[0] /= 2.f;
-                    shifthfrac[2] /= 2.f;
-                    shiftvfrac[0] /= 2.f;
-                    shiftvfrac[2] /= 2.f;
-
-#ifdef __SSE2__
-                    vfloat zd25v = F2V(0.25f);
-                    vfloat onev = F2V(1.f);
-                    vfloat zd5v = F2V(0.5f);
-                    vfloat epsv = F2V(eps);
-#endif
-                    for (int rr = 8; rr < rr1 - 8; rr++) {
-                        int cc = 8 + (FC(rr, 2) & 1);
-                        int c = FC(rr, cc);
-                        int GRBdir0 = GRBdir[0][c];
-                        int GRBdir1 = GRBdir[1][c];
-#ifdef __SSE2__
-                        vfloat shifthfracc = F2V(shifthfrac[c]);
-                        vfloat shiftvfracc = F2V(shiftvfrac[c]);
-                        for (int indx = rr * ts + cc; cc < cc1 - 14; cc += 8, indx += 8) {
-                            //interpolate colour difference from optical R/B locations to grid locations
-                            vfloat grbdiffinthfloor = vintpf(shifthfracc, LVFU(grbdiff[(indx - GRBdir1) >> 1]), LVFU(grbdiff[indx >> 1]));
-                            vfloat grbdiffinthceil = vintpf(shifthfracc, LVFU(grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]), LVFU(grbdiff[((rr - GRBdir0) * ts + cc) >> 1]));
-                            //grbdiffint is bilinear interpolation of G-R/G-B at grid point
-                            vfloat grbdiffint = vintpf(shiftvfracc, grbdiffinthceil, grbdiffinthfloor);
-
-                            //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
-                            vfloat cinv = LVFU(rgb[c][indx >> 1]);
-                            vfloat rinv = LC2VFU(rgb[1][indx]);
-                            vfloat RBint = rinv - grbdiffint;
-                            vmask cmask = vmaskf_ge(vabsf(RBint - cinv), zd25v * (RBint + cinv));
-                            if(_mm_movemask_ps((vfloat)cmask)) {
-                                // if for any of the 4 pixels the condition is true, do the math for all 4 pixels and mask the unused out at the end
-                                //gradient weights using difference from G at CA shift points and G at grid points
-                                vfloat p0 = onev / (epsv + vabsf(rinv - LVFU(gshift[indx >> 1])));
-                                vfloat p1 = onev / (epsv + vabsf(rinv - LVFU(gshift[(indx - GRBdir1) >> 1])));
-                                vfloat p2 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir0) * ts + cc) >> 1])));
-                                vfloat p3 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1])));
-
-                                grbdiffint = vself(cmask, (p0 * LVFU(grbdiff[indx >> 1]) + p1 * LVFU(grbdiff[(indx - GRBdir1) >> 1]) +
-                                              p2 * LVFU(grbdiff[((rr - GRBdir0) * ts + cc) >> 1]) + p3 * LVFU(grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1])) / (p0 + p1 + p2 + p3), grbdiffint);
-
-                            }
-                            vfloat grbdiffold = rinv - cinv;
-                            RBint = rinv - grbdiffint;
-                            RBint = vself(vmaskf_gt(vabsf(grbdiffold), vabsf(grbdiffint)), RBint, cinv);
-                            RBint = vself(vmaskf_lt(grbdiffold * grbdiffint, ZEROV), rinv - zd5v * (grbdiffold + grbdiffint), RBint);
-                            STVFU(rgb[c][indx >> 1], RBint);
-                        }
-#endif
-                        for (int c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
-                            float grbdiffold = rgb[1][indx] - rgb[c][indx >> 1];
-
-                            //interpolate colour difference from optical R/B locations to grid locations
-                            float grbdiffinthfloor = intp(shifthfrac[c], grbdiff[(indx - GRBdir1) >> 1], grbdiff[indx >> 1]);
-                            float grbdiffinthceil = intp(shifthfrac[c], grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1], grbdiff[((rr - GRBdir0) * ts + cc) >> 1]);
-                            //grbdiffint is bilinear interpolation of G-R/G-B at grid point
-                            float grbdiffint = intp(shiftvfrac[c], grbdiffinthceil, grbdiffinthfloor);
-
-                            //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
-                            float RBint = rgb[1][indx] - grbdiffint;
-
-                            if (fabsf(RBint - rgb[c][indx >> 1]) < 0.25f * (RBint + rgb[c][indx >> 1])) {
-                                if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
-                                    rgb[c][indx >> 1] = RBint;
+                            if (row > -1 && row < height) {
+                                int offset = (FC(row,max(left + 3, 0)) & 1);
+                                int col = max(left + 3, 0) + offset;
+                                int indx = rr * ts + 3 - (left < 0 ? (left+3) : 0) + offset;
+    #ifdef __SSE2__
+                                for(; col < min(cc1 + left - 3, width) - 7; col+=8, indx+=8) {
+                                    STVFU(Gtmp[(row * width + col) >> 1], LC2VFU(rgb[1][indx]));
                                 }
+    #endif
+                                for(; col < min(cc1 + left - 3, width); col+=2, indx+=2) {
+                                    Gtmp[(row * width + col) >> 1] = rgb[1][indx];
+                                }
+                            }
+
+                        }
+                        //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+    #ifdef __SSE2__
+                        vfloat zd25v = F2V(0.25f);
+    #endif
+                        for (int rr = 4; rr < rr1 - 4; rr++) {
+                            int cc = 4 + (FC(rr, 2) & 1);
+                            int indx = rr * ts + cc;
+                            int c = FC(rr, cc);
+    #ifdef __SSE2__
+                            for (; cc < cc1 - 10; cc += 8, indx += 8) {
+                                vfloat rgb1v = LC2VFU(rgb[1][indx]);
+                                vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
+                                vfloat rgb1mv4 = LC2VFU(rgb[1][indx - v4]);
+                                vfloat rgb1pv4 = LC2VFU(rgb[1][indx + v4]);
+                                vfloat temp1v = vabsf(vabsf((rgb1v - rgbcv) - (rgb1pv4 - LVFU(rgb[c][(indx + v4) >> 1]))) +
+                                                      vabsf(rgb1mv4 - LVFU(rgb[c][(indx - v4) >> 1]) - rgb1v + rgbcv) -
+                                                      vabsf(rgb1mv4 - LVFU(rgb[c][(indx - v4) >> 1]) - rgb1pv4 + LVFU(rgb[c][(indx + v4) >> 1])));
+                                STVFU(rbhpfv[indx >> 1], temp1v);
+                                vfloat rgb1m4 = LC2VFU(rgb[1][indx - 4]);
+                                vfloat rgb1p4 = LC2VFU(rgb[1][indx + 4]);
+                                vfloat temp2v = vabsf(vabsf((rgb1v - rgbcv) - (rgb1p4 - LVFU(rgb[c][(indx + 4) >> 1]))) +
+                                                      vabsf(rgb1m4 - LVFU(rgb[c][(indx - 4) >> 1]) - rgb1v + rgbcv) -
+                                                      vabsf(rgb1m4 - LVFU(rgb[c][(indx - 4) >> 1]) - rgb1p4 + LVFU(rgb[c][(indx + 4) >> 1])));
+                                STVFU(rbhpfh[indx >> 1], temp2v);
+
+                                //low and high pass 1D filters of G in vertical/horizontal directions
+                                rgb1v = vmul2f(rgb1v);
+                                vfloat glpfvv = (rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]));
+                                vfloat glpfhv = (rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]));
+                                rgbcv = vmul2f(rgbcv);
+                                STVFU(rblpfv[indx >> 1], zd25v * vabsf(glpfvv - (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
+                                STVFU(rblpfh[indx >> 1], zd25v * vabsf(glpfhv - (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
+                                STVFU(grblpfv[indx >> 1], zd25v * (glpfvv + (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
+                                STVFU(grblpfh[indx >> 1], zd25v * (glpfhv + (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
+                            }
+
+    #endif
+                            for (; cc < cc1 - 4; cc += 2, indx += 2) {
+                                rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])) +
+                                                          fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
+                                                          fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])));
+                                rbhpfh[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])) +
+                                                          fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
+                                                          fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])));
+
+                                //low and high pass 1D filters of G in vertical/horizontal directions
+                                float glpfv = (2.f * rgb[1][indx] + rgb[1][indx + v2] + rgb[1][indx - v2]);
+                                float glpfh = (2.f * rgb[1][indx] + rgb[1][indx + 2] + rgb[1][indx - 2]);
+                                rblpfv[indx >> 1] = 0.25f * fabsf(glpfv - (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]));
+                                rblpfh[indx >> 1] = 0.25f * fabsf(glpfh - (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]));
+                                grblpfv[indx >> 1] = 0.25f * (glpfv + (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]));
+                                grblpfh[indx >> 1] = 0.25f * (glpfh + (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]));
+                            }
+                        }
+
+                        for (int dir = 0; dir < 2; dir++) {
+                            for (int k = 0; k < 3; k++) {
+                                for (int c = 0; c < 2; c++) {
+                                    coeff[dir][k][c] = 0;
+                                }
+                            }
+                        }
+
+    #ifdef __SSE2__
+                        vfloat zd3v = F2V(0.3f);
+                        vfloat zd1v = F2V(0.1f);
+                        vfloat zd5v = F2V(0.5f);
+    #endif
+
+                        // along line segments, find the point along each segment that minimizes the colour variance
+                        // averaged over the tile; evaluate for up/down and left/right away from R/B grid point
+                        for (int rr = 8; rr < rr1 - 8; rr++) {
+                            int cc = 8 + (FC(rr, 2) & 1);
+                            int indx = rr * ts + cc;
+                            int c = FC(rr, cc);
+    #ifdef __SSE2__
+                            vfloat coeff00v = ZEROV;
+                            vfloat coeff01v = ZEROV;
+                            vfloat coeff02v = ZEROV;
+                            vfloat coeff10v = ZEROV;
+                            vfloat coeff11v = ZEROV;
+                            vfloat coeff12v = ZEROV;
+                            for (; cc < cc1 - 14; cc += 8, indx += 8) {
+
+                                //in linear interpolation, colour differences are a quadratic function of interpolation position;
+                                //solve for the interpolation position that minimizes colour difference variance over the tile
+
+                                //vertical
+                                vfloat temp1 = zd3v * (LC2VFU(rgb[1][indx + ts + 1]) - LC2VFU(rgb[1][indx - ts - 1]));
+                                vfloat temp2 = zd3v * (LC2VFU(rgb[1][indx - ts + 1]) - LC2VFU(rgb[1][indx + ts - 1]));
+                                vfloat gdiffvv = (LC2VFU(rgb[1][indx + ts]) - LC2VFU(rgb[1][indx - ts])) + (temp1 - temp2);
+                                vfloat deltgrbv = LVFU(rgb[c][indx >> 1]) - LC2VFU(rgb[1][indx]);
+
+                                vfloat gradwtvv = (LVFU(rbhpfv[indx >> 1]) + zd5v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1]))) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) + LVFU(rblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) + v1]));
+
+                                coeff00v += gradwtvv * deltgrbv * deltgrbv;
+                                coeff01v += gradwtvv * gdiffvv * deltgrbv;
+                                coeff02v += gradwtvv * gdiffvv * gdiffvv;
+
+                                //horizontal
+                                vfloat gdiffhv = (LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + (temp1 + temp2);
+
+                                vfloat gradwthv = (LVFU(rbhpfh[indx >> 1]) + zd5v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1]))) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) + LVFU(rblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) + 1]));
+
+                                coeff10v += gradwthv * deltgrbv * deltgrbv;
+                                coeff11v += gradwthv * gdiffhv * deltgrbv;
+                                coeff12v += gradwthv * gdiffhv * gdiffhv;
+                            }
+
+                            coeff[0][0][c>>1] += vhadd(coeff00v);
+                            coeff[0][1][c>>1] += vhadd(coeff01v);
+                            coeff[0][2][c>>1] += vhadd(coeff02v);
+                            coeff[1][0][c>>1] += vhadd(coeff10v);
+                            coeff[1][1][c>>1] += vhadd(coeff11v);
+                            coeff[1][2][c>>1] += vhadd(coeff12v);
+
+    #endif
+                            for (; cc < cc1 - 8; cc += 2, indx += 2) {
+
+                                //in linear interpolation, colour differences are a quadratic function of interpolation position;
+                                //solve for the interpolation position that minimizes colour difference variance over the tile
+
+                                //vertical
+                                float gdiff = (rgb[1][indx + ts] - rgb[1][indx - ts]) + 0.3f * (rgb[1][indx + ts + 1] - rgb[1][indx - ts + 1] + rgb[1][indx + ts - 1] - rgb[1][indx - ts - 1]);
+                                float deltgrb = (rgb[c][indx >> 1] - rgb[1][indx]);
+
+                                float gradwt = (rbhpfv[indx >> 1] + 0.5f * (rbhpfv[(indx >> 1) + 1] + rbhpfv[(indx >> 1) - 1]) ) * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) / (eps + 0.1f * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) + rblpfv[(indx >> 1) - v1] + rblpfv[(indx >> 1) + v1]);
+
+                                coeff[0][0][c>>1] += gradwt * deltgrb * deltgrb;
+                                coeff[0][1][c>>1] += gradwt * gdiff * deltgrb;
+                                coeff[0][2][c>>1] += gradwt * gdiff * gdiff;
+
+                                //horizontal
+                                gdiff = (rgb[1][indx + 1] - rgb[1][indx - 1]) + 0.3f * (rgb[1][indx + 1 + ts] - rgb[1][indx - 1 + ts] + rgb[1][indx + 1 - ts] - rgb[1][indx - 1 - ts]);
+
+                                gradwt = (rbhpfh[indx >> 1] + 0.5f * (rbhpfh[(indx >> 1) + v1] + rbhpfh[(indx >> 1) - v1]) ) * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) / (eps + 0.1f * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) + rblpfh[(indx >> 1) - 1] + rblpfh[(indx >> 1) + 1]);
+
+                                coeff[1][0][c>>1] += gradwt * deltgrb * deltgrb;
+                                coeff[1][1][c>>1] += gradwt * gdiff * deltgrb;
+                                coeff[1][2][c>>1] += gradwt * gdiff * gdiff;
+
+                                //  In Mathematica,
+                                //  f[x_]=Expand[Total[Flatten[
+                                //  ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]];
+                                //  extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2]
+                            }
+                        }
+
+                        for (int dir = 0; dir < 2; dir++) {
+                            for (int k = 0; k < 3; k++) {
+                                for (int c = 0; c < 2; c++) {
+                                    coeff[dir][k][c] *= 0.25f;
+                                    if(k == 1) {
+                                        coeff[dir][k][c] *= 0.3125f;
+                                    } else if(k == 2) {
+                                        coeff[dir][k][c] *= SQR(0.3125f);
+                                    }
+                                }
+                            }
+                        }
+
+                        for (int c = 0; c < 2; c++) {
+                            for (int dir = 0; dir < 2; dir++) { // vert/hor
+
+                                // CAshift[dir][c] are the locations
+                                // that minimize colour difference variances;
+                                // This is the approximate _optical_ location of the R/B pixels
+                                if (coeff[dir][2][c] > eps2) {
+                                    CAshift[dir][c] = coeff[dir][1][c] / coeff[dir][2][c];
+                                    blockwt[vblock * hblsz + hblock] = coeff[dir][2][c] / (eps + coeff[dir][0][c]) ;
+                                } else {
+                                    CAshift[dir][c] = 17.0;
+                                    blockwt[vblock * hblsz + hblock] = 0;
+                                }
+
+                                //data structure = CAshift[vert/hor][colour]
+                                //dir : 0=vert, 1=hor
+
+                                //offset gives NW corner of square containing the min; dir : 0=vert, 1=hor
+                                if (fabsf(CAshift[dir][c]) < 2.0f) {
+                                    blockavethr[dir][c] += CAshift[dir][c];
+                                    blocksqavethr[dir][c] += SQR(CAshift[dir][c]);
+                                    blockdenomthr[dir][c] += 1;
+                                }
+                                //evaluate the shifts to the location that minimizes CA within the tile
+                                blockshifts[vblock * hblsz + hblock][c][dir] = CAshift[dir][c]; //vert/hor CA shift for R/B
+
+                            }//vert/hor
+                        }//colour
+
+                        if(plistener) {
+                            progresscounter++;
+
+                            if(progresscounter % 8 == 0)
+                                #pragma omp critical (cadetectpass1)
+                            {
+                                progress += (double)(8.0 * (ts - border2) * (ts - border2)) / (2 * height * width);
+
+                                if (progress > 1.0) {
+                                    progress = 1.0;
+                                }
+
+                                plistener->setProgress(progress);
+                            }
+                        }
+
+                    }
+
+                //end of diagnostic pass
+                #pragma omp critical (cadetectpass2)
+                {
+                    for (int dir = 0; dir < 2; dir++)
+                        for (int c = 0; c < 2; c++) {
+                            blockdenom[dir][c] += blockdenomthr[dir][c];
+                            blocksqave[dir][c] += blocksqavethr[dir][c];
+                            blockave[dir][c]   += blockavethr[dir][c];
+                        }
+                }
+                #pragma omp barrier
+
+                #pragma omp single
+                {
+                    for (int dir = 0; dir < 2; dir++)
+                        for (int c = 0; c < 2; c++) {
+                            if (blockdenom[dir][c]) {
+                                blockvar[dir][c] = blocksqave[dir][c] / blockdenom[dir][c] - SQR(blockave[dir][c] / blockdenom[dir][c]);
                             } else {
+                                processpasstwo = false;
+                                printf ("blockdenom vanishes \n");
+                                break;
+                            }
+                        }
 
-                                //gradient weights using difference from G at CA shift points and G at grid points
-                                float p0 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[indx >> 1]));
-                                float p1 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[(indx - GRBdir1) >> 1]));
-                                float p2 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[((rr - GRBdir0) * ts + cc) >> 1]));
-                                float p3 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]));
+                    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-                                grbdiffint = (p0 * grbdiff[indx >> 1] + p1 * grbdiff[(indx - GRBdir1) >> 1] +
-                                              p2 * grbdiff[((rr - GRBdir0) * ts + cc) >> 1] + p3 * grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]) / (p0 + p1 + p2 + p3) ;
-
-                                //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
-                                if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
-                                    rgb[c][indx >> 1] = rgb[1][indx] - grbdiffint;
+                    //now prepare for CA correction pass
+                    //first, fill border blocks of blockshift array
+                    if(processpasstwo) {
+                        for (int vblock = 1; vblock < vblsz - 1; vblock++) { //left and right sides
+                            for (int c = 0; c < 2; c++) {
+                                for (int i = 0; i < 2; i++) {
+                                    blockshifts[vblock * hblsz][c][i] = blockshifts[(vblock) * hblsz + 2][c][i];
+                                    blockshifts[vblock * hblsz + hblsz - 1][c][i] = blockshifts[(vblock) * hblsz + hblsz - 3][c][i];
                                 }
                             }
+                        }
 
-                            //if colour difference interpolation overshot the correction, just desaturate
-                            if (grbdiffold * grbdiffint < 0) {
-                                rgb[c][indx >> 1] = rgb[1][indx] - 0.5f * (grbdiffold + grbdiffint);
+                        for (int hblock = 0; hblock < hblsz; hblock++) { //top and bottom sides
+                            for (int c = 0; c < 2; c++) {
+                                for (int i = 0; i < 2; i++) {
+                                    blockshifts[hblock][c][i] = blockshifts[2 * hblsz + hblock][c][i];
+                                    blockshifts[(vblsz - 1)*hblsz + hblock][c][i] = blockshifts[(vblsz - 3) * hblsz + hblock][c][i];
+                                }
                             }
                         }
-                    }
 
-                    // copy CA corrected results to temporary image matrix
-                    for (int rr = border; rr < rr1 - border; rr++) {
-                        int c = FC(rr + top, left + border + (FC(rr + top, 2) & 1));
-                        int row = rr + top;
-                        int cc = border + (FC(rr, 2) & 1);
-                        int indx = (row * width + cc + left) >> 1;
-                        int indx1 = (rr * ts + cc) >> 1;
-#ifdef __SSE2__
-                        for (; indx < (row * width + cc1 - border - 7 + left) >> 1; indx+=4, indx1 += 4) {
-                            STVFU(RawDataTmp[indx], c65535v * LVFU(rgb[c][indx1]));
+                        //end of filling border pixels of blockshift array
+
+                        //initialize fit arrays
+                        double polymat[2][2][256], shiftmat[2][2][16];
+
+                        for (int i = 0; i < 256; i++) {
+                            polymat[0][0][i] = polymat[0][1][i] = polymat[1][0][i] = polymat[1][1][i] = 0;
                         }
-#endif
-                        for (; indx < (row * width + cc1 - border + left) >> 1; indx++, indx1++) {
-                            RawDataTmp[indx] = 65535.f * rgb[c][indx1];
+
+                        for (int i = 0; i < 16; i++) {
+                            shiftmat[0][0][i] = shiftmat[0][1][i] = shiftmat[1][0][i] = shiftmat[1][1][i] = 0;
                         }
-                    }
 
-                    if(plistener) {
-                        progresscounter++;
+                        int numblox[2] = {0, 0};
 
-                        if(progresscounter % 8 == 0)
-                            #pragma omp critical (cacorrect)
-                        {
-                            progress += (double)(8.0 * (ts - border2) * (ts - border2)) / (2 * height * width);
+                        for (int vblock = 1; vblock < vblsz - 1; vblock++)
+                            for (int hblock = 1; hblock < hblsz - 1; hblock++) {
+                                // block 3x3 median of blockshifts for robustness
+                                for (int c = 0; c < 2; c ++) {
+                                    float bstemp[2];
+                                    for (int dir = 0; dir < 2; dir++) {
+                                        //temporary storage for median filter
+                                        const std::array<float, 9> p = {
+                                            blockshifts[(vblock - 1) * hblsz + hblock - 1][c][dir],
+                                            blockshifts[(vblock - 1) * hblsz + hblock][c][dir],
+                                            blockshifts[(vblock - 1) * hblsz + hblock + 1][c][dir],
+                                            blockshifts[(vblock) * hblsz + hblock - 1][c][dir],
+                                            blockshifts[(vblock) * hblsz + hblock][c][dir],
+                                            blockshifts[(vblock) * hblsz + hblock + 1][c][dir],
+                                            blockshifts[(vblock + 1) * hblsz + hblock - 1][c][dir],
+                                            blockshifts[(vblock + 1) * hblsz + hblock][c][dir],
+                                            blockshifts[(vblock + 1) * hblsz + hblock + 1][c][dir]
+                                        };
+                                        bstemp[dir] = median(p);
+                                    }
 
-                            if (progress > 1.0) {
-                                progress = 1.0;
+                                    //now prepare coefficient matrix; use only data points within caautostrength/2 std devs of zero
+                                    if (SQR(bstemp[0]) > caautostrength * blockvar[0][c] || SQR(bstemp[1]) > caautostrength * blockvar[1][c]) {
+                                        continue;
+                                    }
+
+                                    numblox[c]++;
+
+                                    for (int dir = 0; dir < 2; dir++) {
+                                        double powVblockInit = 1.0;
+                                        for (int i = 0; i < polyord; i++) {
+                                            double powHblockInit = 1.0;
+                                            for (int j = 0; j < polyord; j++) {
+                                                double powVblock = powVblockInit;
+                                                for (int m = 0; m < polyord; m++) {
+                                                    double powHblock = powHblockInit;
+                                                    for (int n = 0; n < polyord; n++) {
+                                                        polymat[c][dir][numpar * (polyord * i + j) + (polyord * m + n)] += powVblock * powHblock * blockwt[vblock * hblsz + hblock];
+                                                        powHblock *= hblock;
+                                                    }
+                                                    powVblock *= vblock;
+                                                }
+                                                shiftmat[c][dir][(polyord * i + j)] += powVblockInit * powHblockInit * bstemp[dir] * blockwt[vblock * hblsz + hblock];
+                                                powHblockInit *= hblock;
+                                            }
+                                            powVblockInit *= vblock;
+                                        }//monomials
+                                    }//dir
+                                }//c
+                            }//blocks
+
+                        numblox[1] = min(numblox[0], numblox[1]);
+
+                        //if too few data points, restrict the order of the fit to linear
+                        if (numblox[1] < 32) {
+                            polyord = 2;
+                            numpar = 4;
+
+                            if (numblox[1] < 10) {
+
+                                printf ("numblox = %d \n", numblox[1]);
+                                processpasstwo = false;
                             }
-
-                            plistener->setProgress(progress);
                         }
+
+                        if(processpasstwo)
+
+                            //fit parameters to blockshifts
+                            for (int c = 0; c < 2; c++)
+                                for (int dir = 0; dir < 2; dir++) {
+                                    if (!LinEqSolve(numpar, polymat[c][dir], shiftmat[c][dir], fitparams[c][dir])) {
+                                        printf("CA correction pass failed -- can't solve linear equations for colour %d direction %d...\n", c, dir);
+                                        processpasstwo = false;
+                                    }
+                                }
                     }
 
+                    //fitparams[polyord*i+j] gives the coefficients of (vblock^i hblock^j) in a polynomial fit for i,j<=4
                 }
-
-            #pragma omp barrier
-// copy temporary image matrix back to image matrix
-            #pragma omp for
-
-            for(int row = 0; row < height; row++) {
-                int col = FC(row, 0) & 1;
-                int indx = (row * width + col) >> 1;
-#ifdef __SSE2__
-                for(; col < width - 7; col += 8, indx += 4) {
-                    STC2VFU(rawData[row][col], LVFU(RawDataTmp[indx]));
-                }
-#endif
-                for(; col < width - (W & 1); col += 2, indx++) {
-                    rawData[row][col] = RawDataTmp[indx];
-                }
+                //end of initialization for CA correction pass
+                //only executed if autoCA is true
             }
 
+            // Main algorithm: Tile loop
+            if(processpasstwo) {
+                float *grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
+                //green interpolated to optical sample points for R/B
+                float *gshift  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
+                #pragma omp for schedule(dynamic) collapse(2) nowait
+
+                for (int top = -border; top < height; top += ts - border2)
+                    for (int left = -border; left < width - (W & 1); left += ts - border2) {
+                        memset(bufferThr, 0, buffersizePassTwo);
+                        float lblockshifts[2][2];
+                        const int vblock = ((top + border) / (ts - border2)) + 1;
+                        const int hblock = ((left + border) / (ts - border2)) + 1;
+                        const int bottom = min(top + ts, height + border);
+                        const int right  = min(left + ts, width - (W & 1) + border);
+                        const int rr1 = bottom - top;
+                        const int cc1 = right - left;
+
+                        const int rrmin = top < 0 ? border : 0;
+                        const int rrmax = bottom > height ? height - top : rr1;
+                        const int ccmin = left < 0 ? border : 0;
+                        const int ccmax = (right > width - (W & 1)) ? width - (W & 1) - left : cc1;
+
+                        // rgb from input CFA data
+                        // rgb values should be floating point number between 0 and 1
+                        // after white balance multipliers are applied
+
+    #ifdef __SSE2__
+                        vfloat c65535v = F2V(65535.f);
+                        vmask gmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+    #endif
+                        for (int rr = rrmin; rr < rrmax; rr++) {
+                            int row = rr + top;
+                            int cc = ccmin;
+                            int col = cc + left;
+                            int indx = row * width + col;
+                            int indx1 = rr * ts + cc;
+    #ifdef __SSE2__
+                            int c = FC(rr, cc);
+                            if(c & 1) {
+                                rgb[1][indx1] = rawData[row][col] / 65535.f;
+                                indx++;
+                                indx1++;
+                                cc++;
+                                col++;
+                                c = FC(rr, cc);
+                            }
+                            for (; cc < ccmax - 7; cc += 8, col += 8, indx += 8, indx1 += 8) {
+                                vfloat val1v = LVFU(rawData[row][col]) / c65535v;
+                                vfloat val2v = LVFU(rawData[row][col + 4]) / c65535v;
+                                STVFU(rgb[c][indx1 >> 1], _mm_shuffle_ps(val1v, val2v, _MM_SHUFFLE(2, 0, 2, 0)));
+                                vfloat gtmpv = LVFU(Gtmp[indx >> 1]);
+                                STVFU(rgb[1][indx1], vself(gmask, PERMUTEPS(gtmpv, _MM_SHUFFLE(1, 1, 0, 0)), val1v));
+                                STVFU(rgb[1][indx1 + 4], vself(gmask, PERMUTEPS(gtmpv, _MM_SHUFFLE(3, 3, 2, 2)), val2v));
+                            }
+    #endif
+                            for (; cc < ccmax; cc++, col++, indx++, indx1++) {
+                                int c = FC(rr, cc);
+                                rgb[c][indx1 >> ((c & 1) ^ 1)] = rawData[row][col] / 65535.f;
+
+                                if ((c & 1) == 0) {
+                                    rgb[1][indx1] = Gtmp[indx >> 1];
+                                }
+                            }
+                        }
+                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                        //fill borders
+                        if (rrmin > 0) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = ccmin; cc < ccmax; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
+                                    rgb[1][rr * ts + cc] = rgb[1][(border2 - rr) * ts + cc];
+                                }
+                        }
+
+                        if (rrmax < rr1) {
+                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
+                                for (int cc = ccmin; cc < ccmax; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][left + cc]) / 65535.f;
+                                    if ((c & 1) == 0) {
+                                        rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + left + cc) >> 1];
+                                    }
+                                }
+                        }
+
+                        if (ccmin > 0) {
+                            for (int rr = rrmin; rr < rrmax; rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
+                                    rgb[1][rr * ts + cc] = rgb[1][rr * ts + border2 - cc];
+                                }
+                        }
+
+                        if (ccmax < cc1) {
+                            for (int rr = rrmin; rr < rrmax; rr++)
+                                for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.f;
+                                    if ((c & 1) == 0) {
+                                        rgb[1][rr * ts + ccmax + cc] = Gtmp[((top + rr) * width + (width - cc - 2)) >> 1];
+                                    }
+                                }
+                        }
+
+                        //also, fill the image corners
+                        if (rrmin > 0 && ccmin > 0) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = (rawData[border2 - rr][border2 - cc]) / 65535.f;
+                                    if ((c & 1) == 0) {
+                                        rgb[1][rr * ts + cc] = Gtmp[((border2 - rr) * width + border2 - cc) >> 1];
+                                    }
+                                }
+                        }
+
+                        if (rrmax < rr1 && ccmax < cc1) {
+                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
+                                for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.f;
+                                    if ((c & 1) == 0) {
+                                        rgb[1][(rrmax + rr)*ts + ccmax + cc] = Gtmp[((height - rr - 2) * width + (width - cc - 2)) >> 1];
+                                    }
+                                }
+                        }
+
+                        if (rrmin > 0 && ccmax < cc1) {
+                            for (int rr = 0; rr < border; rr++)
+                                for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.f;
+                                    if ((c & 1) == 0) {
+                                        rgb[1][rr * ts + ccmax + cc] = Gtmp[((border2 - rr) * width + (width - cc - 2)) >> 1];
+                                    }
+                                }
+                        }
+
+                        if (rrmax < rr1 && ccmin > 0) {
+                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
+                                for (int cc = 0; cc < border; cc++) {
+                                    int c = FC(rr, cc);
+                                    rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.f;
+                                    if ((c & 1) == 0) {
+                                        rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + (border2 - cc)) >> 1];
+                                    }
+                                }
+                        }
+
+                        //end of border fill
+                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+                        if (!autoCA || fitParamsIn) {
+    #ifdef __SSE2__
+                            const vfloat onev = F2V(1.f);
+                            const vfloat epsv = F2V(eps);
+    #endif
+
+                            //manual CA correction; use red/blue slider values to set CA shift parameters
+                            for (int rr = 3; rr < rr1 - 3; rr++) {
+                                int cc = 3 + FC(rr, 1), c = FC(rr,cc), indx = rr * ts + cc;
+    #ifdef __SSE2__
+                                for (; cc < cc1 - 10; cc += 8, indx += 8) {
+                                    //compute directional weights using image gradients
+                                    vfloat val1v = epsv + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 1) * ts + cc]));
+                                    vfloat val2v = epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1]));
+                                    vfloat wtuv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr - 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr - 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 3) * ts + cc])));
+                                    vfloat wtdv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr + 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr + 3) * ts + cc])));
+                                    vfloat wtlv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
+                                    vfloat wtrv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));
+
+                                    //store in rgb array the interpolated G value at R/B grid points using directional weighted average
+                                    STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
+                                }
+    #endif
+                                for (; cc < cc1 - 3; cc += 2, indx += 2) {
+                                    //compute directional weights using image gradients
+                                    float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
+                                    float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
+                                    float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
+                                    float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
+
+                                    //store in rgb array the interpolated G value at R/B grid points using directional weighted average
+                                    rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
+                                }
+                            }
+                        }
+                        if (!autoCA) {
+                            float hfrac = -((float)(hblock - 0.5) / (hblsz - 2) - 0.5);
+                            float vfrac = -((float)(vblock - 0.5) / (vblsz - 2) - 0.5) * height / width;
+                            lblockshifts[0][0] = 2 * vfrac * cared;
+                            lblockshifts[0][1] = 2 * hfrac * cared;
+                            lblockshifts[1][0] = 2 * vfrac * cablue;
+                            lblockshifts[1][1] = 2 * hfrac * cablue;
+                        } else {
+                            //CA auto correction; use CA diagnostic pass to set shift parameters
+                            lblockshifts[0][0] = lblockshifts[0][1] = 0;
+                            lblockshifts[1][0] = lblockshifts[1][1] = 0;
+                            double powVblock = 1.0;
+                            for (int i = 0; i < polyord; i++) {
+                                double powHblock = powVblock;
+                                for (int j = 0; j < polyord; j++) {
+                                    lblockshifts[0][0] += powHblock * fitparams[0][0][polyord * i + j];
+                                    lblockshifts[0][1] += powHblock * fitparams[0][1][polyord * i + j];
+                                    lblockshifts[1][0] += powHblock * fitparams[1][0][polyord * i + j];
+                                    lblockshifts[1][1] += powHblock * fitparams[1][1][polyord * i + j];
+                                    powHblock *= hblock;
+                                }
+                                powVblock *= vblock;
+                            }
+                            constexpr float bslim = 3.99; //max allowed CA shift
+                            lblockshifts[0][0] = LIM(lblockshifts[0][0], -bslim, bslim);
+                            lblockshifts[0][1] = LIM(lblockshifts[0][1], -bslim, bslim);
+                            lblockshifts[1][0] = LIM(lblockshifts[1][0], -bslim, bslim);
+                            lblockshifts[1][1] = LIM(lblockshifts[1][1], -bslim, bslim);
+                        }//end of setting CA shift parameters
+
+
+                        for (int c = 0; c < 3; c += 2) {
+
+                            //some parameters for the bilinear interpolation
+                            shiftvfloor[c] = floor((float)lblockshifts[c>>1][0]);
+                            shiftvceil[c] = ceil((float)lblockshifts[c>>1][0]);
+                            shiftvfrac[c] = lblockshifts[c>>1][0] - shiftvfloor[c];
+
+                            shifthfloor[c] = floor((float)lblockshifts[c>>1][1]);
+                            shifthceil[c] = ceil((float)lblockshifts[c>>1][1]);
+                            shifthfrac[c] = lblockshifts[c>>1][1] - shifthfloor[c];
+
+                            GRBdir[0][c] = lblockshifts[c>>1][0] > 0 ? 2 : -2;
+                            GRBdir[1][c] = lblockshifts[c>>1][1] > 0 ? 2 : -2;
+
+                        }
+
+
+                        for (int rr = 4; rr < rr1 - 4; rr++) {
+                            int cc = 4 + (FC(rr, 2) & 1);
+                            int c = FC(rr, cc);
+                            int indx = (rr * ts + cc) >> 1;
+                            int indxfc = (rr + shiftvfloor[c]) * ts + cc + shifthceil[c];
+                            int indxff = (rr + shiftvfloor[c]) * ts + cc + shifthfloor[c];
+                            int indxcc = (rr + shiftvceil[c]) * ts + cc + shifthceil[c];
+                            int indxcf = (rr + shiftvceil[c]) * ts + cc + shifthfloor[c];
+    #ifdef __SSE2__
+                            vfloat shifthfracv = F2V(shifthfrac[c]);
+                            vfloat shiftvfracv = F2V(shiftvfrac[c]);
+                            for (; cc < cc1 - 10; cc += 8, indxfc += 8, indxff += 8, indxcc += 8, indxcf += 8, indx += 4) {
+                                //perform CA correction using colour ratios or colour differences
+                                vfloat Ginthfloorv = vintpf(shifthfracv, LC2VFU(rgb[1][indxfc]), LC2VFU(rgb[1][indxff]));
+                                vfloat Ginthceilv = vintpf(shifthfracv, LC2VFU(rgb[1][indxcc]), LC2VFU(rgb[1][indxcf]));
+                                //Gint is bilinear interpolation of G at CA shift point
+                                vfloat Gintv = vintpf(shiftvfracv, Ginthceilv, Ginthfloorv);
+
+                                //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
+                                //but first we need to interpolate G-R/G-B to grid points...
+                                STVFU(grbdiff[indx], Gintv - LVFU(rgb[c][indx]));
+                                STVFU(gshift[indx], Gintv);
+                            }
+
+    #endif
+                            for (; cc < cc1 - 4; cc += 2, indxfc += 2, indxff += 2, indxcc += 2, indxcf += 2, ++indx) {
+                                //perform CA correction using colour ratios or colour differences
+                                float Ginthfloor = intp(shifthfrac[c], rgb[1][indxfc], rgb[1][indxff]);
+                                float Ginthceil = intp(shifthfrac[c], rgb[1][indxcc], rgb[1][indxcf]);
+                                //Gint is bilinear interpolation of G at CA shift point
+                                float Gint = intp(shiftvfrac[c], Ginthceil, Ginthfloor);
+
+                                //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
+                                //but first we need to interpolate G-R/G-B to grid points...
+                                grbdiff[indx] = Gint - rgb[c][indx];
+                                gshift[indx] = Gint;
+                            }
+                        }
+
+                        shifthfrac[0] /= 2.f;
+                        shifthfrac[2] /= 2.f;
+                        shiftvfrac[0] /= 2.f;
+                        shiftvfrac[2] /= 2.f;
+
+    #ifdef __SSE2__
+                        vfloat zd25v = F2V(0.25f);
+                        vfloat onev = F2V(1.f);
+                        vfloat zd5v = F2V(0.5f);
+                        vfloat epsv = F2V(eps);
+    #endif
+                        for (int rr = 8; rr < rr1 - 8; rr++) {
+                            int cc = 8 + (FC(rr, 2) & 1);
+                            int c = FC(rr, cc);
+                            int GRBdir0 = GRBdir[0][c];
+                            int GRBdir1 = GRBdir[1][c];
+    #ifdef __SSE2__
+                            vfloat shifthfracc = F2V(shifthfrac[c]);
+                            vfloat shiftvfracc = F2V(shiftvfrac[c]);
+                            for (int indx = rr * ts + cc; cc < cc1 - 14; cc += 8, indx += 8) {
+                                //interpolate colour difference from optical R/B locations to grid locations
+                                vfloat grbdiffinthfloor = vintpf(shifthfracc, LVFU(grbdiff[(indx - GRBdir1) >> 1]), LVFU(grbdiff[indx >> 1]));
+                                vfloat grbdiffinthceil = vintpf(shifthfracc, LVFU(grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]), LVFU(grbdiff[((rr - GRBdir0) * ts + cc) >> 1]));
+                                //grbdiffint is bilinear interpolation of G-R/G-B at grid point
+                                vfloat grbdiffint = vintpf(shiftvfracc, grbdiffinthceil, grbdiffinthfloor);
+
+                                //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
+                                vfloat cinv = LVFU(rgb[c][indx >> 1]);
+                                vfloat rinv = LC2VFU(rgb[1][indx]);
+                                vfloat RBint = rinv - grbdiffint;
+                                vmask cmask = vmaskf_ge(vabsf(RBint - cinv), zd25v * (RBint + cinv));
+                                if(_mm_movemask_ps((vfloat)cmask)) {
+                                    // if for any of the 4 pixels the condition is true, do the math for all 4 pixels and mask the unused out at the end
+                                    //gradient weights using difference from G at CA shift points and G at grid points
+                                    vfloat p0 = onev / (epsv + vabsf(rinv - LVFU(gshift[indx >> 1])));
+                                    vfloat p1 = onev / (epsv + vabsf(rinv - LVFU(gshift[(indx - GRBdir1) >> 1])));
+                                    vfloat p2 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir0) * ts + cc) >> 1])));
+                                    vfloat p3 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1])));
+
+                                    grbdiffint = vself(cmask, (p0 * LVFU(grbdiff[indx >> 1]) + p1 * LVFU(grbdiff[(indx - GRBdir1) >> 1]) +
+                                                  p2 * LVFU(grbdiff[((rr - GRBdir0) * ts + cc) >> 1]) + p3 * LVFU(grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1])) / (p0 + p1 + p2 + p3), grbdiffint);
+
+                                }
+                                vfloat grbdiffold = rinv - cinv;
+                                RBint = rinv - grbdiffint;
+                                RBint = vself(vmaskf_gt(vabsf(grbdiffold), vabsf(grbdiffint)), RBint, cinv);
+                                RBint = vself(vmaskf_lt(grbdiffold * grbdiffint, ZEROV), rinv - zd5v * (grbdiffold + grbdiffint), RBint);
+                                STVFU(rgb[c][indx >> 1], RBint);
+                            }
+    #endif
+                            for (int c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
+                                float grbdiffold = rgb[1][indx] - rgb[c][indx >> 1];
+
+                                //interpolate colour difference from optical R/B locations to grid locations
+                                float grbdiffinthfloor = intp(shifthfrac[c], grbdiff[(indx - GRBdir1) >> 1], grbdiff[indx >> 1]);
+                                float grbdiffinthceil = intp(shifthfrac[c], grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1], grbdiff[((rr - GRBdir0) * ts + cc) >> 1]);
+                                //grbdiffint is bilinear interpolation of G-R/G-B at grid point
+                                float grbdiffint = intp(shiftvfrac[c], grbdiffinthceil, grbdiffinthfloor);
+
+                                //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
+                                float RBint = rgb[1][indx] - grbdiffint;
+
+                                if (fabsf(RBint - rgb[c][indx >> 1]) < 0.25f * (RBint + rgb[c][indx >> 1])) {
+                                    if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
+                                        rgb[c][indx >> 1] = RBint;
+                                    }
+                                } else {
+
+                                    //gradient weights using difference from G at CA shift points and G at grid points
+                                    float p0 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[indx >> 1]));
+                                    float p1 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[(indx - GRBdir1) >> 1]));
+                                    float p2 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[((rr - GRBdir0) * ts + cc) >> 1]));
+                                    float p3 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]));
+
+                                    grbdiffint = (p0 * grbdiff[indx >> 1] + p1 * grbdiff[(indx - GRBdir1) >> 1] +
+                                                  p2 * grbdiff[((rr - GRBdir0) * ts + cc) >> 1] + p3 * grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]) / (p0 + p1 + p2 + p3) ;
+
+                                    //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
+                                    if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
+                                        rgb[c][indx >> 1] = rgb[1][indx] - grbdiffint;
+                                    }
+                                }
+
+                                //if colour difference interpolation overshot the correction, just desaturate
+                                if (grbdiffold * grbdiffint < 0) {
+                                    rgb[c][indx >> 1] = rgb[1][indx] - 0.5f * (grbdiffold + grbdiffint);
+                                }
+                            }
+                        }
+
+                        // copy CA corrected results to temporary image matrix
+                        for (int rr = border; rr < rr1 - border; rr++) {
+                            int c = FC(rr + top, left + border + (FC(rr + top, 2) & 1));
+                            int row = rr + top;
+                            int cc = border + (FC(rr, 2) & 1);
+                            int indx = (row * width + cc + left) >> 1;
+                            int indx1 = (rr * ts + cc) >> 1;
+    #ifdef __SSE2__
+                            for (; indx < (row * width + cc1 - border - 7 + left) >> 1; indx+=4, indx1 += 4) {
+                                STVFU(RawDataTmp[indx], c65535v * LVFU(rgb[c][indx1]));
+                            }
+    #endif
+                            for (; indx < (row * width + cc1 - border + left) >> 1; indx++, indx1++) {
+                                RawDataTmp[indx] = 65535.f * rgb[c][indx1];
+                            }
+                        }
+
+                        if(plistener) {
+                            progresscounter++;
+
+                            if(progresscounter % 8 == 0)
+                                #pragma omp critical (cacorrect)
+                            {
+                                progress += (double)(8.0 * (ts - border2) * (ts - border2)) / (2 * height * width);
+
+                                if (progress > 1.0) {
+                                    progress = 1.0;
+                                }
+
+                                plistener->setProgress(progress);
+                            }
+                        }
+
+                    }
+
+                #pragma omp barrier
+    // copy temporary image matrix back to image matrix
+                #pragma omp for
+
+                for(int row = 0; row < height; row++) {
+                    int col = FC(row, 0) & 1;
+                    int indx = (row * width + col) >> 1;
+    #ifdef __SSE2__
+                    for(; col < width - 7; col += 8, indx += 4) {
+                        STC2VFU(rawData[row][col], LVFU(RawDataTmp[indx]));
+                    }
+    #endif
+                    for(; col < width - (W & 1); col += 2, indx++) {
+                        rawData[row][col] = RawDataTmp[indx];
+                    }
+                }
+
+            }
+
+            // clean up
+            free(bufferThr);
         }
-
-        // clean up
-        free(bufferThr);
     }
-
     if(autoCA && fitParamsTransfer && fitParamsOut) {
         // store calculated parameters
         int index = 0;
diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index e7225e993..29e52b905 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -2561,6 +2561,7 @@ RAWParams::RAWParams() :
     ff_AutoClipControl(false),
     ff_clipControl(0),
     ca_autocorrect(false),
+    caautoiterations(1),
     cared(0.0),
     cablue(0.0),
     expos(1.0),
@@ -2585,6 +2586,7 @@ bool RAWParams::operator ==(const RAWParams& other) const
         && ff_AutoClipControl == other.ff_AutoClipControl
         && ff_clipControl == other.ff_clipControl
         && ca_autocorrect == other.ca_autocorrect
+        && caautoiterations == other.caautoiterations
         && cared == other.cared
         && cablue == other.cablue
         && expos == other.expos
@@ -3381,6 +3383,7 @@ int ProcParams::save(const Glib::ustring& fname, const Glib::ustring& fname2, bo
         saveToKeyfile(!pedited || pedited->raw.ff_AutoClipControl, "RAW", "FlatFieldAutoClipControl", raw.ff_AutoClipControl, keyFile);
         saveToKeyfile(!pedited || pedited->raw.ff_clipControl, "RAW", "FlatFieldClipControl", raw.ff_clipControl, keyFile);
         saveToKeyfile(!pedited || pedited->raw.ca_autocorrect, "RAW", "CA", raw.ca_autocorrect, keyFile);
+        saveToKeyfile(!pedited || pedited->raw.caautoiterations, "RAW", "CAAutoIterations", raw.caautoiterations, keyFile);
         saveToKeyfile(!pedited || pedited->raw.cared, "RAW", "CARed", raw.cared, keyFile);
         saveToKeyfile(!pedited || pedited->raw.cablue, "RAW", "CABlue", raw.cablue, keyFile);
         saveToKeyfile(!pedited || pedited->raw.hotPixelFilter, "RAW", "HotPixelFilter", raw.hotPixelFilter, keyFile);
@@ -4753,6 +4756,7 @@ int ProcParams::load(const Glib::ustring& fname, ParamsEdited* pedited)
             }
 
             assignFromKeyfile(keyFile, "RAW", "CA", pedited, raw.ca_autocorrect, pedited->raw.ca_autocorrect);
+            assignFromKeyfile(keyFile, "RAW", "CAAutoIterations", pedited, raw.caautoiterations, pedited->raw.caautoiterations);
             assignFromKeyfile(keyFile, "RAW", "CARed", pedited, raw.cared, pedited->raw.cared);
             assignFromKeyfile(keyFile, "RAW", "CABlue", pedited, raw.cablue, pedited->raw.cablue);
             // For compatibility to elder pp3 versions
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index bbc9763cf..8e8978601 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -1368,6 +1368,7 @@ struct RAWParams {
     int ff_clipControl;
 
     bool ca_autocorrect;
+    int caautoiterations;
     double cared;
     double cablue;
 
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 85a02c85e..a4a68cc17 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2009,13 +2009,13 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
         }
         if(numFrames == 4) {
             double fitParams[64];
-            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, nullptr, false);
+            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, nullptr, false);
             for(int i = 1; i < 3; ++i) {
-                CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, true, false, buffer, false);
+                CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, true, false, buffer, false);
             }
-            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[3], fitParams, true, false, buffer, true);
+            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, *rawDataFrames[3], fitParams, true, false, buffer, true);
         } else {
-            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, rawData, nullptr, false, false, nullptr, true);
+            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, rawData, nullptr, false, false, nullptr, true);
         }
     }
 
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index 95c4dbe22..47fc756a9 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -239,7 +239,7 @@ protected:
     inline  void interpolate_row_rb (float* ar, float* ab, float* pg, float* cg, float* ng, int i);
     inline  void interpolate_row_rb_mul_pp (const array2D<float> &rawData, float* ar, float* ab, float* pg, float* cg, float* ng, int i, float r_mul, float g_mul, float b_mul, int x1, int width, int skip);
 
-    float* CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float * buffer, bool freeBuffer);
+    float* CA_correct_RT  (const bool autoCA, const size_t autoIterations, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float * buffer, bool freeBuffer);
     void ddct8x8s(int isgn, float a[8][8]);
     void processRawWhitepoint (float expos, float preser, array2D<float> &rawData);  // exposure before interpolation
 
diff --git a/rtgui/paramsedited.cc b/rtgui/paramsedited.cc
index 400568203..c9ce7affc 100644
--- a/rtgui/paramsedited.cc
+++ b/rtgui/paramsedited.cc
@@ -431,6 +431,7 @@ void ParamsEdited::set(bool v)
     raw.xtranssensor.exBlackGreen = v;
     raw.xtranssensor.exBlackBlue = v;
     raw.ca_autocorrect = v;
+    raw.caautoiterations  = v;
     raw.cablue  = v;
     raw.cared   = v;
     raw.hotPixelFilter = v;
@@ -986,6 +987,7 @@ void ParamsEdited::initFrom(const std::vector<rtengine::procparams::ProcParams>&
         raw.xtranssensor.exBlackGreen = raw.xtranssensor.exBlackGreen && p.raw.xtranssensor.blackgreen == other.raw.xtranssensor.blackgreen;
         raw.xtranssensor.exBlackBlue = raw.xtranssensor.exBlackBlue && p.raw.xtranssensor.blackblue == other.raw.xtranssensor.blackblue;
         raw.ca_autocorrect = raw.ca_autocorrect && p.raw.ca_autocorrect == other.raw.ca_autocorrect;
+        raw.caautoiterations = raw.caautoiterations && p.raw.caautoiterations == other.raw.caautoiterations;
         raw.cared = raw.cared && p.raw.cared == other.raw.cared;
         raw.cablue = raw.cablue && p.raw.cablue == other.raw.cablue;
         raw.hotPixelFilter = raw.hotPixelFilter && p.raw.hotPixelFilter == other.raw.hotPixelFilter;
@@ -2626,6 +2628,10 @@ void ParamsEdited::combine(rtengine::procparams::ProcParams& toEdit, const rteng
         toEdit.raw.ca_autocorrect  = mods.raw.ca_autocorrect;
     }
 
+    if (raw.caautoiterations) {
+        toEdit.raw.caautoiterations           = dontforceSet && options.baBehav[ADDSET_RAWCACORR] ? toEdit.raw.caautoiterations + mods.raw.caautoiterations : mods.raw.caautoiterations;
+    }
+
     if (raw.cared) {
         toEdit.raw.cared           = dontforceSet && options.baBehav[ADDSET_RAWCACORR] ? toEdit.raw.cared + mods.raw.cared : mods.raw.cared;
     }
@@ -3127,7 +3133,7 @@ bool RAWParamsEdited::XTransSensor::isUnchanged() const
 
 bool RAWParamsEdited::isUnchanged() const
 {
-    return  bayersensor.isUnchanged() && xtranssensor.isUnchanged() && ca_autocorrect && cared && cablue && hotPixelFilter && deadPixelFilter && hotdeadpix_thresh && darkFrame
+    return  bayersensor.isUnchanged() && xtranssensor.isUnchanged() && ca_autocorrect && caautoiterations && cared && cablue && hotPixelFilter && deadPixelFilter && hotdeadpix_thresh && darkFrame
             && df_autoselect && ff_file && ff_AutoSelect && ff_BlurRadius && ff_BlurType && exPos && exPreser && ff_AutoClipControl && ff_clipControl;
 }
 
diff --git a/rtgui/paramsedited.h b/rtgui/paramsedited.h
index c4c36ce61..1cd16290c 100644
--- a/rtgui/paramsedited.h
+++ b/rtgui/paramsedited.h
@@ -787,6 +787,7 @@ public:
     XTransSensor xtranssensor;
 
     bool ca_autocorrect;
+    bool caautoiterations;
     bool cared;
     bool cablue;
     bool hotPixelFilter;
diff --git a/rtgui/partialpastedlg.cc b/rtgui/partialpastedlg.cc
index eb1bbbfee..556360bda 100644
--- a/rtgui/partialpastedlg.cc
+++ b/rtgui/partialpastedlg.cc
@@ -901,6 +901,7 @@ void PartialPasteDlg::applyPaste (rtengine::procparams::ProcParams* dstPP, Param
 
     if (!raw_ca_autocorrect->get_active ()) {
         filterPE.raw.ca_autocorrect       = falsePE.raw.ca_autocorrect;
+        filterPE.raw.caautoiterations       = falsePE.raw.caautoiterations;
     }
 
     if (!raw_caredblue->get_active ()) {
diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index 2f3c54522..7e75682e0 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -17,6 +17,7 @@
  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include "rawcacorrection.h"
+#include "eventmapper.h"
 #include "guiutils.h"
 #include "rtimage.h"
 
@@ -25,6 +26,9 @@ using namespace rtengine::procparams;
 
 RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROMATABERR_LABEL"))
 {
+    auto m = ProcEventMapper::getInstance();
+    EvPreProcessCAAutoiterations = m->newEvent(DARKFRAME, "HISTORY_MSG_RAWCACORR_AUTOIT");
+
     Gtk::Image* icaredL =   Gtk::manage (new RTImage ("circle-red-cyan-small.png"));
     Gtk::Image* icaredR =   Gtk::manage (new RTImage ("circle-cyan-red-small.png"));
     Gtk::Image* icablueL =  Gtk::manage (new RTImage ("circle-blue-yellow-small.png"));
@@ -33,6 +37,13 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     caAutocorrect = Gtk::manage (new CheckBox(M("TP_RAWCACORR_AUTO"), multiImage));
     caAutocorrect->setCheckBoxListener (this);
 
+    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 3, 1, 1));
+    caAutoiterations->setAdjusterListener (this);
+
+    if (caAutoiterations->delay < options.adjusterMaxDelay) {
+        caAutoiterations->delay = options.adjusterMaxDelay;
+    }
+
     caRed = Gtk::manage(new Adjuster (M("TP_RAWCACORR_CARED"), -8.0, 8.0, 0.1, 0, icaredL, icaredR));
     caRed->setAdjusterListener (this);
 
@@ -51,6 +62,7 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     caBlue->show();
 
     pack_start( *caAutocorrect, Gtk::PACK_SHRINK, 4);
+    pack_start( *caAutoiterations, Gtk::PACK_SHRINK, 4);
     pack_start( *caRed, Gtk::PACK_SHRINK, 4);
     pack_start( *caBlue, Gtk::PACK_SHRINK, 4);
 
@@ -62,15 +74,18 @@ void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdi
 
     if(pedited ) {
         caAutocorrect->setEdited(pedited->raw.ca_autocorrect);
+        caAutoiterations->setEditedState( pedited->raw.cared ? Edited : UnEdited );
         caRed->setEditedState( pedited->raw.cared ? Edited : UnEdited );
         caBlue->setEditedState( pedited->raw.cablue ? Edited : UnEdited );
     }
 
     // disable Red and Blue sliders when caAutocorrect is enabled
+    caAutoiterations->set_sensitive(pp->raw.ca_autocorrect);
     caRed->set_sensitive(!pp->raw.ca_autocorrect);
     caBlue->set_sensitive(!pp->raw.ca_autocorrect);
 
     caAutocorrect->setValue(pp->raw.ca_autocorrect);
+    caAutoiterations->setValue (pp->raw.caautoiterations);
     caRed->setValue (pp->raw.cared);
     caBlue->setValue (pp->raw.cablue);
 
@@ -80,11 +95,13 @@ void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdi
 void RAWCACorr::write( rtengine::procparams::ProcParams* pp, ParamsEdited* pedited)
 {
     pp->raw.ca_autocorrect = caAutocorrect->getLastActive();
+    pp->raw.caautoiterations = caAutoiterations->getValue();
     pp->raw.cared = caRed->getValue();
     pp->raw.cablue = caBlue->getValue();
 
     if (pedited) {
         pedited->raw.ca_autocorrect = !caAutocorrect->get_inconsistent();
+        pedited->raw.caautoiterations = caAutoiterations->getEditedState ();
         pedited->raw.cared = caRed->getEditedState ();
         pedited->raw.cablue = caBlue->getEditedState ();
     }
@@ -97,7 +114,9 @@ void RAWCACorr::adjusterChanged (Adjuster* a, double newval)
 
         Glib::ustring value = a->getTextValue();
 
-        if (a == caRed) {
+        if (a == caAutoiterations) {
+            listener->panelChanged (EvPreProcessCAAutoiterations,  value );
+        } else if (a == caRed) {
             listener->panelChanged (EvPreProcessCARed,  value );
         } else if (a == caBlue) {
             listener->panelChanged (EvPreProcessCABlue,  value );
@@ -110,6 +129,7 @@ void RAWCACorr::checkBoxToggled (CheckBox* c, CheckValue newval)
     if (c == caAutocorrect) {
         if (!batchMode) {
             // disable Red and Blue sliders when caAutocorrect is enabled
+            caAutoiterations->set_sensitive(caAutocorrect->getLastActive ());
             caRed->set_sensitive(!caAutocorrect->getLastActive ());
             caBlue->set_sensitive(!caAutocorrect->getLastActive ());
         }
@@ -122,19 +142,23 @@ void RAWCACorr::checkBoxToggled (CheckBox* c, CheckValue newval)
 void RAWCACorr::setBatchMode(bool batchMode)
 {
     ToolPanel::setBatchMode (batchMode);
+    caAutoiterations->showEditedCB ();
     caRed->showEditedCB ();
     caBlue->showEditedCB ();
 }
 
 void RAWCACorr::setDefaults(const rtengine::procparams::ProcParams* defParams, const ParamsEdited* pedited)
 {
+    caAutoiterations->setDefault( defParams->raw.caautoiterations);
     caRed->setDefault( defParams->raw.cared);
     caBlue->setDefault( defParams->raw.cablue);
 
     if (pedited) {
+        caAutoiterations->setDefaultEditedState( pedited->raw.caautoiterations ? Edited : UnEdited);
         caRed->setDefaultEditedState( pedited->raw.cared ? Edited : UnEdited);
         caBlue->setDefaultEditedState( pedited->raw.cablue ? Edited : UnEdited);
     } else {
+        caAutoiterations->setDefaultEditedState( Irrelevant );
         caRed->setDefaultEditedState( Irrelevant );
         caBlue->setDefaultEditedState( Irrelevant );
     }
@@ -150,6 +174,7 @@ void RAWCACorr::setAdjusterBehavior (bool caadd)
 void RAWCACorr::trimValues (rtengine::procparams::ProcParams* pp)
 {
 
+    caAutoiterations->trimValue(pp->raw.caautoiterations);
     caRed->trimValue(pp->raw.cared);
     caBlue->trimValue(pp->raw.cablue);
 }
diff --git a/rtgui/rawcacorrection.h b/rtgui/rawcacorrection.h
index 69292b1aa..27e486247 100644
--- a/rtgui/rawcacorrection.h
+++ b/rtgui/rawcacorrection.h
@@ -29,9 +29,12 @@ class RAWCACorr : public ToolParamBlock, public AdjusterListener, public CheckBo
 
 protected:
     CheckBox* caAutocorrect;
+    Adjuster* caAutoiterations;
     Adjuster* caRed;
     Adjuster* caBlue;
 
+    rtengine::ProcEvent EvPreProcessCAAutoiterations;
+
 public:
 
     RAWCACorr ();

From ac1db9922086339847bce6658f146dcfb9d714f7 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 5 Sep 2018 17:54:11 +0200
Subject: [PATCH 02/22] CA_correct_RT(): readability changes suggested by
 @Floessie; increased range of iterations adjuster to 5, #4774

---
 rtengine/CA_correct_RT.cc | 33 +++++++++++++++++++++++++++------
 rtengine/rawimagesource.h | 14 +++++++++++++-
 rtgui/rawcacorrection.cc  |  2 +-
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 35e36ce05..0a53ba469 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -111,7 +111,19 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
 using namespace std;
 using namespace rtengine;
 
-float* RawImageSource::CA_correct_RT(const bool autoCA, const size_t autoIterations, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
+float* RawImageSource::CA_correct_RT(
+    bool autoCA,
+    size_t autoIterations,
+    double cared,
+    double cablue,
+    double caautostrength,
+    const array2D<float> &rawData,
+    double* fitParamsTransfer,
+    bool fitParamsIn,
+    bool fitParamsOut,
+    float* buffer,
+    bool freeBuffer
+)
 {
 // multithreaded and vectorized by Ingo Weyrich
     constexpr int ts = 128;
@@ -120,14 +132,16 @@ float* RawImageSource::CA_correct_RT(const bool autoCA, const size_t autoIterati
     constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, v4 = 4 * ts; //, p1=-ts+1, p2=-2*ts+2, p3=-3*ts+3, m1=ts+1, m2=2*ts+2, m3=3*ts+3;
 
     // Test for RGB cfa
-    for(int i = 0; i < 2; i++)
-        for(int j = 0; j < 2; j++)
+    for(int i = 0; i < 2; i++) {
+        for(int j = 0; j < 2; j++) {
             if(FC(i, j) == 3) {
                 printf("CA correction supports only RGB Colour filter arrays\n");
                 return buffer;
             }
+        }
+    }
 
-    volatile double progress = 0.0;
+    double progress = 0.0;
 
     if(plistener) {
         plistener->setProgress (progress);
@@ -159,9 +173,16 @@ float* RawImageSource::CA_correct_RT(const bool autoCA, const size_t autoIterati
     bool processpasstwo = true;
     double fitparams[2][2][16];
 
-    const size_t iterations = autoCA ? std::max(autoIterations, static_cast<size_t>(1)) : 1;
+    const size_t iterations =
+        autoCA
+            ? std::max<size_t>(autoIterations, 1)
+            : 1;
+
     for (size_t it = 0; it < iterations && processpasstwo; ++it) {
-        float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
+        float blockave[2][2] = {};
+        float blocksqave[2][2] = {};
+        float blockdenom[2][2] = {};
+        float blockvar[2][2];
         const bool fitParamsSet = fitParamsTransfer && fitParamsIn;
         if(autoCA && fitParamsSet && iterations < 2) {
             // use stored parameters
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index 47fc756a9..b9729555b 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -239,7 +239,19 @@ protected:
     inline  void interpolate_row_rb (float* ar, float* ab, float* pg, float* cg, float* ng, int i);
     inline  void interpolate_row_rb_mul_pp (const array2D<float> &rawData, float* ar, float* ab, float* pg, float* cg, float* ng, int i, float r_mul, float g_mul, float b_mul, int x1, int width, int skip);
 
-    float* CA_correct_RT  (const bool autoCA, const size_t autoIterations, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float * buffer, bool freeBuffer);
+    float* CA_correct_RT(
+        bool autoCA,
+        size_t autoIterations,
+        double cared,
+        double cablue,
+        double caautostrength,
+        const array2D<float> &rawData,
+        double* fitParamsTransfer,
+        bool fitParamsIn,
+        bool fitParamsOut,
+        float* buffer,
+        bool freeBuffer
+    );
     void ddct8x8s(int isgn, float a[8][8]);
     void processRawWhitepoint (float expos, float preser, array2D<float> &rawData);  // exposure before interpolation
 
diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index 7e75682e0..8c917c910 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -37,7 +37,7 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     caAutocorrect = Gtk::manage (new CheckBox(M("TP_RAWCACORR_AUTO"), multiImage));
     caAutocorrect->setCheckBoxListener (this);
 
-    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 3, 1, 1));
+    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 5, 1, 1));
     caAutoiterations->setAdjusterListener (this);
 
     if (caAutoiterations->delay < options.adjusterMaxDelay) {

From 5fcb64634dfe2c960ff856f15569b432401e0eb9 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 5 Sep 2018 20:33:48 +0200
Subject: [PATCH 03/22] CA_correct_RT(), minor change

---
 rtengine/CA_correct_RT.cc | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 0a53ba469..f8fc9b39e 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -27,6 +27,7 @@
 #include "rawimagesource.h"
 #include "rt_math.h"
 #include "median.h"
+//#define BENCHMARK
 #include "StopWatch.h"
 namespace {
 
@@ -125,6 +126,7 @@ float* RawImageSource::CA_correct_RT(
     bool freeBuffer
 )
 {
+    BENCHFUN
 // multithreaded and vectorized by Ingo Weyrich
     constexpr int ts = 128;
     constexpr int tsh = ts / 2;
@@ -178,23 +180,25 @@ float* RawImageSource::CA_correct_RT(
             ? std::max<size_t>(autoIterations, 1)
             : 1;
 
+    const bool fitParamsSet = fitParamsTransfer && fitParamsIn && iterations < 2;
+    if(autoCA && fitParamsSet) {
+        // use stored parameters
+        int index = 0;
+        for(int c = 0; c < 2; ++c) {
+            for(int d = 0; d < 2; ++d) {
+                for(int e = 0; e < 16; ++e) {
+                    fitparams[c][d][e] = fitParamsTransfer[index++];
+                }
+            }
+        }
+    }
+
     for (size_t it = 0; it < iterations && processpasstwo; ++it) {
         float blockave[2][2] = {};
         float blocksqave[2][2] = {};
         float blockdenom[2][2] = {};
         float blockvar[2][2];
-        const bool fitParamsSet = fitParamsTransfer && fitParamsIn;
-        if(autoCA && fitParamsSet && iterations < 2) {
-            // use stored parameters
-            int index = 0;
-            for(int c = 0; c < 2; ++c) {
-                for(int d = 0; d < 2; ++d) {
-                    for(int e = 0; e < 16; ++e) {
-                        fitparams[c][d][e] = fitParamsTransfer[index++];
-                    }
-                }
-            }
-        }
+
         //order of 2d polynomial fit (polyord), and numpar=polyord^2
         int polyord = 4, numpar = 16;
 

From 01618e1b7f06b6c04ef13852d81df49be8c03c4d Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 5 Sep 2018 21:47:16 +0200
Subject: [PATCH 04/22] Set default value for raw auto ca correction iterations
 to 2 while keeping backwars compatibility, #4774

---
 rtengine/procparams.cc   | 8 ++++++--
 rtgui/ppversion.h        | 4 +++-
 rtgui/rawcacorrection.cc | 2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index 29e52b905..eb06e5b16 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -2561,7 +2561,7 @@ RAWParams::RAWParams() :
     ff_AutoClipControl(false),
     ff_clipControl(0),
     ca_autocorrect(false),
-    caautoiterations(1),
+    caautoiterations(2),
     cared(0.0),
     cablue(0.0),
     expos(1.0),
@@ -4756,7 +4756,11 @@ int ProcParams::load(const Glib::ustring& fname, ParamsEdited* pedited)
             }
 
             assignFromKeyfile(keyFile, "RAW", "CA", pedited, raw.ca_autocorrect, pedited->raw.ca_autocorrect);
-            assignFromKeyfile(keyFile, "RAW", "CAAutoIterations", pedited, raw.caautoiterations, pedited->raw.caautoiterations);
+            if (ppVersion >= 342) {
+                assignFromKeyfile(keyFile, "RAW", "CAAutoIterations", pedited, raw.caautoiterations, pedited->raw.caautoiterations);
+            } else {
+                raw.caautoiterations = 1;
+            }
             assignFromKeyfile(keyFile, "RAW", "CARed", pedited, raw.cared, pedited->raw.cared);
             assignFromKeyfile(keyFile, "RAW", "CABlue", pedited, raw.cablue, pedited->raw.cablue);
             // For compatibility to elder pp3 versions
diff --git a/rtgui/ppversion.h b/rtgui/ppversion.h
index f4ca1f360..dd89c9f99 100644
--- a/rtgui/ppversion.h
+++ b/rtgui/ppversion.h
@@ -1,11 +1,13 @@
 #pragma once
 
 // This number has to be incremented whenever the PP3 file format is modified or the behaviour of a tool changes
-#define PPVERSION 340
+#define PPVERSION 341
 #define PPVERSION_AEXP 301 //value of PPVERSION when auto exposure algorithm was modified
 
 /*
   Log of version changes
+   342  2018-09-05
+        raw auto ca correction iterations
    341  2018-07-22
         [ICM] enhanced custom output profile
    340  2018-07-08
diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index 8c917c910..9957ca9be 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -37,7 +37,7 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     caAutocorrect = Gtk::manage (new CheckBox(M("TP_RAWCACORR_AUTO"), multiImage));
     caAutocorrect->setCheckBoxListener (this);
 
-    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 5, 1, 1));
+    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 5, 1, 2));
     caAutoiterations->setAdjusterListener (this);
 
     if (caAutoiterations->delay < options.adjusterMaxDelay) {

From f6195877e54a205a7b4a2092dfd8bcecc0c05b59 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 5 Sep 2018 21:51:36 +0200
Subject: [PATCH 05/22] Fix wrong ppversion

---
 rtgui/ppversion.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtgui/ppversion.h b/rtgui/ppversion.h
index dd89c9f99..5380943bf 100644
--- a/rtgui/ppversion.h
+++ b/rtgui/ppversion.h
@@ -1,7 +1,7 @@
 #pragma once
 
 // This number has to be incremented whenever the PP3 file format is modified or the behaviour of a tool changes
-#define PPVERSION 341
+#define PPVERSION 342
 #define PPVERSION_AEXP 301 //value of PPVERSION when auto exposure algorithm was modified
 
 /*

From bcc7a3fb855de67366ba295eeec6fd88b8a983b4 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Thu, 6 Sep 2018 13:52:48 +0200
Subject: [PATCH 06/22] raw ca correction: first try to avoid colour shift,
 #4777

---
 rtdata/languages/default   |  2 ++
 rtengine/CA_correct_RT.cc  | 39 ++++++++++++++++++++++++++++++++++++++
 rtengine/procparams.cc     |  9 +++++++++
 rtengine/procparams.h      |  1 +
 rtengine/rawimagesource.cc |  8 ++++----
 rtengine/rawimagesource.h  |  1 +
 rtgui/paramsedited.cc      |  8 +++++++-
 rtgui/paramsedited.h       |  1 +
 rtgui/ppversion.h          |  4 +++-
 rtgui/rawcacorrection.cc   | 16 +++++++++++++++-
 rtgui/rawcacorrection.h    |  2 ++
 11 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/rtdata/languages/default b/rtdata/languages/default
index 05498985f..e786b5355 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -749,6 +749,7 @@ HISTORY_MSG_PREPROCESS_PDAFLINESFILTER;PDAF lines filter
 HISTORY_MSG_PRSHARPEN_CONTRAST;PRS - Contrast threshold
 HISTORY_MSG_RAW_BORDER;Raw border
 HISTORY_MSG_RAWCACORR_AUTOIT;Raw CA Correction - Iterations
+HISTORY_MSG_RAWCACORR_COLOURSHIFT;Raw CA Correction - Avoid color shift
 HISTORY_MSG_RESIZE_ALLOWUPSCALING;Resize - Allow upscaling
 HISTORY_MSG_SHARPENING_CONTRAST;Sharpening - Contrast threshold
 HISTORY_MSG_SOFTLIGHT_ENABLED;Soft light
@@ -1782,6 +1783,7 @@ TP_PRSHARPENING_LABEL;Post-Resize Sharpening
 TP_PRSHARPENING_TOOLTIP;Sharpens the image after resizing. Only works when the "Lanczos" resizing method is used. It is impossible to preview the effects of this tool. See RawPedia for usage instructions.
 TP_RAWCACORR_AUTO;Auto-correction
 TP_RAWCACORR_AUTOIT;Iterations
+TP_RAWCACORR_AVOIDCOLORSHIFT;Avoid color shift
 TP_RAWCACORR_CABLUE;Blue
 TP_RAWCACORR_CARED;Red
 TP_RAWCACORR_CASTR;Strength
diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index f8fc9b39e..7e05c856c 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -26,6 +26,7 @@
 #include "rtengine.h"
 #include "rawimagesource.h"
 #include "rt_math.h"
+#include "gauss.h"
 #include "median.h"
 //#define BENCHMARK
 #include "StopWatch.h"
@@ -118,6 +119,7 @@ float* RawImageSource::CA_correct_RT(
     double cared,
     double cablue,
     double caautostrength,
+    bool avoidColourshift,
     const array2D<float> &rawData,
     double* fitParamsTransfer,
     bool fitParamsIn,
@@ -142,6 +144,14 @@ float* RawImageSource::CA_correct_RT(
             }
         }
     }
+    array2D<float> oldraw(W,H);
+    if (avoidColourshift) {
+        for(int i = 0; i < H; ++i) {
+            for(int j = 0; j < W; ++j) {
+                oldraw[i][j] = rawData[i][j];
+            }
+        }
+    }
 
     double progress = 0.0;
 
@@ -1230,6 +1240,35 @@ float* RawImageSource::CA_correct_RT(
         buffer = nullptr;
     }
 
+
+    if (avoidColourshift) {
+        array2D<float> redFactor((W+1)/2, (H+1)/2);
+        array2D<float> blueFactor((W+1)/2, (H+1)/2);
+
+        for(int i = 0; i < H; ++i) {
+            for(int j = 0; j < W; ++j) {
+                float factor = (rawData[i][j] * oldraw[i][j] == 0.0 ? 1.0 : oldraw[i][j] / rawData[i][j]);
+                if(FC(i,j) == 0) {
+                    redFactor[i/2][j/2] = factor;
+                } else if(FC(i,j) == 2) {
+                    blueFactor[i/2][j/2] = factor;
+                }
+            }
+        }
+        gaussianBlur(redFactor, redFactor, (W+1)/2, (H+1)/2, 30.0);
+        gaussianBlur(blueFactor, blueFactor, (W+1)/2, (H+1)/2, 30.0);
+
+        for(int i = 0; i < H; ++i) {
+            for(int j = 0; j < W; ++j) {
+                if(FC(i,j) == 0) {
+                    rawData[i][j] *= redFactor[i/2][j/2];
+                } else if(FC(i,j) == 2) {
+                    rawData[i][j] *= blueFactor[i/2][j/2];
+                }
+            }
+        }
+    }
+
     if(plistener) {
         plistener->setProgress(1.0);
     }
diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index eb06e5b16..af89f99a1 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -2561,6 +2561,7 @@ RAWParams::RAWParams() :
     ff_AutoClipControl(false),
     ff_clipControl(0),
     ca_autocorrect(false),
+    ca_avoidcolourshift(true),
     caautoiterations(2),
     cared(0.0),
     cablue(0.0),
@@ -2586,6 +2587,7 @@ bool RAWParams::operator ==(const RAWParams& other) const
         && ff_AutoClipControl == other.ff_AutoClipControl
         && ff_clipControl == other.ff_clipControl
         && ca_autocorrect == other.ca_autocorrect
+        && ca_avoidcolourshift == other.ca_avoidcolourshift
         && caautoiterations == other.caautoiterations
         && cared == other.cared
         && cablue == other.cablue
@@ -3383,6 +3385,7 @@ int ProcParams::save(const Glib::ustring& fname, const Glib::ustring& fname2, bo
         saveToKeyfile(!pedited || pedited->raw.ff_AutoClipControl, "RAW", "FlatFieldAutoClipControl", raw.ff_AutoClipControl, keyFile);
         saveToKeyfile(!pedited || pedited->raw.ff_clipControl, "RAW", "FlatFieldClipControl", raw.ff_clipControl, keyFile);
         saveToKeyfile(!pedited || pedited->raw.ca_autocorrect, "RAW", "CA", raw.ca_autocorrect, keyFile);
+        saveToKeyfile(!pedited || pedited->raw.ca_avoidcolourshift, "RAW", "CAAvoidColourshift", raw.ca_avoidcolourshift, keyFile);
         saveToKeyfile(!pedited || pedited->raw.caautoiterations, "RAW", "CAAutoIterations", raw.caautoiterations, keyFile);
         saveToKeyfile(!pedited || pedited->raw.cared, "RAW", "CARed", raw.cared, keyFile);
         saveToKeyfile(!pedited || pedited->raw.cablue, "RAW", "CABlue", raw.cablue, keyFile);
@@ -4761,6 +4764,12 @@ int ProcParams::load(const Glib::ustring& fname, ParamsEdited* pedited)
             } else {
                 raw.caautoiterations = 1;
             }
+
+            if (ppVersion >= 343) {
+                assignFromKeyfile(keyFile, "RAW", "CAAvoidColourshift", pedited, raw.ca_avoidcolourshift, pedited->raw.ca_avoidcolourshift);
+            } else {
+                raw.ca_avoidcolourshift = false;
+            }
             assignFromKeyfile(keyFile, "RAW", "CARed", pedited, raw.cared, pedited->raw.cared);
             assignFromKeyfile(keyFile, "RAW", "CABlue", pedited, raw.cablue, pedited->raw.cablue);
             // For compatibility to elder pp3 versions
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index 8e8978601..0b8b5ba56 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -1368,6 +1368,7 @@ struct RAWParams {
     int ff_clipControl;
 
     bool ca_autocorrect;
+    bool ca_avoidcolourshift;
     int caautoiterations;
     double cared;
     double cablue;
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index a4a68cc17..77c6285be 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2009,13 +2009,13 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
         }
         if(numFrames == 4) {
             double fitParams[64];
-            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, nullptr, false);
+            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, raw.ca_avoidcolourshift, *rawDataFrames[0], fitParams, false, true, nullptr, false);
             for(int i = 1; i < 3; ++i) {
-                CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, true, false, buffer, false);
+                CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, raw.ca_avoidcolourshift, *rawDataFrames[i], fitParams, true, false, buffer, false);
             }
-            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, *rawDataFrames[3], fitParams, true, false, buffer, true);
+            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, raw.ca_avoidcolourshift, *rawDataFrames[3], fitParams, true, false, buffer, true);
         } else {
-            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, rawData, nullptr, false, false, nullptr, true);
+            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, raw.ca_avoidcolourshift, rawData, nullptr, false, false, nullptr, true);
         }
     }
 
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index b9729555b..972e1fe64 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -245,6 +245,7 @@ protected:
         double cared,
         double cablue,
         double caautostrength,
+        bool avoidColourshift,
         const array2D<float> &rawData,
         double* fitParamsTransfer,
         bool fitParamsIn,
diff --git a/rtgui/paramsedited.cc b/rtgui/paramsedited.cc
index c9ce7affc..2ccb62f65 100644
--- a/rtgui/paramsedited.cc
+++ b/rtgui/paramsedited.cc
@@ -431,6 +431,7 @@ void ParamsEdited::set(bool v)
     raw.xtranssensor.exBlackGreen = v;
     raw.xtranssensor.exBlackBlue = v;
     raw.ca_autocorrect = v;
+    raw.ca_avoidcolourshift = v;
     raw.caautoiterations  = v;
     raw.cablue  = v;
     raw.cared   = v;
@@ -987,6 +988,7 @@ void ParamsEdited::initFrom(const std::vector<rtengine::procparams::ProcParams>&
         raw.xtranssensor.exBlackGreen = raw.xtranssensor.exBlackGreen && p.raw.xtranssensor.blackgreen == other.raw.xtranssensor.blackgreen;
         raw.xtranssensor.exBlackBlue = raw.xtranssensor.exBlackBlue && p.raw.xtranssensor.blackblue == other.raw.xtranssensor.blackblue;
         raw.ca_autocorrect = raw.ca_autocorrect && p.raw.ca_autocorrect == other.raw.ca_autocorrect;
+        raw.ca_avoidcolourshift = raw.ca_avoidcolourshift && p.raw.ca_avoidcolourshift == other.raw.ca_avoidcolourshift;
         raw.caautoiterations = raw.caautoiterations && p.raw.caautoiterations == other.raw.caautoiterations;
         raw.cared = raw.cared && p.raw.cared == other.raw.cared;
         raw.cablue = raw.cablue && p.raw.cablue == other.raw.cablue;
@@ -2628,6 +2630,10 @@ void ParamsEdited::combine(rtengine::procparams::ProcParams& toEdit, const rteng
         toEdit.raw.ca_autocorrect  = mods.raw.ca_autocorrect;
     }
 
+    if (raw.ca_avoidcolourshift) {
+        toEdit.raw.ca_avoidcolourshift  = mods.raw.ca_avoidcolourshift;
+    }
+
     if (raw.caautoiterations) {
         toEdit.raw.caautoiterations           = dontforceSet && options.baBehav[ADDSET_RAWCACORR] ? toEdit.raw.caautoiterations + mods.raw.caautoiterations : mods.raw.caautoiterations;
     }
@@ -3133,7 +3139,7 @@ bool RAWParamsEdited::XTransSensor::isUnchanged() const
 
 bool RAWParamsEdited::isUnchanged() const
 {
-    return  bayersensor.isUnchanged() && xtranssensor.isUnchanged() && ca_autocorrect && caautoiterations && cared && cablue && hotPixelFilter && deadPixelFilter && hotdeadpix_thresh && darkFrame
+    return  bayersensor.isUnchanged() && xtranssensor.isUnchanged() && ca_autocorrect && ca_avoidcolourshift && caautoiterations && cared && cablue && hotPixelFilter && deadPixelFilter && hotdeadpix_thresh && darkFrame
             && df_autoselect && ff_file && ff_AutoSelect && ff_BlurRadius && ff_BlurType && exPos && exPreser && ff_AutoClipControl && ff_clipControl;
 }
 
diff --git a/rtgui/paramsedited.h b/rtgui/paramsedited.h
index 1cd16290c..0a795696c 100644
--- a/rtgui/paramsedited.h
+++ b/rtgui/paramsedited.h
@@ -787,6 +787,7 @@ public:
     XTransSensor xtranssensor;
 
     bool ca_autocorrect;
+    bool ca_avoidcolourshift;
     bool caautoiterations;
     bool cared;
     bool cablue;
diff --git a/rtgui/ppversion.h b/rtgui/ppversion.h
index 5380943bf..050c4c653 100644
--- a/rtgui/ppversion.h
+++ b/rtgui/ppversion.h
@@ -1,11 +1,13 @@
 #pragma once
 
 // This number has to be incremented whenever the PP3 file format is modified or the behaviour of a tool changes
-#define PPVERSION 342
+#define PPVERSION 343
 #define PPVERSION_AEXP 301 //value of PPVERSION when auto exposure algorithm was modified
 
 /*
   Log of version changes
+   343  2018-09-06
+        raw auto ca correction avoid colour shift
    342  2018-09-05
         raw auto ca correction iterations
    341  2018-07-22
diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index 9957ca9be..9a1001f74 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -28,6 +28,7 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
 {
     auto m = ProcEventMapper::getInstance();
     EvPreProcessCAAutoiterations = m->newEvent(DARKFRAME, "HISTORY_MSG_RAWCACORR_AUTOIT");
+    EvPreProcessCAColourshift = m->newEvent(DARKFRAME, "HISTORY_MSG_RAWCACORR_COLOURSHIFT");
 
     Gtk::Image* icaredL =   Gtk::manage (new RTImage ("circle-red-cyan-small.png"));
     Gtk::Image* icaredR =   Gtk::manage (new RTImage ("circle-cyan-red-small.png"));
@@ -66,6 +67,11 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     pack_start( *caRed, Gtk::PACK_SHRINK, 4);
     pack_start( *caBlue, Gtk::PACK_SHRINK, 4);
 
+    caAvoidcolourshift = Gtk::manage (new CheckBox(M("TP_RAWCACORR_AVOIDCOLORSHIFT"), multiImage));
+    caAvoidcolourshift->setCheckBoxListener (this);
+    pack_start( *caAvoidcolourshift, Gtk::PACK_SHRINK, 4);
+
+
 }
 
 void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdited* pedited)
@@ -74,7 +80,8 @@ void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdi
 
     if(pedited ) {
         caAutocorrect->setEdited(pedited->raw.ca_autocorrect);
-        caAutoiterations->setEditedState( pedited->raw.cared ? Edited : UnEdited );
+        caAvoidcolourshift->setEdited(pedited->raw.ca_avoidcolourshift);
+        caAutoiterations->setEditedState( pedited->raw.caautoiterations ? Edited : UnEdited );
         caRed->setEditedState( pedited->raw.cared ? Edited : UnEdited );
         caBlue->setEditedState( pedited->raw.cablue ? Edited : UnEdited );
     }
@@ -85,6 +92,7 @@ void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdi
     caBlue->set_sensitive(!pp->raw.ca_autocorrect);
 
     caAutocorrect->setValue(pp->raw.ca_autocorrect);
+    caAvoidcolourshift->setValue(pp->raw.ca_avoidcolourshift);
     caAutoiterations->setValue (pp->raw.caautoiterations);
     caRed->setValue (pp->raw.cared);
     caBlue->setValue (pp->raw.cablue);
@@ -95,12 +103,14 @@ void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdi
 void RAWCACorr::write( rtengine::procparams::ProcParams* pp, ParamsEdited* pedited)
 {
     pp->raw.ca_autocorrect = caAutocorrect->getLastActive();
+    pp->raw.ca_avoidcolourshift = caAvoidcolourshift->getLastActive();
     pp->raw.caautoiterations = caAutoiterations->getValue();
     pp->raw.cared = caRed->getValue();
     pp->raw.cablue = caBlue->getValue();
 
     if (pedited) {
         pedited->raw.ca_autocorrect = !caAutocorrect->get_inconsistent();
+        pedited->raw.ca_avoidcolourshift = !caAvoidcolourshift->get_inconsistent();
         pedited->raw.caautoiterations = caAutoiterations->getEditedState ();
         pedited->raw.cared = caRed->getEditedState ();
         pedited->raw.cablue = caBlue->getEditedState ();
@@ -136,6 +146,10 @@ void RAWCACorr::checkBoxToggled (CheckBox* c, CheckValue newval)
         if (listener) {
             listener->panelChanged (EvPreProcessAutoCA, caAutocorrect->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
         }
+    } else if (c == caAvoidcolourshift) {
+        if (listener) {
+            listener->panelChanged (EvPreProcessCAColourshift, caAvoidcolourshift->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
+        }
     }
 }
 
diff --git a/rtgui/rawcacorrection.h b/rtgui/rawcacorrection.h
index 27e486247..614bd4b90 100644
--- a/rtgui/rawcacorrection.h
+++ b/rtgui/rawcacorrection.h
@@ -32,8 +32,10 @@ protected:
     Adjuster* caAutoiterations;
     Adjuster* caRed;
     Adjuster* caBlue;
+    CheckBox* caAvoidcolourshift;
 
     rtengine::ProcEvent EvPreProcessCAAutoiterations;
+    rtengine::ProcEvent EvPreProcessCAColourshift;
 
 public:
 

From 68ee9d422ba679e5fcb3a37af43d923f546ae9e7 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Thu, 6 Sep 2018 16:05:06 +0200
Subject: [PATCH 07/22] raw ca correction: optimized avoid colour shift, #4777

---
 rtengine/CA_correct_RT.cc | 137 ++++++++++++++++++++++----------------
 1 file changed, 78 insertions(+), 59 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 7e05c856c..b6c4b0375 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -144,11 +144,17 @@ float* RawImageSource::CA_correct_RT(
             }
         }
     }
-    array2D<float> oldraw(W,H);
+    array2D<float>* oldraw = nullptr;
     if (avoidColourshift) {
+        oldraw = new array2D<float>((W + 1) / 2, H);
+        #pragma omp parallel for
         for(int i = 0; i < H; ++i) {
-            for(int j = 0; j < W; ++j) {
-                oldraw[i][j] = rawData[i][j];
+            int j = FC(i, 0) & 1;
+            for(; j < W - 1; j += 2) {
+                (*oldraw)[i][j / 2] = rawData[i][j];
+            }
+            if(j < W) {
+                (*oldraw)[i][j / 2] = rawData[i][j];
             }
         }
     }
@@ -284,15 +290,15 @@ float* RawImageSource::CA_correct_RT(
                         // rgb values should be floating point numbers between 0 and 1
                         // after white balance multipliers are applied
 
-    #ifdef __SSE2__
+#ifdef __SSE2__
                         vfloat c65535v = F2V(65535.f);
-    #endif
+#endif
 
                         for (int rr = rrmin; rr < rrmax; rr++) {
                             int row = rr + top;
                             int cc = ccmin;
                             int col = cc + left;
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             int c0 = FC(rr, cc);
                             if(c0 == 1) {
                                 rgb[c0][rr * ts + cc] = rawData[row][col] / 65535.f;
@@ -309,7 +315,7 @@ float* RawImageSource::CA_correct_RT(
                                 STVFU(rgb[1][indx1], val1);
                                 STVFU(rgb[1][indx1 + 4], val2);
                             }
-    #endif
+#endif
                             for (; cc < ccmax; cc++, col++) {
                                 int c = FC(rr, cc);
                                 int indx1 = rr * ts + cc;
@@ -389,16 +395,16 @@ float* RawImageSource::CA_correct_RT(
                         //end of initialization
 
 
-    #ifdef __SSE2__
+#ifdef __SSE2__
                         vfloat onev = F2V(1.f);
                         vfloat epsv = F2V(eps);
-    #endif
+#endif
                         for (int rr = 3; rr < rr1 - 3; rr++) {
                             int row = rr + top;
                             int cc = 3 + (FC(rr,3) & 1);
                             int indx = rr * ts + cc;
                             int c = FC(rr,cc);
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             for (; cc < cc1 - 9; cc+=8, indx+=8) {
                                 //compute directional weights using image gradients
                                 vfloat rgb1mv1v = LC2VFU(rgb[1][indx - v1]);
@@ -417,7 +423,7 @@ float* RawImageSource::CA_correct_RT(
                                 STC2VFU(rgb[1][indx], (wtuv * rgb1mv1v + wtdv * rgb1pv1v + wtlv * rgb1m1v + wtrv * rgb1p1v) / (wtuv + wtdv + wtlv + wtrv));
                             }
 
-    #endif
+#endif
                             for (; cc < cc1 - 3; cc+=2, indx+=2) {
                                 //compute directional weights using image gradients
                                 float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - v2) >> 1]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
@@ -433,11 +439,11 @@ float* RawImageSource::CA_correct_RT(
                                 int offset = (FC(row,max(left + 3, 0)) & 1);
                                 int col = max(left + 3, 0) + offset;
                                 int indx = rr * ts + 3 - (left < 0 ? (left+3) : 0) + offset;
-    #ifdef __SSE2__
+#ifdef __SSE2__
                                 for(; col < min(cc1 + left - 3, width) - 7; col+=8, indx+=8) {
                                     STVFU(Gtmp[(row * width + col) >> 1], LC2VFU(rgb[1][indx]));
                                 }
-    #endif
+#endif
                                 for(; col < min(cc1 + left - 3, width); col+=2, indx+=2) {
                                     Gtmp[(row * width + col) >> 1] = rgb[1][indx];
                                 }
@@ -445,14 +451,14 @@ float* RawImageSource::CA_correct_RT(
 
                         }
                         //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-    #ifdef __SSE2__
+#ifdef __SSE2__
                         vfloat zd25v = F2V(0.25f);
-    #endif
+#endif
                         for (int rr = 4; rr < rr1 - 4; rr++) {
                             int cc = 4 + (FC(rr, 2) & 1);
                             int indx = rr * ts + cc;
                             int c = FC(rr, cc);
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             for (; cc < cc1 - 10; cc += 8, indx += 8) {
                                 vfloat rgb1v = LC2VFU(rgb[1][indx]);
                                 vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
@@ -480,7 +486,7 @@ float* RawImageSource::CA_correct_RT(
                                 STVFU(grblpfh[indx >> 1], zd25v * (glpfhv + (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
                             }
 
-    #endif
+#endif
                             for (; cc < cc1 - 4; cc += 2, indx += 2) {
                                 rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])) +
                                                           fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
@@ -507,11 +513,11 @@ float* RawImageSource::CA_correct_RT(
                             }
                         }
 
-    #ifdef __SSE2__
+#ifdef __SSE2__
                         vfloat zd3v = F2V(0.3f);
                         vfloat zd1v = F2V(0.1f);
                         vfloat zd5v = F2V(0.5f);
-    #endif
+#endif
 
                         // along line segments, find the point along each segment that minimizes the colour variance
                         // averaged over the tile; evaluate for up/down and left/right away from R/B grid point
@@ -519,7 +525,7 @@ float* RawImageSource::CA_correct_RT(
                             int cc = 8 + (FC(rr, 2) & 1);
                             int indx = rr * ts + cc;
                             int c = FC(rr, cc);
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             vfloat coeff00v = ZEROV;
                             vfloat coeff01v = ZEROV;
                             vfloat coeff02v = ZEROV;
@@ -560,7 +566,7 @@ float* RawImageSource::CA_correct_RT(
                             coeff[1][1][c>>1] += vhadd(coeff11v);
                             coeff[1][2][c>>1] += vhadd(coeff12v);
 
-    #endif
+#endif
                             for (; cc < cc1 - 8; cc += 2, indx += 2) {
 
                                 //in linear interpolation, colour differences are a quadratic function of interpolation position;
@@ -825,17 +831,17 @@ float* RawImageSource::CA_correct_RT(
                         // rgb values should be floating point number between 0 and 1
                         // after white balance multipliers are applied
 
-    #ifdef __SSE2__
+#ifdef __SSE2__
                         vfloat c65535v = F2V(65535.f);
                         vmask gmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
-    #endif
+#endif
                         for (int rr = rrmin; rr < rrmax; rr++) {
                             int row = rr + top;
                             int cc = ccmin;
                             int col = cc + left;
                             int indx = row * width + col;
                             int indx1 = rr * ts + cc;
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             int c = FC(rr, cc);
                             if(c & 1) {
                                 rgb[1][indx1] = rawData[row][col] / 65535.f;
@@ -853,7 +859,7 @@ float* RawImageSource::CA_correct_RT(
                                 STVFU(rgb[1][indx1], vself(gmask, PERMUTEPS(gtmpv, _MM_SHUFFLE(1, 1, 0, 0)), val1v));
                                 STVFU(rgb[1][indx1 + 4], vself(gmask, PERMUTEPS(gtmpv, _MM_SHUFFLE(3, 3, 2, 2)), val2v));
                             }
-    #endif
+#endif
                             for (; cc < ccmax; cc++, col++, indx++, indx1++) {
                                 int c = FC(rr, cc);
                                 rgb[c][indx1 >> ((c & 1) ^ 1)] = rawData[row][col] / 65535.f;
@@ -954,15 +960,15 @@ float* RawImageSource::CA_correct_RT(
                         // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
                         if (!autoCA || fitParamsIn) {
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             const vfloat onev = F2V(1.f);
                             const vfloat epsv = F2V(eps);
-    #endif
+#endif
 
                             //manual CA correction; use red/blue slider values to set CA shift parameters
                             for (int rr = 3; rr < rr1 - 3; rr++) {
                                 int cc = 3 + FC(rr, 1), c = FC(rr,cc), indx = rr * ts + cc;
-    #ifdef __SSE2__
+#ifdef __SSE2__
                                 for (; cc < cc1 - 10; cc += 8, indx += 8) {
                                     //compute directional weights using image gradients
                                     vfloat val1v = epsv + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 1) * ts + cc]));
@@ -975,7 +981,7 @@ float* RawImageSource::CA_correct_RT(
                                     //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                     STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
                                 }
-    #endif
+#endif
                                 for (; cc < cc1 - 3; cc += 2, indx += 2) {
                                     //compute directional weights using image gradients
                                     float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
@@ -1044,7 +1050,7 @@ float* RawImageSource::CA_correct_RT(
                             int indxff = (rr + shiftvfloor[c]) * ts + cc + shifthfloor[c];
                             int indxcc = (rr + shiftvceil[c]) * ts + cc + shifthceil[c];
                             int indxcf = (rr + shiftvceil[c]) * ts + cc + shifthfloor[c];
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             vfloat shifthfracv = F2V(shifthfrac[c]);
                             vfloat shiftvfracv = F2V(shiftvfrac[c]);
                             for (; cc < cc1 - 10; cc += 8, indxfc += 8, indxff += 8, indxcc += 8, indxcf += 8, indx += 4) {
@@ -1060,7 +1066,7 @@ float* RawImageSource::CA_correct_RT(
                                 STVFU(gshift[indx], Gintv);
                             }
 
-    #endif
+#endif
                             for (; cc < cc1 - 4; cc += 2, indxfc += 2, indxff += 2, indxcc += 2, indxcf += 2, ++indx) {
                                 //perform CA correction using colour ratios or colour differences
                                 float Ginthfloor = intp(shifthfrac[c], rgb[1][indxfc], rgb[1][indxff]);
@@ -1080,18 +1086,18 @@ float* RawImageSource::CA_correct_RT(
                         shiftvfrac[0] /= 2.f;
                         shiftvfrac[2] /= 2.f;
 
-    #ifdef __SSE2__
+#ifdef __SSE2__
                         vfloat zd25v = F2V(0.25f);
                         vfloat onev = F2V(1.f);
                         vfloat zd5v = F2V(0.5f);
                         vfloat epsv = F2V(eps);
-    #endif
+#endif
                         for (int rr = 8; rr < rr1 - 8; rr++) {
                             int cc = 8 + (FC(rr, 2) & 1);
                             int c = FC(rr, cc);
                             int GRBdir0 = GRBdir[0][c];
                             int GRBdir1 = GRBdir[1][c];
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             vfloat shifthfracc = F2V(shifthfrac[c]);
                             vfloat shiftvfracc = F2V(shiftvfrac[c]);
                             for (int indx = rr * ts + cc; cc < cc1 - 14; cc += 8, indx += 8) {
@@ -1124,7 +1130,7 @@ float* RawImageSource::CA_correct_RT(
                                 RBint = vself(vmaskf_lt(grbdiffold * grbdiffint, ZEROV), rinv - zd5v * (grbdiffold + grbdiffint), RBint);
                                 STVFU(rgb[c][indx >> 1], RBint);
                             }
-    #endif
+#endif
                             for (int c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
                                 float grbdiffold = rgb[1][indx] - rgb[c][indx >> 1];
 
@@ -1172,11 +1178,11 @@ float* RawImageSource::CA_correct_RT(
                             int cc = border + (FC(rr, 2) & 1);
                             int indx = (row * width + cc + left) >> 1;
                             int indx1 = (rr * ts + cc) >> 1;
-    #ifdef __SSE2__
+#ifdef __SSE2__
                             for (; indx < (row * width + cc1 - border - 7 + left) >> 1; indx+=4, indx1 += 4) {
                                 STVFU(RawDataTmp[indx], c65535v * LVFU(rgb[c][indx1]));
                             }
-    #endif
+#endif
                             for (; indx < (row * width + cc1 - border + left) >> 1; indx++, indx1++) {
                                 RawDataTmp[indx] = 65535.f * rgb[c][indx1];
                             }
@@ -1201,17 +1207,17 @@ float* RawImageSource::CA_correct_RT(
                     }
 
                 #pragma omp barrier
-    // copy temporary image matrix back to image matrix
+                // copy temporary image matrix back to image matrix
                 #pragma omp for
 
                 for(int row = 0; row < height; row++) {
                     int col = FC(row, 0) & 1;
                     int indx = (row * width + col) >> 1;
-    #ifdef __SSE2__
+#ifdef __SSE2__
                     for(; col < width - 7; col += 8, indx += 4) {
                         STC2VFU(rawData[row][col], LVFU(RawDataTmp[indx]));
                     }
-    #endif
+#endif
                     for(; col < width - (W & 1); col += 2, indx++) {
                         rawData[row][col] = RawDataTmp[indx];
                     }
@@ -1244,29 +1250,42 @@ float* RawImageSource::CA_correct_RT(
     if (avoidColourshift) {
         array2D<float> redFactor((W+1)/2, (H+1)/2);
         array2D<float> blueFactor((W+1)/2, (H+1)/2);
+        array2D<float>* nonGreen;
 
-        for(int i = 0; i < H; ++i) {
-            for(int j = 0; j < W; ++j) {
-                float factor = (rawData[i][j] * oldraw[i][j] == 0.0 ? 1.0 : oldraw[i][j] / rawData[i][j]);
-                if(FC(i,j) == 0) {
-                    redFactor[i/2][j/2] = factor;
-                } else if(FC(i,j) == 2) {
-                    blueFactor[i/2][j/2] = factor;
-                }
-            }
-        }
-        gaussianBlur(redFactor, redFactor, (W+1)/2, (H+1)/2, 30.0);
-        gaussianBlur(blueFactor, blueFactor, (W+1)/2, (H+1)/2, 30.0);
-
-        for(int i = 0; i < H; ++i) {
-            for(int j = 0; j < W; ++j) {
-                if(FC(i,j) == 0) {
-                    rawData[i][j] *= redFactor[i/2][j/2];
-                } else if(FC(i,j) == 2) {
-                    rawData[i][j] *= blueFactor[i/2][j/2];
+        #pragma omp parallel
+        {
+            #pragma omp for
+            for(int i = 0; i < H; ++i) {
+                int firstCol = FC(i, 0) & 1;
+                int colour = FC(i, firstCol);
+                nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                int j = firstCol;
+                for(; j < W - 1; j += 2) {
+                    (*nonGreen)[i/2][j/2] = (rawData[i][j] * (*oldraw)[i][j / 2] == 0.0 ? 1.0 : (*oldraw)[i][j / 2] / rawData[i][j]);
+                }
+                if (j < W) {
+                    (*nonGreen)[i/2][j/2] = (rawData[i][j] * (*oldraw)[i][j / 2] == 0.0 ? 1.0 : (*oldraw)[i][j / 2] / rawData[i][j]);
+                }
+            }
+
+            gaussianBlur(redFactor, redFactor, (W+1)/2, (H+1)/2, 30.0);
+            gaussianBlur(blueFactor, blueFactor, (W+1)/2, (H+1)/2, 30.0);
+
+            #pragma omp for
+            for(int i = 0; i < H; ++i) {
+                int firstCol = FC(i, 0) & 1;
+                int colour = FC(i, firstCol);
+                nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                int j = firstCol;
+                for(; j < W - 1; j += 2) {
+                    rawData[i][j] *= (*nonGreen)[i/2][j/2];
+                }
+                if (j < W) {
+                    rawData[i][j] *= (*nonGreen)[i/2][j/2];
                 }
             }
         }
+        delete oldraw;
     }
 
     if(plistener) {

From fac0e8ee789a9c7852bc2e869f217c2f96712919 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Thu, 6 Sep 2018 17:59:08 +0200
Subject: [PATCH 08/22] raw ca correction: fix blob artifacts when avoid
 colourshift is enabled, #4777

---
 rtengine/CA_correct_RT.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index b6c4b0375..be85fff47 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1252,6 +1252,12 @@ float* RawImageSource::CA_correct_RT(
         array2D<float> blueFactor((W+1)/2, (H+1)/2);
         array2D<float>* nonGreen;
 
+        for(int i = 0; i < (H+1)/2; ++i) {
+            for(int j = 0; j < (W+1)/2; ++j) {
+                redFactor[i][j] = blueFactor[i][j] = 1.f;
+            }
+        }
+
         #pragma omp parallel
         {
             #pragma omp for
@@ -1261,10 +1267,10 @@ float* RawImageSource::CA_correct_RT(
                 nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
                 for(; j < W - 1; j += 2) {
-                    (*nonGreen)[i/2][j/2] = (rawData[i][j] * (*oldraw)[i][j / 2] == 0.0 ? 1.0 : (*oldraw)[i][j / 2] / rawData[i][j]);
+                    (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 1.5f);
                 }
                 if (j < W) {
-                    (*nonGreen)[i/2][j/2] = (rawData[i][j] * (*oldraw)[i][j / 2] == 0.0 ? 1.0 : (*oldraw)[i][j / 2] / rawData[i][j]);
+                    (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 1.5f);
                 }
             }
 

From c9d89c9b1b4a497b9a5769ab998c79d43af4654a Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Thu, 6 Sep 2018 19:31:01 +0200
Subject: [PATCH 09/22] raw ca correction. Be less restrictive during test
 phase

---
 rtengine/CA_correct_RT.cc | 4 ++--
 rtgui/rawcacorrection.cc  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index be85fff47..7abd9bfcc 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1267,10 +1267,10 @@ float* RawImageSource::CA_correct_RT(
                 nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
                 for(; j < W - 1; j += 2) {
-                    (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 1.5f);
+                    (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
                 }
                 if (j < W) {
-                    (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 1.5f);
+                    (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
                 }
             }
 
diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index 9a1001f74..fd69ab063 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -38,7 +38,7 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     caAutocorrect = Gtk::manage (new CheckBox(M("TP_RAWCACORR_AUTO"), multiImage));
     caAutocorrect->setCheckBoxListener (this);
 
-    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 5, 1, 2));
+    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 10, 1, 2));
     caAutoiterations->setAdjusterListener (this);
 
     if (caAutoiterations->delay < options.adjusterMaxDelay) {

From 198989d5983beccc58da4ae4924149130898dcac Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 7 Sep 2018 13:50:42 +0200
Subject: [PATCH 10/22] raw ca correction/avoid colour shift: fix colour cast
 when (height % 2) == 1, #4777

---
 rtengine/CA_correct_RT.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 7abd9bfcc..3d5f806de 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1274,6 +1274,20 @@ float* RawImageSource::CA_correct_RT(
                 }
             }
 
+            #pragma omp single
+            {
+                if (H % 2) {
+                    int firstCol = FC(H - 1, 0) & 1;
+                    int colour = FC(H - 1, firstCol);
+                    nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                    for (int j = 0; j < (W + 1) / 2; ++j) {
+                        (*nonGreen)[(H + 1) / 2 - 1][j] = (*nonGreen)[(H + 1) / 2 - 2][j];
+                    }
+                }
+            }
+
+            #pragma omp barrier
+
             gaussianBlur(redFactor, redFactor, (W+1)/2, (H+1)/2, 30.0);
             gaussianBlur(blueFactor, blueFactor, (W+1)/2, (H+1)/2, 30.0);
 

From c7ab5ff288dcd33dff9563ba98c11e297681f98e Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 7 Sep 2018 16:20:30 +0200
Subject: [PATCH 11/22] raw ca correction/avoid colour shift: further bugfixes,
 #4777

---
 rtengine/CA_correct_RT.cc | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 3d5f806de..322bf23f7 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1214,11 +1214,11 @@ float* RawImageSource::CA_correct_RT(
                     int col = FC(row, 0) & 1;
                     int indx = (row * width + col) >> 1;
 #ifdef __SSE2__
-                    for(; col < width - 7; col += 8, indx += 4) {
+                    for(; col < width - 7 - (3 * (W & 1)); col += 8, indx += 4) {
                         STC2VFU(rawData[row][col], LVFU(RawDataTmp[indx]));
                     }
 #endif
-                    for(; col < width - (W & 1); col += 2, indx++) {
+                    for(; col < width - (3 * (W & 1)); col += 2, indx++) {
                         rawData[row][col] = RawDataTmp[indx];
                     }
                 }
@@ -1250,13 +1250,6 @@ float* RawImageSource::CA_correct_RT(
     if (avoidColourshift) {
         array2D<float> redFactor((W+1)/2, (H+1)/2);
         array2D<float> blueFactor((W+1)/2, (H+1)/2);
-        array2D<float>* nonGreen;
-
-        for(int i = 0; i < (H+1)/2; ++i) {
-            for(int j = 0; j < (W+1)/2; ++j) {
-                redFactor[i][j] = blueFactor[i][j] = 1.f;
-            }
-        }
 
         #pragma omp parallel
         {
@@ -1264,7 +1257,7 @@ float* RawImageSource::CA_correct_RT(
             for(int i = 0; i < H; ++i) {
                 int firstCol = FC(i, 0) & 1;
                 int colour = FC(i, firstCol);
-                nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
                 for(; j < W - 1; j += 2) {
                     (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
@@ -1279,11 +1272,21 @@ float* RawImageSource::CA_correct_RT(
                 if (H % 2) {
                     int firstCol = FC(H - 1, 0) & 1;
                     int colour = FC(H - 1, firstCol);
-                    nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                    array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                     for (int j = 0; j < (W + 1) / 2; ++j) {
                         (*nonGreen)[(H + 1) / 2 - 1][j] = (*nonGreen)[(H + 1) / 2 - 2][j];
                     }
                 }
+
+                if (W % 2) {
+                    int ngRow = 1 - (FC(0, 0) & 1);
+                    int ngCol = FC(ngRow, 0) & 1;
+                    int colour = FC(ngRow, ngCol);
+                    array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                    for (int i = 0; i < (H + 1) / 2; ++i) {
+                        (*nonGreen)[i][(W + 1) / 2 - 1] = redFactor[i][(W + 1) / 2 - 2];
+                    }
+                }
             }
 
             #pragma omp barrier
@@ -1295,7 +1298,7 @@ float* RawImageSource::CA_correct_RT(
             for(int i = 0; i < H; ++i) {
                 int firstCol = FC(i, 0) & 1;
                 int colour = FC(i, firstCol);
-                nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
                 for(; j < W - 1; j += 2) {
                     rawData[i][j] *= (*nonGreen)[i/2][j/2];

From 910b5165539fe63a8f33e386e5c91252ccd73110 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 7 Sep 2018 17:56:55 +0200
Subject: [PATCH 12/22] raw ca correction/avoid colour shift: bugfix, #4777

---
 rtengine/CA_correct_RT.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 322bf23f7..90fb3932b 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1270,9 +1270,9 @@ float* RawImageSource::CA_correct_RT(
             #pragma omp single
             {
                 if (H % 2) {
-                    int firstCol = FC(H - 1, 0) & 1;
-                    int colour = FC(H - 1, firstCol);
-                    array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                    int firstCol = FC(0, 0) & 1;
+                    int colour = FC(0, firstCol);
+                    array2D<float>* nonGreen = colour == 0 ? &blueFactor : &redFactor;
                     for (int j = 0; j < (W + 1) / 2; ++j) {
                         (*nonGreen)[(H + 1) / 2 - 1][j] = (*nonGreen)[(H + 1) / 2 - 2][j];
                     }

From becfc1a9fdf38a985a7141d12a4221cde2825ac8 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 7 Sep 2018 18:46:39 +0200
Subject: [PATCH 13/22] raw ca correction: smooth progress bar also for > 1
 iterations, #4774

---
 rtengine/CA_correct_RT.cc | 43 ++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 90fb3932b..7dd140328 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -146,6 +146,7 @@ float* RawImageSource::CA_correct_RT(
     }
     array2D<float>* oldraw = nullptr;
     if (avoidColourshift) {
+        // copy raw values before ca correction
         oldraw = new array2D<float>((W + 1) / 2, H);
         #pragma omp parallel for
         for(int i = 0; i < H; ++i) {
@@ -646,12 +647,8 @@ float* RawImageSource::CA_correct_RT(
                             if(progresscounter % 8 == 0)
                                 #pragma omp critical (cadetectpass1)
                             {
-                                progress += (double)(8.0 * (ts - border2) * (ts - border2)) / (2 * height * width);
-
-                                if (progress > 1.0) {
-                                    progress = 1.0;
-                                }
-
+                                progress += 4.0 * SQR(ts - border2) / (iterations * height * width);
+                                progress = std::min(progress, 1.0);
                                 plistener->setProgress(progress);
                             }
                         }
@@ -1194,12 +1191,8 @@ float* RawImageSource::CA_correct_RT(
                             if(progresscounter % 8 == 0)
                                 #pragma omp critical (cacorrect)
                             {
-                                progress += (double)(8.0 * (ts - border2) * (ts - border2)) / (2 * height * width);
-
-                                if (progress > 1.0) {
-                                    progress = 1.0;
-                                }
-
+                                progress += 4.0 * SQR(ts - border2) / (iterations * height * width);
+                                progress = std::min(progress, 1.0);
                                 plistener->setProgress(progress);
                             }
                         }
@@ -1255,9 +1248,9 @@ float* RawImageSource::CA_correct_RT(
         {
             #pragma omp for
             for(int i = 0; i < H; ++i) {
-                int firstCol = FC(i, 0) & 1;
-                int colour = FC(i, firstCol);
-                array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                const int firstCol = FC(i, 0) & 1;
+                const int colour = FC(i, firstCol);
+                const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
                 for(; j < W - 1; j += 2) {
                     (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
@@ -1270,19 +1263,19 @@ float* RawImageSource::CA_correct_RT(
             #pragma omp single
             {
                 if (H % 2) {
-                    int firstCol = FC(0, 0) & 1;
-                    int colour = FC(0, firstCol);
-                    array2D<float>* nonGreen = colour == 0 ? &blueFactor : &redFactor;
+                    const int firstCol = FC(0, 0) & 1;
+                    const int colour = FC(0, firstCol);
+                    const array2D<float>* nonGreen = colour == 0 ? &blueFactor : &redFactor;
                     for (int j = 0; j < (W + 1) / 2; ++j) {
                         (*nonGreen)[(H + 1) / 2 - 1][j] = (*nonGreen)[(H + 1) / 2 - 2][j];
                     }
                 }
 
                 if (W % 2) {
-                    int ngRow = 1 - (FC(0, 0) & 1);
-                    int ngCol = FC(ngRow, 0) & 1;
-                    int colour = FC(ngRow, ngCol);
-                    array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                    const int ngRow = 1 - (FC(0, 0) & 1);
+                    const int ngCol = FC(ngRow, 0) & 1;
+                    const int colour = FC(ngRow, ngCol);
+                    const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                     for (int i = 0; i < (H + 1) / 2; ++i) {
                         (*nonGreen)[i][(W + 1) / 2 - 1] = redFactor[i][(W + 1) / 2 - 2];
                     }
@@ -1296,9 +1289,9 @@ float* RawImageSource::CA_correct_RT(
 
             #pragma omp for
             for(int i = 0; i < H; ++i) {
-                int firstCol = FC(i, 0) & 1;
-                int colour = FC(i, firstCol);
-                array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
+                const int firstCol = FC(i, 0) & 1;
+                const int colour = FC(i, firstCol);
+                const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
                 for(; j < W - 1; j += 2) {
                     rawData[i][j] *= (*nonGreen)[i/2][j/2];

From 09796f06942d3a08b0b940656b599fae6ee3c67e Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sat, 8 Sep 2018 00:52:39 +0200
Subject: [PATCH 14/22] raw ca correction: beautified code

---
 rtengine/CA_correct_RT.cc  | 287 +++++++++++++++++++------------------
 rtengine/rawimagesource.cc |   8 +-
 rtengine/rawimagesource.h  |   1 -
 3 files changed, 152 insertions(+), 144 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 7dd140328..ef1e534c0 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1,11 +1,11 @@
 ////////////////////////////////////////////////////////////////
 //
-//      Chromatic Aberration Auto-correction
+//  Chromatic Aberration correction on raw bayer cfa data
 //
-//      copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
+//  copyright (c) 2008-2010 Emil Martinec <ejmartin@uchicago.edu>
+//  copyright (c) for improvements (speedups, iterated correction and avoid colour shift) 2018 Ingo Weyrich <heckflosse67@gmx.de>
 //
-//
-// code dated: November 26, 2010
+//  code dated: September 8, 2018
 //
 //  CA_correct_RT.cc is free software: you can redistribute it and/or modify
 //  it under the terms of the GNU General Public License as published by
@@ -14,14 +14,13 @@
 //
 //  This program is distributed in the hope that it will be useful,
 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 //  GNU General Public License for more details.
 //
 //  You should have received a copy of the GNU General Public License
 //  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
 ////////////////////////////////////////////////////////////////
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 #include "rtengine.h"
 #include "rawimagesource.h"
@@ -51,22 +50,22 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
 
     int i, j, k;
 
-    for(k = 0; k < (nDim - 1); k++) { // base row of matrix
+    for (k = 0; k < (nDim - 1); k++) { // base row of matrix
         // search of line with max element
-        double fMaxElem = fabs( pfMatr[k * nDim + k] );
+        double fMaxElem = fabs(pfMatr[k * nDim + k]);
         int m = k;
 
         for (i = k + 1; i < nDim; i++) {
-            if(fMaxElem < fabs(pfMatr[i * nDim + k]) ) {
+            if (fMaxElem < fabs(pfMatr[i * nDim + k])) {
                 fMaxElem = pfMatr[i * nDim + k];
                 m = i;
             }
         }
 
         // permutation of base line (index k) and max element line(index m)
-        if(m != k) {
-            for(i = k; i < nDim; i++) {
-                fAcc               = pfMatr[k * nDim + i];
+        if (m != k) {
+            for (i = k; i < nDim; i++) {
+                fAcc                 = pfMatr[k * nDim + i];
                 pfMatr[k * nDim + i] = pfMatr[m * nDim + i];
                 pfMatr[m * nDim + i] = fAcc;
             }
@@ -76,16 +75,16 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
             pfVect[m] = fAcc;
         }
 
-        if( pfMatr[k * nDim + k] == 0.) {
+        if (pfMatr[k * nDim + k] == 0.) {
             //linear system has no solution
             return false; // needs improvement !!!
         }
 
         // triangulation of matrix with coefficients
-        for(j = (k + 1); j < nDim; j++) { // current row of matrix
+        for (j = (k + 1); j < nDim; j++) { // current row of matrix
             fAcc = - pfMatr[j * nDim + k] / pfMatr[k * nDim + k];
 
-            for(i = k; i < nDim; i++) {
+            for (i = k; i < nDim; i++) {
                 pfMatr[j * nDim + i] = pfMatr[j * nDim + i] + fAcc * pfMatr[k * nDim + i];
             }
 
@@ -93,10 +92,10 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
         }
     }
 
-    for(k = (nDim - 1); k >= 0; k--) {
+    for (k = (nDim - 1); k >= 0; k--) {
         pfSolution[k] = pfVect[k];
 
-        for(i = (k + 1); i < nDim; i++) {
+        for (i = (k + 1); i < nDim; i++) {
             pfSolution[k] -= (pfMatr[k * nDim + i] * pfSolution[i]);
         }
 
@@ -106,8 +105,6 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
     return true;
 }
 //end of linear equation solver
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
 }
 
 using namespace std;
@@ -118,7 +115,6 @@ float* RawImageSource::CA_correct_RT(
     size_t autoIterations,
     double cared,
     double cablue,
-    double caautostrength,
     bool avoidColourshift,
     const array2D<float> &rawData,
     double* fitParamsTransfer,
@@ -132,29 +128,30 @@ float* RawImageSource::CA_correct_RT(
 // multithreaded and vectorized by Ingo Weyrich
     constexpr int ts = 128;
     constexpr int tsh = ts / 2;
-    //shifts to location of vertical and diagonal neighbors
+    //shifts to location of vertical and diagonal neighbours
     constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, v4 = 4 * ts; //, p1=-ts+1, p2=-2*ts+2, p3=-3*ts+3, m1=ts+1, m2=2*ts+2, m3=3*ts+3;
 
     // Test for RGB cfa
-    for(int i = 0; i < 2; i++) {
-        for(int j = 0; j < 2; j++) {
-            if(FC(i, j) == 3) {
-                printf("CA correction supports only RGB Colour filter arrays\n");
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 2; j++) {
+            if (FC(i, j) == 3) {
+                std::cout << "CA correction supports only RGB Colour filter arrays" << std::endl;
                 return buffer;
             }
         }
     }
+
     array2D<float>* oldraw = nullptr;
     if (avoidColourshift) {
         // copy raw values before ca correction
         oldraw = new array2D<float>((W + 1) / 2, H);
         #pragma omp parallel for
-        for(int i = 0; i < H; ++i) {
+        for (int i = 0; i < H; ++i) {
             int j = FC(i, 0) & 1;
-            for(; j < W - 1; j += 2) {
+            for (; j < W - 1; j += 2) {
                 (*oldraw)[i][j / 2] = rawData[i][j];
             }
-            if(j < W) {
+            if (j < W) {
                 (*oldraw)[i][j / 2] = rawData[i][j];
             }
         }
@@ -162,7 +159,7 @@ float* RawImageSource::CA_correct_RT(
 
     double progress = 0.0;
 
-    if(plistener) {
+    if (plistener) {
         plistener->setProgress (progress);
     }
 
@@ -180,11 +177,11 @@ float* RawImageSource::CA_correct_RT(
     if (!buffer) {
         buffer = static_cast<float*>(malloc ((height * width + vblsz * hblsz * (2 * 2 + 1)) * sizeof(float)));
     }
-    float *Gtmp = buffer;
-    float *RawDataTmp = buffer + (height * width) / 2;
+    float* Gtmp = buffer;
+    float* RawDataTmp = buffer + (height * width) / 2;
 
     //block CA shift values and weight assigned to block
-    float *const blockwt = buffer + (height * width);
+    float* const blockwt = buffer + (height * width);
     memset(blockwt, 0, vblsz * hblsz * (2 * 2 + 1) * sizeof(float));
     float (*blockshifts)[2][2] = (float (*)[2][2])(blockwt + vblsz * hblsz);
 
@@ -198,12 +195,12 @@ float* RawImageSource::CA_correct_RT(
             : 1;
 
     const bool fitParamsSet = fitParamsTransfer && fitParamsIn && iterations < 2;
-    if(autoCA && fitParamsSet) {
+    if (autoCA && fitParamsSet) {
         // use stored parameters
         int index = 0;
-        for(int c = 0; c < 2; ++c) {
-            for(int d = 0; d < 2; ++d) {
-                for(int e = 0; e < 16; ++e) {
+        for (int c = 0; c < 2; ++c) {
+            for (int d = 0; d < 2; ++d) {
+                for (int e = 0; e < 16; ++e) {
                     fitparams[c][d][e] = fitParamsTransfer[index++];
                 }
             }
@@ -232,36 +229,37 @@ float* RawImageSource::CA_correct_RT(
 
             //polynomial fit coefficients
             //residual CA shift amount within a plaquette
-            float   shifthfrac[3], shiftvfrac[3];
+            float shifthfrac[3], shiftvfrac[3];
 
             // assign working space
             constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
             constexpr int buffersizePassTwo = sizeof(float) * ts * ts + 4 * sizeof(float) * ts * tsh + 4 * 64 + 63;
             char * const bufferThr = (char *) malloc((autoCA && !fitParamsSet) ? buffersize : buffersizePassTwo);
 
-            char * const data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
+            char * const data = (char*)((uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
 
             // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <= 4-way associative L1-Cache
 
             //rgb data in a tile
             float* rgb[3];
-            rgb[0]         = (float (*)) data;
-            rgb[1]         = (float (*)) (data + sizeof(float) * ts * tsh + 1 * 64);
-            rgb[2]         = (float (*)) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);
+            rgb[0] = (float*) data;
+            rgb[1] = (float*) (data + sizeof(float) * ts * tsh + 1 * 64);
+            rgb[2] = (float*) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);
 
             if (autoCA && !fitParamsSet) {
+                constexpr float caAutostrength = 8.f;
                 //high pass filter for R/B in vertical direction
-                float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
+                float* rbhpfh  = (float*) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
                 //high pass filter for R/B in horizontal direction
-                float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
+                float* rbhpfv  = (float*) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
                 //low pass filter for R/B in horizontal direction
-                float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
+                float* rblpfh  = (float*) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
                 //low pass filter for R/B in vertical direction
-                float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
+                float* rblpfv  = (float*) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
                 //low pass filter for colour differences in horizontal direction
-                float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
+                float* grblpfh = (float*) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
                 //low pass filter for colour differences in vertical direction
-                float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
+                float* grblpfv = (float*) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
                 // Main algorithm: Tile loop calculating correction parameters per tile
 
                 //local quadratic fit to shift data within a tile
@@ -270,14 +268,16 @@ float* RawImageSource::CA_correct_RT(
                 float CAshift[2][2];
 
                 //per thread data for evaluation of block CA shift variance
-                float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
+                float blockavethr[2][2] = {};
+                float blocksqavethr[2][2] = {};
+                float blockdenomthr[2][2] = {};
 
                 #pragma omp for collapse(2) schedule(dynamic) nowait
-                for (int top = -border ; top < height; top += ts - border2)
+                for (int top = -border ; top < height; top += ts - border2) {
                     for (int left = -border; left < width - (W & 1); left += ts - border2) {
                         memset(bufferThr, 0, buffersize);
-                        const int vblock = ((top + border) / (ts - border2)) + 1;
-                        const int hblock = ((left + border) / (ts - border2)) + 1;
+                        const int vblock = (top + border) / (ts - border2) + 1;
+                        const int hblock = (left + border) / (ts - border2) + 1;
                         const int bottom = min(top + ts, height + border);
                         const int right  = min(left + ts, width - (W & 1) + border);
                         const int rr1 = bottom - top;
@@ -301,7 +301,7 @@ float* RawImageSource::CA_correct_RT(
                             int col = cc + left;
 #ifdef __SSE2__
                             int c0 = FC(rr, cc);
-                            if(c0 == 1) {
+                            if (c0 == 1) {
                                 rgb[c0][rr * ts + cc] = rawData[row][col] / 65535.f;
                                 cc++;
                                 col++;
@@ -311,7 +311,7 @@ float* RawImageSource::CA_correct_RT(
                             for (; cc < ccmax - 7; cc+=8, col+=8, indx1 += 8) {
                                 vfloat val1 = LVFU(rawData[row][col]) / c65535v;
                                 vfloat val2 = LVFU(rawData[row][col + 4]) / c65535v;
-                                vfloat nonGreenv = _mm_shuffle_ps(val1,val2,_MM_SHUFFLE( 2,0,2,0 ));
+                                vfloat nonGreenv = _mm_shuffle_ps(val1,val2,_MM_SHUFFLE(2,0,2,0));
                                 STVFU(rgb[c0][indx1 >> 1], nonGreenv);
                                 STVFU(rgb[1][indx1], val1);
                                 STVFU(rgb[1][indx1 + 4], val2);
@@ -324,78 +324,83 @@ float* RawImageSource::CA_correct_RT(
                             }
                         }
 
-                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                         //fill borders
                         if (rrmin > 0) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = ccmin; cc < ccmax; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                                 }
+                            }
                         }
 
                         if (rrmax < rr1) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = ccmin; cc < ccmax; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][left + cc] / 65535.f;
                                 }
+                            }
                         }
 
                         if (ccmin > 0) {
-                            for (int rr = rrmin; rr < rrmax; rr++)
+                            for (int rr = rrmin; rr < rrmax; rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
                                 }
+                            }
                         }
 
                         if (ccmax < cc1) {
-                            for (int rr = rrmin; rr < rrmax; rr++)
+                            for (int rr = rrmin; rr < rrmax; rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(top + rr)][(width - cc - 2)] / 65535.f;
                                 }
+                            }
                         }
 
                         //also, fill the image corners
                         if (rrmin > 0 && ccmin > 0) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rawData[border2 - rr][border2 - cc] / 65535.f;
                                 }
+                            }
                         }
 
                         if (rrmax < rr1 && ccmax < cc1) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][(width - cc - 2)] / 65535.f;
                                 }
+                            }
                         }
 
                         if (rrmin > 0 && ccmax < cc1) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(border2 - rr)][(width - cc - 2)] / 65535.f;
                                 }
+                            }
                         }
 
                         if (rrmax < rr1 && ccmin > 0) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][(border2 - cc)] / 65535.f;
                                 }
+                            }
                         }
 
                         //end of border fill
-                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                         //end of initialization
 
-
 #ifdef __SSE2__
                         vfloat onev = F2V(1.f);
                         vfloat epsv = F2V(eps);
@@ -441,17 +446,16 @@ float* RawImageSource::CA_correct_RT(
                                 int col = max(left + 3, 0) + offset;
                                 int indx = rr * ts + 3 - (left < 0 ? (left+3) : 0) + offset;
 #ifdef __SSE2__
-                                for(; col < min(cc1 + left - 3, width) - 7; col+=8, indx+=8) {
+                                for (; col < min(cc1 + left - 3, width) - 7; col+=8, indx+=8) {
                                     STVFU(Gtmp[(row * width + col) >> 1], LC2VFU(rgb[1][indx]));
                                 }
 #endif
-                                for(; col < min(cc1 + left - 3, width); col+=2, indx+=2) {
+                                for (; col < min(cc1 + left - 3, width); col+=2, indx+=2) {
                                     Gtmp[(row * width + col) >> 1] = rgb[1][indx];
                                 }
                             }
-
                         }
-                        //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 #ifdef __SSE2__
                         vfloat zd25v = F2V(0.25f);
 #endif
@@ -486,7 +490,6 @@ float* RawImageSource::CA_correct_RT(
                                 STVFU(grblpfv[indx >> 1], zd25v * (glpfvv + (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
                                 STVFU(grblpfh[indx >> 1], zd25v * (glpfhv + (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
                             }
-
 #endif
                             for (; cc < cc1 - 4; cc += 2, indx += 2) {
                                 rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])) +
@@ -534,7 +537,6 @@ float* RawImageSource::CA_correct_RT(
                             vfloat coeff11v = ZEROV;
                             vfloat coeff12v = ZEROV;
                             for (; cc < cc1 - 14; cc += 8, indx += 8) {
-
                                 //in linear interpolation, colour differences are a quadratic function of interpolation position;
                                 //solve for the interpolation position that minimizes colour difference variance over the tile
 
@@ -569,7 +571,6 @@ float* RawImageSource::CA_correct_RT(
 
 #endif
                             for (; cc < cc1 - 8; cc += 2, indx += 2) {
-
                                 //in linear interpolation, colour differences are a quadratic function of interpolation position;
                                 //solve for the interpolation position that minimizes colour difference variance over the tile
 
@@ -577,7 +578,7 @@ float* RawImageSource::CA_correct_RT(
                                 float gdiff = (rgb[1][indx + ts] - rgb[1][indx - ts]) + 0.3f * (rgb[1][indx + ts + 1] - rgb[1][indx - ts + 1] + rgb[1][indx + ts - 1] - rgb[1][indx - ts - 1]);
                                 float deltgrb = (rgb[c][indx >> 1] - rgb[1][indx]);
 
-                                float gradwt = (rbhpfv[indx >> 1] + 0.5f * (rbhpfv[(indx >> 1) + 1] + rbhpfv[(indx >> 1) - 1]) ) * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) / (eps + 0.1f * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) + rblpfv[(indx >> 1) - v1] + rblpfv[(indx >> 1) + v1]);
+                                float gradwt = (rbhpfv[indx >> 1] + 0.5f * (rbhpfv[(indx >> 1) + 1] + rbhpfv[(indx >> 1) - 1])) * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) / (eps + 0.1f * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) + rblpfv[(indx >> 1) - v1] + rblpfv[(indx >> 1) + v1]);
 
                                 coeff[0][0][c>>1] += gradwt * deltgrb * deltgrb;
                                 coeff[0][1][c>>1] += gradwt * gdiff * deltgrb;
@@ -586,7 +587,7 @@ float* RawImageSource::CA_correct_RT(
                                 //horizontal
                                 gdiff = (rgb[1][indx + 1] - rgb[1][indx - 1]) + 0.3f * (rgb[1][indx + 1 + ts] - rgb[1][indx - 1 + ts] + rgb[1][indx + 1 - ts] - rgb[1][indx - 1 - ts]);
 
-                                gradwt = (rbhpfh[indx >> 1] + 0.5f * (rbhpfh[(indx >> 1) + v1] + rbhpfh[(indx >> 1) - v1]) ) * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) / (eps + 0.1f * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) + rblpfh[(indx >> 1) - 1] + rblpfh[(indx >> 1) + 1]);
+                                gradwt = (rbhpfh[indx >> 1] + 0.5f * (rbhpfh[(indx >> 1) + v1] + rbhpfh[(indx >> 1) - v1])) * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) / (eps + 0.1f * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) + rblpfh[(indx >> 1) - 1] + rblpfh[(indx >> 1) + 1]);
 
                                 coeff[1][0][c>>1] += gradwt * deltgrb * deltgrb;
                                 coeff[1][1][c>>1] += gradwt * gdiff * deltgrb;
@@ -603,9 +604,9 @@ float* RawImageSource::CA_correct_RT(
                             for (int k = 0; k < 3; k++) {
                                 for (int c = 0; c < 2; c++) {
                                     coeff[dir][k][c] *= 0.25f;
-                                    if(k == 1) {
+                                    if (k == 1) {
                                         coeff[dir][k][c] *= 0.3125f;
-                                    } else if(k == 2) {
+                                    } else if (k == 2) {
                                         coeff[dir][k][c] *= SQR(0.3125f);
                                     }
                                 }
@@ -637,33 +638,33 @@ float* RawImageSource::CA_correct_RT(
                                 }
                                 //evaluate the shifts to the location that minimizes CA within the tile
                                 blockshifts[vblock * hblsz + hblock][c][dir] = CAshift[dir][c]; //vert/hor CA shift for R/B
-
                             }//vert/hor
                         }//colour
 
-                        if(plistener) {
+                        if (plistener) {
                             progresscounter++;
 
-                            if(progresscounter % 8 == 0)
+                            if (progresscounter % 8 == 0) {
                                 #pragma omp critical (cadetectpass1)
-                            {
-                                progress += 4.0 * SQR(ts - border2) / (iterations * height * width);
-                                progress = std::min(progress, 1.0);
-                                plistener->setProgress(progress);
+                                {
+                                    progress += 4.0 * SQR(ts - border2) / (iterations * height * width);
+                                    progress = std::min(progress, 1.0);
+                                    plistener->setProgress(progress);
+                                }
                             }
                         }
-
                     }
-
+                }
                 //end of diagnostic pass
                 #pragma omp critical (cadetectpass2)
                 {
-                    for (int dir = 0; dir < 2; dir++)
+                    for (int dir = 0; dir < 2; dir++) {
                         for (int c = 0; c < 2; c++) {
                             blockdenom[dir][c] += blockdenomthr[dir][c];
                             blocksqave[dir][c] += blocksqavethr[dir][c];
                             blockave[dir][c]   += blockavethr[dir][c];
                         }
+                    }
                 }
                 #pragma omp barrier
 
@@ -675,16 +676,14 @@ float* RawImageSource::CA_correct_RT(
                                 blockvar[dir][c] = blocksqave[dir][c] / blockdenom[dir][c] - SQR(blockave[dir][c] / blockdenom[dir][c]);
                             } else {
                                 processpasstwo = false;
-                                printf ("blockdenom vanishes \n");
+                                std::cout << "blockdenom vanishes" << std::endl;
                                 break;
                             }
                         }
 
-                    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
                     //now prepare for CA correction pass
                     //first, fill border blocks of blockshift array
-                    if(processpasstwo) {
+                    if (processpasstwo) {
                         for (int vblock = 1; vblock < vblsz - 1; vblock++) { //left and right sides
                             for (int c = 0; c < 2; c++) {
                                 for (int i = 0; i < 2; i++) {
@@ -718,7 +717,7 @@ float* RawImageSource::CA_correct_RT(
 
                         int numblox[2] = {0, 0};
 
-                        for (int vblock = 1; vblock < vblsz - 1; vblock++)
+                        for (int vblock = 1; vblock < vblsz - 1; vblock++) {
                             for (int hblock = 1; hblock < hblsz - 1; hblock++) {
                                 // block 3x3 median of blockshifts for robustness
                                 for (int c = 0; c < 2; c ++) {
@@ -739,8 +738,8 @@ float* RawImageSource::CA_correct_RT(
                                         bstemp[dir] = median(p);
                                     }
 
-                                    //now prepare coefficient matrix; use only data points within caautostrength/2 std devs of zero
-                                    if (SQR(bstemp[0]) > caautostrength * blockvar[0][c] || SQR(bstemp[1]) > caautostrength * blockvar[1][c]) {
+                                    //now prepare coefficient matrix; use only data points within caAutostrength/2 std devs of zero
+                                    if (SQR(bstemp[0]) > caAutostrength * blockvar[0][c] || SQR(bstemp[1]) > caAutostrength * blockvar[1][c]) {
                                         continue;
                                     }
 
@@ -768,7 +767,7 @@ float* RawImageSource::CA_correct_RT(
                                     }//dir
                                 }//c
                             }//blocks
-
+                        }
                         numblox[1] = min(numblox[0], numblox[1]);
 
                         //if too few data points, restrict the order of the fit to linear
@@ -777,24 +776,23 @@ float* RawImageSource::CA_correct_RT(
                             numpar = 4;
 
                             if (numblox[1] < 10) {
-
-                                printf ("numblox = %d \n", numblox[1]);
+                                std::cout << "numblox = " << numblox[1] << std::endl;
                                 processpasstwo = false;
                             }
                         }
 
-                        if(processpasstwo)
-
+                        if (processpasstwo) {
                             //fit parameters to blockshifts
-                            for (int c = 0; c < 2; c++)
+                            for (int c = 0; c < 2; c++) {
                                 for (int dir = 0; dir < 2; dir++) {
                                     if (!LinEqSolve(numpar, polymat[c][dir], shiftmat[c][dir], fitparams[c][dir])) {
-                                        printf("CA correction pass failed -- can't solve linear equations for colour %d direction %d...\n", c, dir);
+                                        std::cout << "CA correction pass failed -- can't solve linear equations for colour %d direction " << c << std::endl;
                                         processpasstwo = false;
                                     }
                                 }
+                            }
+                        }
                     }
-
                     //fitparams[polyord*i+j] gives the coefficients of (vblock^i hblock^j) in a polynomial fit for i,j<=4
                 }
                 //end of initialization for CA correction pass
@@ -802,13 +800,13 @@ float* RawImageSource::CA_correct_RT(
             }
 
             // Main algorithm: Tile loop
-            if(processpasstwo) {
-                float *grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
+            if (processpasstwo) {
+                float* grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
                 //green interpolated to optical sample points for R/B
-                float *gshift  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
+                float* gshift  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
                 #pragma omp for schedule(dynamic) collapse(2) nowait
 
-                for (int top = -border; top < height; top += ts - border2)
+                for (int top = -border; top < height; top += ts - border2) {
                     for (int left = -border; left < width - (W & 1); left += ts - border2) {
                         memset(bufferThr, 0, buffersizePassTwo);
                         float lblockshifts[2][2];
@@ -840,7 +838,7 @@ float* RawImageSource::CA_correct_RT(
                             int indx1 = rr * ts + cc;
 #ifdef __SSE2__
                             int c = FC(rr, cc);
-                            if(c & 1) {
+                            if (c & 1) {
                                 rgb[1][indx1] = rawData[row][col] / 65535.f;
                                 indx++;
                                 indx1++;
@@ -866,19 +864,20 @@ float* RawImageSource::CA_correct_RT(
                                 }
                             }
                         }
-                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
                         //fill borders
                         if (rrmin > 0) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = ccmin; cc < ccmax; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                                     rgb[1][rr * ts + cc] = rgb[1][(border2 - rr) * ts + cc];
                                 }
+                            }
                         }
 
                         if (rrmax < rr1) {
-                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
+                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++) {
                                 for (int cc = ccmin; cc < ccmax; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][left + cc]) / 65535.f;
@@ -886,19 +885,21 @@ float* RawImageSource::CA_correct_RT(
                                         rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + left + cc) >> 1];
                                     }
                                 }
+                            }
                         }
 
                         if (ccmin > 0) {
-                            for (int rr = rrmin; rr < rrmax; rr++)
+                            for (int rr = rrmin; rr < rrmax; rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
                                     rgb[1][rr * ts + cc] = rgb[1][rr * ts + border2 - cc];
                                 }
+                            }
                         }
 
                         if (ccmax < cc1) {
-                            for (int rr = rrmin; rr < rrmax; rr++)
+                            for (int rr = rrmin; rr < rrmax; rr++) {
                                 for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.f;
@@ -906,11 +907,12 @@ float* RawImageSource::CA_correct_RT(
                                         rgb[1][rr * ts + ccmax + cc] = Gtmp[((top + rr) * width + (width - cc - 2)) >> 1];
                                     }
                                 }
+                            }
                         }
 
                         //also, fill the image corners
                         if (rrmin > 0 && ccmin > 0) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = (rawData[border2 - rr][border2 - cc]) / 65535.f;
@@ -918,10 +920,11 @@ float* RawImageSource::CA_correct_RT(
                                         rgb[1][rr * ts + cc] = Gtmp[((border2 - rr) * width + border2 - cc) >> 1];
                                     }
                                 }
+                            }
                         }
 
                         if (rrmax < rr1 && ccmax < cc1) {
-                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
+                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++) {
                                 for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.f;
@@ -929,10 +932,11 @@ float* RawImageSource::CA_correct_RT(
                                         rgb[1][(rrmax + rr)*ts + ccmax + cc] = Gtmp[((height - rr - 2) * width + (width - cc - 2)) >> 1];
                                     }
                                 }
+                            }
                         }
 
                         if (rrmin > 0 && ccmax < cc1) {
-                            for (int rr = 0; rr < border; rr++)
+                            for (int rr = 0; rr < border; rr++) {
                                 for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.f;
@@ -940,10 +944,11 @@ float* RawImageSource::CA_correct_RT(
                                         rgb[1][rr * ts + ccmax + cc] = Gtmp[((border2 - rr) * width + (width - cc - 2)) >> 1];
                                     }
                                 }
+                            }
                         }
 
                         if (rrmax < rr1 && ccmin > 0) {
-                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++)
+                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++) {
                                 for (int cc = 0; cc < border; cc++) {
                                     int c = FC(rr, cc);
                                     rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.f;
@@ -951,17 +956,16 @@ float* RawImageSource::CA_correct_RT(
                                         rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + (border2 - cc)) >> 1];
                                     }
                                 }
+                            }
                         }
 
                         //end of border fill
-                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
                         if (!autoCA || fitParamsIn) {
 #ifdef __SSE2__
                             const vfloat onev = F2V(1.f);
                             const vfloat epsv = F2V(eps);
 #endif
-
                             //manual CA correction; use red/blue slider values to set CA shift parameters
                             for (int rr = 3; rr < rr1 - 3; rr++) {
                                 int cc = 3 + FC(rr, 1), c = FC(rr,cc), indx = rr * ts + cc;
@@ -991,6 +995,7 @@ float* RawImageSource::CA_correct_RT(
                                 }
                             }
                         }
+
                         if (!autoCA) {
                             float hfrac = -((float)(hblock - 0.5) / (hblsz - 2) - 0.5);
                             float vfrac = -((float)(vblock - 0.5) / (vblsz - 2) - 0.5) * height / width;
@@ -1035,7 +1040,6 @@ float* RawImageSource::CA_correct_RT(
 
                             GRBdir[0][c] = lblockshifts[c>>1][0] > 0 ? 2 : -2;
                             GRBdir[1][c] = lblockshifts[c>>1][1] > 0 ? 2 : -2;
-
                         }
 
 
@@ -1109,7 +1113,7 @@ float* RawImageSource::CA_correct_RT(
                                 vfloat rinv = LC2VFU(rgb[1][indx]);
                                 vfloat RBint = rinv - grbdiffint;
                                 vmask cmask = vmaskf_ge(vabsf(RBint - cinv), zd25v * (RBint + cinv));
-                                if(_mm_movemask_ps((vfloat)cmask)) {
+                                if (_mm_movemask_ps((vfloat)cmask)) {
                                     // if for any of the 4 pixels the condition is true, do the math for all 4 pixels and mask the unused out at the end
                                     //gradient weights using difference from G at CA shift points and G at grid points
                                     vfloat p0 = onev / (epsv + vabsf(rinv - LVFU(gshift[indx >> 1])));
@@ -1141,7 +1145,7 @@ float* RawImageSource::CA_correct_RT(
                                 float RBint = rgb[1][indx] - grbdiffint;
 
                                 if (fabsf(RBint - rgb[c][indx >> 1]) < 0.25f * (RBint + rgb[c][indx >> 1])) {
-                                    if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
+                                    if (fabsf(grbdiffold) > fabsf(grbdiffint)) {
                                         rgb[c][indx >> 1] = RBint;
                                     }
                                 } else {
@@ -1156,7 +1160,7 @@ float* RawImageSource::CA_correct_RT(
                                                   p2 * grbdiff[((rr - GRBdir0) * ts + cc) >> 1] + p3 * grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]) / (p0 + p1 + p2 + p3) ;
 
                                     //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
-                                    if (fabsf(grbdiffold) > fabsf(grbdiffint) ) {
+                                    if (fabsf(grbdiffold) > fabsf(grbdiffint)) {
                                         rgb[c][indx >> 1] = rgb[1][indx] - grbdiffint;
                                     }
                                 }
@@ -1185,10 +1189,10 @@ float* RawImageSource::CA_correct_RT(
                             }
                         }
 
-                        if(plistener) {
+                        if (plistener) {
                             progresscounter++;
 
-                            if(progresscounter % 8 == 0)
+                            if (progresscounter % 8 == 0)
                                 #pragma omp critical (cacorrect)
                             {
                                 progress += 4.0 * SQR(ts - border2) / (iterations * height * width);
@@ -1196,63 +1200,66 @@ float* RawImageSource::CA_correct_RT(
                                 plistener->setProgress(progress);
                             }
                         }
-
                     }
+                }
 
                 #pragma omp barrier
                 // copy temporary image matrix back to image matrix
                 #pragma omp for
 
-                for(int row = 0; row < height; row++) {
+                for (int row = 0; row < height; row++) {
                     int col = FC(row, 0) & 1;
                     int indx = (row * width + col) >> 1;
 #ifdef __SSE2__
-                    for(; col < width - 7 - (3 * (W & 1)); col += 8, indx += 4) {
+                    for (; col < width - 7 - (3 * (W & 1)); col += 8, indx += 4) {
                         STC2VFU(rawData[row][col], LVFU(RawDataTmp[indx]));
                     }
 #endif
-                    for(; col < width - (3 * (W & 1)); col += 2, indx++) {
+                    for (; col < width - (3 * (W & 1)); col += 2, indx++) {
                         rawData[row][col] = RawDataTmp[indx];
                     }
                 }
 
             }
-
             // clean up
             free(bufferThr);
         }
     }
-    if(autoCA && fitParamsTransfer && fitParamsOut) {
+    if (autoCA && fitParamsTransfer && fitParamsOut) {
         // store calculated parameters
         int index = 0;
-        for(int c = 0; c < 2; ++c) {
-            for(int d = 0; d < 2; ++d) {
-                for(int e = 0; e < 16; ++e) {
+        for (int c = 0; c < 2; ++c) {
+            for (int d = 0; d < 2; ++d) {
+                for (int e = 0; e < 16; ++e) {
                     fitParamsTransfer[index++] = fitparams[c][d][e];
                 }
             }
         }
     }
 
-    if(freeBuffer) {
+    if (freeBuffer) {
         free(buffer);
         buffer = nullptr;
     }
 
 
     if (avoidColourshift) {
+        // to avoid or at least reduce the colour shift caused by raw ca correction we compute the per pixel difference factors
+        // of red and blue channel and apply a gaussian blur on them.
+        // Then we apply the resulting factors per pixel on the result of raw ca correction
+
         array2D<float> redFactor((W+1)/2, (H+1)/2);
         array2D<float> blueFactor((W+1)/2, (H+1)/2);
 
         #pragma omp parallel
         {
             #pragma omp for
-            for(int i = 0; i < H; ++i) {
+            for (int i = 0; i < H; ++i) {
                 const int firstCol = FC(i, 0) & 1;
                 const int colour = FC(i, firstCol);
                 const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
-                for(; j < W - 1; j += 2) {
+                for (; j < W - 1; j += 2) {
                     (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
                 }
                 if (j < W) {
@@ -1263,6 +1270,7 @@ float* RawImageSource::CA_correct_RT(
             #pragma omp single
             {
                 if (H % 2) {
+                    // odd height => factors for one one channel are not set in last row => use values of preceding row
                     const int firstCol = FC(0, 0) & 1;
                     const int colour = FC(0, firstCol);
                     const array2D<float>* nonGreen = colour == 0 ? &blueFactor : &redFactor;
@@ -1272,6 +1280,7 @@ float* RawImageSource::CA_correct_RT(
                 }
 
                 if (W % 2) {
+                    // odd width => factors for one one channel are not set in last column => use value of preceding column
                     const int ngRow = 1 - (FC(0, 0) & 1);
                     const int ngCol = FC(ngRow, 0) & 1;
                     const int colour = FC(ngRow, ngCol);
@@ -1282,18 +1291,18 @@ float* RawImageSource::CA_correct_RT(
                 }
             }
 
-            #pragma omp barrier
-
+            // blur correction factors
             gaussianBlur(redFactor, redFactor, (W+1)/2, (H+1)/2, 30.0);
             gaussianBlur(blueFactor, blueFactor, (W+1)/2, (H+1)/2, 30.0);
 
+            // apply correction factors to avoid (reduce) colour shift
             #pragma omp for
-            for(int i = 0; i < H; ++i) {
+            for (int i = 0; i < H; ++i) {
                 const int firstCol = FC(i, 0) & 1;
                 const int colour = FC(i, firstCol);
                 const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
-                for(; j < W - 1; j += 2) {
+                for (; j < W - 1; j += 2) {
                     rawData[i][j] *= (*nonGreen)[i/2][j/2];
                 }
                 if (j < W) {
@@ -1304,7 +1313,7 @@ float* RawImageSource::CA_correct_RT(
         delete oldraw;
     }
 
-    if(plistener) {
+    if (plistener) {
         plistener->setProgress(1.0);
     }
     return buffer;
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 77c6285be..1396ae245 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2009,13 +2009,13 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
         }
         if(numFrames == 4) {
             double fitParams[64];
-            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, raw.ca_avoidcolourshift, *rawDataFrames[0], fitParams, false, true, nullptr, false);
+            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, raw.ca_avoidcolourshift, *rawDataFrames[0], fitParams, false, true, nullptr, false);
             for(int i = 1; i < 3; ++i) {
-                CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, raw.ca_avoidcolourshift, *rawDataFrames[i], fitParams, true, false, buffer, false);
+                CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, raw.ca_avoidcolourshift, *rawDataFrames[i], fitParams, true, false, buffer, false);
             }
-            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, raw.ca_avoidcolourshift, *rawDataFrames[3], fitParams, true, false, buffer, true);
+            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, raw.ca_avoidcolourshift, *rawDataFrames[3], fitParams, true, false, buffer, true);
         } else {
-            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, 8.0, raw.ca_avoidcolourshift, rawData, nullptr, false, false, nullptr, true);
+            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, raw.ca_avoidcolourshift, rawData, nullptr, false, false, nullptr, true);
         }
     }
 
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index 972e1fe64..ad7807a44 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -244,7 +244,6 @@ protected:
         size_t autoIterations,
         double cared,
         double cablue,
-        double caautostrength,
         bool avoidColourshift,
         const array2D<float> &rawData,
         double* fitParamsTransfer,

From ec4115512a3779b07ee1bc6398d069cde58ce40a Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sat, 8 Sep 2018 13:08:44 +0200
Subject: [PATCH 15/22] raw ca correction/avoid colour shift: Vectorized one
 loop, #4777

---
 rtengine/CA_correct_RT.cc | 45 +++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index ef1e534c0..9cf1af145 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -147,11 +147,7 @@ float* RawImageSource::CA_correct_RT(
         oldraw = new array2D<float>((W + 1) / 2, H);
         #pragma omp parallel for
         for (int i = 0; i < H; ++i) {
-            int j = FC(i, 0) & 1;
-            for (; j < W - 1; j += 2) {
-                (*oldraw)[i][j / 2] = rawData[i][j];
-            }
-            if (j < W) {
+            for (int j = FC(i, 0) & 1; j < W; j += 2) {
                 (*oldraw)[i][j / 2] = rawData[i][j];
             }
         }
@@ -804,8 +800,7 @@ float* RawImageSource::CA_correct_RT(
                 float* grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
                 //green interpolated to optical sample points for R/B
                 float* gshift  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
-                #pragma omp for schedule(dynamic) collapse(2) nowait
-
+                #pragma omp for schedule(dynamic) collapse(2)
                 for (int top = -border; top < height; top += ts - border2) {
                     for (int left = -border; left < width - (W & 1); left += ts - border2) {
                         memset(bufferThr, 0, buffersizePassTwo);
@@ -958,7 +953,6 @@ float* RawImageSource::CA_correct_RT(
                                 }
                             }
                         }
-
                         //end of border fill
 
                         if (!autoCA || fitParamsIn) {
@@ -1026,7 +1020,6 @@ float* RawImageSource::CA_correct_RT(
                             lblockshifts[1][1] = LIM(lblockshifts[1][1], -bslim, bslim);
                         }//end of setting CA shift parameters
 
-
                         for (int c = 0; c < 3; c += 2) {
 
                             //some parameters for the bilinear interpolation
@@ -1042,7 +1035,6 @@ float* RawImageSource::CA_correct_RT(
                             GRBdir[1][c] = lblockshifts[c>>1][1] > 0 ? 2 : -2;
                         }
 
-
                         for (int rr = 4; rr < rr1 - 4; rr++) {
                             int cc = 4 + (FC(rr, 2) & 1);
                             int c = FC(rr, cc);
@@ -1203,7 +1195,6 @@ float* RawImageSource::CA_correct_RT(
                     }
                 }
 
-                #pragma omp barrier
                 // copy temporary image matrix back to image matrix
                 #pragma omp for
 
@@ -1245,7 +1236,7 @@ float* RawImageSource::CA_correct_RT(
 
     if (avoidColourshift) {
         // to avoid or at least reduce the colour shift caused by raw ca correction we compute the per pixel difference factors
-        // of red and blue channel and apply a gaussian blur on them.
+        // of red and blue channel and apply a gaussian blur to them.
         // Then we apply the resulting factors per pixel on the result of raw ca correction
 
         array2D<float> redFactor((W+1)/2, (H+1)/2);
@@ -1253,24 +1244,36 @@ float* RawImageSource::CA_correct_RT(
 
         #pragma omp parallel
         {
+#ifdef __SSE2__
+            const vfloat onev = F2V(1.f);
+            const vfloat twov = F2V(2.f);
+            const vfloat zd5v = F2V(0.5f);
+#endif
             #pragma omp for
             for (int i = 0; i < H; ++i) {
                 const int firstCol = FC(i, 0) & 1;
                 const int colour = FC(i, firstCol);
                 const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
                 int j = firstCol;
-                for (; j < W - 1; j += 2) {
-                    (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
+#ifdef __SSE2__
+                for (; j < W - 7; j += 8) {
+                    const vfloat newvals = LC2VFU(rawData[i][j]);
+                    const vfloat oldvals = LVFU((*oldraw)[i][j / 2]);
+                    vfloat factors = oldvals / newvals;
+                    factors = vself(vmaskf_le(newvals, onev), onev, factors);
+                    factors = vself(vmaskf_le(oldvals, onev), onev, factors);
+                    STVFU((*nonGreen)[i/2][j/2], LIMV(factors, zd5v, twov));
                 }
-                if (j < W) {
-                    (*nonGreen)[i/2][j/2] = rtengine::LIM((rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : (*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
+#endif
+                for (; j < W; j += 2) {
+                    (*nonGreen)[i/2][j/2] = (rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : rtengine::LIM((*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
                 }
             }
 
             #pragma omp single
             {
                 if (H % 2) {
-                    // odd height => factors for one one channel are not set in last row => use values of preceding row
+                    // odd height => factors for one channel are not set in last row => use values of preceding row
                     const int firstCol = FC(0, 0) & 1;
                     const int colour = FC(0, firstCol);
                     const array2D<float>* nonGreen = colour == 0 ? &blueFactor : &redFactor;
@@ -1280,7 +1283,7 @@ float* RawImageSource::CA_correct_RT(
                 }
 
                 if (W % 2) {
-                    // odd width => factors for one one channel are not set in last column => use value of preceding column
+                    // odd width => factors for one channel are not set in last column => use value of preceding column
                     const int ngRow = 1 - (FC(0, 0) & 1);
                     const int ngCol = FC(ngRow, 0) & 1;
                     const int colour = FC(ngRow, ngCol);
@@ -1301,11 +1304,7 @@ float* RawImageSource::CA_correct_RT(
                 const int firstCol = FC(i, 0) & 1;
                 const int colour = FC(i, firstCol);
                 const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
-                int j = firstCol;
-                for (; j < W - 1; j += 2) {
-                    rawData[i][j] *= (*nonGreen)[i/2][j/2];
-                }
-                if (j < W) {
+                for (int j = firstCol; j < W; j += 2) {
                     rawData[i][j] *= (*nonGreen)[i/2][j/2];
                 }
             }

From 88d0f60d6bb3e561d1f498f456786afaa3b98b19 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sat, 8 Sep 2018 13:09:55 +0200
Subject: [PATCH 16/22] raw ca correction/avoid colour shift: fire event only
 if raw ca correction is active, #4777

---
 rtgui/rawcacorrection.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index fd69ab063..02b7a65e3 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -147,7 +147,7 @@ void RAWCACorr::checkBoxToggled (CheckBox* c, CheckValue newval)
             listener->panelChanged (EvPreProcessAutoCA, caAutocorrect->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
         }
     } else if (c == caAvoidcolourshift) {
-        if (listener) {
+        if (listener && (caAutocorrect->getLastActive() || caRed->getValue() != 0 || caBlue->getValue() != 0)) {
             listener->panelChanged (EvPreProcessCAColourshift, caAvoidcolourshift->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
         }
     }

From ddbf94e94099f953811baaf1bfaaa11ac2e2dd2b Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sat, 8 Sep 2018 13:26:23 +0200
Subject: [PATCH 17/22] raw ca correction: generate history entry but don't
 reprocess when changing 'avoid colour shift' while ca correction is disabled,
 #4777

---
 rtgui/rawcacorrection.cc | 5 +++--
 rtgui/rawcacorrection.h  | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index 02b7a65e3..dc5657ece 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -29,6 +29,7 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     auto m = ProcEventMapper::getInstance();
     EvPreProcessCAAutoiterations = m->newEvent(DARKFRAME, "HISTORY_MSG_RAWCACORR_AUTOIT");
     EvPreProcessCAColourshift = m->newEvent(DARKFRAME, "HISTORY_MSG_RAWCACORR_COLOURSHIFT");
+    EvPreProcessCAColourshiftHistory = m->newEvent(M_VOID, "HISTORY_MSG_RAWCACORR_COLOURSHIFT");
 
     Gtk::Image* icaredL =   Gtk::manage (new RTImage ("circle-red-cyan-small.png"));
     Gtk::Image* icaredR =   Gtk::manage (new RTImage ("circle-cyan-red-small.png"));
@@ -147,8 +148,8 @@ void RAWCACorr::checkBoxToggled (CheckBox* c, CheckValue newval)
             listener->panelChanged (EvPreProcessAutoCA, caAutocorrect->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
         }
     } else if (c == caAvoidcolourshift) {
-        if (listener && (caAutocorrect->getLastActive() || caRed->getValue() != 0 || caBlue->getValue() != 0)) {
-            listener->panelChanged (EvPreProcessCAColourshift, caAvoidcolourshift->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
+        if (listener) {
+            listener->panelChanged ((caAutocorrect->getLastActive() || caRed->getValue() != 0 || caBlue->getValue() != 0) ? EvPreProcessCAColourshift : EvPreProcessCAColourshiftHistory, caAvoidcolourshift->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
         }
     }
 }
diff --git a/rtgui/rawcacorrection.h b/rtgui/rawcacorrection.h
index 614bd4b90..dea9ef738 100644
--- a/rtgui/rawcacorrection.h
+++ b/rtgui/rawcacorrection.h
@@ -36,6 +36,7 @@ protected:
 
     rtengine::ProcEvent EvPreProcessCAAutoiterations;
     rtengine::ProcEvent EvPreProcessCAColourshift;
+    rtengine::ProcEvent EvPreProcessCAColourshiftHistory;
 
 public:
 

From f85ec677ffc8b29831c66add0665fbd8da20e275 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sun, 9 Sep 2018 12:53:44 +0200
Subject: [PATCH 18/22] raw ca correction: avoid colour shift per iteration,
 #4777

---
 rtengine/CA_correct_RT.cc | 157 ++++++++++++++++++++------------------
 1 file changed, 81 insertions(+), 76 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 9cf1af145..81fd8d593 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -141,10 +141,14 @@ float* RawImageSource::CA_correct_RT(
         }
     }
 
+    array2D<float>* redFactor = nullptr;
+    array2D<float>* blueFactor = nullptr;
     array2D<float>* oldraw = nullptr;
     if (avoidColourshift) {
-        // copy raw values before ca correction
+        redFactor = new array2D<float>((W+1)/2, (H+1)/2);
+        blueFactor = new array2D<float>((W+1)/2, (H+1)/2);
         oldraw = new array2D<float>((W + 1) / 2, H);
+        // copy raw values before ca correction
         #pragma omp parallel for
         for (int i = 0; i < H; ++i) {
             for (int j = FC(i, 0) & 1; j < W; j += 2) {
@@ -1215,7 +1219,81 @@ float* RawImageSource::CA_correct_RT(
             // clean up
             free(bufferThr);
         }
+        if (avoidColourshift) {
+            // to avoid or at least reduce the colour shift caused by raw ca correction we compute the per pixel difference factors
+            // of red and blue channel and apply a gaussian blur to them.
+            // Then we apply the resulting factors per pixel on the result of raw ca correction
+
+            #pragma omp parallel
+            {
+    #ifdef __SSE2__
+                const vfloat onev = F2V(1.f);
+                const vfloat twov = F2V(2.f);
+                const vfloat zd5v = F2V(0.5f);
+    #endif
+                #pragma omp for
+                for (int i = 0; i < H; ++i) {
+                    const int firstCol = FC(i, 0) & 1;
+                    const int colour = FC(i, firstCol);
+                    const array2D<float>* nonGreen = colour == 0 ? redFactor : blueFactor;
+                    int j = firstCol;
+    #ifdef __SSE2__
+                    for (; j < W - 7; j += 8) {
+                        const vfloat newvals = LC2VFU(rawData[i][j]);
+                        const vfloat oldvals = LVFU((*oldraw)[i][j / 2]);
+                        vfloat factors = oldvals / newvals;
+                        factors = vself(vmaskf_le(newvals, onev), onev, factors);
+                        factors = vself(vmaskf_le(oldvals, onev), onev, factors);
+                        STVFU((*nonGreen)[i/2][j/2], LIMV(factors, zd5v, twov));
+                    }
+    #endif
+                    for (; j < W; j += 2) {
+                        (*nonGreen)[i/2][j/2] = (rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : rtengine::LIM((*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
+                    }
+                }
+
+                #pragma omp single
+                {
+                    if (H % 2) {
+                        // odd height => factors for one channel are not set in last row => use values of preceding row
+                        const int firstCol = FC(0, 0) & 1;
+                        const int colour = FC(0, firstCol);
+                        const array2D<float>* nonGreen = colour == 0 ? blueFactor : redFactor;
+                        for (int j = 0; j < (W + 1) / 2; ++j) {
+                            (*nonGreen)[(H + 1) / 2 - 1][j] = (*nonGreen)[(H + 1) / 2 - 2][j];
+                        }
+                    }
+
+                    if (W % 2) {
+                        // odd width => factors for one channel are not set in last column => use value of preceding column
+                        const int ngRow = 1 - (FC(0, 0) & 1);
+                        const int ngCol = FC(ngRow, 0) & 1;
+                        const int colour = FC(ngRow, ngCol);
+                        const array2D<float>* nonGreen = colour == 0 ? redFactor : blueFactor;
+                        for (int i = 0; i < (H + 1) / 2; ++i) {
+                            (*nonGreen)[i][(W + 1) / 2 - 1] = (*nonGreen)[i][(W + 1) / 2 - 2];
+                        }
+                    }
+                }
+
+                // blur correction factors
+                gaussianBlur(*redFactor, *redFactor, (W+1)/2, (H+1)/2, 30.0);
+                gaussianBlur(*blueFactor, *blueFactor, (W+1)/2, (H+1)/2, 30.0);
+
+                // apply correction factors to avoid (reduce) colour shift
+                #pragma omp for
+                for (int i = 0; i < H; ++i) {
+                    const int firstCol = FC(i, 0) & 1;
+                    const int colour = FC(i, firstCol);
+                    const array2D<float>* nonGreen = colour == 0 ? redFactor : blueFactor;
+                    for (int j = firstCol; j < W; j += 2) {
+                        rawData[i][j] *= (*nonGreen)[i/2][j/2];
+                    }
+                }
+            }
+        }
     }
+
     if (autoCA && fitParamsTransfer && fitParamsOut) {
         // store calculated parameters
         int index = 0;
@@ -1233,83 +1311,10 @@ float* RawImageSource::CA_correct_RT(
         buffer = nullptr;
     }
 
-
     if (avoidColourshift) {
-        // to avoid or at least reduce the colour shift caused by raw ca correction we compute the per pixel difference factors
-        // of red and blue channel and apply a gaussian blur to them.
-        // Then we apply the resulting factors per pixel on the result of raw ca correction
-
-        array2D<float> redFactor((W+1)/2, (H+1)/2);
-        array2D<float> blueFactor((W+1)/2, (H+1)/2);
-
-        #pragma omp parallel
-        {
-#ifdef __SSE2__
-            const vfloat onev = F2V(1.f);
-            const vfloat twov = F2V(2.f);
-            const vfloat zd5v = F2V(0.5f);
-#endif
-            #pragma omp for
-            for (int i = 0; i < H; ++i) {
-                const int firstCol = FC(i, 0) & 1;
-                const int colour = FC(i, firstCol);
-                const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
-                int j = firstCol;
-#ifdef __SSE2__
-                for (; j < W - 7; j += 8) {
-                    const vfloat newvals = LC2VFU(rawData[i][j]);
-                    const vfloat oldvals = LVFU((*oldraw)[i][j / 2]);
-                    vfloat factors = oldvals / newvals;
-                    factors = vself(vmaskf_le(newvals, onev), onev, factors);
-                    factors = vself(vmaskf_le(oldvals, onev), onev, factors);
-                    STVFU((*nonGreen)[i/2][j/2], LIMV(factors, zd5v, twov));
-                }
-#endif
-                for (; j < W; j += 2) {
-                    (*nonGreen)[i/2][j/2] = (rawData[i][j] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : rtengine::LIM((*oldraw)[i][j / 2] / rawData[i][j], 0.5f, 2.f);
-                }
-            }
-
-            #pragma omp single
-            {
-                if (H % 2) {
-                    // odd height => factors for one channel are not set in last row => use values of preceding row
-                    const int firstCol = FC(0, 0) & 1;
-                    const int colour = FC(0, firstCol);
-                    const array2D<float>* nonGreen = colour == 0 ? &blueFactor : &redFactor;
-                    for (int j = 0; j < (W + 1) / 2; ++j) {
-                        (*nonGreen)[(H + 1) / 2 - 1][j] = (*nonGreen)[(H + 1) / 2 - 2][j];
-                    }
-                }
-
-                if (W % 2) {
-                    // odd width => factors for one channel are not set in last column => use value of preceding column
-                    const int ngRow = 1 - (FC(0, 0) & 1);
-                    const int ngCol = FC(ngRow, 0) & 1;
-                    const int colour = FC(ngRow, ngCol);
-                    const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
-                    for (int i = 0; i < (H + 1) / 2; ++i) {
-                        (*nonGreen)[i][(W + 1) / 2 - 1] = redFactor[i][(W + 1) / 2 - 2];
-                    }
-                }
-            }
-
-            // blur correction factors
-            gaussianBlur(redFactor, redFactor, (W+1)/2, (H+1)/2, 30.0);
-            gaussianBlur(blueFactor, blueFactor, (W+1)/2, (H+1)/2, 30.0);
-
-            // apply correction factors to avoid (reduce) colour shift
-            #pragma omp for
-            for (int i = 0; i < H; ++i) {
-                const int firstCol = FC(i, 0) & 1;
-                const int colour = FC(i, firstCol);
-                const array2D<float>* nonGreen = colour == 0 ? &redFactor : &blueFactor;
-                for (int j = firstCol; j < W; j += 2) {
-                    rawData[i][j] *= (*nonGreen)[i/2][j/2];
-                }
-            }
-        }
         delete oldraw;
+        delete redFactor;
+        delete blueFactor;
     }
 
     if (plistener) {

From 2062555e47965f729dde07418abab540880d7561 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Tue, 11 Sep 2018 23:43:39 +0200
Subject: [PATCH 19/22] remove raw ca correction 'avoid colour shift' from gui,
 #4777

---
 rtgui/rawcacorrection.cc | 25 +++++++++++++------------
 rtgui/rawcacorrection.h  |  2 +-
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index dc5657ece..5ad5d30a3 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -39,7 +39,7 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     caAutocorrect = Gtk::manage (new CheckBox(M("TP_RAWCACORR_AUTO"), multiImage));
     caAutocorrect->setCheckBoxListener (this);
 
-    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 10, 1, 2));
+    caAutoiterations = Gtk::manage(new Adjuster (M("TP_RAWCACORR_AUTOIT"), 1, 5, 1, 2));
     caAutoiterations->setAdjusterListener (this);
 
     if (caAutoiterations->delay < options.adjusterMaxDelay) {
@@ -68,9 +68,9 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
     pack_start( *caRed, Gtk::PACK_SHRINK, 4);
     pack_start( *caBlue, Gtk::PACK_SHRINK, 4);
 
-    caAvoidcolourshift = Gtk::manage (new CheckBox(M("TP_RAWCACORR_AVOIDCOLORSHIFT"), multiImage));
-    caAvoidcolourshift->setCheckBoxListener (this);
-    pack_start( *caAvoidcolourshift, Gtk::PACK_SHRINK, 4);
+//    caAvoidcolourshift = Gtk::manage (new CheckBox(M("TP_RAWCACORR_AVOIDCOLORSHIFT"), multiImage));
+//    caAvoidcolourshift->setCheckBoxListener (this);
+//    pack_start( *caAvoidcolourshift, Gtk::PACK_SHRINK, 4);
 
 
 }
@@ -81,7 +81,7 @@ void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdi
 
     if(pedited ) {
         caAutocorrect->setEdited(pedited->raw.ca_autocorrect);
-        caAvoidcolourshift->setEdited(pedited->raw.ca_avoidcolourshift);
+//        caAvoidcolourshift->setEdited(pedited->raw.ca_avoidcolourshift);
         caAutoiterations->setEditedState( pedited->raw.caautoiterations ? Edited : UnEdited );
         caRed->setEditedState( pedited->raw.cared ? Edited : UnEdited );
         caBlue->setEditedState( pedited->raw.cablue ? Edited : UnEdited );
@@ -93,7 +93,7 @@ void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdi
     caBlue->set_sensitive(!pp->raw.ca_autocorrect);
 
     caAutocorrect->setValue(pp->raw.ca_autocorrect);
-    caAvoidcolourshift->setValue(pp->raw.ca_avoidcolourshift);
+//    caAvoidcolourshift->setValue(pp->raw.ca_avoidcolourshift);
     caAutoiterations->setValue (pp->raw.caautoiterations);
     caRed->setValue (pp->raw.cared);
     caBlue->setValue (pp->raw.cablue);
@@ -104,14 +104,14 @@ void RAWCACorr::read(const rtengine::procparams::ProcParams* pp, const ParamsEdi
 void RAWCACorr::write( rtengine::procparams::ProcParams* pp, ParamsEdited* pedited)
 {
     pp->raw.ca_autocorrect = caAutocorrect->getLastActive();
-    pp->raw.ca_avoidcolourshift = caAvoidcolourshift->getLastActive();
+//    pp->raw.ca_avoidcolourshift = caAvoidcolourshift->getLastActive();
     pp->raw.caautoiterations = caAutoiterations->getValue();
     pp->raw.cared = caRed->getValue();
     pp->raw.cablue = caBlue->getValue();
 
     if (pedited) {
         pedited->raw.ca_autocorrect = !caAutocorrect->get_inconsistent();
-        pedited->raw.ca_avoidcolourshift = !caAvoidcolourshift->get_inconsistent();
+//        pedited->raw.ca_avoidcolourshift = !caAvoidcolourshift->get_inconsistent();
         pedited->raw.caautoiterations = caAutoiterations->getEditedState ();
         pedited->raw.cared = caRed->getEditedState ();
         pedited->raw.cablue = caBlue->getEditedState ();
@@ -147,11 +147,12 @@ void RAWCACorr::checkBoxToggled (CheckBox* c, CheckValue newval)
         if (listener) {
             listener->panelChanged (EvPreProcessAutoCA, caAutocorrect->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
         }
-    } else if (c == caAvoidcolourshift) {
-        if (listener) {
-            listener->panelChanged ((caAutocorrect->getLastActive() || caRed->getValue() != 0 || caBlue->getValue() != 0) ? EvPreProcessCAColourshift : EvPreProcessCAColourshiftHistory, caAvoidcolourshift->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
-        }
     }
+//     else if (c == caAvoidcolourshift) {
+//        if (listener) {
+//            listener->panelChanged ((caAutocorrect->getLastActive() || caRed->getValue() != 0 || caBlue->getValue() != 0) ? EvPreProcessCAColourshift : EvPreProcessCAColourshiftHistory, caAvoidcolourshift->getLastActive() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
+//        }
+//    }
 }
 
 void RAWCACorr::setBatchMode(bool batchMode)
diff --git a/rtgui/rawcacorrection.h b/rtgui/rawcacorrection.h
index dea9ef738..1c1ced53b 100644
--- a/rtgui/rawcacorrection.h
+++ b/rtgui/rawcacorrection.h
@@ -32,7 +32,7 @@ protected:
     Adjuster* caAutoiterations;
     Adjuster* caRed;
     Adjuster* caBlue;
-    CheckBox* caAvoidcolourshift;
+//    CheckBox* caAvoidcolourshift;
 
     rtengine::ProcEvent EvPreProcessCAAutoiterations;
     rtengine::ProcEvent EvPreProcessCAColourshift;

From adddd5aafa11c3d039a64d34cfd9a706242335f9 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Tue, 11 Sep 2018 23:44:42 +0200
Subject: [PATCH 20/22] set raw ca correction 'avoid colour shift' as default,
 #4777

---
 rtengine/rawimagesource.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 1396ae245..0f6d7cc93 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2009,13 +2009,13 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
         }
         if(numFrames == 4) {
             double fitParams[64];
-            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, raw.ca_avoidcolourshift, *rawDataFrames[0], fitParams, false, true, nullptr, false);
+            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, true, *rawDataFrames[0], fitParams, false, true, nullptr, false);
             for(int i = 1; i < 3; ++i) {
-                CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, raw.ca_avoidcolourshift, *rawDataFrames[i], fitParams, true, false, buffer, false);
+                CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, true, *rawDataFrames[i], fitParams, true, false, buffer, false);
             }
-            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, raw.ca_avoidcolourshift, *rawDataFrames[3], fitParams, true, false, buffer, true);
+            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, true, *rawDataFrames[3], fitParams, true, false, buffer, true);
         } else {
-            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, raw.ca_avoidcolourshift, rawData, nullptr, false, false, nullptr, true);
+            CA_correct_RT(raw.ca_autocorrect, raw.caautoiterations, raw.cared, raw.cablue, true, rawData, nullptr, false, false, nullptr, true);
         }
     }
 

From 7cbf1f6d8d1053e82a876932f47b39e4629764b8 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 12 Sep 2018 14:24:41 +0200
Subject: [PATCH 21/22] Fix calculation of interpolation parameters for
 negative block shifts, #4774, #4777

---
 rtengine/CA_correct_RT.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index 81fd8d593..a27ee7998 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -1029,11 +1029,17 @@ float* RawImageSource::CA_correct_RT(
                             //some parameters for the bilinear interpolation
                             shiftvfloor[c] = floor((float)lblockshifts[c>>1][0]);
                             shiftvceil[c] = ceil((float)lblockshifts[c>>1][0]);
-                            shiftvfrac[c] = lblockshifts[c>>1][0] - shiftvfloor[c];
+                            if (lblockshifts[c>>1][0] < 0.f) {
+                                std::swap(shiftvfloor[c], shiftvceil[c]);
+                            }
+                            shiftvfrac[c] = fabs(lblockshifts[c>>1][0] - shiftvfloor[c]);
 
                             shifthfloor[c] = floor((float)lblockshifts[c>>1][1]);
                             shifthceil[c] = ceil((float)lblockshifts[c>>1][1]);
-                            shifthfrac[c] = lblockshifts[c>>1][1] - shifthfloor[c];
+                            if (lblockshifts[c>>1][1] < 0.f) {
+                                std::swap(shifthfloor[c], shifthceil[c]);
+                            }
+                            shifthfrac[c] = fabs(lblockshifts[c>>1][1] - shifthfloor[c]);
 
                             GRBdir[0][c] = lblockshifts[c>>1][0] > 0 ? 2 : -2;
                             GRBdir[1][c] = lblockshifts[c>>1][1] > 0 ? 2 : -2;

From 9be449fc1ea04d29439eaedf90f5ab595ca158c5 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sat, 15 Sep 2018 00:14:22 +0200
Subject: [PATCH 22/22] Cleanup for unused raw ca corrtection avoid colour
 shift stuff

---
 rtgui/rawcacorrection.cc | 4 ++--
 rtgui/rawcacorrection.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rtgui/rawcacorrection.cc b/rtgui/rawcacorrection.cc
index 5ad5d30a3..8dd96e3a4 100644
--- a/rtgui/rawcacorrection.cc
+++ b/rtgui/rawcacorrection.cc
@@ -28,8 +28,8 @@ RAWCACorr::RAWCACorr () : FoldableToolPanel(this, "rawcacorrection", M("TP_CHROM
 {
     auto m = ProcEventMapper::getInstance();
     EvPreProcessCAAutoiterations = m->newEvent(DARKFRAME, "HISTORY_MSG_RAWCACORR_AUTOIT");
-    EvPreProcessCAColourshift = m->newEvent(DARKFRAME, "HISTORY_MSG_RAWCACORR_COLOURSHIFT");
-    EvPreProcessCAColourshiftHistory = m->newEvent(M_VOID, "HISTORY_MSG_RAWCACORR_COLOURSHIFT");
+//    EvPreProcessCAColourshift = m->newEvent(DARKFRAME, "HISTORY_MSG_RAWCACORR_COLOURSHIFT");
+//    EvPreProcessCAColourshiftHistory = m->newEvent(M_VOID, "HISTORY_MSG_RAWCACORR_COLOURSHIFT");
 
     Gtk::Image* icaredL =   Gtk::manage (new RTImage ("circle-red-cyan-small.png"));
     Gtk::Image* icaredR =   Gtk::manage (new RTImage ("circle-cyan-red-small.png"));
diff --git a/rtgui/rawcacorrection.h b/rtgui/rawcacorrection.h
index 1c1ced53b..566f93e03 100644
--- a/rtgui/rawcacorrection.h
+++ b/rtgui/rawcacorrection.h
@@ -35,8 +35,8 @@ protected:
 //    CheckBox* caAvoidcolourshift;
 
     rtengine::ProcEvent EvPreProcessCAAutoiterations;
-    rtengine::ProcEvent EvPreProcessCAColourshift;
-    rtengine::ProcEvent EvPreProcessCAColourshiftHistory;
+//    rtengine::ProcEvent EvPreProcessCAColourshift;
+//    rtengine::ProcEvent EvPreProcessCAColourshiftHistory;
 
 public: