rawTherapee/rtengine/CA_correct_RT.cc

////////////////////////////////////////////////////////////////
//
//  Chromatic Aberration correction on raw bayer cfa data
//
//  copyright (c) 2008-2010 Emil Martinec <ejmartin@uchicago.edu>
//  copyright (c) for improvements (speedups, iterated correction and avoid colour shift) 2018 Ingo Weyrich <heckflosse67@gmx.de>
//
//  code dated: September 8, 2018
//
//  CA_correct_RT.cc is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program.  If not, see <https://www.gnu.org/licenses/>.
//
////////////////////////////////////////////////////////////////

#include "rtengine.h"
#include "rawimagesource.h"
#include "rt_math.h"
#include "gauss.h"
#include "median.h"
#include "StopWatch.h"

namespace
{
unsigned fc(const unsigned int cfa[2][2], int r, int c) {
    return cfa[r & 1][c & 1];
}
}

namespace {

bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
{
//==============================================================================
// return 1 if system not solving, 0 if system solved
// nDim - system dimension
// pfMatr - matrix with coefficients
// pfVect - vector with free members
// pfSolution - vector with system solution
// pfMatr becomes triangular after function call
// pfVect changes after function call
//
// Developer: Henry Guennadi Levkin
//
//==============================================================================

    double fAcc;

    int i, j, k;

    for (k = 0; k < (nDim - 1); k++) { // base row of matrix
        // search of line with max element
        double fMaxElem = fabs(pfMatr[k * nDim + k]);
        int m = k;

        for (i = k + 1; i < nDim; i++) {
            if (fMaxElem < fabs(pfMatr[i * nDim + k])) {
                fMaxElem = pfMatr[i * nDim + k];
                m = i;
            }
        }

        // permutation of base line (index k) and max element line(index m)
        if (m != k) {
            for (i = k; i < nDim; i++) {
                fAcc                 = pfMatr[k * nDim + i];
                pfMatr[k * nDim + i] = pfMatr[m * nDim + i];
                pfMatr[m * nDim + i] = fAcc;
            }

            fAcc = pfVect[k];
            pfVect[k] = pfVect[m];
            pfVect[m] = fAcc;
        }

        if (pfMatr[k * nDim + k] == 0.) {
            //linear system has no solution
            return false; // needs improvement !!!
        }

        // triangulation of matrix with coefficients
        for (j = (k + 1); j < nDim; j++) { // current row of matrix
            fAcc = - pfMatr[j * nDim + k] / pfMatr[k * nDim + k];

            for (i = k; i < nDim; i++) {
                pfMatr[j * nDim + i] = pfMatr[j * nDim + i] + fAcc * pfMatr[k * nDim + i];
            }

            pfVect[j] = pfVect[j] + fAcc * pfVect[k]; // free member recalculation
        }
    }

    for (k = (nDim - 1); k >= 0; k--) {
        pfSolution[k] = pfVect[k];

        for (i = (k + 1); i < nDim; i++) {
            pfSolution[k] -= (pfMatr[k * nDim + i] * pfSolution[i]);
        }

        pfSolution[k] = pfSolution[k] / pfMatr[k * nDim + k];
    }

    return true;
}
//end of linear equation solver
}

using namespace std;
using namespace rtengine;

float* RawImageSource::CA_correct_RT(
    bool autoCA,
    size_t autoIterations,
    double cared,
    double cablue,
    bool avoidColourshift,
    const array2D<float> &rawData,
    double* fitParamsTransfer,
    bool fitParamsIn,
    bool fitParamsOut,
    float* buffer,
    bool freeBuffer,
    size_t chunkSize,
    bool measure
)
{

    std::unique_ptr<StopWatch> stop;

    if (measure) {
        std::cout << "CA correcting " << W << "x" << H << " image with " << chunkSize << " tiles per thread" << std::endl;
        stop.reset(new StopWatch("CA correction"));
    }

    // multithreaded and vectorized by Ingo Weyrich
    const unsigned int cfa[2][2] = {{FC(0,0), FC(0,1)}, {FC(1,0), FC(1,1)}};
    constexpr int ts = 128;
    constexpr int tsh = ts / 2;
    constexpr int cb = 2; // 2 pixels border will be excluded from correction

    //shifts to location of vertical and diagonal neighbours
    constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, v4 = 4 * ts; //, p1=-ts+1, p2=-2*ts+2, p3=-3*ts+3, m1=ts+1, m2=2*ts+2, m3=3*ts+3;

    // Test for RGB cfa
    for (int i = 0; i < 2; i++) {
        for (int j = 0; j < 2; j++) {
            if (fc(cfa, i, j) == 3) {
                std::cout << "CA correction supports only RGB Colour filter arrays" << std::endl;
                return buffer;
            }
        }
    }

    array2D<float>* redFactor = nullptr;
    array2D<float>* blueFactor = nullptr;
    array2D<float>* oldraw = nullptr;
    if (avoidColourshift) {
        redFactor = new array2D<float>((W + 1 - 2 * cb) / 2, (H + 1 - 2 * cb) / 2);
        blueFactor = new array2D<float>((W + 1 - 2 * cb) / 2, (H + 1 - 2 * cb) / 2);
        oldraw = new array2D<float>((W + 1- 2 * cb) / 2, H- 2 * cb);
        // copy raw values before ca correction
#ifdef _OPENMP
        #pragma omp parallel for
#endif
        for (int i = cb; i < H - cb; ++i) {
            for (int j = cb + (fc(cfa, i, 0) & 1); j < W - cb; j += 2) {
                (*oldraw)[i - cb][(j - cb) / 2] = rawData[i][j];
            }
        }
    }

    double progress = 0.0;

    if (plistener) {
        plistener->setProgress (progress);
    }

    // local variables
    const int width = W + (W & 1), height = H;
    constexpr int border = 8;
    constexpr int border2 = 16;

    const int vz1 = (height + border2) % (ts - border2) == 0 ? 1 : 0;
    const int hz1 = (width + border2) % (ts - border2) == 0 ? 1 : 0;
    const int vblsz = ceil((float)(height + border2) / (ts - border2) + 2 + vz1);
    const int hblsz = ceil((float)(width + border2) / (ts - border2) + 2 + hz1);

    //temporary array to store simple interpolation of G
    if (!buffer) {
        buffer = static_cast<float*>(malloc ((height * width + vblsz * hblsz * (2 * 2 + 1)) * sizeof(float)));
    }
    float* Gtmp = buffer;
    float* RawDataTmp = buffer + (height * width) / 2;

    //block CA shift values and weight assigned to block
    float* const blockwt = buffer + (height * width);
    memset(blockwt, 0, static_cast<unsigned long>(vblsz) * hblsz * (2 * 2 + 1) * sizeof(float));
    float (*blockshifts)[2][2] = (float (*)[2][2])(blockwt + vblsz * hblsz);

    // Because we can't break parallel processing, we need a switch do handle the errors
    bool processpasstwo = true;
    double fitparams[2][2][16];

    const size_t iterations =
        autoCA
            ? std::max<size_t>(autoIterations, 1)
            : 1;

    const bool fitParamsSet = fitParamsTransfer && fitParamsIn && iterations < 2;
    if (autoCA && fitParamsSet) {
        // use stored parameters
        int index = 0;
        for (int c = 0; c < 2; ++c) {
            for (int d = 0; d < 2; ++d) {
                for (int e = 0; e < 16; ++e) {
                    fitparams[c][d][e] = fitParamsTransfer[index++];
                }
            }
        }
    }

    for (size_t it = 0; it < iterations && processpasstwo; ++it) {
        float blockave[2][2] = {};
        float blocksqave[2][2] = {};
        float blockdenom[2][2] = {};
        float blockvar[2][2];

        //order of 2d polynomial fit (polyord), and numpar=polyord^2
        int polyord = 4, numpar = 16;

        constexpr float eps = 1e-5f, eps2 = 1e-10f; //tolerance to avoid dividing by zero

#ifdef _OPENMP
        #pragma omp parallel
#endif
        {
            int progresscounter = 0;

            //direction of the CA shift in a tile
            int GRBdir[2][3];

            int shifthfloor[3], shiftvfloor[3], shifthceil[3], shiftvceil[3];

            //polynomial fit coefficients
            //residual CA shift amount within a plaquette
            float shifthfrac[3], shiftvfrac[3];

            // assign working space
            constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
            constexpr int buffersizePassTwo = sizeof(float) * ts * ts + 4 * sizeof(float) * ts * tsh + 4 * 64 + 63;
            char * const bufferThr = (char *) malloc((autoCA && !fitParamsSet) ? buffersize : buffersizePassTwo);

            char * const data = (char*)((uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);

            // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <= 4-way associative L1-Cache

            //rgb data in a tile
            float* rgb[3];
            rgb[0] = (float*) data;
            rgb[1] = (float*) (data + sizeof(float) * ts * tsh + 1 * 64);
            rgb[2] = (float*) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);

            if (autoCA && !fitParamsSet) {
                constexpr float caAutostrength = 8.f;
                //high pass filter for R/B in vertical direction
                float* rbhpfh  = (float*) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
                //high pass filter for R/B in horizontal direction
                float* rbhpfv  = (float*) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
                //low pass filter for R/B in horizontal direction
                float* rblpfh  = (float*) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
                //low pass filter for R/B in vertical direction
                float* rblpfv  = (float*) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
                //low pass filter for colour differences in horizontal direction
                float* grblpfh = (float*) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
                //low pass filter for colour differences in vertical direction
                float* grblpfv = (float*) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
                // Main algorithm: Tile loop calculating correction parameters per tile

                //local quadratic fit to shift data within a tile
                float coeff[2][3][2];
                //measured CA shift parameters for a tile
                float CAshift[2][2];

                //per thread data for evaluation of block CA shift variance
                float blockavethr[2][2] = {};
                float blocksqavethr[2][2] = {};
                float blockdenomthr[2][2] = {};

#ifdef _OPENMP
                #pragma omp for collapse(2) schedule(dynamic, chunkSize) nowait
#endif
                for (int top = -border ; top < height; top += ts - border2) {
                    for (int left = -border; left < width - (W & 1); left += ts - border2) {
                        memset(bufferThr, 0, buffersize);
                        const int vblock = (top + border) / (ts - border2) + 1;
                        const int hblock = (left + border) / (ts - border2) + 1;
                        const int bottom = min(top + ts, height + border);
                        const int right  = min(left + ts, width - (W & 1) + border);
                        const int rr1 = bottom - top;
                        const int cc1 = right - left;
                        const int rrmin = top < 0 ? border : 0;
                        const int rrmax = bottom > height ? height - top : rr1;
                        const int ccmin = left < 0 ? border : 0;
                        const int ccmax = (right > width - (W & 1)) ? width - (W & 1) - left : cc1;

                        // rgb from input CFA data
                        // rgb values should be floating point numbers between 0 and 1
                        // after white balance multipliers are applied

#ifdef __SSE2__
                        vfloat c65535v = F2V(65535.f);
#endif

                        for (int rr = rrmin; rr < rrmax; rr++) {
                            int row = rr + top;
                            int cc = ccmin;
                            int col = cc + left;
#ifdef __SSE2__
                            int c0 = fc(cfa, rr, cc);
                            if (c0 == 1) {
                                rgb[c0][rr * ts + cc] = rawData[row][col] / 65535.f;
                                cc++;
                                col++;
                                c0 = fc(cfa, rr, cc);
                            }
                            int indx1 = rr * ts + cc;
                            for (; cc < ccmax - 7; cc+=8, col+=8, indx1 += 8) {
                                vfloat val1 = LVFU(rawData[row][col]) / c65535v;
                                vfloat val2 = LVFU(rawData[row][col + 4]) / c65535v;
                                vfloat nonGreenv = _mm_shuffle_ps(val1,val2,_MM_SHUFFLE(2,0,2,0));
                                STVFU(rgb[c0][indx1 >> 1], nonGreenv);
                                STVFU(rgb[1][indx1], val1);
                                STVFU(rgb[1][indx1 + 4], val2);
                            }
#endif
                            for (; cc < ccmax; cc++, col++) {
                                int c = fc(cfa, rr, cc);
                                int indx1 = rr * ts + cc;
                                rgb[c][indx1 >> ((c & 1) ^ 1)] = rawData[row][col] / 65535.f;
                            }
                        }

                        //fill borders
                        if (rrmin > 0) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = ccmin; cc < ccmax; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                                }
                            }
                        }

                        if (rrmax < rr1) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = ccmin; cc < ccmax; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][left + cc] / 65535.f;
                                }
                            }
                        }

                        if (ccmin > 0) {
                            for (int rr = rrmin; rr < rrmax; rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
                                }
                            }
                        }

                        if (ccmax < cc1) {
                            for (int rr = rrmin; rr < rrmax; rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(top + rr)][(width - cc - 2)] / 65535.f;
                                }
                            }
                        }

                        //also, fill the image corners
                        if (rrmin > 0 && ccmin > 0) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rawData[border2 - rr][border2 - cc] / 65535.f;
                                }
                            }
                        }

                        if (rrmax < rr1 && ccmax < cc1) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][(width - cc - 2)] / 65535.f;
                                }
                            }
                        }

                        if (rrmin > 0 && ccmax < cc1) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = rawData[(border2 - rr)][(width - cc - 2)] / 65535.f;
                                }
                            }
                        }

                        if (rrmax < rr1 && ccmin > 0) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = rawData[(height - rr - 2)][(border2 - cc)] / 65535.f;
                                }
                            }
                        }

                        //end of border fill
                        //end of initialization

#ifdef __SSE2__
                        vfloat onev = F2V(1.f);
                        vfloat epsv = F2V(eps);
#endif
                        for (int rr = 3; rr < rr1 - 3; rr++) {
                            int row = rr + top;
                            int cc = 3 + (fc(cfa, rr,3) & 1);
                            int indx = rr * ts + cc;
                            int c = fc(cfa, rr,cc);
#ifdef __SSE2__
                            for (; cc < cc1 - 9; cc+=8, indx+=8) {
                                //compute directional weights using image gradients
                                vfloat rgb1mv1v = LC2VFU(rgb[1][indx - v1]);
                                vfloat rgb1pv1v = LC2VFU(rgb[1][indx + v1]);
                                vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
                                vfloat temp1v = epsv + vabsf(rgb1mv1v - rgb1pv1v);
                                vfloat wtuv = onev / SQRV(temp1v + vabsf(rgbcv - LVFU(rgb[c][(indx - v2) >> 1])) + vabsf(rgb1mv1v - LC2VFU(rgb[1][indx - v3])));
                                vfloat wtdv = onev / SQRV(temp1v + vabsf(rgbcv - LVFU(rgb[c][(indx + v2) >> 1])) + vabsf(rgb1pv1v - LC2VFU(rgb[1][indx + v3])));
                                vfloat rgb1m1v = LC2VFU(rgb[1][indx - 1]);
                                vfloat rgb1p1v = LC2VFU(rgb[1][indx + 1]);
                                vfloat temp2v = epsv + vabsf(rgb1m1v - rgb1p1v);
                                vfloat wtlv = onev / SQRV(temp2v + vabsf(rgbcv - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(rgb1m1v - LC2VFU(rgb[1][indx - 3])));
                                vfloat wtrv = onev / SQRV(temp2v + vabsf(rgbcv - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(rgb1p1v - LC2VFU(rgb[1][indx + 3])));

                                //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                vfloat result = (wtuv * rgb1mv1v + wtdv * rgb1pv1v + wtlv * rgb1m1v + wtrv * rgb1p1v) / (wtuv + wtdv + wtlv + wtrv);
                                STC2VFU(rgb[1][indx], result);
                            }

#endif
                            for (; cc < cc1 - 3; cc+=2, indx+=2) {
                                //compute directional weights using image gradients
                                float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - v2) >> 1]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
                                float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + v2) >> 1]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
                                float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx - 2) >> 1]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
                                float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx >> 1] - rgb[c][(indx + 2) >> 1]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));

                                //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
                            }

                            if (row > -1 && row < height) {
                                int offset = (fc(cfa, row,max(left + 3, 0)) & 1);
                                int col = max(left + 3, 0) + offset;
                                int indx = rr * ts + 3 - (left < 0 ? (left+3) : 0) + offset;
#ifdef __SSE2__
                                for (; col < min(cc1 + left - 3, width) - 7; col+=8, indx+=8) {
                                    STVFU(Gtmp[(row * width + col) >> 1], LC2VFU(rgb[1][indx]));
                                }
#endif
                                for (; col < min(cc1 + left - 3, width); col+=2, indx+=2) {
                                    Gtmp[(row * width + col) >> 1] = rgb[1][indx];
                                }
                            }
                        }

#ifdef __SSE2__
                        vfloat zd25v = F2V(0.25f);
#endif
                        for (int rr = 4; rr < rr1 - 4; rr++) {
                            int cc = 4 + (fc(cfa, rr, 2) & 1);
                            int indx = rr * ts + cc;
                            int c = fc(cfa, rr, cc);
#ifdef __SSE2__
                            for (; cc < cc1 - 10; cc += 8, indx += 8) {
                                vfloat rgb1v = LC2VFU(rgb[1][indx]);
                                vfloat rgbcv = LVFU(rgb[c][indx >> 1]);
                                vfloat rgb1mv4 = LC2VFU(rgb[1][indx - v4]);
                                vfloat rgb1pv4 = LC2VFU(rgb[1][indx + v4]);
                                vfloat temp1v = vabsf(vabsf((rgb1v - rgbcv) - (rgb1pv4 - LVFU(rgb[c][(indx + v4) >> 1]))) +
                                                      vabsf(rgb1mv4 - LVFU(rgb[c][(indx - v4) >> 1]) - rgb1v + rgbcv) -
                                                      vabsf(rgb1mv4 - LVFU(rgb[c][(indx - v4) >> 1]) - rgb1pv4 + LVFU(rgb[c][(indx + v4) >> 1])));
                                STVFU(rbhpfv[indx >> 1], temp1v);
                                vfloat rgb1m4 = LC2VFU(rgb[1][indx - 4]);
                                vfloat rgb1p4 = LC2VFU(rgb[1][indx + 4]);
                                vfloat temp2v = vabsf(vabsf((rgb1v - rgbcv) - (rgb1p4 - LVFU(rgb[c][(indx + 4) >> 1]))) +
                                                      vabsf(rgb1m4 - LVFU(rgb[c][(indx - 4) >> 1]) - rgb1v + rgbcv) -
                                                      vabsf(rgb1m4 - LVFU(rgb[c][(indx - 4) >> 1]) - rgb1p4 + LVFU(rgb[c][(indx + 4) >> 1])));
                                STVFU(rbhpfh[indx >> 1], temp2v);

                                //low and high pass 1D filters of G in vertical/horizontal directions
                                rgb1v = vmul2f(rgb1v);
                                vfloat glpfvv = (rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]));
                                vfloat glpfhv = (rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]));
                                rgbcv = vmul2f(rgbcv);
                                STVFU(rblpfv[indx >> 1], zd25v * vabsf(glpfvv - (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
                                STVFU(rblpfh[indx >> 1], zd25v * vabsf(glpfhv - (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
                                STVFU(grblpfv[indx >> 1], zd25v * (glpfvv + (rgbcv + LVFU(rgb[c][(indx + v2) >> 1]) + LVFU(rgb[c][(indx - v2) >> 1]))));
                                STVFU(grblpfh[indx >> 1], zd25v * (glpfhv + (rgbcv + LVFU(rgb[c][(indx + 2) >> 1]) + LVFU(rgb[c][(indx - 2) >> 1]))));
                            }
#endif
                            for (; cc < cc1 - 4; cc += 2, indx += 2) {
                                rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])) +
                                                          fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
                                                          fabsf((rgb[1][indx - v4] - rgb[c][(indx - v4) >> 1]) - (rgb[1][indx + v4] - rgb[c][(indx + v4) >> 1])));
                                rbhpfh[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])) +
                                                          fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx] - rgb[c][indx >> 1])) -
                                                          fabsf((rgb[1][indx - 4] - rgb[c][(indx - 4) >> 1]) - (rgb[1][indx + 4] - rgb[c][(indx + 4) >> 1])));

                                //low and high pass 1D filters of G in vertical/horizontal directions
                                float glpfv = (2.f * rgb[1][indx] + rgb[1][indx + v2] + rgb[1][indx - v2]);
                                float glpfh = (2.f * rgb[1][indx] + rgb[1][indx + 2] + rgb[1][indx - 2]);
                                rblpfv[indx >> 1] = 0.25f * fabsf(glpfv - (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]));
                                rblpfh[indx >> 1] = 0.25f * fabsf(glpfh - (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]));
                                grblpfv[indx >> 1] = 0.25f * (glpfv + (2.f * rgb[c][indx >> 1] + rgb[c][(indx + v2) >> 1] + rgb[c][(indx - v2) >> 1]));
                                grblpfh[indx >> 1] = 0.25f * (glpfh + (2.f * rgb[c][indx >> 1] + rgb[c][(indx + 2) >> 1] + rgb[c][(indx - 2) >> 1]));
                            }
                        }

                        for (int dir = 0; dir < 2; dir++) {
                            for (int k = 0; k < 3; k++) {
                                for (int c = 0; c < 2; c++) {
                                    coeff[dir][k][c] = 0;
                                }
                            }
                        }

#ifdef __SSE2__
                        vfloat zd3v = F2V(0.3f);
                        vfloat zd1v = F2V(0.1f);
                        vfloat zd5v = F2V(0.5f);
#endif

                        // along line segments, find the point along each segment that minimizes the colour variance
                        // averaged over the tile; evaluate for up/down and left/right away from R/B grid point
                        for (int rr = 8; rr < rr1 - 8; rr++) {
                            int cc = 8 + (fc(cfa, rr, 2) & 1);
                            int indx = rr * ts + cc;
                            int c = fc(cfa, rr, cc);
#ifdef __SSE2__
                            vfloat coeff00v = ZEROV;
                            vfloat coeff01v = ZEROV;
                            vfloat coeff02v = ZEROV;
                            vfloat coeff10v = ZEROV;
                            vfloat coeff11v = ZEROV;
                            vfloat coeff12v = ZEROV;
                            for (; cc < cc1 - 14; cc += 8, indx += 8) {
                                //in linear interpolation, colour differences are a quadratic function of interpolation position;
                                //solve for the interpolation position that minimizes colour difference variance over the tile

                                //vertical
                                vfloat temp1 = zd3v * (LC2VFU(rgb[1][indx + ts + 1]) - LC2VFU(rgb[1][indx - ts - 1]));
                                vfloat temp2 = zd3v * (LC2VFU(rgb[1][indx - ts + 1]) - LC2VFU(rgb[1][indx + ts - 1]));
                                vfloat gdiffvv = (LC2VFU(rgb[1][indx + ts]) - LC2VFU(rgb[1][indx - ts])) + (temp1 - temp2);
                                vfloat deltgrbv = LVFU(rgb[c][indx >> 1]) - LC2VFU(rgb[1][indx]);

                                vfloat gradwtvv = (LVFU(rbhpfv[indx >> 1]) + zd5v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1]))) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) + LVFU(rblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) + v1]));

                                coeff00v += gradwtvv * deltgrbv * deltgrbv;
                                coeff01v += gradwtvv * gdiffvv * deltgrbv;
                                coeff02v += gradwtvv * gdiffvv * gdiffvv;

                                //horizontal
                                vfloat gdiffhv = (LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + (temp1 + temp2);

                                vfloat gradwthv = (LVFU(rbhpfh[indx >> 1]) + zd5v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1]))) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) + LVFU(rblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) + 1]));

                                coeff10v += gradwthv * deltgrbv * deltgrbv;
                                coeff11v += gradwthv * gdiffhv * deltgrbv;
                                coeff12v += gradwthv * gdiffhv * gdiffhv;
                            }

                            coeff[0][0][c>>1] += vhadd(coeff00v);
                            coeff[0][1][c>>1] += vhadd(coeff01v);
                            coeff[0][2][c>>1] += vhadd(coeff02v);
                            coeff[1][0][c>>1] += vhadd(coeff10v);
                            coeff[1][1][c>>1] += vhadd(coeff11v);
                            coeff[1][2][c>>1] += vhadd(coeff12v);

#endif
                            for (; cc < cc1 - 8; cc += 2, indx += 2) {
                                //in linear interpolation, colour differences are a quadratic function of interpolation position;
                                //solve for the interpolation position that minimizes colour difference variance over the tile

                                //vertical
                                float gdiff = (rgb[1][indx + ts] - rgb[1][indx - ts]) + 0.3f * (rgb[1][indx + ts + 1] - rgb[1][indx - ts + 1] + rgb[1][indx + ts - 1] - rgb[1][indx - ts - 1]);
                                float deltgrb = (rgb[c][indx >> 1] - rgb[1][indx]);

                                float gradwt = (rbhpfv[indx >> 1] + 0.5f * (rbhpfv[(indx >> 1) + 1] + rbhpfv[(indx >> 1) - 1])) * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) / (eps + 0.1f * (grblpfv[(indx >> 1) - v1] + grblpfv[(indx >> 1) + v1]) + rblpfv[(indx >> 1) - v1] + rblpfv[(indx >> 1) + v1]);

                                coeff[0][0][c>>1] += gradwt * deltgrb * deltgrb;
                                coeff[0][1][c>>1] += gradwt * gdiff * deltgrb;
                                coeff[0][2][c>>1] += gradwt * gdiff * gdiff;

                                //horizontal
                                gdiff = (rgb[1][indx + 1] - rgb[1][indx - 1]) + 0.3f * (rgb[1][indx + 1 + ts] - rgb[1][indx - 1 + ts] + rgb[1][indx + 1 - ts] - rgb[1][indx - 1 - ts]);

                                gradwt = (rbhpfh[indx >> 1] + 0.5f * (rbhpfh[(indx >> 1) + v1] + rbhpfh[(indx >> 1) - v1])) * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) / (eps + 0.1f * (grblpfh[(indx >> 1) - 1] + grblpfh[(indx >> 1) + 1]) + rblpfh[(indx >> 1) - 1] + rblpfh[(indx >> 1) + 1]);

                                coeff[1][0][c>>1] += gradwt * deltgrb * deltgrb;
                                coeff[1][1][c>>1] += gradwt * gdiff * deltgrb;
                                coeff[1][2][c>>1] += gradwt * gdiff * gdiff;

                                //  In Mathematica,
                                //  f[x_]=Expand[Total[Flatten[
                                //  ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]];
                                //  extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2]
                            }
                        }

                        for (int dir = 0; dir < 2; dir++) {
                            for (int k = 0; k < 3; k++) {
                                for (int c = 0; c < 2; c++) {
                                    coeff[dir][k][c] *= 0.25f;
                                    if (k == 1) {
                                        coeff[dir][k][c] *= 0.3125f;
                                    } else if (k == 2) {
                                        coeff[dir][k][c] *= SQR(0.3125f);
                                    }
                                }
                            }
                        }

                        for (int c = 0; c < 2; c++) {
                            for (int dir = 0; dir < 2; dir++) { // vert/hor

                                // CAshift[dir][c] are the locations
                                // that minimize colour difference variances;
                                // This is the approximate _optical_ location of the R/B pixels
                                if (coeff[dir][2][c] > eps2) {
                                    CAshift[dir][c] = coeff[dir][1][c] / coeff[dir][2][c];
                                    blockwt[vblock * hblsz + hblock] = coeff[dir][2][c] / (eps + coeff[dir][0][c]) ;
                                } else {
                                    CAshift[dir][c] = 17.0;
                                    blockwt[vblock * hblsz + hblock] = 0;
                                }

                                //data structure = CAshift[vert/hor][colour]
                                //dir : 0=vert, 1=hor

                                //offset gives NW corner of square containing the min; dir : 0=vert, 1=hor
                                if (fabsf(CAshift[dir][c]) < 2.0f) {
                                    blockavethr[dir][c] += CAshift[dir][c];
                                    blocksqavethr[dir][c] += SQR(CAshift[dir][c]);
                                    blockdenomthr[dir][c] += 1;
                                }
                                //evaluate the shifts to the location that minimizes CA within the tile
                                blockshifts[vblock * hblsz + hblock][c][dir] = CAshift[dir][c]; //vert/hor CA shift for R/B
                            }//vert/hor
                        }//colour

                        if (plistener) {
                            progresscounter++;

                            if (progresscounter % 8 == 0) {
#ifdef _OPENMP
                                #pragma omp critical (cadetectpass1)
#endif
                                {
                                    progress += 4.0 * SQR(ts - border2) / (iterations * height * width);
                                    progress = std::min(progress, 1.0);
                                    plistener->setProgress(progress);
                                }
                            }
                        }
                    }
                }
                //end of diagnostic pass
#ifdef _OPENMP
                #pragma omp critical (cadetectpass2)
#endif
                {
                    for (int dir = 0; dir < 2; dir++) {
                        for (int c = 0; c < 2; c++) {
                            blockdenom[dir][c] += blockdenomthr[dir][c];
                            blocksqave[dir][c] += blocksqavethr[dir][c];
                            blockave[dir][c]   += blockavethr[dir][c];
                        }
                    }
                }
#ifdef _OPENMP
                #pragma omp barrier
#endif

#ifdef _OPENMP
                #pragma omp single
#endif
                {
                    for (int dir = 0; dir < 2; dir++)
                        for (int c = 0; c < 2; c++) {
                            if (blockdenom[dir][c]) {
                                blockvar[dir][c] = blocksqave[dir][c] / blockdenom[dir][c] - SQR(blockave[dir][c] / blockdenom[dir][c]);
                            } else {
                                processpasstwo = false;
                                if (settings->verbose) {
                                    std::cout << "blockdenom vanishes" << std::endl;
                                }
                                break;
                            }
                        }

                    //now prepare for CA correction pass
                    //first, fill border blocks of blockshift array
                    if (processpasstwo) {
                        for (int vblock = 1; vblock < vblsz - 1; vblock++) { //left and right sides
                            for (int c = 0; c < 2; c++) {
                                for (int i = 0; i < 2; i++) {
                                    blockshifts[vblock * hblsz][c][i] = blockshifts[(vblock) * hblsz + 2][c][i];
                                    blockshifts[vblock * hblsz + hblsz - 1][c][i] = blockshifts[(vblock) * hblsz + hblsz - 3][c][i];
                                }
                            }
                        }

                        for (int hblock = 0; hblock < hblsz; hblock++) { //top and bottom sides
                            for (int c = 0; c < 2; c++) {
                                for (int i = 0; i < 2; i++) {
                                    blockshifts[hblock][c][i] = blockshifts[2 * hblsz + hblock][c][i];
                                    blockshifts[(vblsz - 1)*hblsz + hblock][c][i] = blockshifts[(vblsz - 3) * hblsz + hblock][c][i];
                                }
                            }
                        }

                        //end of filling border pixels of blockshift array

                        //initialize fit arrays
                        double polymat[2][2][256], shiftmat[2][2][16];

                        for (int i = 0; i < 256; i++) {
                            polymat[0][0][i] = polymat[0][1][i] = polymat[1][0][i] = polymat[1][1][i] = 0;
                        }

                        for (int i = 0; i < 16; i++) {
                            shiftmat[0][0][i] = shiftmat[0][1][i] = shiftmat[1][0][i] = shiftmat[1][1][i] = 0;
                        }

                        int numblox[2] = {0, 0};

                        for (int vblock = 1; vblock < vblsz - 1; vblock++) {
                            for (int hblock = 1; hblock < hblsz - 1; hblock++) {
                                // block 3x3 median of blockshifts for robustness
                                for (int c = 0; c < 2; c ++) {
                                    float bstemp[2];
                                    for (int dir = 0; dir < 2; dir++) {
                                        //temporary storage for median filter
                                        const std::array<float, 9> p = {
                                            blockshifts[(vblock - 1) * hblsz + hblock - 1][c][dir],
                                            blockshifts[(vblock - 1) * hblsz + hblock][c][dir],
                                            blockshifts[(vblock - 1) * hblsz + hblock + 1][c][dir],
                                            blockshifts[(vblock) * hblsz + hblock - 1][c][dir],
                                            blockshifts[(vblock) * hblsz + hblock][c][dir],
                                            blockshifts[(vblock) * hblsz + hblock + 1][c][dir],
                                            blockshifts[(vblock + 1) * hblsz + hblock - 1][c][dir],
                                            blockshifts[(vblock + 1) * hblsz + hblock][c][dir],
                                            blockshifts[(vblock + 1) * hblsz + hblock + 1][c][dir]
                                        };
                                        bstemp[dir] = median(p);
                                    }

                                    //now prepare coefficient matrix; use only data points within caAutostrength/2 std devs of zero
                                    if (SQR(bstemp[0]) > caAutostrength * blockvar[0][c] || SQR(bstemp[1]) > caAutostrength * blockvar[1][c]) {
                                        continue;
                                    }

                                    numblox[c]++;

                                    for (int dir = 0; dir < 2; dir++) {
                                        double powVblockInit = 1.0;
                                        for (int i = 0; i < polyord; i++) {
                                            double powHblockInit = 1.0;
                                            for (int j = 0; j < polyord; j++) {
                                                double powVblock = powVblockInit;
                                                for (int m = 0; m < polyord; m++) {
                                                    double powHblock = powHblockInit;
                                                    for (int n = 0; n < polyord; n++) {
                                                        polymat[c][dir][numpar * (polyord * i + j) + (polyord * m + n)] += powVblock * powHblock * blockwt[vblock * hblsz + hblock];
                                                        powHblock *= hblock;
                                                    }
                                                    powVblock *= vblock;
                                                }
                                                shiftmat[c][dir][(polyord * i + j)] += powVblockInit * powHblockInit * bstemp[dir] * blockwt[vblock * hblsz + hblock];
                                                powHblockInit *= hblock;
                                            }
                                            powVblockInit *= vblock;
                                        }//monomials
                                    }//dir
                                }//c
                            }//blocks
                        }
                        numblox[1] = min(numblox[0], numblox[1]);

                        //if too few data points, restrict the order of the fit to linear
                        if (numblox[1] < 32) {
                            polyord = 2;
                            numpar = 4;

                            if (numblox[1] < 10) {
                                if (settings->verbose) {
                                    std::cout << "numblox = " << numblox[1] << std::endl;
                                }
                                processpasstwo = false;
                            }
                        }

                        if (processpasstwo) {
                            //fit parameters to blockshifts
                            for (int c = 0; c < 2; c++) {
                                for (int dir = 0; dir < 2; dir++) {
                                    if (!LinEqSolve(numpar, polymat[c][dir], shiftmat[c][dir], fitparams[c][dir])) {
                                        std::cout << "CA correction pass failed -- can't solve linear equations for colour %d direction " << c << std::endl;
                                        processpasstwo = false;
                                    }
                                }
                            }
                        }
                    }
                    //fitparams[polyord*i+j] gives the coefficients of (vblock^i hblock^j) in a polynomial fit for i,j<=4
                }
                //end of initialization for CA correction pass
                //only executed if autoCA is true
            }

            // Main algorithm: Tile loop
            if (processpasstwo) {
                float* grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
                //green interpolated to optical sample points for R/B
                float* gshift  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
#ifdef _OPENMP
                #pragma omp for schedule(dynamic, chunkSize) collapse(2)
#endif
                for (int top = -border; top < height; top += ts - border2) {
                    for (int left = -border; left < width - (W & 1); left += ts - border2) {
                        memset(bufferThr, 0, buffersizePassTwo);
                        float lblockshifts[2][2];
                        const int vblock = ((top + border) / (ts - border2)) + 1;
                        const int hblock = ((left + border) / (ts - border2)) + 1;
                        const int bottom = min(top + ts, height + border);
                        const int right  = min(left + ts, width - (W & 1) + border);
                        const int rr1 = bottom - top;
                        const int cc1 = right - left;

                        const int rrmin = top < 0 ? border : 0;
                        const int rrmax = bottom > height ? height - top : rr1;
                        const int ccmin = left < 0 ? border : 0;
                        const int ccmax = (right > width - (W & 1)) ? width - (W & 1) - left : cc1;

                        // rgb from input CFA data
                        // rgb values should be floating point number between 0 and 1
                        // after white balance multipliers are applied

#ifdef __SSE2__
                        vfloat c65535v = F2V(65535.f);
                        vmask gmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
#endif
                        for (int rr = rrmin; rr < rrmax; rr++) {
                            int row = rr + top;
                            int cc = ccmin;
                            int col = cc + left;
                            int indx = row * width + col;
                            int indx1 = rr * ts + cc;
#ifdef __SSE2__
                            int c = fc(cfa, rr, cc);
                            if (c & 1) {
                                rgb[1][indx1] = rawData[row][col] / 65535.f;
                                indx++;
                                indx1++;
                                cc++;
                                col++;
                                c = fc(cfa, rr, cc);
                            }
                            for (; cc < ccmax - 7; cc += 8, col += 8, indx += 8, indx1 += 8) {
                                vfloat val1v = LVFU(rawData[row][col]) / c65535v;
                                vfloat val2v = LVFU(rawData[row][col + 4]) / c65535v;
                                STVFU(rgb[c][indx1 >> 1], _mm_shuffle_ps(val1v, val2v, _MM_SHUFFLE(2, 0, 2, 0)));
                                vfloat gtmpv = LVFU(Gtmp[indx >> 1]);
                                STVFU(rgb[1][indx1], vself(gmask, PERMUTEPS(gtmpv, _MM_SHUFFLE(1, 1, 0, 0)), val1v));
                                STVFU(rgb[1][indx1 + 4], vself(gmask, PERMUTEPS(gtmpv, _MM_SHUFFLE(3, 3, 2, 2)), val2v));
                            }
#endif
                            for (; cc < ccmax; cc++, col++, indx++, indx1++) {
                                int c = fc(cfa, rr, cc);
                                rgb[c][indx1 >> ((c & 1) ^ 1)] = rawData[row][col] / 65535.f;

                                if ((c & 1) == 0) {
                                    rgb[1][indx1] = Gtmp[indx >> 1];
                                }
                            }
                        }

                        //fill borders
                        if (rrmin > 0) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = ccmin; cc < ccmax; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][((border2 - rr) * ts + cc) >> ((c & 1) ^ 1)];
                                    rgb[1][rr * ts + cc] = rgb[1][(border2 - rr) * ts + cc];
                                }
                            }
                        }

                        if (rrmax < rr1) {
                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++) {
                                for (int cc = ccmin; cc < ccmax; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][left + cc]) / 65535.f;
                                    if ((c & 1) == 0) {
                                        rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + left + cc) >> 1];
                                    }
                                }
                            }
                        }

                        if (ccmin > 0) {
                            for (int rr = rrmin; rr < rrmax; rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = rgb[c][(rr * ts + border2 - cc) >> ((c & 1) ^ 1)];
                                    rgb[1][rr * ts + cc] = rgb[1][rr * ts + border2 - cc];
                                }
                            }
                        }

                        if (ccmax < cc1) {
                            for (int rr = rrmin; rr < rrmax; rr++) {
                                for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(top + rr)][(width - cc - 2)]) / 65535.f;
                                    if ((c & 1) == 0) {
                                        rgb[1][rr * ts + ccmax + cc] = Gtmp[((top + rr) * width + (width - cc - 2)) >> 1];
                                    }
                                }
                            }
                        }

                        //also, fill the image corners
                        if (rrmin > 0 && ccmin > 0) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + cc) >> ((c & 1) ^ 1)] = (rawData[border2 - rr][border2 - cc]) / 65535.f;
                                    if ((c & 1) == 0) {
                                        rgb[1][rr * ts + cc] = Gtmp[((border2 - rr) * width + border2 - cc) >> 1];
                                    }
                                }
                            }
                        }

                        if (rrmax < rr1 && ccmax < cc1) {
                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++) {
                                for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][((rrmax + rr)*ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(width - cc - 2)]) / 65535.f;
                                    if ((c & 1) == 0) {
                                        rgb[1][(rrmax + rr)*ts + ccmax + cc] = Gtmp[((height - rr - 2) * width + (width - cc - 2)) >> 1];
                                    }
                                }
                            }
                        }

                        if (rrmin > 0 && ccmax < cc1) {
                            for (int rr = 0; rr < border; rr++) {
                                for (int cc = 0; cc < std::min(border, cc1 - ccmax); cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][(rr * ts + ccmax + cc) >> ((c & 1) ^ 1)] = (rawData[(border2 - rr)][(width - cc - 2)]) / 65535.f;
                                    if ((c & 1) == 0) {
                                        rgb[1][rr * ts + ccmax + cc] = Gtmp[((border2 - rr) * width + (width - cc - 2)) >> 1];
                                    }
                                }
                            }
                        }

                        if (rrmax < rr1 && ccmin > 0) {
                            for (int rr = 0; rr < std::min(border, rr1 - rrmax); rr++) {
                                for (int cc = 0; cc < border; cc++) {
                                    int c = fc(cfa, rr, cc);
                                    rgb[c][((rrmax + rr)*ts + cc) >> ((c & 1) ^ 1)] = (rawData[(height - rr - 2)][(border2 - cc)]) / 65535.f;
                                    if ((c & 1) == 0) {
                                        rgb[1][(rrmax + rr)*ts + cc] = Gtmp[((height - rr - 2) * width + (border2 - cc)) >> 1];
                                    }
                                }
                            }
                        }
                        //end of border fill

                        if (!autoCA || fitParamsIn) {
#ifdef __SSE2__
                            const vfloat onev = F2V(1.f);
                            const vfloat epsv = F2V(eps);
#endif
                            //manual CA correction; use red/blue slider values to set CA shift parameters
                            for (int rr = 3; rr < rr1 - 3; rr++) {
                                int cc = 3 + fc(cfa, rr, 1), c = fc(cfa, rr,cc), indx = rr * ts + cc;
#ifdef __SSE2__
                                for (; cc < cc1 - 10; cc += 8, indx += 8) {
                                    //compute directional weights using image gradients
                                    vfloat val1v = epsv + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 1) * ts + cc]));
                                    vfloat val2v = epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1]));
                                    vfloat wtuv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr - 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr - 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 3) * ts + cc])));
                                    vfloat wtdv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr + 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr + 3) * ts + cc])));
                                    vfloat wtlv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
                                    vfloat wtrv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));

                                    //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                    vfloat result = (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv);
                                    STC2VFU(rgb[1][indx], result);
                                }
#endif
                                for (; cc < cc1 - 3; cc += 2, indx += 2) {
                                    //compute directional weights using image gradients
                                    float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
                                    float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
                                    float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
                                    float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));

                                    //store in rgb array the interpolated G value at R/B grid points using directional weighted average
                                    rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
                                }
                            }
                        }

                        if (!autoCA) {
                            float hfrac = -((float)(hblock - 0.5) / (hblsz - 2) - 0.5);
                            float vfrac = -((float)(vblock - 0.5) / (vblsz - 2) - 0.5) * height / width;
                            lblockshifts[0][0] = 2 * vfrac * cared;
                            lblockshifts[0][1] = 2 * hfrac * cared;
                            lblockshifts[1][0] = 2 * vfrac * cablue;
                            lblockshifts[1][1] = 2 * hfrac * cablue;
                        } else {
                            //CA auto correction; use CA diagnostic pass to set shift parameters
                            lblockshifts[0][0] = lblockshifts[0][1] = 0;
                            lblockshifts[1][0] = lblockshifts[1][1] = 0;
                            double powVblock = 1.0;
                            for (int i = 0; i < polyord; i++) {
                                double powHblock = powVblock;
                                for (int j = 0; j < polyord; j++) {
                                    lblockshifts[0][0] += powHblock * fitparams[0][0][polyord * i + j];
                                    lblockshifts[0][1] += powHblock * fitparams[0][1][polyord * i + j];
                                    lblockshifts[1][0] += powHblock * fitparams[1][0][polyord * i + j];
                                    lblockshifts[1][1] += powHblock * fitparams[1][1][polyord * i + j];
                                    powHblock *= hblock;
                                }
                                powVblock *= vblock;
                            }
                            constexpr float bslim = 3.99; //max allowed CA shift
                            lblockshifts[0][0] = LIM(lblockshifts[0][0], -bslim, bslim);
                            lblockshifts[0][1] = LIM(lblockshifts[0][1], -bslim, bslim);
                            lblockshifts[1][0] = LIM(lblockshifts[1][0], -bslim, bslim);
                            lblockshifts[1][1] = LIM(lblockshifts[1][1], -bslim, bslim);
                        }//end of setting CA shift parameters

                        for (int c = 0; c < 3; c += 2) {

                            //some parameters for the bilinear interpolation
                            shiftvfloor[c] = floor((float)lblockshifts[c>>1][0]);
                            shiftvceil[c] = ceil((float)lblockshifts[c>>1][0]);
                            if (lblockshifts[c>>1][0] < 0.f) {
                                std::swap(shiftvfloor[c], shiftvceil[c]);
                            }
                            shiftvfrac[c] = fabs(lblockshifts[c>>1][0] - shiftvfloor[c]);

                            shifthfloor[c] = floor((float)lblockshifts[c>>1][1]);
                            shifthceil[c] = ceil((float)lblockshifts[c>>1][1]);
                            if (lblockshifts[c>>1][1] < 0.f) {
                                std::swap(shifthfloor[c], shifthceil[c]);
                            }
                            shifthfrac[c] = fabs(lblockshifts[c>>1][1] - shifthfloor[c]);

                            GRBdir[0][c] = lblockshifts[c>>1][0] > 0 ? 2 : -2;
                            GRBdir[1][c] = lblockshifts[c>>1][1] > 0 ? 2 : -2;
                        }

                        for (int rr = 4; rr < rr1 - 4; rr++) {
                            int cc = 4 + (fc(cfa, rr, 2) & 1);
                            int c = fc(cfa, rr, cc);
                            int indx = (rr * ts + cc) >> 1;
                            int indxfc = (rr + shiftvfloor[c]) * ts + cc + shifthceil[c];
                            int indxff = (rr + shiftvfloor[c]) * ts + cc + shifthfloor[c];
                            int indxcc = (rr + shiftvceil[c]) * ts + cc + shifthceil[c];
                            int indxcf = (rr + shiftvceil[c]) * ts + cc + shifthfloor[c];
#ifdef __SSE2__
                            vfloat shifthfracv = F2V(shifthfrac[c]);
                            vfloat shiftvfracv = F2V(shiftvfrac[c]);
                            for (; cc < cc1 - 10; cc += 8, indxfc += 8, indxff += 8, indxcc += 8, indxcf += 8, indx += 4) {
                                //perform CA correction using colour ratios or colour differences
                                vfloat Ginthfloorv = vintpf(shifthfracv, LC2VFU(rgb[1][indxfc]), LC2VFU(rgb[1][indxff]));
                                vfloat Ginthceilv = vintpf(shifthfracv, LC2VFU(rgb[1][indxcc]), LC2VFU(rgb[1][indxcf]));
                                //Gint is bilinear interpolation of G at CA shift point
                                vfloat Gintv = vintpf(shiftvfracv, Ginthceilv, Ginthfloorv);

                                //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
                                //but first we need to interpolate G-R/G-B to grid points...
                                STVFU(grbdiff[indx], Gintv - LVFU(rgb[c][indx]));
                                STVFU(gshift[indx], Gintv);
                            }

#endif
                            for (; cc < cc1 - 4; cc += 2, indxfc += 2, indxff += 2, indxcc += 2, indxcf += 2, ++indx) {
                                //perform CA correction using colour ratios or colour differences
                                float Ginthfloor = intp(shifthfrac[c], rgb[1][indxfc], rgb[1][indxff]);
                                float Ginthceil = intp(shifthfrac[c], rgb[1][indxcc], rgb[1][indxcf]);
                                //Gint is bilinear interpolation of G at CA shift point
                                float Gint = intp(shiftvfrac[c], Ginthceil, Ginthfloor);

                                //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
                                //but first we need to interpolate G-R/G-B to grid points...
                                grbdiff[indx] = Gint - rgb[c][indx];
                                gshift[indx] = Gint;
                            }
                        }

                        shifthfrac[0] /= 2.f;
                        shifthfrac[2] /= 2.f;
                        shiftvfrac[0] /= 2.f;
                        shiftvfrac[2] /= 2.f;

#ifdef __SSE2__
                        vfloat zd25v = F2V(0.25f);
                        vfloat onev = F2V(1.f);
                        vfloat zd5v = F2V(0.5f);
                        vfloat epsv = F2V(eps);
#endif
                        for (int rr = 8; rr < rr1 - 8; rr++) {
                            int cc = 8 + (fc(cfa, rr, 2) & 1);
                            int c = fc(cfa, rr, cc);
                            int GRBdir0 = GRBdir[0][c];
                            int GRBdir1 = GRBdir[1][c];
#ifdef __SSE2__
                            vfloat shifthfracc = F2V(shifthfrac[c]);
                            vfloat shiftvfracc = F2V(shiftvfrac[c]);
                            for (int indx = rr * ts + cc; cc < cc1 - 14; cc += 8, indx += 8) {
                                //interpolate colour difference from optical R/B locations to grid locations
                                vfloat grbdiffinthfloor = vintpf(shifthfracc, LVFU(grbdiff[(indx - GRBdir1) >> 1]), LVFU(grbdiff[indx >> 1]));
                                vfloat grbdiffinthceil = vintpf(shifthfracc, LVFU(grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]), LVFU(grbdiff[((rr - GRBdir0) * ts + cc) >> 1]));
                                //grbdiffint is bilinear interpolation of G-R/G-B at grid point
                                vfloat grbdiffint = vintpf(shiftvfracc, grbdiffinthceil, grbdiffinthfloor);

                                //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
                                vfloat cinv = LVFU(rgb[c][indx >> 1]);
                                vfloat rinv = LC2VFU(rgb[1][indx]);
                                vfloat RBint = rinv - grbdiffint;
                                vmask cmask = vmaskf_ge(vabsf(RBint - cinv), zd25v * (RBint + cinv));
                                if (_mm_movemask_ps((vfloat)cmask)) {
                                    // if for any of the 4 pixels the condition is true, do the math for all 4 pixels and mask the unused out at the end
                                    //gradient weights using difference from G at CA shift points and G at grid points
                                    vfloat p0 = onev / (epsv + vabsf(rinv - LVFU(gshift[indx >> 1])));
                                    vfloat p1 = onev / (epsv + vabsf(rinv - LVFU(gshift[(indx - GRBdir1) >> 1])));
                                    vfloat p2 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir0) * ts + cc) >> 1])));
                                    vfloat p3 = onev / (epsv + vabsf(rinv - LVFU(gshift[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1])));

                                    grbdiffint = vself(cmask, (p0 * LVFU(grbdiff[indx >> 1]) + p1 * LVFU(grbdiff[(indx - GRBdir1) >> 1]) +
                                                  p2 * LVFU(grbdiff[((rr - GRBdir0) * ts + cc) >> 1]) + p3 * LVFU(grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1])) / (p0 + p1 + p2 + p3), grbdiffint);

                                }
                                vfloat grbdiffold = rinv - cinv;
                                RBint = rinv - grbdiffint;
                                RBint = vself(vmaskf_gt(vabsf(grbdiffold), vabsf(grbdiffint)), RBint, cinv);
                                RBint = vself(vmaskf_lt(grbdiffold * grbdiffint, ZEROV), rinv - zd5v * (grbdiffold + grbdiffint), RBint);
                                STVFU(rgb[c][indx >> 1], RBint);
                            }
#endif
                            for (int c = fc(cfa, rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
                                float grbdiffold = rgb[1][indx] - rgb[c][indx >> 1];

                                //interpolate colour difference from optical R/B locations to grid locations
                                float grbdiffinthfloor = intp(shifthfrac[c], grbdiff[(indx - GRBdir1) >> 1], grbdiff[indx >> 1]);
                                float grbdiffinthceil = intp(shifthfrac[c], grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1], grbdiff[((rr - GRBdir0) * ts + cc) >> 1]);
                                //grbdiffint is bilinear interpolation of G-R/G-B at grid point
                                float grbdiffint = intp(shiftvfrac[c], grbdiffinthceil, grbdiffinthfloor);

                                //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
                                float RBint = rgb[1][indx] - grbdiffint;

                                if (fabsf(RBint - rgb[c][indx >> 1]) < 0.25f * (RBint + rgb[c][indx >> 1])) {
                                    if (fabsf(grbdiffold) > fabsf(grbdiffint)) {
                                        rgb[c][indx >> 1] = RBint;
                                    }
                                } else {

                                    //gradient weights using difference from G at CA shift points and G at grid points
                                    float p0 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[indx >> 1]));
                                    float p1 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[(indx - GRBdir1) >> 1]));
                                    float p2 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[((rr - GRBdir0) * ts + cc) >> 1]));
                                    float p3 = 1.f / (eps + fabsf(rgb[1][indx] - gshift[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]));

                                    grbdiffint = (p0 * grbdiff[indx >> 1] + p1 * grbdiff[(indx - GRBdir1) >> 1] +
                                                  p2 * grbdiff[((rr - GRBdir0) * ts + cc) >> 1] + p3 * grbdiff[((rr - GRBdir0) * ts + cc - GRBdir1) >> 1]) / (p0 + p1 + p2 + p3) ;

                                    //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
                                    if (fabsf(grbdiffold) > fabsf(grbdiffint)) {
                                        rgb[c][indx >> 1] = rgb[1][indx] - grbdiffint;
                                    }
                                }

                                //if colour difference interpolation overshot the correction, just desaturate
                                if (grbdiffold * grbdiffint < 0) {
                                    rgb[c][indx >> 1] = rgb[1][indx] - 0.5f * (grbdiffold + grbdiffint);
                                }
                            }
                        }

                        // copy CA corrected results to temporary image matrix
                        for (int rr = border; rr < rr1 - border; rr++) {
                            int c = fc(cfa, rr + top, left + border + (fc(cfa, rr + top, 2) & 1));
                            int row = rr + top;
                            int cc = border + (fc(cfa, rr, 2) & 1);
                            int indx = (row * width + cc + left) >> 1;
                            int indx1 = (rr * ts + cc) >> 1;
#ifdef __SSE2__
                            for (; indx < (row * width + cc1 - border - 7 + left) >> 1; indx+=4, indx1 += 4) {
                                STVFU(RawDataTmp[indx], c65535v * LVFU(rgb[c][indx1]));
                            }
#endif
                            for (; indx < (row * width + cc1 - border + left) >> 1; indx++, indx1++) {
                                RawDataTmp[indx] = 65535.f * rgb[c][indx1];
                            }
                        }

                        if (plistener) {
                            progresscounter++;

                            if (progresscounter % 8 == 0)
#ifdef _OPENMP
                                #pragma omp critical (cacorrect)
#endif
                            {
                                progress += 4.0 * SQR(ts - border2) / (iterations * height * width);
                                progress = std::min(progress, 1.0);
                                plistener->setProgress(progress);
                            }
                        }
                    }
                }

                // copy temporary image matrix back to image matrix
#ifdef _OPENMP
                #pragma omp for
#endif

                for (int row = cb; row < height - cb; row++) {
                    int col = cb + (fc(cfa, row, 0) & 1);
                    int indx = (row * width + col) >> 1;
#ifdef __SSE2__
                    for (; col < width - 7 - cb; col += 8, indx += 4) {
                        const vfloat val = vmaxf(LVFU(RawDataTmp[indx]), ZEROV);
                        STC2VFU(rawData[row][col], val);
                    }
#endif
                    for (; col < width - cb; col += 2, indx++) {
                        rawData[row][col] = std::max(0.f, RawDataTmp[indx]);
                    }
                }

            }
            // clean up
            free(bufferThr);
        }
        if (avoidColourshift) {
            // to avoid or at least reduce the colour shift caused by raw ca correction we compute the per pixel difference factors
            // of red and blue channel and apply a gaussian blur to them.
            // Then we apply the resulting factors per pixel on the result of raw ca correction

#ifdef _OPENMP
            #pragma omp parallel
#endif
            {
#ifdef __SSE2__
                const vfloat onev = F2V(1.f);
                const vfloat twov = F2V(2.f);
                const vfloat zd5v = F2V(0.5f);
#endif
#ifdef _OPENMP
                #pragma omp for
#endif
                for (int i = 0; i < H - 2 * cb; ++i) {
                    const int firstCol = fc(cfa, i, 0) & 1;
                    const int colour = fc(cfa, i, firstCol);
                    const array2D<float>* nonGreen = colour == 0 ? redFactor : blueFactor;
                    int j = firstCol;
#ifdef __SSE2__
                    for (; j < W - 7 - 2 * cb; j += 8) {
                        const vfloat newvals = LC2VFU(rawData[i + cb][j + cb]);
                        const vfloat oldvals = LVFU((*oldraw)[i][j / 2]);
                        vfloat factors = oldvals / newvals;
                        factors = vself(vmaskf_le(newvals, onev), onev, factors);
                        factors = vself(vmaskf_le(oldvals, onev), onev, factors);
                        STVFU((*nonGreen)[i/2][j/2], vclampf(factors, zd5v, twov));
                    }
#endif
                    for (; j < W - 2 * cb; j += 2) {
                        (*nonGreen)[i/2][j/2] = (rawData[i + cb][j + cb] <= 1.f || (*oldraw)[i][j / 2] <= 1.f) ? 1.f : rtengine::LIM((*oldraw)[i][j / 2] / rawData[i + cb][j + cb], 0.5f, 2.f);
                    }
                }

#ifdef _OPENMP
                #pragma omp single
#endif
                {
                    if (H % 2) {
                        // odd height => factors are not set in last row => use values of preceding row
                        for (int j = 0; j < (W + 1 - 2 * cb) / 2; ++j) {
                            (*redFactor)[(H - 2 * cb + 1) / 2 - 1][j] = (*redFactor)[(H - 2 * cb + 1) / 2 - 2][j];
                            (*blueFactor)[(H - 2 * cb + 1) / 2 - 1][j] = (*blueFactor)[(H - 2 * cb + 1) / 2 - 2][j];
                        }
                    }

                    if (W % 2) {
                        // odd width => factors for one channel are not set in last column => use value of preceding column
                        const int ngRow = 1 - (fc(cfa, 0, 0) & 1);
                        const int ngCol = fc(cfa, ngRow, 0) & 1;
                        const int colour = fc(cfa, ngRow, ngCol);
                        const array2D<float>* nonGreen = colour == 0 ? redFactor : blueFactor;
                        for (int i = 0; i < (H + 1 - 2 * cb) / 2; ++i) {
                            (*nonGreen)[i][(W - 2 * cb + 1) / 2 - 1] = (*nonGreen)[i][(W - 2* cb + 1) / 2 - 2];
                        }
                    }
                }

                // blur correction factors
                gaussianBlur(*redFactor, *redFactor, (W + 1 - 2 * cb) / 2, (H + 1 - 2 * cb) / 2, 30.0);
                gaussianBlur(*blueFactor, *blueFactor, (W + 1 - 2 * cb) / 2, (H + 1 - 2 * cb) / 2, 30.0);

                // apply correction factors to avoid (reduce) colour shift
#ifdef _OPENMP
                #pragma omp for
#endif
                for (int i = 0; i < H - 2 * cb; ++i) {
                    const int firstCol = fc(cfa, i, 0) & 1;
                    const int colour = fc(cfa, i, firstCol);
                    const array2D<float>* nonGreen = colour == 0 ? redFactor : blueFactor;
                    for (int j = firstCol; j < W - 2 * cb; j += 2) {
                        rawData[i + cb][j + cb] *= (*nonGreen)[i / 2][j / 2];
                    }
                }
            }
        }
    }

    if (autoCA && fitParamsTransfer && fitParamsOut) {
        // store calculated parameters
        int index = 0;
        for (int c = 0; c < 2; ++c) {
            for (int d = 0; d < 2; ++d) {
                for (int e = 0; e < 16; ++e) {
                    fitParamsTransfer[index++] = fitparams[c][d][e];
                }
            }
        }
    }

    if (freeBuffer) {
        free(buffer);
        buffer = nullptr;
    }

    if (avoidColourshift) {
        delete oldraw;
        delete redFactor;
        delete blueFactor;
    }

    if (plistener) {
        plistener->setProgress(1.0);
    }
    return buffer;
}