Speedup for ca-correction of pixelshift files, also fixed a memory leak

2018-05-20 13:59:50 +02:00
parent 68fabf0be5
commit ac78dd311e
3 changed files with 83 additions and 83 deletions
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -111,7 +111,7 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
 using namespace std;
 using namespace rtengine;

-void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
+float* RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
 {
 // multithreaded and vectorized by Ingo Weyrich
    constexpr int ts = 128;
@@ -124,7 +124,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
        for(int j = 0; j < 2; j++)
            if(FC(i, j) == 3) {
                printf("CA correction supports only RGB Colour filter arrays\n");
-                return;
+                return buffer;
            }

    volatile double progress = 0.0;
@@ -135,19 +135,6 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

    // local variables
    const int width = W + (W & 1), height = H;
-
-    //temporary array to store simple interpolation of G
-    if (!buffer) {
-        buffer = (float (*)) malloc ((height * width) / 2 * sizeof (float) + (height * width) * sizeof(float) / 2);
-    }
-    float *Gtmp = buffer;
-    float *RawDataTmp = buffer + (height * width) / 2;
-
-    float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
-
-    // Because we can't break parallel processing, we need a switch do handle the errors
-    bool processpasstwo = true;
-
    constexpr int border = 8;
    constexpr int border2 = 16;

@@ -156,13 +143,27 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
    const int vblsz = ceil((float)(height + border2) / (ts - border2) + 2 + vz1);
    const int hblsz = ceil((float)(width + border2) / (ts - border2) + 2 + hz1);

+    //temporary array to store simple interpolation of G
+    if (!buffer) {
+        buffer = static_cast<float*>(malloc ((height * width + vblsz * hblsz * (2 * 2 + 1)) * sizeof(float)));
+    }
+    float *Gtmp = buffer;
+    float *RawDataTmp = buffer + (height * width) / 2;
+
    //block CA shift values and weight assigned to block
-    float* const blockwt = static_cast<float*>(calloc(vblsz * hblsz * (2 * 2 + 1), sizeof(float)));
+    float *const blockwt = buffer + (height * width);
+    memset(blockwt, 0, vblsz * hblsz * (2 * 2 + 1) * sizeof(float));
    float (*blockshifts)[2][2] = (float (*)[2][2])(blockwt + vblsz * hblsz);

+    float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
+
+    // Because we can't break parallel processing, we need a switch do handle the errors
+    bool processpasstwo = true;
+
    double fitparams[2][2][16];
    const bool fitParamsSet = fitParamsTransfer && fitParamsIn;
    if(autoCA && fitParamsSet) {
+        // use stored parameters
        int index = 0;
        for(int c = 0; c < 2; ++c) {
            for(int d = 0; d < 2; ++d) {
@@ -186,22 +187,18 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

        int shifthfloor[3], shiftvfloor[3], shifthceil[3], shiftvceil[3];

-        //local quadratic fit to shift data within a tile
-        float   coeff[2][3][2];
-        //measured CA shift parameters for a tile
-        float   CAshift[2][2];
        //polynomial fit coefficients
        //residual CA shift amount within a plaquette
        float   shifthfrac[3], shiftvfrac[3];
-        //per thread data for evaluation of block CA shift variance
-        float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};

        // assign working space
        constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
-        char *bufferThr = (char *) malloc(buffersize);
-        char *data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
+        constexpr int buffersizePassTwo = sizeof(float) * ts * ts + 4 * sizeof(float) * ts * tsh + 4 * 64 + 63;
+        char * const bufferThr = (char *) malloc((autoCA && !fitParamsSet) ? buffersize : buffersizePassTwo);

-        // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache
+        char * const data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
+
+        // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <= 4-way associative L1-Cache

        //rgb data in a tile
        float* rgb[3];
@@ -209,25 +206,29 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
        rgb[1]         = (float (*)) (data + sizeof(float) * ts * tsh + 1 * 64);
        rgb[2]         = (float (*)) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);

-        //high pass filter for R/B in vertical direction
-        float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
-        //high pass filter for R/B in horizontal direction
-        float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
-        //low pass filter for R/B in horizontal direction
-        float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
-        //low pass filter for R/B in vertical direction
-        float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
-        //low pass filter for colour differences in horizontal direction
-        float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
-        //low pass filter for colour differences in vertical direction
-        float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
-        float *grbdiff = rbhpfh; // there is no overlap in buffer usage => share
-        //green interpolated to optical sample points for R/B
-        float *gshift  = rbhpfv; // there is no overlap in buffer usage => share
-
-
        if (autoCA && !fitParamsSet) {
+            //high pass filter for R/B in vertical direction
+            float *rbhpfh  = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
+            //high pass filter for R/B in horizontal direction
+            float *rbhpfv  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
+            //low pass filter for R/B in horizontal direction
+            float *rblpfh  = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
+            //low pass filter for R/B in vertical direction
+            float *rblpfv  = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
+            //low pass filter for colour differences in horizontal direction
+            float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
+            //low pass filter for colour differences in vertical direction
+            float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
            // Main algorithm: Tile loop calculating correction parameters per tile
+
+            //local quadratic fit to shift data within a tile
+            float coeff[2][3][2];
+            //measured CA shift parameters for a tile
+            float CAshift[2][2];
+
+            //per thread data for evaluation of block CA shift variance
+            float   blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
+
            #pragma omp for collapse(2) schedule(dynamic) nowait
            for (int top = -border ; top < height; top += ts - border2)
                for (int left = -border; left < width - (W & 1); left += ts - border2) {
@@ -763,11 +764,14 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const

        // Main algorithm: Tile loop
        if(processpasstwo) {
+            float *grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
+            //green interpolated to optical sample points for R/B
+            float *gshift  = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
            #pragma omp for schedule(dynamic) collapse(2) nowait

            for (int top = -border; top < height; top += ts - border2)
                for (int left = -border; left < width - (W & 1); left += ts - border2) {
-                    memset(bufferThr, 0, buffersize);
+                    memset(bufferThr, 0, buffersizePassTwo);
                    float lblockshifts[2][2];
                    const int vblock = ((top + border) / (ts - border2)) + 1;
                    const int hblock = ((left + border) / (ts - border2)) + 1;
@@ -913,25 +917,42 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                    //end of border fill
                    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

-                    if (!autoCA) {
+                    if (!autoCA || fitParamsIn) {
+#ifdef __SSE2__
+                        const vfloat onev = F2V(1.f);
+                        const vfloat epsv = F2V(eps);
+#endif
+
                        //manual CA correction; use red/blue slider values to set CA shift parameters
-                        for (int rr = 3; rr < rr1 - 3; rr++)
-                            for (int cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) {
-                                int c = FC(rr, cc);
-
-                                if (c != 1) {
-                                    //compute directional weights using image gradients
-                                    float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
-                                    float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
-                                    float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
-                                    float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc + 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
-
-                                    //store in rgb array the interpolated G value at R/B grid points using directional weighted average
-                                    rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
-                                }
+                        for (int rr = 3; rr < rr1 - 3; rr++) {
+                            int cc = 3 + FC(rr, 1), c = FC(rr,cc), indx = rr * ts + cc;
+#ifdef __SSE2__
+                            for (; cc < cc1 - 10; cc += 8, indx += 8) {
+                                //compute directional weights using image gradients
+                                vfloat val1v = epsv + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 1) * ts + cc]));
+                                vfloat val2v = epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1]));
+                                vfloat wtuv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr - 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr - 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 3) * ts + cc])));
+                                vfloat wtdv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr + 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr + 3) * ts + cc])));
+                                vfloat wtlv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
+                                vfloat wtrv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));

+                                //store in rgb array the interpolated G value at R/B grid points using directional weighted average
+                                STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
                            }
+#endif
+                            for (; cc < cc1 - 3; cc += 2, indx += 2) {
+                                //compute directional weights using image gradients
+                                float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
+                                float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
+                                float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
+                                float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));

+                                //store in rgb array the interpolated G value at R/B grid points using directional weighted average
+                                rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
+                            }
+                        }
+                    }
+                    if (!autoCA) {
                        float hfrac = -((float)(hblock - 0.5) / (hblsz - 2) - 0.5);
                        float vfrac = -((float)(vblock - 0.5) / (vblsz - 2) - 0.5) * height / width;
                        lblockshifts[0][0] = 2 * vfrac * cared;
@@ -940,32 +961,12 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
                        lblockshifts[1][1] = 2 * hfrac * cablue;
                    } else {
                        //CA auto correction; use CA diagnostic pass to set shift parameters
-                        if (fitParamsIn) {
-                            for (int rr = 3; rr < rr1 - 3; rr++) {
-                                for (int cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) {
-                                    int c = FC(rr, cc);
-
-                                    if (c != 1) {
-                                        //compute directional weights using image gradients
-                                        float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
-                                        float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
-                                        float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
-                                        float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc + 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
-
-                                        //store in rgb array the interpolated G value at R/B grid points using directional weighted average
-                                        rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
-                                    }
-                                }
-                            }
-                        }
-
                        lblockshifts[0][0] = lblockshifts[0][1] = 0;
                        lblockshifts[1][0] = lblockshifts[1][1] = 0;
                        double powVblock = 1.0;
                        for (int i = 0; i < polyord; i++) {
                            double powHblock = powVblock;
                            for (int j = 0; j < polyord; j++) {
-                                //printf("i= %d j= %d polycoeff= %f \n",i,j,fitparams[0][0][polyord*i+j]);
                                lblockshifts[0][0] += powHblock * fitparams[0][0][polyord * i + j];
                                lblockshifts[0][1] += powHblock * fitparams[0][1][polyord * i + j];
                                lblockshifts[1][0] += powHblock * fitparams[1][0][polyord * i + j];
@@ -1187,6 +1188,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
    }

    if(autoCA && fitParamsTransfer && fitParamsOut) {
+        // store calculated parameters
        int index = 0;
        for(int c = 0; c < 2; ++c) {
            for(int d = 0; d < 2; ++d) {
@@ -1201,9 +1203,8 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
        free(buffer);
    }

-    free(blockwt);
-
    if(plistener) {
        plistener->setProgress(1.0);
    }
+    return buffer;
 }
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -2017,8 +2017,7 @@ void RawImageSource::preprocess  (const RAWParams &raw, const LensProfParams &le
        if(numFrames == 4) {
            StopWatch Stop1("ps ca correction");
            double fitParams[64];
-            float *buffer = nullptr;
-            CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, buffer, false);
+            float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, nullptr, false);
            for(int i = 1; i < 3; ++i) {
                CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, true, false, buffer, false);
            }
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -245,7 +245,7 @@ protected:
    inline  void interpolate_row_rb     (float* ar, float* ab, float* pg, float* cg, float* ng, int i);
    inline  void interpolate_row_rb_mul_pp (float* ar, float* ab, float* pg, float* cg, float* ng, int i, float r_mul, float g_mul, float b_mul, int x1, int width, int skip);

-    void CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer = nullptr, bool fitParamsIn = false, bool fitParamsOut = false, float *buffer = nullptr, bool freeBuffer = true);
+    float* CA_correct_RT  (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer = nullptr, bool fitParamsIn = false, bool fitParamsOut = false, float * buffer = nullptr, bool freeBuffer = true);
    void ddct8x8s(int isgn, float a[8][8]);
    void processRawWhitepoint (float expos, float preser, array2D<float> &rawData);  // exposure before interpolation