locallab: speedup and a bit of cleanup for local sharpening

2017-03-26 20:43:46 +02:00
parent 0e2502a2d2
commit 67fe37f760
1 changed files with 83 additions and 170 deletions
--- a/rtengine/iplocallab.cc
+++ b/rtengine/iplocallab.cc
@@ -31,6 +31,7 @@
 #include "iccmatrices.h"
 #include "color.h"
 #include "rt_math.h"
+#include "jaggedarray.h"
 #ifdef _DEBUG
 #include "mytime.h"
 #endif
@@ -1886,7 +1887,6 @@ void ImProcFunctions::Contrast_Local (int call, float ave, LabImage * bufcontori
 // contrast - perhaps for 4 areas   if need
 // I tried shmap adaptaed to Lab, but no real gain and artifacts
    const float localtype = lumaref; // always spot area
-    //  const float localtype = ave; // always spot area
    const float ach = (float)lp.trans / 100.f;
    float reducac;

@@ -1933,7 +1933,6 @@ void ImProcFunctions::Contrast_Local (int call, float ave, LabImage * bufcontori
    float minco = +10000.f;

    if (call <= 3) {
-std::cout << lp.sens << " " << lp.qualmet << std::endl;
 #ifdef _OPENMP
        #pragma omp parallel if (multiThread)
 #endif
@@ -1949,11 +1948,8 @@ std::cout << lp.sens << " " << lp.qualmet << std::endl;
            #pragma omp for schedule(dynamic,16)
 #endif

-            for (int y = 0; y < transformed->H; y++)
-            {
-
+            for (int y = 0; y < transformed->H; y++) {
                const int loy = cy + y;
-
                const bool isZone0 = loy > lp.yc + lp.ly || loy < lp.yc - lp.lyT; // whole line is zone 0 => we can skip a lot of processing

                if(isZone0) { // outside selection and outside transition zone => no effect, keep original values
@@ -1978,14 +1974,12 @@ std::cout << lp.sens << " " << lp.qualmet << std::endl;
 #endif

                for (int x = 0; x < transformed->W; x++) {
-                    int lox = cx + x;
+                    const int lox = cx + x;

                    float rL;

                    if (lox >= (lp.xc - lp.lxL) && lox < (lp.xc + lp.lx) && (rL = original->L[y][x]) > 3.2768f) {
                        // rL > 3.2768f to avoid crash with very low gamut in rare cases ex : L=0.01 a=0.5 b=-0.9
-                        int begx = lp.xc - lp.lxL;
-                        int begy = lp.yc - lp.lyT;
                        int zone = 0;

                        float localFactor = 1.f;
@@ -2008,6 +2002,8 @@ std::cout << lp.sens << " " << lp.qualmet << std::endl;

                        float cli = 1.f;

+                        const int begx = lp.xc - lp.lxL;
+                        const int begy = lp.yc - lp.lyT;
                        if (lp.curvact) {

                            cli = (buflightc[loy - begy][lox - begx]);
@@ -2119,7 +2115,6 @@ std::cout << lp.sens << " " << lp.qualmet << std::endl;

                            if (rchro < kcr) {
                                fach *= SQR(rchro) / SQR(kcr);
-//                                fach *= (1.f / (kcr * kcr)) * rchro * rchro;
                            }
                        }

@@ -2212,7 +2207,6 @@ std::cout << lp.sens << " " << lp.qualmet << std::endl;
                    }
                }
            }
-
        }
    }
 }
@@ -2543,7 +2537,7 @@ void ImProcFunctions::InverseSharp_Local (int sp, float **loctemp, const float h

 void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const float hueplus, const float huemoins, const float hueref, const float dhue, const float chromaref, const float lumaref, const local_params & lp, LabImage * original, LabImage * transformed, int cx, int cy)
 {
-    // BENCHFUN
+     BENCHFUN
    const float localtype = lumaref; // always spot area
    const float ach = (float)lp.trans / 100.f;
    float reducac;
@@ -2573,6 +2567,7 @@ void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const floa
    const float ahu = 1.f / (2.8f * lp.senssha - 280.f);
    const float bhu = 1.f - ahu * 2.8f * lp.senssha;

+    const bool detectHue = lp.senssha < 20.f && lp.qualmet == 1;
 #ifdef _OPENMP
    #pragma omp parallel if (multiThread)
 #endif
@@ -2588,50 +2583,63 @@ void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const floa
 #endif

        for (int y = 0; y < transformed->H; y++) {
+
+            const int loy = cy + y;
+            const bool isZone0 = loy > lp.yc + lp.ly || loy < lp.yc - lp.lyT; // whole line is zone 0 => we can skip a lot of processing
+
+            if(isZone0) { // outside selection and outside transition zone => no effect, keep original values
+                for (int x = 0; x < transformed->W; x++) {
+                    transformed->L[y][x] = original->L[y][x];
+                }
+                continue;
+            }
+
 #ifdef __SSE2__
            int i = 0;

-            for (; i < transformed->W - 3; i += 4) {
-                vfloat av = LVFU (original->a[y][i]);
-                vfloat bv = LVFU (original->b[y][i]);
-                STVF (atan2Buffer[i], xatan2f (bv, av));
-                STVF (sqrtBuffer[i], _mm_sqrt_ps (SQRV (bv) + SQRV (av)) / c327d68v);
-            }
+            if(detectHue) {
+                for (; i < transformed->W - 3; i += 4) {
+                    vfloat av = LVFU (original->a[y][i]);
+                    vfloat bv = LVFU (original->b[y][i]);
+                    STVF (atan2Buffer[i], xatan2f (bv, av));
+                    STVF (sqrtBuffer[i], _mm_sqrt_ps (SQRV (bv) + SQRV (av)) / c327d68v);
+                }

-            for (; i < transformed->W; i++) {
-                atan2Buffer[i] = xatan2f (original->b[y][i], original->a[y][i]);
-                sqrtBuffer[i] = sqrt (SQR (original->b[y][i]) + SQR (original->a[y][i])) / 327.68f;
+                for (; i < transformed->W; i++) {
+                    atan2Buffer[i] = xatan2f (original->b[y][i], original->a[y][i]);
+                    sqrtBuffer[i] = sqrt (SQR (original->b[y][i]) + SQR (original->a[y][i])) / 327.68f;
+                }
+            } else {
+                for (; i < transformed->W - 3; i += 4) {
+                    vfloat av = LVFU (original->a[y][i]);
+                    vfloat bv = LVFU (original->b[y][i]);
+                    STVF (sqrtBuffer[i], _mm_sqrt_ps (SQRV (bv) + SQRV (av)) / c327d68v);
+                }
+                for (; i < transformed->W; i++) {
+                    sqrtBuffer[i] = sqrt (SQR (original->b[y][i]) + SQR (original->a[y][i])) / 327.68f;
+                }
            }

 #endif

-            int loy = cy + y;
-
            for (int x = 0; x < transformed->W; x++) {
                int lox = cx + x;
-#ifdef __SSE2__
-                float rhue = atan2Buffer[x];
-                float rchro = sqrtBuffer[x];
-#else
-                float rhue = xatan2f (original->b[y][x], original->a[y][x]);
-                float rchro = sqrt (SQR (original->b[y][x]) + SQR (original->a[y][x])) / 327.68f;
-#endif
-                int zone;
+                int zone = 0;
                float localFactor = 1.f;
                calcTransition (lox, loy, ach, lp, zone, localFactor);
+                if(zone == 0) { // outside selection and outside transition zone => no effect, keep original values
+                    transformed->L[y][x] = original->L[y][x];
+                    continue;
+                }
+#ifdef __SSE2__
+                float rchro = sqrtBuffer[x];
+#else
+                float rchro = sqrt (SQR (original->b[y][x]) + SQR (original->a[y][x])) / 327.68f;
+#endif
                //prepare shape detection
-                float khu = 0.f;
                float kch = 1.f;
-                bool kzon = false;
                float fach = 1.f;
                float deltachro = fabs (rchro - chromaref);
-                float deltahue = fabs (rhue - hueref);
-
-                if (deltahue > rtengine::RT_PI) {
-                    deltahue = - (deltahue - 2.f * rtengine::RT_PI);
-                }
-
-                float deltaE = 20.f * deltahue + deltachro; //pseudo deltaE between 0 and 280

                //kch to modulate action with chroma
                if (deltachro < 160.f * SQR (lp.senssha / 100.f)) {
@@ -2641,15 +2649,24 @@ void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const floa
                    float ak = 1.f / (ck - 160.f);
                    float bk = -160.f * ak;
                    kch = ak * deltachro + bk;
+                    if (lp.senssha < 40.f ) {
+                        kch = pow_F (kch, pa * lp.senssha + pb);   //increase under 40
+                    }
                }

-                if (lp.senssha < 40.f ) {
-                    kch = pow (kch, pa * lp.senssha + pb);   //increase under 40
-                }
-
-
                // algo with detection of hue ==> artifacts for noisy images  ==> denoise before
-                if (lp.senssha < 20.f) { //to try...
+                if (detectHue) { //to try...
+#ifdef __SSE2__
+                    float rhue = atan2Buffer[x];
+#else
+                    float rhue = xatan2f (original->b[y][x], original->a[y][x]);
+#endif
+                    float khu = 0.f;
+                    float deltahue = fabs (rhue - hueref);
+
+                    if (deltahue > rtengine::RT_PI) {
+                        deltahue = - (deltahue - 2.f * rtengine::RT_PI);
+                    }
                    //hue detection
                    if ((hueref + dhue) < rtengine::RT_PI && rhue < hueplus && rhue > huemoins) { //transition are good
                        if (rhue >= hueplus - delhu )  {
@@ -2661,7 +2678,6 @@ void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const floa
                        }


-                        kzon = true;
                    } else if ((hueref + dhue) >= rtengine::RT_PI && (rhue > huemoins  || rhue < hueplus )) {
                        if (rhue >= hueplus - delhu  && rhue < hueplus)  {
                            khu  = apl * rhue + bpl;
@@ -2671,7 +2687,6 @@ void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const floa
                            khu = 1.f;
                        }

-                        kzon = true;
                    }

                    if ((hueref - dhue) > -rtengine::RT_PI && rhue < hueplus && rhue > huemoins ) {
@@ -2683,7 +2698,6 @@ void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const floa
                            khu = 1.f;
                        }

-                        kzon = true;
                    } else if ((hueref - dhue) <= -rtengine::RT_PI && (rhue > huemoins  || rhue < hueplus )) {
                        if (rhue >= hueplus - delhu  && rhue < hueplus)  {
                            khu  = apl * rhue + bpl;
@@ -2693,9 +2707,10 @@ void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const floa
                            khu = 1.f;
                        }

-                        kzon = true;
                    }

+                    float deltaE = 20.f * deltahue + deltachro; //pseudo deltaE between 0 and 280
+
                    if (deltaE <  2.8f * lp.senssha) {
                        fach = khu;
                    } else {
@@ -2709,33 +2724,12 @@ void ImProcFunctions::Sharp_Local (int call, int sp, float **loctemp, const floa
                        fach *= (1.f / (kcr * kcr)) * rchro * rchro;
                    }

-                    if (lp.qualmet == 1) {
-                    } else {
-                        fach = 1.f;
-                    }
-
-                    //fach = khu ;
-
-                } else {
-                    /*
-                        float kcr = 8.f;
-                        if(lp.senssha > 30.f){
-                        if (rchro < kcr) {
-                            fach *= (1.f / (kcr)) * rchro;
-
-                        }
-                        }
-                        */
                }

                int begx = int (lp.xc - lp.lxL);
                int begy = int (lp.yc - lp.lyT);

                switch (zone) {
-                    case 0: { // outside selection and outside transition zone => no effect, keep original values
-                        transformed->L[y][x] = original->L[y][x];
-                        break;
-                    }

                    case 1: { // inside transition zone
                        float factorx = localFactor;
@@ -4831,70 +4825,36 @@ void ImProcFunctions::Lab_Local (int call, int sp, float** shbuffer, LabImage *

 //end cbdl
        if (!lp.invshar && lp.shrad > 0.42 && call < 3  && lp.sharpena) { //interior ellipse for sharpening, call = 1 and 2 only with Dcrop and simpleprocess
-
-            int GW = original->W;
-            int GH = original->H;
-            float **bufsh;//buffer por square zone
-            float **loctemp;
-            float **hbuffer;
-            int bfh = int (lp.ly + lp.lyT) + del; //bfw bfh real size of square zone
-            int bfw = int (lp.lx + lp.lxL) + del;
+            int bfh = call == 2 ? int (lp.ly + lp.lyT) + del : original->H; //bfw bfh real size of square zone
+            int bfw = call == 2 ? int (lp.lx + lp.lxL) + del : original->W;
+            const JaggedArray<float> loctemp (bfw, bfh);

            if (call == 2) { //call from simpleprocess
-                bufsh   = new float*[bfh];
-
-                for (int i = 0; i < bfh; i++) {
-                    bufsh[i] = new float[bfw];
-                }
+                const JaggedArray<float> bufsh (bfw, bfh, true);
+                const JaggedArray<float> hbuffer (bfw, bfh);

+                int yStart = lp.yc - lp.lyT - cy;
+                int yEnd = lp.yc + lp.ly - cy;
+                int xStart = lp.xc - lp.lxL - cx;
+                int xEnd = lp.xc + lp.lx - cx;
+                int begy = lp.yc - lp.lyT;
+                int begx = lp.xc - lp.lxL;
 #ifdef _OPENMP
-                #pragma omp parallel for
+                #pragma omp parallel for schedule(dynamic,16)
 #endif

-                for (int ir = 0; ir < bfh; ir++) //fill with 0
-                    for (int jr = 0; jr < bfw; jr++) {
-                        bufsh[ir][jr] = 0.f;
+                for (int y = yStart; y < yEnd ; y++) {
+                    int loy = cy + y;
+                    for (int x = xStart, lox = cx + x; x < xEnd; x++, lox++) {
+                        bufsh[loy - begy][lox - begx] = original->L[y][x];//fill square buffer with datas
                    }
-
-
-#ifdef _OPENMP
-                #pragma omp parallel for
-#endif
-
-                for (int y = 0; y < transformed->H ; y++) //{
-                    for (int x = 0; x < transformed->W; x++) {
-                        int lox = cx + x;
-                        int loy = cy + y;
-                        int begx = int (lp.xc - lp.lxL);
-                        int begy = int (lp.yc - lp.lyT);
-
-                        if (lox >= (lp.xc - lp.lxL) && lox < (lp.xc + lp.lx) && loy >= (lp.yc - lp.lyT) && loy < (lp.yc + lp.ly)) {
-                            bufsh[loy - begy][lox - begx] = original->L[y][x];//fill square buffer with datas
-                        }
-                    }
-
-                loctemp = new float*[bfh];//allocate temp
-
-                for (int i = 0; i < bfh; i++) {
-                    loctemp[i] = new float[bfw];
-                }
-
-                hbuffer = new float*[bfh];//allocate buffer for sharp
-
-                for (int i = 0; i < bfh; i++) {
-                    hbuffer[i] = new float[bfw];
                }

                //sharpen only square area instaed of all image
                ImProcFunctions::deconvsharpeningloc (bufsh, hbuffer, bfw, bfh, loctemp, params->locallab.shardamping, (double)params->locallab.sharradius / 100., params->locallab.shariter, params->locallab.sharamount);
            } else { //call from dcrop.cc
-                loctemp = new float*[GH];//allocate temp

-                for (int i = 0; i < GH; i++) {
-                    loctemp[i] = new float[GW];
-                }
-
-                ImProcFunctions::deconvsharpeningloc (original->L, shbuffer, GW, GH, loctemp, params->locallab.shardamping, (double)params->locallab.sharradius / 100., params->locallab.shariter, params->locallab.sharamount);
+                ImProcFunctions::deconvsharpeningloc (original->L, shbuffer, bfw, bfh, loctemp, params->locallab.shardamping, (double)params->locallab.sharradius / 100., params->locallab.shariter, params->locallab.sharamount);

            }

@@ -4912,50 +4872,10 @@ void ImProcFunctions::Lab_Local (int call, int sp, float** shbuffer, LabImage *
            //sharpen ellipse and transition
            Sharp_Local (call, sp, loctemp, hueplus, huemoins, hueref, dhue, chromaref, lumaref, lp, original, transformed, cx, cy);

-            //cleann all
-            if (call == 2  && !lp.invshar) {
-                for (int i = 0; i < bfh; i++) {
-                    delete [] loctemp[i];
-                }
-
-                delete [] loctemp;
-
-                for (int i = 0; i < bfh; i++) {
-                    delete [] bufsh[i];
-                }
-
-                delete [] bufsh;
-
-                for (int i = 0; i < bfh; i++) {
-                    delete [] hbuffer[i];
-                }
-
-                delete [] hbuffer;
-            } else {
-                for (int i = 0; i < GH; i++) {
-                    delete [] loctemp[i];
-                }
-
-                delete [] loctemp;
-
-            }
-
-            /*            for (int i = 0; i < GH; i++) {
-                            delete [] hbuffer[i];
-                        }
-
-                        delete [] hbuffer;
-            */
-
        } else if (lp.invshar && lp.shrad > 0.42 && call < 3 && lp.sharpena) {
            int GW = original->W;
            int GH = original->H;
-
-            float **loctemp = new float*[GH];
-
-            for (int i = 0; i < GH; i++) {
-                loctemp[i] = new float[GW];
-            }
+            const JaggedArray<float> loctemp (GW, GH);

            ImProcFunctions::deconvsharpeningloc (original->L, shbuffer, GW, GH, loctemp, params->locallab.shardamping, (double)params->locallab.sharradius / 100., params->locallab.shariter, params->locallab.sharamount);

@@ -4971,13 +4891,6 @@ void ImProcFunctions::Lab_Local (int call, int sp, float** shbuffer, LabImage *
            }

            InverseSharp_Local (sp, loctemp, hueplus, huemoins, hueref, dhue, chromaref, lumaref, lp, original, transformed, cx, cy);
-
-            for (int i = 0; i < GH; i++) {
-                delete [] loctemp[i];
-            }
-
-            delete [] loctemp;
-
        }

        //      }