SSE2 code for BadpixelsLab()

2018-02-16 13:35:10 +01:00
parent 63f14dda7e
commit ad0e05f846
2 changed files with 43 additions and 32 deletions
--- a/rtengine/PF_correct_RT.cc
+++ b/rtengine/PF_correct_RT.cc
@@ -34,15 +34,11 @@
 #include "jaggedarray.h"
 #define BENCHMARK
 #include "StopWatch.h"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 using namespace std;
 namespace rtengine
 {
 extern const Settings* settings;
 void ImProcFunctions::PF_correct_RT(LabImage * src, double radius, int thresh)
 {
@@ -1243,50 +1239,68 @@ void ImProcFunctions::BadpixelsLab(LabImage * src, double radius, int thresh, in
                    float atot = 0.f;
                    float btot = 0.f;
                    float norm = 0.f;
                    float wt;
                    for (int i1 = max(0, i - halfwin + 1); i1 < min(height, i + halfwin); i1++)
                        for (int j1 = 0; j1 < j + halfwin; j1++) {
-                            wt = badpix[i1 * width + j1];
+                            float wt = badpix[i1 * width + j1];
                            atot += wt * src->a[i1][j1];
                            btot += wt * src->b[i1][j1];
                            norm += wt;
                        }
-                    if(norm > 0.f) {
+                    if(SQR(atot) + SQR(btot) < chrom * SQR(norm)) {
-                        const float a = atot / norm;
+                        src->a[i][j] = atot / norm;
-                        const float b = btot / norm;
+                        src->b[i][j] = btot / norm;
                        if(SQR(a) + SQR(b) < chrom) {
                            src->a[i][j] = a;
                            src->b[i][j] = b;
                        }
                    }
                }
            }
-            for(; j < width - halfwin; j++) { // this loop is the hot spot. Maybe worth to vectorize
+#ifdef __SSE2__
            vfloat chromv = F2V(chrom);
            vfloat threshfactorv = F2V(threshfactor);
            for(; j < width - halfwin - 3; j+=4) {
                vmask selMask = vmaskf_lt(LVFU(badpix[i * width + j]), threshfactorv);
                if (_mm_movemask_ps((vfloat)selMask)) {
                    vfloat atotv = ZEROV;
                    vfloat btotv = ZEROV;
                    vfloat normv = ZEROV;
                    for (int i1 = max(0, i - halfwin + 1); i1 < min(height, i + halfwin); i1++)
                        for (int j1 = j - halfwin + 1; j1 < j + halfwin; j1++) {
                            vfloat wtv = LVFU(badpix[i1 * width + j1]);
                            atotv += wtv * LVFU(src->a[i1][j1]);
                            btotv += wtv * LVFU(src->b[i1][j1]);
                            normv += wtv;
                        }
                    selMask = vandm(selMask, vmaskf_lt(SQRV(atotv) + SQR(btotv), chromv * SQRV(normv)));
                    if(_mm_movemask_ps((vfloat)selMask)) {
                        vfloat aOrig = LVFU(src->a[i][j]);
                        vfloat bOrig = LVFU(src->b[i][j]);
                        STVFU(src->a[i][j], vself(selMask, atotv / normv, aOrig));
                        STVFU(src->b[i][j], vself(selMask, btotv / normv, bOrig));
                    }
                }
            }
 #endif
            for(; j < width - halfwin; j++) {
                if (badpix[i * width + j] < threshfactor) {
                    float atot = 0.f;
                    float btot = 0.f;
                    float norm = 0.f;
                    float wt;
                    for (int i1 = max(0, i - halfwin + 1); i1 < min(height, i + halfwin); i1++)
                        for (int j1 = j - halfwin + 1; j1 < j + halfwin; j1++) {
-                            wt = badpix[i1 * width + j1];
+                            float wt = badpix[i1 * width + j1];
                            atot += wt * src->a[i1][j1];
                            btot += wt * src->b[i1][j1];
                            norm += wt;
                        }
-                    if(norm > 0.f) {
+                    if(SQR(atot) + SQR(btot) < chrom * SQR(norm)) {
-                        const float a = atot / norm;
+                        src->a[i][j] = atot / norm;
-                        const float b = btot / norm;
+                        src->b[i][j] = btot / norm;
                        if(SQR(a) + SQR(b) < chrom) {
                            src->a[i][j] = a;
                            src->b[i][j] = b;
                        }
                    }
                }
            }
@@ -1297,23 +1311,18 @@ void ImProcFunctions::BadpixelsLab(LabImage * src, double radius, int thresh, in
                    float atot = 0.f;
                    float btot = 0.f;
                    float norm = 0.f;
                    float wt;
                    for (int i1 = max(0, i - halfwin + 1); i1 < min(height, i + halfwin); i1++)
                        for (int j1 = j - halfwin + 1; j1 < width; j1++) {
-                            wt = badpix[i1 * width + j1];
+                            float wt = badpix[i1 * width + j1];
                            atot += wt * src->a[i1][j1];
                            btot += wt * src->b[i1][j1];
                            norm += wt;
                        }
-                    if(norm > 0.f) {
+                    if(SQR(atot) + SQR(btot) < chrom * SQR(norm)) {
-                        const float a = atot / norm;
+                        src->a[i][j] = atot / norm;
-                        const float b = btot / norm;
+                        src->b[i][j] = btot / norm;
                        if(SQR(a) + SQR(b) < chrom) {
                            src->a[i][j] = a;
                            src->b[i][j] = b;
                        }
                    }
                }
            }
--- a/rtgui/dirpyrequalizer.cc
+++ b/rtgui/dirpyrequalizer.cc
@@ -196,6 +196,7 @@ void DirPyrEqualizer::read (const ProcParams* pp, const ParamsEdited* pedited)
    */
    gamutlabConn.block (true);
    gamutlab->set_active (pp->dirpyrequalizer.gamutlab);
    gamutlab->set_sensitive (pp->dirpyrequalizer.skinprotect != 0);
    gamutlabConn.block (false);
    lastgamutlab = pp->dirpyrequalizer.gamutlab;
@@ -339,6 +340,7 @@ void DirPyrEqualizer::adjusterChanged (Adjuster* a, double newval)
                                            Glib::ustring::format(std::fixed, std::setprecision(2), threshold->getValue()))
                                   );
        } else if (a == skinprotect) {
            gamutlab->set_sensitive (skinprotect->getValue() != 0);
            listener->panelChanged (EvDirPyrEqualizerSkin,
                                    Glib::ustring::compose("%1",
                                            Glib::ustring::format(std::fixed, std::setprecision(2), skinprotect->getValue()))