Merge pull request #4268 from Beep6581/rgbroc_speedup

Rgbroc speedup
2017-12-31 14:43:04 +01:00 · 2017-12-31 14:43:04 +01:00 · 4c3e7b8efa
commit 4c3e7b8efa
parent 7efedf390c 3dcce23c24
4 changed files with 412 additions and 272 deletions
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@ -309,6 +309,38 @@ public:

 #if defined( __SSE2__ ) && defined( __x86_64__ )

+
+    // NOTE: This function requires LUTs which clips only at lower bound
+    vfloat cb(vfloat indexv) const
+    {
+        static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
+
+        // Clamp and convert to integer values. Extract out of SSE register because all
+        // lookup operations use regular addresses.
+        vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxIndexFloat), indexv));
+        vint indexes = _mm_cvttps_epi32(clampedIndexes);
+        int indexArray[4];
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
+
+        // Load data from the table. This reads more than necessary, but there don't seem
+        // to exist more granular operations (though we could try non-SSE).
+        // Cast to int for convenience in the next operation (partial transpose).
+        vint values[4];
+        for (int i = 0; i < 4; ++i) {
+            values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
+        }
+
+        // Partial 4x4 transpose operation. We want two new vectors, the first consisting
+        // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
+        __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
+        __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
+        vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
+        vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
+
+        vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes);
+        return vintpf(diff, upper, lower);
+    }
+
    // NOTE: This version requires LUTs which clip at upper and lower bounds
    // (which is the default).
    vfloat operator[](vfloat indexv) const
@ -340,6 +372,37 @@ public:
        vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes);
        return vintpf(diff, upper, lower);
    }
+
+    // NOTE: This version requires LUTs which do not clip at upper and lower bounds
+    vfloat operator()(vfloat indexv) const
+    {
+        static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
+
+        // Clamp and convert to integer values. Extract out of SSE register because all
+        // lookup operations use regular addresses.
+        vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxsf), indexv));
+        vint indexes = _mm_cvttps_epi32(clampedIndexes);
+        int indexArray[4];
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
+
+        // Load data from the table. This reads more than necessary, but there don't seem
+        // to exist more granular operations (though we could try non-SSE).
+        // Cast to int for convenience in the next operation (partial transpose).
+        vint values[4];
+        for (int i = 0; i < 4; ++i) {
+            values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
+        }
+
+        // Partial 4x4 transpose operation. We want two new vectors, the first consisting
+        // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
+        __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
+        __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
+        vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
+        vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
+
+        vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
+        return vintpf(diff, upper, lower);
+    }
 #ifdef __SSE4_1__
    template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
    vfloat operator[](vint idxv ) const
--- a/rtengine/color.cc
+++ b/rtengine/color.cc
@ -1783,6 +1783,71 @@ void Color::Lab2XYZ(vfloat L, vfloat a, vfloat b, vfloat &x, vfloat &y, vfloat &
 }
 #endif // __SSE2__

+void Color::RGB2Lab(float *R, float *G, float *B, float *L, float *a, float *b, const float wp[3][3], int width)
+{
+
+#ifdef __SSE2__
+    // prepare matrix to save some divisions (reduces the number of divisions by width/2 - 6)
+    float wpn[3][3];
+    for(int i = 0; i < 3; ++i) {
+        wpn[0][i] = wp[0][i] / Color::D50x;
+        wpn[1][i] = wp[1][i];
+        wpn[2][i] = wp[2][i] / Color::D50z;
+    }
+
+    vfloat maxvalfv = F2V(MAXVALF);
+    vfloat c116v = F2V(116.f);
+    vfloat c5242d88v = F2V(5242.88f);
+    vfloat c500v = F2V(500.f);
+    vfloat c200v = F2V(200.f);
+#endif
+    int i = 0;
+#ifdef __SSE2__
+    for(;i < width - 3; i+=4) {
+        const vfloat rv = LVFU(R[i]);
+        const vfloat gv = LVFU(G[i]);
+        const vfloat bv = LVFU(B[i]);
+        const vfloat xv = F2V(wpn[0][0]) * rv + F2V(wpn[0][1]) * gv + F2V(wpn[0][2]) * bv;
+        const vfloat yv = F2V(wpn[1][0]) * rv + F2V(wpn[1][1]) * gv + F2V(wpn[1][2]) * bv;
+        const vfloat zv = F2V(wpn[2][0]) * rv + F2V(wpn[2][1]) * gv + F2V(wpn[2][2]) * bv;
+
+        vmask maxMask = vmaskf_gt(vmaxf(xv, vmaxf(yv, zv)), maxvalfv);
+        if (_mm_movemask_ps((vfloat)maxMask)) {
+            // take slower code path for all 4 pixels if one of the values is > MAXVALF. Still faster than non SSE2 version
+            for(int k = 0; k < 4; ++k) {
+                float x = xv[k];
+                float y = yv[k];
+                float z = zv[k];
+                float fx = (x <= 65535.f ? cachef[x] : (327.68f * xcbrtf(x / MAXVALF)));
+                float fy = (y <= 65535.f ? cachef[y] : (327.68f * xcbrtf(y / MAXVALF)));
+                float fz = (z <= 65535.f ? cachef[z] : (327.68f * xcbrtf(z / MAXVALF)));
+
+                L[i + k] = (116.f *  fy - 5242.88f); //5242.88=16.0*327.68;
+                a[i + k] = (500.f * (fx - fy) );
+                b[i + k] = (200.f * (fy - fz) );
+            }
+        } else {
+            const vfloat fx = cachef[xv];
+            const vfloat fy = cachef[yv];
+            const vfloat fz = cachef[zv];
+
+            STVFU(L[i], c116v *  fy - c5242d88v); //5242.88=16.0*327.68;
+            STVFU(a[i], c500v * (fx - fy));
+            STVFU(b[i], c200v * (fy - fz));
+        }
+    }
+#endif
+    for(;i < width; ++i) {
+        const float rv = R[i];
+        const float gv = G[i];
+        const float bv = B[i];
+        float x = wp[0][0] * rv + wp[0][1] * gv + wp[0][2] * bv;
+        float y = wp[1][0] * rv + wp[1][1] * gv + wp[1][2] * bv;
+        float z = wp[2][0] * rv + wp[2][1] * gv + wp[2][2] * bv;
+        XYZ2Lab(x, y, z, L[i], a[i], b[i]);
+    }
+}
+
 void Color::XYZ2Lab(float X, float Y, float Z, float &L, float &a, float &b)
 {

--- a/rtengine/color.h
+++ b/rtengine/color.h
@ -475,7 +475,7 @@ public:
    * @param b channel [-42000 ; +42000] ; can be more than 42000 (return value)
    */
    static void XYZ2Lab(float x, float y, float z, float &L, float &a, float &b);
-
+    static void RGB2Lab(float *X, float *Y, float *Z, float *L, float *a, float *b, const float wp[3][3], int width);

    /**
    * @brief Convert Lab in Yuv
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@ -48,6 +48,227 @@
 #undef CLIPD
 #define CLIPD(a) ((a)>0.0f?((a)<1.0f?(a):1.0f):0.0f)

+namespace {
+
+using namespace rtengine;
+// begin of helper function for rgbProc()
+void shadowToneCurve(const LUTf &shtonecurve, float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize) {
+
+#ifdef __SSE2__
+    vfloat cr = F2V(0.299f);
+    vfloat cg = F2V(0.587f);
+    vfloat cb = F2V(0.114f);
+#endif
+
+    for (int i = istart, ti = 0; i < tH; i++, ti++) {
+        int j = jstart, tj = 0;
+#ifdef __SSE2__
+        for (; j < tW - 3; j+=4, tj+=4) {
+
+            vfloat rv = LVF(rtemp[ti * tileSize + tj]);
+            vfloat gv = LVF(gtemp[ti * tileSize + tj]);
+            vfloat bv = LVF(btemp[ti * tileSize + tj]);
+
+            //shadow tone curve
+            vfloat Yv = cr * rv + cg * gv + cb * bv;
+            vfloat tonefactorv = shtonecurve(Yv);
+            STVF(rtemp[ti * tileSize + tj], rv * tonefactorv);
+            STVF(gtemp[ti * tileSize + tj], gv * tonefactorv);
+            STVF(btemp[ti * tileSize + tj], bv * tonefactorv);
+        }
+#endif
+        for (; j < tW; j++, tj++) {
+
+            float r = rtemp[ti * tileSize + tj];
+            float g = gtemp[ti * tileSize + tj];
+            float b = btemp[ti * tileSize + tj];
+
+            //shadow tone curve
+            float Y = (0.299f * r + 0.587f * g + 0.114f * b);
+            float tonefactor = shtonecurve[Y];
+            rtemp[ti * tileSize + tj] = rtemp[ti * tileSize + tj] * tonefactor;
+            gtemp[ti * tileSize + tj] = gtemp[ti * tileSize + tj] * tonefactor;
+            btemp[ti * tileSize + tj] = btemp[ti * tileSize + tj] * tonefactor;
+        }
+    }
+}
+
+void highlightToneCurve(const LUTf &hltonecurve, float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize, float exp_scale, float comp, float hlrange) {
+
+#ifdef __SSE2__
+    vfloat threev = F2V(3.f);
+    vfloat maxvalfv = F2V(MAXVALF);
+#endif
+
+    for (int i = istart, ti = 0; i < tH; i++, ti++) {
+        int j = jstart, tj = 0;
+#ifdef __SSE2__
+        for (; j < tW - 3; j+=4, tj+=4) {
+
+            vfloat rv = LVF(rtemp[ti * tileSize + tj]);
+            vfloat gv = LVF(gtemp[ti * tileSize + tj]);
+            vfloat bv = LVF(btemp[ti * tileSize + tj]);
+
+            //TODO: proper treatment of out-of-gamut colors
+            //float tonefactor = hltonecurve[(0.299f*r+0.587f*g+0.114f*b)];
+            vmask maxMask = vmaskf_ge(vmaxf(rv, vmaxf(gv, bv)), maxvalfv);
+            if(_mm_movemask_ps((vfloat)maxMask)) {
+                for (int k = 0; k < 4; ++k) {
+                    float r = rtemp[ti * tileSize + tj + k];
+                    float g = gtemp[ti * tileSize + tj + k];
+                    float b = btemp[ti * tileSize + tj + k];
+                    float tonefactor = ((r < MAXVALF ? hltonecurve[r] : CurveFactory::hlcurve (exp_scale, comp, hlrange, r) ) +
+                                        (g < MAXVALF ? hltonecurve[g] : CurveFactory::hlcurve (exp_scale, comp, hlrange, g) ) +
+                                        (b < MAXVALF ? hltonecurve[b] : CurveFactory::hlcurve (exp_scale, comp, hlrange, b) ) ) / 3.0;
+
+                    // note: tonefactor includes exposure scaling, that is here exposure slider and highlight compression takes place
+                    rtemp[ti * tileSize + tj + k] = r * tonefactor;
+                    gtemp[ti * tileSize + tj + k] = g * tonefactor;
+                    btemp[ti * tileSize + tj + k] = b * tonefactor;
+                }
+            } else {
+                vfloat tonefactorv = (hltonecurve.cb(rv) + hltonecurve.cb(gv) + hltonecurve.cb(bv)) / threev;
+                // note: tonefactor includes exposure scaling, that is here exposure slider and highlight compression takes place
+                STVF(rtemp[ti * tileSize + tj], rv * tonefactorv);
+                STVF(gtemp[ti * tileSize + tj], gv * tonefactorv);
+                STVF(btemp[ti * tileSize + tj], bv * tonefactorv);
+            }
+        }
+#endif
+        for (; j < tW; j++, tj++) {
+
+            float r = rtemp[ti * tileSize + tj];
+            float g = gtemp[ti * tileSize + tj];
+            float b = btemp[ti * tileSize + tj];
+
+            //TODO: proper treatment of out-of-gamut colors
+            //float tonefactor = hltonecurve[(0.299f*r+0.587f*g+0.114f*b)];
+            float tonefactor = ((r < MAXVALF ? hltonecurve[r] : CurveFactory::hlcurve (exp_scale, comp, hlrange, r) ) +
+                                (g < MAXVALF ? hltonecurve[g] : CurveFactory::hlcurve (exp_scale, comp, hlrange, g) ) +
+                                (b < MAXVALF ? hltonecurve[b] : CurveFactory::hlcurve (exp_scale, comp, hlrange, b) ) ) / 3.0;
+
+            // note: tonefactor includes exposure scaling, that is here exposure slider and highlight compression takes place
+            rtemp[ti * tileSize + tj] = r * tonefactor;
+            gtemp[ti * tileSize + tj] = g * tonefactor;
+            btemp[ti * tileSize + tj] = b * tonefactor;
+        }
+    }
+}
+
+void proPhotoBlue(float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize) {
+    // this is a hack to avoid the blue=>black bug (Issue 2141)
+    for (int i = istart, ti = 0; i < tH; i++, ti++) {
+        int j = jstart, tj = 0;
+#ifdef __SSE2__
+        for (; j < tW - 3; j+=4, tj+=4) {
+            vfloat rv = LVF(rtemp[ti * tileSize + tj]);
+            vfloat gv = LVF(gtemp[ti * tileSize + tj]);
+            vmask zeromask = vorm(vmaskf_eq(rv, ZEROV), vmaskf_eq(gv, ZEROV));
+            if(_mm_movemask_ps((vfloat)zeromask)) {
+                for (int k = 0; k < 4; ++k) {
+                    float r = rtemp[ti * tileSize + tj + k];
+                    float g = gtemp[ti * tileSize + tj + k];
+                    if (r == 0.0f || g == 0.0f) {
+                        float b = btemp[ti * tileSize + tj + k];
+                        float h, s, v;
+                        Color::rgb2hsv (r, g, b, h, s, v);
+                        s *= 0.99f;
+                        Color::hsv2rgb (h, s, v, rtemp[ti * tileSize + tj + k], gtemp[ti * tileSize + tj + k], btemp[ti * tileSize + tj + k]);
+                    }
+                }
+            }
+        }
+#endif
+        for (; j < tW; j++, tj++) {
+            float r = rtemp[ti * tileSize + tj];
+            float g = gtemp[ti * tileSize + tj];
+
+            if (r == 0.0f || g == 0.0f) {
+                float b = btemp[ti * tileSize + tj];
+                float h, s, v;
+                Color::rgb2hsv (r, g, b, h, s, v);
+                s *= 0.99f;
+                Color::hsv2rgb (h, s, v, rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj]);
+            }
+        }
+    }
+}
+
+void customToneCurve(const ToneCurve &customToneCurve, ToneCurveParams::TcMode curveMode, float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize, PerceptualToneCurveState ptcApplyState) {
+
+    if (curveMode == ToneCurveParams::TcMode::STD) { // Standard
+        for (int i = istart, ti = 0; i < tH; i++, ti++) {
+            const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve);
+            userToneCurve.BatchApply (
+                    0, tW - jstart,
+                    &rtemp[ti * tileSize], &gtemp[ti * tileSize], &btemp[ti * tileSize]);
+        }
+    } else if (curveMode == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like
+        for (int i = istart, ti = 0; i < tH; i++, ti++) {
+            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+                const AdobeToneCurve& userToneCurve = static_cast<const AdobeToneCurve&> (customToneCurve);
+                userToneCurve.Apply (rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj]);
+            }
+        }
+    } else if (curveMode == ToneCurveParams::TcMode::SATANDVALBLENDING) { // apply the curve on the saturation and value channels
+        for (int i = istart, ti = 0; i < tH; i++, ti++) {
+            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+                const SatAndValueBlendingToneCurve& userToneCurve = static_cast<const SatAndValueBlendingToneCurve&> (customToneCurve);
+                rtemp[ti * tileSize + tj] = CLIP<float> (rtemp[ti * tileSize + tj]);
+                gtemp[ti * tileSize + tj] = CLIP<float> (gtemp[ti * tileSize + tj]);
+                btemp[ti * tileSize + tj] = CLIP<float> (btemp[ti * tileSize + tj]);
+                userToneCurve.Apply (rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj]);
+            }
+        }
+    } else if (curveMode == ToneCurveParams::TcMode::WEIGHTEDSTD) { // apply the curve to the rgb channels, weighted
+        const WeightedStdToneCurve& userToneCurve = static_cast<const WeightedStdToneCurve&> (customToneCurve);
+
+        for (int i = istart, ti = 0; i < tH; i++, ti++) {
+            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+                rtemp[ti * tileSize + tj] = CLIP<float> (rtemp[ti * tileSize + tj]);
+                gtemp[ti * tileSize + tj] = CLIP<float> (gtemp[ti * tileSize + tj]);
+                btemp[ti * tileSize + tj] = CLIP<float> (btemp[ti * tileSize + tj]);
+                userToneCurve.Apply (rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj]);
+            }
+        }
+    } else if (curveMode == ToneCurveParams::TcMode::LUMINANCE) { // apply the curve to the luminance channel
+        const LuminanceToneCurve& userToneCurve = static_cast<const LuminanceToneCurve&> (customToneCurve);
+
+        for (int i = istart, ti = 0; i < tH; i++, ti++) {
+            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+                rtemp[ti * tileSize + tj] = CLIP<float> (rtemp[ti * tileSize + tj]);
+                gtemp[ti * tileSize + tj] = CLIP<float> (gtemp[ti * tileSize + tj]);
+                btemp[ti * tileSize + tj] = CLIP<float> (btemp[ti * tileSize + tj]);
+                userToneCurve.Apply (rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj]);
+            }
+        }
+    } else if (curveMode == ToneCurveParams::TcMode::PERCEPTUAL) { // apply curve while keeping color appearance constant
+        const PerceptualToneCurve& userToneCurve = static_cast<const PerceptualToneCurve&> (customToneCurve);
+
+        for (int i = istart, ti = 0; i < tH; i++, ti++) {
+            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+                rtemp[ti * tileSize + tj] = CLIP<float> (rtemp[ti * tileSize + tj]);
+                gtemp[ti * tileSize + tj] = CLIP<float> (gtemp[ti * tileSize + tj]);
+                btemp[ti * tileSize + tj] = CLIP<float> (btemp[ti * tileSize + tj]);
+                userToneCurve.Apply (rtemp[ti * tileSize + tj], gtemp[ti * tileSize + tj], btemp[ti * tileSize + tj], ptcApplyState);
+            }
+        }
+    }
+}
+
+void fillEditFloat(float *editIFloatTmpR, float *editIFloatTmpG, float *editIFloatTmpB, float *rtemp, float *gtemp, float *btemp, int istart, int tH, int jstart, int tW, int tileSize) {
+    for (int i = istart, ti = 0; i < tH; i++, ti++) {
+        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+            editIFloatTmpR[ti * tileSize + tj] = Color::gamma2curve[rtemp[ti * tileSize + tj]] / 65535.f;
+            editIFloatTmpG[ti * tileSize + tj] = Color::gamma2curve[gtemp[ti * tileSize + tj]] / 65535.f;
+            editIFloatTmpB[ti * tileSize + tj] = Color::gamma2curve[btemp[ti * tileSize + tj]] / 65535.f;
+        }
+    }
+}
+// end of helper function for rgbProc()
+
+}
+
 namespace rtengine
 {

@ -3332,8 +3553,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
    float chMixBG = float (params->chmixer.blue[1]);
    float chMixBB = float (params->chmixer.blue[2]);

-    int shHighlights = params->sh.highlights;
-    int shShadows = params->sh.shadows;
+    int shHighlights = params->sh.highlights / 100.f;
+    int shShadows = params->sh.shadows / 100.f;
    bool blackwhite = params->blackwhite.enabled;
    bool complem = params->blackwhite.enabledcc;
    float bwr = float (params->blackwhite.mixerRed);
@ -3501,15 +3722,13 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                            float g = gtemp[ti * TS + tj];
                            float b = btemp[ti * TS + tj];

-                            double mapval = 1.0 + shmap->map[i][j];
-                            double factor = 1.0;
+                            float mapval = 1.f + shmap->map[i][j];
+                            float factor = 1.f;

-                            if (processSH) {
-                                if (mapval > h_th) {
-                                    factor = (h_th + (100.0 - shHighlights) * (mapval - h_th) / 100.0) / mapval;
-                                } else if (mapval < s_th) {
-                                    factor = (s_th - (100.0 - shShadows) * (s_th - mapval) / 100.0) / mapval;
-                                }
+                            if (mapval > h_th) {
+                                factor = (1.f - shHighlights) + shHighlights * h_th / mapval;
+                            } else if (mapval < s_th) {
+                                factor = (s_th - (1.f - shShadows) * (s_th - mapval)) / mapval;
                            }

                            rtemp[ti * TS + tj] = factor * r;
@ -3519,41 +3738,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                    }
                }

-                for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                    for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-
-                        float r = rtemp[ti * TS + tj];
-                        float g = gtemp[ti * TS + tj];
-                        float b = btemp[ti * TS + tj];
-
-                        //TODO: proper treatment of out-of-gamut colors
-                        //float tonefactor = hltonecurve[(0.299f*r+0.587f*g+0.114f*b)];
-                        float tonefactor = ((r < MAXVALF ? hltonecurve[r] : CurveFactory::hlcurve (exp_scale, comp, hlrange, r) ) +
-                                            (g < MAXVALF ? hltonecurve[g] : CurveFactory::hlcurve (exp_scale, comp, hlrange, g) ) +
-                                            (b < MAXVALF ? hltonecurve[b] : CurveFactory::hlcurve (exp_scale, comp, hlrange, b) ) ) / 3.0;
-
-                        // note: tonefactor includes exposure scaling, that is here exposure slider and highlight compression takes place
-                        rtemp[ti * TS + tj] = r * tonefactor;
-                        gtemp[ti * TS + tj] = g * tonefactor;
-                        btemp[ti * TS + tj] = b * tonefactor;
-                    }
-                }
-
-                for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                    for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-
-                        float r = rtemp[ti * TS + tj];
-                        float g = gtemp[ti * TS + tj];
-                        float b = btemp[ti * TS + tj];
-
-                        //shadow tone curve
-                        float Y = (0.299f * r + 0.587f * g + 0.114f * b);
-                        float tonefactor = shtonecurve[Y];
-                        rtemp[ti * TS + tj] = rtemp[ti * TS + tj] * tonefactor;
-                        gtemp[ti * TS + tj] = gtemp[ti * TS + tj] * tonefactor;
-                        btemp[ti * TS + tj] = btemp[ti * TS + tj] * tonefactor;
-                    }
-                }
+                highlightToneCurve(hltonecurve, rtemp, gtemp, btemp, istart, tH, jstart, tW, TS, exp_scale, comp, hlrange);
+                shadowToneCurve(shtonecurve, rtemp, gtemp, btemp, istart, tH, jstart, tW, TS);

                if (dcpProf) {
                    dcpProf->step2ApplyTile (rtemp, gtemp, btemp, tW - jstart, tH - istart, TS, asIn);
@ -3561,22 +3747,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer

                for (int i = istart, ti = 0; i < tH; i++, ti++) {
                    for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                        float r = rtemp[ti * TS + tj];
-                        float g = gtemp[ti * TS + tj];
-                        float b = btemp[ti * TS + tj];
-
-                        // clip out of gamut colors, without distorting color too bad
-                        if (r < 0) {
-                            r = 0;
-                        }
-
-                        if (g < 0) {
-                            g = 0;
-                        }
-
-                        if (b < 0) {
-                            b = 0;
-                        }
+                        // clip out of gamut colors, without distorting colour too bad
+                        float r = std::max(rtemp[ti * TS + tj], 0.f);
+                        float g = std::max(gtemp[ti * TS + tj], 0.f);
+                        float b = std::max(btemp[ti * TS + tj], 0.f);

                        if (r > 65535 || g > 65535 || b > 65535) {
                            filmlike_clip (&r, &g, &b);
@ -3588,149 +3762,53 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                    }
                }

-                for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                    for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+                if (histToneCurveThr) {
+                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
+                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {

-                        //brightness/contrast
-                        rtemp[ti * TS + tj] = tonecurve[ rtemp[ti * TS + tj] ];
-                        gtemp[ti * TS + tj] = tonecurve[ gtemp[ti * TS + tj] ];
-                        btemp[ti * TS + tj] = tonecurve[ btemp[ti * TS + tj] ];
+                            //brightness/contrast
+                            rtemp[ti * TS + tj] = tonecurve[ rtemp[ti * TS + tj] ];
+                            gtemp[ti * TS + tj] = tonecurve[ gtemp[ti * TS + tj] ];
+                            btemp[ti * TS + tj] = tonecurve[ btemp[ti * TS + tj] ];

-                        if (histToneCurveThr) {
                            int y = CLIP<int> (lumimulf[0] * Color::gamma2curve[rtemp[ti * TS + tj]] + lumimulf[1] * Color::gamma2curve[gtemp[ti * TS + tj]] + lumimulf[2] * Color::gamma2curve[btemp[ti * TS + tj]]);
                            histToneCurveThr[y >> histToneCurveCompression]++;
                        }
                    }
+                } else {
+                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
+                        int j = jstart, tj = 0;
+#ifdef __SSE2__
+                        for (; j < tW - 3; j+=4, tj+=4) {
+                            //brightness/contrast
+                            STVF(rtemp[ti * TS + tj], tonecurve(LVF(rtemp[ti * TS + tj])));
+                            STVF(gtemp[ti * TS + tj], tonecurve(LVF(gtemp[ti * TS + tj])));
+                            STVF(btemp[ti * TS + tj], tonecurve(LVF(btemp[ti * TS + tj])));
+                        }
+#endif
+                        for (; j < tW; j++, tj++) {
+                            //brightness/contrast
+                            rtemp[ti * TS + tj] = tonecurve[rtemp[ti * TS + tj]];
+                            gtemp[ti * TS + tj] = tonecurve[gtemp[ti * TS + tj]];
+                            btemp[ti * TS + tj] = tonecurve[btemp[ti * TS + tj]];
+                        }
+                    }
                }

                if (editID == EUID_ToneCurve1) {  // filling the pipette buffer
-                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                            editIFloatTmpR[ti * TS + tj] = Color::gamma2curve[rtemp[ti * TS + tj]] / 65535.f;
-                            editIFloatTmpG[ti * TS + tj] = Color::gamma2curve[gtemp[ti * TS + tj]] / 65535.f;
-                            editIFloatTmpB[ti * TS + tj] = Color::gamma2curve[btemp[ti * TS + tj]] / 65535.f;
-                        }
-                    }
+                    fillEditFloat(editIFloatTmpR, editIFloatTmpG, editIFloatTmpB, rtemp, gtemp, btemp, istart, tH, jstart, tW, TS);
                }

                if (hasToneCurve1) {
-                    if (curveMode == ToneCurveParams::TcMode::STD) { // Standard
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve1);
-                            userToneCurve.BatchApply (
-                                    0, tW - jstart,
-                                    &rtemp[ti * TS], &gtemp[ti * TS], &btemp[ti * TS]);
-                        }
-                    } else if (curveMode == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                const AdobeToneCurve& userToneCurve = static_cast<const AdobeToneCurve&> (customToneCurve1);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode == ToneCurveParams::TcMode::SATANDVALBLENDING) { // apply the curve on the saturation and value channels
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                const SatAndValueBlendingToneCurve& userToneCurve = static_cast<const SatAndValueBlendingToneCurve&> (customToneCurve1);
-                                rtemp[ti * TS + tj] = CLIP<float> (rtemp[ti * TS + tj]);
-                                gtemp[ti * TS + tj] = CLIP<float> (gtemp[ti * TS + tj]);
-                                btemp[ti * TS + tj] = CLIP<float> (btemp[ti * TS + tj]);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode == ToneCurveParams::TcMode::WEIGHTEDSTD) { // apply the curve to the rgb channels, weighted
-                        const WeightedStdToneCurve& userToneCurve = static_cast<const WeightedStdToneCurve&> (customToneCurve1);
-
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                rtemp[ti * TS + tj] = CLIP<float> (rtemp[ti * TS + tj]);
-                                gtemp[ti * TS + tj] = CLIP<float> (gtemp[ti * TS + tj]);
-                                btemp[ti * TS + tj] = CLIP<float> (btemp[ti * TS + tj]);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode == ToneCurveParams::TcMode::LUMINANCE) { // apply the curve to the luminance channel
-                        const LuminanceToneCurve& userToneCurve = static_cast<const LuminanceToneCurve&> (customToneCurve1);
-
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                rtemp[ti * TS + tj] = CLIP<float> (rtemp[ti * TS + tj]);
-                                gtemp[ti * TS + tj] = CLIP<float> (gtemp[ti * TS + tj]);
-                                btemp[ti * TS + tj] = CLIP<float> (btemp[ti * TS + tj]);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode == ToneCurveParams::TcMode::PERCEPTUAL) { // apply curve while keeping color appearance constant
-                        const PerceptualToneCurve& userToneCurve = static_cast<const PerceptualToneCurve&> (customToneCurve1);
-
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                rtemp[ti * TS + tj] = CLIP<float> (rtemp[ti * TS + tj]);
-                                gtemp[ti * TS + tj] = CLIP<float> (gtemp[ti * TS + tj]);
-                                btemp[ti * TS + tj] = CLIP<float> (btemp[ti * TS + tj]);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj], ptc1ApplyState);
-                            }
-                        }
-                    }
+                    customToneCurve(customToneCurve1, curveMode, rtemp, gtemp, btemp, istart, tH, jstart, tW, TS, ptc1ApplyState);
                }

                if (editID == EUID_ToneCurve2) {  // filling the pipette buffer
-                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                            editIFloatTmpR[ti * TS + tj] = Color::gamma2curve[rtemp[ti * TS + tj]] / 65535.f;
-                            editIFloatTmpG[ti * TS + tj] = Color::gamma2curve[gtemp[ti * TS + tj]] / 65535.f;
-                            editIFloatTmpB[ti * TS + tj] = Color::gamma2curve[btemp[ti * TS + tj]] / 65535.f;
-                        }
-                    }
+                    fillEditFloat(editIFloatTmpR, editIFloatTmpG, editIFloatTmpB, rtemp, gtemp, btemp, istart, tH, jstart, tW, TS);
                }

                if (hasToneCurve2) {
-                    if (curveMode2 == ToneCurveParams::TcMode::STD) { // Standard
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve2);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode2 == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                const AdobeToneCurve& userToneCurve = static_cast<const AdobeToneCurve&> (customToneCurve2);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode2 == ToneCurveParams::TcMode::SATANDVALBLENDING) { // apply the curve on the saturation and value channels
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                const SatAndValueBlendingToneCurve& userToneCurve = static_cast<const SatAndValueBlendingToneCurve&> (customToneCurve2);
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode2 == ToneCurveParams::TcMode::WEIGHTEDSTD) { // apply the curve to the rgb channels, weighted
-                        const WeightedStdToneCurve& userToneCurve = static_cast<const WeightedStdToneCurve&> (customToneCurve2);
-
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode2 == ToneCurveParams::TcMode::LUMINANCE) { // apply the curve to the luminance channel
-                        const LuminanceToneCurve& userToneCurve = static_cast<const LuminanceToneCurve&> (customToneCurve2);
-
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    } else if (curveMode2 == ToneCurveParams::TcMode::PERCEPTUAL) { // apply curve while keeping color appearance constant
-                        const PerceptualToneCurve& userToneCurve = static_cast<const PerceptualToneCurve&> (customToneCurve2);
-
-                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                                userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj], ptc2ApplyState);
-                            }
-                        }
-                    }
+                    customToneCurve(customToneCurve2, curveMode2, rtemp, gtemp, btemp, istart, tH, jstart, tW, TS, ptc2ApplyState);
                }

                if (editID == EUID_RGB_R) {
@ -3942,20 +4020,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                }

                if (isProPhoto) { // this is a hack to avoid the blue=>black bug (Issue 2141)
-                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                            float r = rtemp[ti * TS + tj];
-                            float g = gtemp[ti * TS + tj];
-
-                            if (r == 0.0f || g == 0.0f) {
-                                float b = btemp[ti * TS + tj];
-                                float h, s, v;
-                                Color::rgb2hsv (r, g, b, h, s, v);
-                                s *= 0.99f;
-                                Color::hsv2rgb (h, s, v, rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
-                            }
-                        }
-                    }
+                    proPhotoBlue(rtemp, gtemp, btemp, istart, tH, jstart, tW, TS);
                }

                if (hasColorToning && !blackwhite) {
@ -4159,13 +4224,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer

                // filling the pipette buffer
                if (editID == EUID_BlackWhiteBeforeCurve) {
-                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-                            editIFloatTmpR[ti * TS + tj] = Color::gamma2curve[rtemp[ti * TS + tj]] / 65535.f;
-                            editIFloatTmpG[ti * TS + tj] = Color::gamma2curve[gtemp[ti * TS + tj]] / 65535.f;
-                            editIFloatTmpB[ti * TS + tj] = Color::gamma2curve[btemp[ti * TS + tj]] / 65535.f;
-                        }
-                    }
+                    fillEditFloat(editIFloatTmpR, editIFloatTmpG, editIFloatTmpB, rtemp, gtemp, btemp, istart, tH, jstart, tW, TS);
                } else if (editID == EUID_BlackWhiteLuminance) {
                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
@ -4458,53 +4517,24 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer


                if (!blackwhite) {
+                    if (editImgFloat || editWhatever) {
+                        for (int i = istart, ti = 0; i < tH; i++, ti++) {
+                            for (int j = jstart, tj = 0; j < tW; j++, tj++) {
+
+                                // filling the pipette buffer by the content of the temp pipette buffers
+                                if (editImgFloat) {
+                                    editImgFloat->r (i, j) = editIFloatTmpR[ti * TS + tj];
+                                    editImgFloat->g (i, j) = editIFloatTmpG[ti * TS + tj];
+                                    editImgFloat->b (i, j) = editIFloatTmpB[ti * TS + tj];
+                                } else if (editWhatever) {
+                                    editWhatever->v (i, j) = editWhateverTmp[ti * TS + tj];
+                                }
+                            }
+                        }
+                    }
                    // ready, fill lab
                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
-                        for (int j = jstart, tj = 0; j < tW; j++, tj++) {
-
-                            // filling the pipette buffer by the content of the temp pipette buffers
-                            if (editImgFloat) {
-                                editImgFloat->r (i, j) = editIFloatTmpR[ti * TS + tj];
-                                editImgFloat->g (i, j) = editIFloatTmpG[ti * TS + tj];
-                                editImgFloat->b (i, j) = editIFloatTmpB[ti * TS + tj];
-                            } else if (editWhatever) {
-                                editWhatever->v (i, j) = editWhateverTmp[ti * TS + tj];
-                            }
-
-                            float r = rtemp[ti * TS + tj];
-                            float g = gtemp[ti * TS + tj];
-                            float b = btemp[ti * TS + tj];
-
-                            float x = toxyz[0][0] * r + toxyz[0][1] * g + toxyz[0][2] * b;
-                            float y = toxyz[1][0] * r + toxyz[1][1] * g + toxyz[1][2] * b;
-                            float z = toxyz[2][0] * r + toxyz[2][1] * g + toxyz[2][2] * b;
-
-                            float fx, fy, fz;
-
-                            fx = (x < 65535.0f ? Color::cachef[x] : 327.68f * std::cbrt (x / MAXVALF));
-                            fy = (y < 65535.0f ? Color::cachef[y] : 327.68f * std::cbrt (y / MAXVALF));
-                            fz = (z < 65535.0f ? Color::cachef[z] : 327.68f * std::cbrt (z / MAXVALF));
-
-                            lab->L[i][j] = (116.0f *  fy - 5242.88f); //5242.88=16.0*327.68;
-                            lab->a[i][j] = (500.0f * (fx - fy) );
-                            lab->b[i][j] = (200.0f * (fy - fz) );
-
-                            //test for color accuracy
-                            /*
-                            float fy = (0.00862069 * lab->L[i][j])/327.68 + 0.137932; // (L+16)/116
-                            float fx = (0.002 * lab->a[i][j])/327.68 + fy;
-                            float fz = fy - (0.005 * lab->b[i][j])/327.68;
-
-                            float x_ = 65535*Lab2xyz(fx)*Color::D50x;
-                            float y_ = 65535*Lab2xyz(fy);
-                            float z_ = 65535*Lab2xyz(fz)*Color::D50z;
-
-                            int R,G,B;
-                            xyz2srgb(x_,y_,z_,R,G,B);
-                            r=(float)R; g=(float)G; b=(float)B;
-                            float xxx=1;
-                            */
-                        }
+                        Color::RGB2Lab(&rtemp[ti * TS], &gtemp[ti * TS], &btemp[ti * TS], &(lab->L[i][jstart]), &(lab->a[i][jstart]), &(lab->b[i][jstart]), toxyz, tW - jstart);
                    }
                } else { // black & white
                    // Auto channel mixer needs whole image, so we now copy to tmpImage and close the tiled processing
@ -4916,25 +4946,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
 #endif

        for (int i = 0; i < tH; i++) {
-            for (int j = 0; j < tW; j++) {
-                float r = tmpImage->r (i, j);
-                float g = tmpImage->g (i, j);
-                float b = tmpImage->b (i, j);
-
-                float x = toxyz[0][0] * r + toxyz[0][1] * g + toxyz[0][2] * b;
-                float y = toxyz[1][0] * r + toxyz[1][1] * g + toxyz[1][2] * b;
-                float z = toxyz[2][0] * r + toxyz[2][1] * g + toxyz[2][2] * b;
-
-                float fx, fy, fz;
-
-                fx = (x < MAXVALF ? Color::cachef[x] : 327.68f * std::cbrt (x / MAXVALF));
-                fy = (y < MAXVALF ? Color::cachef[y] : 327.68f * std::cbrt (y / MAXVALF));
-                fz = (z < MAXVALF ? Color::cachef[z] : 327.68f * std::cbrt (z / MAXVALF));
-
-                lab->L[i][j] = 116.0f *  fy - 5242.88f; //5242.88=16.0*327.68;
-                lab->a[i][j] = 500.0f * (fx - fy);
-                lab->b[i][j] = 200.0f * (fy - fz);
-            }
+            Color::RGB2Lab(tmpImage->r(i), tmpImage->g(i), tmpImage->b(i), lab->L[i], lab->a[i], lab->b[i], toxyz, tW);
        }