Merge branch 'master' into gtk3

2016-05-06 18:51:21 +02:00 · 2016-05-06 18:51:21 +02:00 · 0074a5d429
commit 0074a5d429
parent ab0783e6fe 31b2589b9b
2 changed files with 34 additions and 24 deletions
--- a/rtengine/clutstore.cc
+++ b/rtengine/clutstore.cc
@ -56,7 +56,7 @@ bool loadFile(
            img_src.convertColorSpace(img_float.get(), icm, curr_wb);
        }

-        AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 4);
+        AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 8); // + 8 because of SSE4_1 version of getClutValue

        std::size_t index = 0;

@ -81,9 +81,9 @@ bool loadFile(
 vfloat getClutValue(const AlignedBuffer<std::uint16_t>& clut_image, size_t index)
 {
 #ifdef __SSE4_1__
-    return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(*reinterpret_cast<const __m128i*>(clut_image.data + index)));
+    return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(clut_image.data + index))));
 #else
-    return _mm_cvtpu16_ps(*reinterpret_cast<const __m64*>(clut_image.data + index));
+    return _mm_set_ps(clut_image.data[index + 3], clut_image.data[index + 2], clut_image.data[index + 1], clut_image.data[index]);
 #endif
 }
 #endif
@ -205,8 +205,8 @@ void rtengine::HaldCLUT::getRGB(
        out_rgbx[2] = intp<float>(strength, out_rgbx[2], *b);
 #else
        const vfloat v_in = _mm_set_ps(0.0f, *b, *g, *r);
-        const vfloat v_tmp = v_in * _mm_load_ps1(&flevel_minus_one);
-        const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp)));
+        const vfloat v_tmp = v_in * F2V(flevel_minus_one);
+        const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(F2V(flevel_minus_two), v_tmp)));

        size_t index = color * 4;

--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@ -3209,10 +3209,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
    bool clutAndWorkingProfilesAreSame = false;
    TMatrix work2xyz, xyz2clut, clut2xyz, xyz2work;
 #ifdef __SSE2__
-    vfloat v_work2xyz[3][3];
-    vfloat v_xyz2clut[3][3];
-    vfloat v_clut2xyz[3][3];
-    vfloat v_xyz2work[3][3];
+    vfloat v_work2xyz[3][3] ALIGNED16;
+    vfloat v_xyz2clut[3][3] ALIGNED16;
+    vfloat v_clut2xyz[3][3] ALIGNED16;
+    vfloat v_xyz2work[3][3] ALIGNED16;
 #endif

    if ( params->filmSimulation.enabled && !params->filmSimulation.clutFilename.empty() ) {
@ -3227,6 +3227,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working );
                clut2xyz = iccStore->workingSpaceMatrix( hald_clut->getProfile() );
 #ifdef __SSE2__
+
                for (int i = 0; i < 3; ++i) {
                    for (int j = 0; j < 3; ++j) {
                        v_work2xyz[i][j] = F2V(work2xyz[i][j]);
@ -3235,6 +3236,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                        v_clut2xyz[i][j] = F2V(clut2xyz[i][j]);
                    }
                }
+
 #endif
            }
        }
@ -3452,6 +3454,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
            editWhateverTmp = (float(*))data;
        }

+        float out_rgbx[4 * TS] ALIGNED16; // Line buffer for CLUT

 #ifdef _OPENMP
        #pragma omp for schedule(dynamic) collapse(2)
@ -4352,8 +4355,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer

                // Film Simulations
                if (hald_clut) {
-                    float out_rgbx[4 * TS] ALIGNED16;
-

                    for (int i = istart, ti = 0; i < tH; i++, ti++) {
                        if (!clutAndWorkingProfilesAreSame) {
@ -4361,10 +4362,11 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                            int j = jstart;
                            int tj = 0;
 #ifdef __SSE2__
+
                            for (; j < tW - 3; j += 4, tj += 4) {
-                                vfloat sourceR = LVF(rtemp[ti * TS + tj]);
-                                vfloat sourceG = LVF(gtemp[ti * TS + tj]);
-                                vfloat sourceB = LVF(btemp[ti * TS + tj]);
+                                vfloat sourceR = LVFU(rtemp[ti * TS + tj]);
+                                vfloat sourceG = LVFU(gtemp[ti * TS + tj]);
+                                vfloat sourceB = LVFU(btemp[ti * TS + tj]);

                                vfloat x;
                                vfloat y;
@ -4372,11 +4374,13 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_work2xyz);
                                Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2clut);

-                                STVF(rtemp[ti * TS + tj], sourceR);
-                                STVF(gtemp[ti * TS + tj], sourceG);
-                                STVF(btemp[ti * TS + tj], sourceB);
+                                STVFU(rtemp[ti * TS + tj], sourceR);
+                                STVFU(gtemp[ti * TS + tj], sourceG);
+                                STVFU(btemp[ti * TS + tj], sourceB);
                            }
+
 #endif
+
                            for (; j < tW; j++, tj++) {
                                float &sourceR = rtemp[ti * TS + tj];
                                float &sourceG = gtemp[ti * TS + tj];
@ -4425,10 +4429,11 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                            int j = jstart;
                            int tj = 0;
 #ifdef __SSE2__
+
                            for (; j < tW - 3; j += 4, tj += 4) {
-                                vfloat sourceR = LVF(rtemp[ti * TS + tj]);
-                                vfloat sourceG = LVF(gtemp[ti * TS + tj]);
-                                vfloat sourceB = LVF(btemp[ti * TS + tj]);
+                                vfloat sourceR = LVFU(rtemp[ti * TS + tj]);
+                                vfloat sourceG = LVFU(gtemp[ti * TS + tj]);
+                                vfloat sourceB = LVFU(btemp[ti * TS + tj]);

                                vfloat x;
                                vfloat y;
@ -4436,11 +4441,13 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
                                Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_clut2xyz);
                                Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2work);

-                                STVF(rtemp[ti * TS + tj], sourceR);
-                                STVF(gtemp[ti * TS + tj], sourceG);
-                                STVF(btemp[ti * TS + tj], sourceB);
+                                STVFU(rtemp[ti * TS + tj], sourceR);
+                                STVFU(gtemp[ti * TS + tj], sourceG);
+                                STVFU(btemp[ti * TS + tj], sourceB);
                            }
+
 #endif
+
                            for (; j < tW; j++, tj++) {
                                float &sourceR = rtemp[ti * TS + tj];
                                float &sourceG = gtemp[ti * TS + tj];
@ -7185,6 +7192,7 @@ SSEFUNCTION void ImProcFunctions::lab2rgb(const LabImage &src, Imagefloat &dst,
            wipv[i][j] = F2V(wiprof[i][j]);
        }
    }
+
 #endif

 #ifdef _OPENMP
@ -7194,9 +7202,10 @@ SSEFUNCTION void ImProcFunctions::lab2rgb(const LabImage &src, Imagefloat &dst,
    for(int i = 0; i < H; i++) {
        int j = 0;
 #ifdef __SSE2__
+
        for(; j < W - 3; j += 4) {
            vfloat X, Y, Z;
-            vfloat R,G,B;
+            vfloat R, G, B;
            Color::Lab2XYZ(LVFU(src.L[i][j]), LVFU(src.a[i][j]), LVFU(src.b[i][j]), X, Y, Z);
            Color::xyz2rgb(X, Y, Z, R, G, B, wipv);
            STVFU(dst.r(i, j), R);
@ -7205,6 +7214,7 @@ SSEFUNCTION void ImProcFunctions::lab2rgb(const LabImage &src, Imagefloat &dst,
        }

 #endif
+
        for(; j < W; j++) {
            float X, Y, Z;
            Color::Lab2XYZ(src.L[i][j], src.a[i][j], src.b[i][j], X, Y, Z);