From 3ccfb9b2031111ee72666b81c3579069c17e0949 Mon Sep 17 00:00:00 2001 From: gatoatigrado Date: Mon, 25 Dec 2017 00:41:43 -0500 Subject: [PATCH 1/5] Use AlignedBuffer helper class in rgbProc, use SSE in standard tone curve application. --- rtengine/alignedbuffer.h | 4 +++ rtengine/curves.h | 53 ++++++++++++++++++++++++++++++++++++++++ rtengine/improcfun.cc | 30 ++++++++++------------- 3 files changed, 70 insertions(+), 17 deletions(-) diff --git a/rtengine/alignedbuffer.h b/rtengine/alignedbuffer.h index dd9d7b278..560f0884f 100644 --- a/rtengine/alignedbuffer.h +++ b/rtengine/alignedbuffer.h @@ -21,6 +21,10 @@ #include #include +inline size_t padToAlignment(size_t size, size_t align = 16) { + return align * ((size + align - 1) / align); +} + // Aligned buffer that should be faster template class AlignedBuffer { diff --git a/rtengine/curves.h b/rtengine/curves.h index c616c94da..e443d430c 100644 --- a/rtengine/curves.h +++ b/rtengine/curves.h @@ -800,6 +800,13 @@ class StandardToneCurve : public ToneCurve { public: void Apply(float& r, float& g, float& b) const; + + // Applies the tone curve to `r`, `g`, `b` arrays, starting at `r[start]` + // and ending at `r[end]` (and respectively for `b` and `g`). Uses SSE + // and requires that `r`, `g`, and `b` pointers have the same alignment. + void BatchApply( + const size_t start, const size_t end, + float *r, float *g, float *b) const; }; class AdobeToneCurve : public ToneCurve @@ -874,6 +881,52 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const g = lutToneCurve[g]; b = lutToneCurve[b]; } +inline void StandardToneCurve::BatchApply( + const size_t start, const size_t end, + float *r, float *g, float *b) const { + assert (lutToneCurve); + + // All pointers must have the same alignment for SSE usage. In the loop body below, + // we will only check `r`, assuming that the same result would hold for `g` and `b`. + assert (reinterpret_cast(r) % 16 == reinterpret_cast(g) % 16); + assert (reinterpret_cast(g) % 16 == reinterpret_cast(b) % 16); + + size_t i = start; + while (true) { + if (i >= end) { + // If we get to the end before getting to an aligned address, just return. + // (Or, for non-SSE mode, if we get to the end.) + return; +#if defined( __SSE2__ ) && defined( __x86_64__ ) + } else if (reinterpret_cast(&r[i]) % 16 == 0) { + // Otherwise, we get to the first aligned address; go to the SSE part. + break; +#endif + } + r[i] = lutToneCurve[r[i]]; + g[i] = lutToneCurve[g[i]]; + b[i] = lutToneCurve[b[i]]; + i++; + } + +#if defined( __SSE2__ ) && defined( __x86_64__ ) + for (; i + 3 < end; i += 4) { + __m128i r_val = _mm_cvtps_epi32(LVF(r[i])); + __m128i g_val = _mm_cvtps_epi32(LVF(g[i])); + __m128i b_val = _mm_cvtps_epi32(LVF(b[i])); + STVF(r[i], lutToneCurve[r_val]); + STVF(g[i], lutToneCurve[g_val]); + STVF(b[i], lutToneCurve[b_val]); + } + + // Remainder in non-SSE. + for (; i < end; ++i) { + r[i] = lutToneCurve[r[i]]; + g[i] = lutToneCurve[g[i]]; + b[i] = lutToneCurve[b[i]]; + } +#endif +} // Tone curve according to Adobe's reference implementation // values in 0xffff space diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 1d38f6be1..082799e62 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -23,6 +23,7 @@ #include #endif +#include "alignedbuffer.h" #include "rtengine.h" #include "improcfun.h" #include "curves.h" @@ -3409,31 +3410,28 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer #pragma omp parallel if (multiThread) #endif { - char *buffer; + size_t perChannelSizeBytes = padToAlignment(sizeof (float) * TS * TS + 4 * 64); + AlignedBuffer buffer(3 * perChannelSizeBytes); char *editIFloatBuffer = nullptr; char *editWhateverBuffer = nullptr; - buffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63); - char *data; - data = (char*) ( ( uintptr_t (buffer) + uintptr_t (63)) / 64 * 64); - - float *rtemp = (float (*))data; - float *gtemp = (float (*)) ((char*)rtemp + sizeof (float) * TS * TS + 4 * 64); - float *btemp = (float (*)) ((char*)gtemp + sizeof (float) * TS * TS + 8 * 64); + float *rtemp = buffer.data; + float *gtemp = &rtemp[perChannelSizeBytes / sizeof(float)]; + float *btemp = >emp[perChannelSizeBytes / sizeof(float)]; int istart; int jstart; int tW; int tH; // zero out the buffers - memset(buffer, 0, 3 * sizeof (float) * TS * TS + 20 * 64 + 63); + memset(rtemp, 0, 3 * perChannelSizeBytes); // Allocating buffer for the PipetteBuffer float *editIFloatTmpR = nullptr, *editIFloatTmpG = nullptr, *editIFloatTmpB = nullptr, *editWhateverTmp = nullptr; if (editImgFloat) { editIFloatBuffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63); - data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64); + char *data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64); editIFloatTmpR = (float (*))data; editIFloatTmpG = (float (*)) ((char*)editIFloatTmpR + sizeof (float) * TS * TS + 4 * 64); @@ -3442,7 +3440,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer if (editWhatever) { editWhateverBuffer = (char *) malloc (sizeof (float) * TS * TS + 20 * 64 + 63); - data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64); + char *data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64); editWhateverTmp = (float (*))data; } @@ -3618,10 +3616,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer if (hasToneCurve1) { if (curveMode == ToneCurveParams::TcMode::STD) { // Standard for (int i = istart, ti = 0; i < tH; i++, ti++) { - for (int j = jstart, tj = 0; j < tW; j++, tj++) { - const StandardToneCurve& userToneCurve = static_cast (customToneCurve1); - userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]); - } + const StandardToneCurve& userToneCurve = static_cast (customToneCurve1); + userToneCurve.BatchApply ( + 0, tW - jstart, + &rtemp[ti * TS], >emp[ti * TS], &btemp[ti * TS]); } } else if (curveMode == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like for (int i = istart, ti = 0; i < tH; i++, ti++) { @@ -4529,8 +4527,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } - free (buffer); - if (editIFloatBuffer) { free (editIFloatBuffer); } From ebc92e1c350b028ae16a113051b8c3e46d4da0f1 Mon Sep 17 00:00:00 2001 From: gatoatigrado Date: Mon, 25 Dec 2017 14:55:14 -0500 Subject: [PATCH 2/5] New SSE interpolating routine for LUT. --- rtengine/LUT.h | 94 +++++++++++++++++------------------------------ rtengine/curves.h | 7 ++-- 2 files changed, 38 insertions(+), 63 deletions(-) diff --git a/rtengine/LUT.h b/rtengine/LUT.h index 2701c4ffc..4f245634e 100644 --- a/rtengine/LUT.h +++ b/rtengine/LUT.h @@ -95,6 +95,8 @@ protected: // list of variables ordered to improve cache speed unsigned int maxs; float maxsf; + // possibly-more-correct value for sse routine (see unit test for details) + float maxIndexFloat; T * data; unsigned int clip; unsigned int size; @@ -129,6 +131,7 @@ public: upperBound = size - 1; maxs = size - 2; maxsf = (float)maxs; + maxIndexFloat = ((float)upperBound) - 1e-5; #if defined( __SSE2__ ) && defined( __x86_64__ ) maxsv = F2V( maxs ); sizeiv = _mm_set1_epi32( (int)(size - 1) ); @@ -158,6 +161,7 @@ public: upperBound = size - 1; maxs = size - 2; maxsf = (float)maxs; + maxIndexFloat = ((float)upperBound) - 1e-5; #if defined( __SSE2__ ) && defined( __x86_64__ ) maxsv = F2V( maxs ); sizeiv = _mm_set1_epi32( (int)(size - 1) ); @@ -228,6 +232,7 @@ public: this->upperBound = rhs.upperBound; this->maxs = this->size - 2; this->maxsf = (float)this->maxs; + this->maxIndexFloat = ((float)this->upperBound) - 1e-5; #if defined( __SSE2__ ) && defined( __x86_64__ ) this->maxsv = F2V( this->size - 2); this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) ); @@ -293,72 +298,37 @@ public: } #if defined( __SSE2__ ) && defined( __x86_64__ ) -/* - vfloat operator[](vfloat indexv ) const + vfloat operator[](vfloat indexv) const { -// printf("don't use this operator. It's not ready for production"); - return _mm_setzero_ps(); + static_assert(std::is_same::value, "This method only works for float LUTs"); - // convert floats to ints - vint idxv = _mm_cvttps_epi32( indexv ); - vfloat tempv, resultv, p1v, p2v; - vmask maxmask = vmaskf_gt(indexv, maxsv); - idxv = _mm_castps_si128(vself(maxmask, maxsv, _mm_castsi128_ps(idxv))); - vmask minmask = vmaskf_lt(indexv, _mm_setzero_ps()); - idxv = _mm_castps_si128(vself(minmask, _mm_setzero_ps(), _mm_castsi128_ps(idxv))); - // access the LUT 4 times and shuffle the values into p1v and p2v + // Clamp and convert to integer values. Extract out of SSE register because all + // lookup operations use regular addresses. + vfloat clampedIndexes = _mm_max_ps( + _mm_setzero_ps(), + _mm_min_ps(_mm_set1_ps(maxIndexFloat), indexv)); + vint indexes = _mm_cvttps_epi32(clampedIndexes); + int indexArray[4]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); - int idx; + // Load data from the table. This reads more than necessary, but there don't seem + // to exist more granular operations (though we could try non-SSE). + // Cast to int for convenience in the next operation (partial transpose). + vint values[4]; + for (int i = 0; i < 4; ++i) { + values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); + } - // get 4th value - idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3))); - tempv = LVFU(data[idx]); - p1v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(0, 0, 0, 0)); - p2v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1)); - // now p1v is 3 3 3 3 - // p2v is 3 3 3 3 + // Partial 4x4 transpose operation. We want two new vectors, the first consisting + // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. + __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); + __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); + vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); + vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); - // get 3rd value - idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2))); - tempv = LVFU(data[idx]); - p1v = _mm_move_ss( p1v, tempv); - tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1)); - p2v = _mm_move_ss( p2v, tempv); - // now p1v is 3 3 3 2 - // p2v is 3 3 3 2 - - // get 2nd value - idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1))); - tempv = LVFU(data[idx]); - p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(1, 0, 1, 0)); - p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(1, 0, 1, 0)); - // now p1v is 3 2 3 2 - // now p2v is 3 2 3 2 - p1v = _mm_move_ss( p1v, tempv ); - // now p1v is 3 2 3 1 - tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1)); - p2v = _mm_move_ss( p2v, tempv); - // now p1v is 3 2 3 1 - - // get 1st value - idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(0, 0, 0, 0))); - tempv = LVFU(data[idx]); - p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(3, 2, 0, 0)); - // now p1v is 3 2 1 1 - p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(3, 2, 0, 0)); - // now p2v is 3 2 1 1 - p1v = _mm_move_ss( p1v, tempv ); - // now p1v is 3 2 1 0 - tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1)); - p2v = _mm_move_ss( p2v, tempv); - // now p2v is 3 2 1 0 - - vfloat diffv = indexv - _mm_cvtepi32_ps ( idxv ); - diffv = vself(vorm(maxmask, minmask), _mm_setzero_ps(), diffv); - resultv = p1v + p2v * diffv; - return resultv ; + vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes); + return (_mm_set1_ps(1.0f) - diff) * lower + (diff * upper); } -*/ #ifdef __SSE4_1__ template::value>::type> vfloat operator[](vint idxv ) const @@ -456,6 +426,8 @@ public: } idx = 0; + // Note: Maybe this should be 'idx > maxsf'? See unit test where a LUT with + // values [10, 11, 12, 13] gets looked up at 2.5 and returns 12.5. } else if (index > maxsf) { if (clip & LUT_CLIP_ABOVE) { return data[upperBound]; @@ -543,6 +515,7 @@ public: maxs = 0; maxsf = 0.f; clip = 0; + maxIndexFloat = ((float)upperBound) - 1e-5; } // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor) @@ -652,6 +625,7 @@ public: upperBound = size - 1; maxs = size - 2; maxsf = (float)maxs; + maxIndexFloat = ((float)upperBound) - 1e-5; #if defined( __SSE2__ ) && defined( __x86_64__ ) maxsv = F2V( size - 2); sizeiv = _mm_set1_epi32( (int)(size - 1) ); diff --git a/rtengine/curves.h b/rtengine/curves.h index e443d430c..15ab96624 100644 --- a/rtengine/curves.h +++ b/rtengine/curves.h @@ -881,6 +881,7 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const g = lutToneCurve[g]; b = lutToneCurve[b]; } + inline void StandardToneCurve::BatchApply( const size_t start, const size_t end, float *r, float *g, float *b) const { @@ -911,9 +912,9 @@ inline void StandardToneCurve::BatchApply( #if defined( __SSE2__ ) && defined( __x86_64__ ) for (; i + 3 < end; i += 4) { - __m128i r_val = _mm_cvtps_epi32(LVF(r[i])); - __m128i g_val = _mm_cvtps_epi32(LVF(g[i])); - __m128i b_val = _mm_cvtps_epi32(LVF(b[i])); + __m128 r_val = LVF(r[i]); + __m128 g_val = LVF(g[i]); + __m128 b_val = LVF(b[i]); STVF(r[i], lutToneCurve[r_val]); STVF(g[i], lutToneCurve[g_val]); STVF(b[i], lutToneCurve[b_val]); From 88ebaf618a223f3bb5ccddf15bbb252a26851046 Mon Sep 17 00:00:00 2001 From: gatoatigrado Date: Mon, 25 Dec 2017 18:13:05 -0500 Subject: [PATCH 3/5] Code review changes --- rtengine/LUT.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/rtengine/LUT.h b/rtengine/LUT.h index 4f245634e..9a700bc17 100644 --- a/rtengine/LUT.h +++ b/rtengine/LUT.h @@ -95,7 +95,8 @@ protected: // list of variables ordered to improve cache speed unsigned int maxs; float maxsf; - // possibly-more-correct value for sse routine (see unit test for details) + // For the SSE routine operator[](vfloat), we just clip float lookup values + // to just below the max value. float maxIndexFloat; T * data; unsigned int clip; @@ -125,7 +126,10 @@ public: #endif dirty = true; clip = flags; - data = new T[s]; + // Add a few extra elements so [](vfloat) won't access out-of-bounds memory. + // The routine would still produce the right answer, but might cause issues + // with address/heap checking programs. + data = new T[s + 3]; owner = 1; size = s; upperBound = size - 1; @@ -155,7 +159,8 @@ public: dirty = true; // Assumption! clip = flags; - data = new T[s]; + // See comment in constructor. + data = new T[s + 3]; owner = 1; size = s; upperBound = size - 1; @@ -222,7 +227,8 @@ public: } if (this->data == nullptr) { - this->data = new T[rhs.size]; + // See comment in constructor. + this->data = new T[rhs.size + 3]; } this->clip = rhs.clip; @@ -327,7 +333,7 @@ public: vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes); - return (_mm_set1_ps(1.0f) - diff) * lower + (diff * upper); + return vintpf(diff, upper, lower); } #ifdef __SSE4_1__ template::value>::type> @@ -426,9 +432,7 @@ public: } idx = 0; - // Note: Maybe this should be 'idx > maxsf'? See unit test where a LUT with - // values [10, 11, 12, 13] gets looked up at 2.5 and returns 12.5. - } else if (index > maxsf) { + } else if (idx > maxs) { if (clip & LUT_CLIP_ABOVE) { return data[upperBound]; } From 59e31ed36f179dbb3faff72181e920d892c8f513 Mon Sep 17 00:00:00 2001 From: gatoatigrado Date: Mon, 25 Dec 2017 22:46:05 -0500 Subject: [PATCH 4/5] Code review changes --- rtengine/LUT.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rtengine/LUT.h b/rtengine/LUT.h index 9a700bc17..b58144a6f 100644 --- a/rtengine/LUT.h +++ b/rtengine/LUT.h @@ -310,9 +310,7 @@ public: // Clamp and convert to integer values. Extract out of SSE register because all // lookup operations use regular addresses. - vfloat clampedIndexes = _mm_max_ps( - _mm_setzero_ps(), - _mm_min_ps(_mm_set1_ps(maxIndexFloat), indexv)); + vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxIndexFloat), indexv)); vint indexes = _mm_cvttps_epi32(clampedIndexes); int indexArray[4]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); From 6dab5742dd6541b5def11589e826c598791e2d14 Mon Sep 17 00:00:00 2001 From: gatoatigrado Date: Mon, 25 Dec 2017 22:50:40 -0500 Subject: [PATCH 5/5] Add comment and assertions --- rtengine/LUT.h | 7 +++++++ rtengine/curves.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/rtengine/LUT.h b/rtengine/LUT.h index b58144a6f..29147df8b 100644 --- a/rtengine/LUT.h +++ b/rtengine/LUT.h @@ -200,6 +200,10 @@ public: clip = flags; } + int getClip() const { + return clip; + } + /** @brief Get the number of element in the LUT (i.e. dimension of the array) * For a LUT(500), it will return 500 * @return number of element in the array @@ -304,6 +308,9 @@ public: } #if defined( __SSE2__ ) && defined( __x86_64__ ) + + // NOTE: This version requires LUTs which clip at upper and lower bounds + // (which is the default). vfloat operator[](vfloat indexv) const { static_assert(std::is_same::value, "This method only works for float LUTs"); diff --git a/rtengine/curves.h b/rtengine/curves.h index 15ab96624..e8b65c33d 100644 --- a/rtengine/curves.h +++ b/rtengine/curves.h @@ -886,6 +886,8 @@ inline void StandardToneCurve::BatchApply( const size_t start, const size_t end, float *r, float *g, float *b) const { assert (lutToneCurve); + assert (lutToneCurve.getClip() & LUT_CLIP_BELOW); + assert (lutToneCurve.getClip() & LUT_CLIP_ABOVE); // All pointers must have the same alignment for SSE usage. In the loop body below, // we will only check `r`, assuming that the same result would hold for `g` and `b`.