diff --git a/rtengine/alignedbuffer.h b/rtengine/alignedbuffer.h index dd9d7b278..560f0884f 100644 --- a/rtengine/alignedbuffer.h +++ b/rtengine/alignedbuffer.h @@ -21,6 +21,10 @@ #include #include +inline size_t padToAlignment(size_t size, size_t align = 16) { + return align * ((size + align - 1) / align); +} + // Aligned buffer that should be faster template class AlignedBuffer { diff --git a/rtengine/curves.h b/rtengine/curves.h index c616c94da..e443d430c 100644 --- a/rtengine/curves.h +++ b/rtengine/curves.h @@ -800,6 +800,13 @@ class StandardToneCurve : public ToneCurve { public: void Apply(float& r, float& g, float& b) const; + + // Applies the tone curve to `r`, `g`, `b` arrays, starting at `r[start]` + // and ending at `r[end]` (and respectively for `b` and `g`). Uses SSE + // and requires that `r`, `g`, and `b` pointers have the same alignment. + void BatchApply( + const size_t start, const size_t end, + float *r, float *g, float *b) const; }; class AdobeToneCurve : public ToneCurve @@ -874,6 +881,52 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const g = lutToneCurve[g]; b = lutToneCurve[b]; } +inline void StandardToneCurve::BatchApply( + const size_t start, const size_t end, + float *r, float *g, float *b) const { + assert (lutToneCurve); + + // All pointers must have the same alignment for SSE usage. In the loop body below, + // we will only check `r`, assuming that the same result would hold for `g` and `b`. + assert (reinterpret_cast(r) % 16 == reinterpret_cast(g) % 16); + assert (reinterpret_cast(g) % 16 == reinterpret_cast(b) % 16); + + size_t i = start; + while (true) { + if (i >= end) { + // If we get to the end before getting to an aligned address, just return. + // (Or, for non-SSE mode, if we get to the end.) + return; +#if defined( __SSE2__ ) && defined( __x86_64__ ) + } else if (reinterpret_cast(&r[i]) % 16 == 0) { + // Otherwise, we get to the first aligned address; go to the SSE part. + break; +#endif + } + r[i] = lutToneCurve[r[i]]; + g[i] = lutToneCurve[g[i]]; + b[i] = lutToneCurve[b[i]]; + i++; + } + +#if defined( __SSE2__ ) && defined( __x86_64__ ) + for (; i + 3 < end; i += 4) { + __m128i r_val = _mm_cvtps_epi32(LVF(r[i])); + __m128i g_val = _mm_cvtps_epi32(LVF(g[i])); + __m128i b_val = _mm_cvtps_epi32(LVF(b[i])); + STVF(r[i], lutToneCurve[r_val]); + STVF(g[i], lutToneCurve[g_val]); + STVF(b[i], lutToneCurve[b_val]); + } + + // Remainder in non-SSE. + for (; i < end; ++i) { + r[i] = lutToneCurve[r[i]]; + g[i] = lutToneCurve[g[i]]; + b[i] = lutToneCurve[b[i]]; + } +#endif +} // Tone curve according to Adobe's reference implementation // values in 0xffff space diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 1d38f6be1..082799e62 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -23,6 +23,7 @@ #include #endif +#include "alignedbuffer.h" #include "rtengine.h" #include "improcfun.h" #include "curves.h" @@ -3409,31 +3410,28 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer #pragma omp parallel if (multiThread) #endif { - char *buffer; + size_t perChannelSizeBytes = padToAlignment(sizeof (float) * TS * TS + 4 * 64); + AlignedBuffer buffer(3 * perChannelSizeBytes); char *editIFloatBuffer = nullptr; char *editWhateverBuffer = nullptr; - buffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63); - char *data; - data = (char*) ( ( uintptr_t (buffer) + uintptr_t (63)) / 64 * 64); - - float *rtemp = (float (*))data; - float *gtemp = (float (*)) ((char*)rtemp + sizeof (float) * TS * TS + 4 * 64); - float *btemp = (float (*)) ((char*)gtemp + sizeof (float) * TS * TS + 8 * 64); + float *rtemp = buffer.data; + float *gtemp = &rtemp[perChannelSizeBytes / sizeof(float)]; + float *btemp = >emp[perChannelSizeBytes / sizeof(float)]; int istart; int jstart; int tW; int tH; // zero out the buffers - memset(buffer, 0, 3 * sizeof (float) * TS * TS + 20 * 64 + 63); + memset(rtemp, 0, 3 * perChannelSizeBytes); // Allocating buffer for the PipetteBuffer float *editIFloatTmpR = nullptr, *editIFloatTmpG = nullptr, *editIFloatTmpB = nullptr, *editWhateverTmp = nullptr; if (editImgFloat) { editIFloatBuffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63); - data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64); + char *data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64); editIFloatTmpR = (float (*))data; editIFloatTmpG = (float (*)) ((char*)editIFloatTmpR + sizeof (float) * TS * TS + 4 * 64); @@ -3442,7 +3440,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer if (editWhatever) { editWhateverBuffer = (char *) malloc (sizeof (float) * TS * TS + 20 * 64 + 63); - data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64); + char *data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64); editWhateverTmp = (float (*))data; } @@ -3618,10 +3616,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer if (hasToneCurve1) { if (curveMode == ToneCurveParams::TcMode::STD) { // Standard for (int i = istart, ti = 0; i < tH; i++, ti++) { - for (int j = jstart, tj = 0; j < tW; j++, tj++) { - const StandardToneCurve& userToneCurve = static_cast (customToneCurve1); - userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]); - } + const StandardToneCurve& userToneCurve = static_cast (customToneCurve1); + userToneCurve.BatchApply ( + 0, tW - jstart, + &rtemp[ti * TS], >emp[ti * TS], &btemp[ti * TS]); } } else if (curveMode == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like for (int i = istart, ti = 0; i < tH; i++, ti++) { @@ -4529,8 +4527,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } - free (buffer); - if (editIFloatBuffer) { free (editIFloatBuffer); }