Merge pull request #4253 from gatoatigrado/use-sse-in-standard-tone-curve
Use AlignedBuffer helper class in rgbProc, use SSE in standard tone c…
This commit is contained in:
111
rtengine/LUT.h
111
rtengine/LUT.h
@@ -95,6 +95,9 @@ protected:
|
||||
// list of variables ordered to improve cache speed
|
||||
unsigned int maxs;
|
||||
float maxsf;
|
||||
// For the SSE routine operator[](vfloat), we just clip float lookup values
|
||||
// to just below the max value.
|
||||
float maxIndexFloat;
|
||||
T * data;
|
||||
unsigned int clip;
|
||||
unsigned int size;
|
||||
@@ -123,12 +126,16 @@ public:
|
||||
#endif
|
||||
dirty = true;
|
||||
clip = flags;
|
||||
data = new T[s];
|
||||
// Add a few extra elements so [](vfloat) won't access out-of-bounds memory.
|
||||
// The routine would still produce the right answer, but might cause issues
|
||||
// with address/heap checking programs.
|
||||
data = new T[s + 3];
|
||||
owner = 1;
|
||||
size = s;
|
||||
upperBound = size - 1;
|
||||
maxs = size - 2;
|
||||
maxsf = (float)maxs;
|
||||
maxIndexFloat = ((float)upperBound) - 1e-5;
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
maxsv = F2V( maxs );
|
||||
sizeiv = _mm_set1_epi32( (int)(size - 1) );
|
||||
@@ -152,12 +159,14 @@ public:
|
||||
|
||||
dirty = true; // Assumption!
|
||||
clip = flags;
|
||||
data = new T[s];
|
||||
// See comment in constructor.
|
||||
data = new T[s + 3];
|
||||
owner = 1;
|
||||
size = s;
|
||||
upperBound = size - 1;
|
||||
maxs = size - 2;
|
||||
maxsf = (float)maxs;
|
||||
maxIndexFloat = ((float)upperBound) - 1e-5;
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
maxsv = F2V( maxs );
|
||||
sizeiv = _mm_set1_epi32( (int)(size - 1) );
|
||||
@@ -191,6 +200,10 @@ public:
|
||||
clip = flags;
|
||||
}
|
||||
|
||||
int getClip() const {
|
||||
return clip;
|
||||
}
|
||||
|
||||
/** @brief Get the number of element in the LUT (i.e. dimension of the array)
|
||||
* For a LUT(500), it will return 500
|
||||
* @return number of element in the array
|
||||
@@ -218,7 +231,8 @@ public:
|
||||
}
|
||||
|
||||
if (this->data == nullptr) {
|
||||
this->data = new T[rhs.size];
|
||||
// See comment in constructor.
|
||||
this->data = new T[rhs.size + 3];
|
||||
}
|
||||
|
||||
this->clip = rhs.clip;
|
||||
@@ -228,6 +242,7 @@ public:
|
||||
this->upperBound = rhs.upperBound;
|
||||
this->maxs = this->size - 2;
|
||||
this->maxsf = (float)this->maxs;
|
||||
this->maxIndexFloat = ((float)this->upperBound) - 1e-5;
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
this->maxsv = F2V( this->size - 2);
|
||||
this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) );
|
||||
@@ -293,72 +308,38 @@ public:
|
||||
}
|
||||
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
/*
|
||||
vfloat operator[](vfloat indexv ) const
|
||||
|
||||
// NOTE: This version requires LUTs which clip at upper and lower bounds
|
||||
// (which is the default).
|
||||
vfloat operator[](vfloat indexv) const
|
||||
{
|
||||
// printf("don't use this operator. It's not ready for production");
|
||||
return _mm_setzero_ps();
|
||||
static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
|
||||
|
||||
// convert floats to ints
|
||||
vint idxv = _mm_cvttps_epi32( indexv );
|
||||
vfloat tempv, resultv, p1v, p2v;
|
||||
vmask maxmask = vmaskf_gt(indexv, maxsv);
|
||||
idxv = _mm_castps_si128(vself(maxmask, maxsv, _mm_castsi128_ps(idxv)));
|
||||
vmask minmask = vmaskf_lt(indexv, _mm_setzero_ps());
|
||||
idxv = _mm_castps_si128(vself(minmask, _mm_setzero_ps(), _mm_castsi128_ps(idxv)));
|
||||
// access the LUT 4 times and shuffle the values into p1v and p2v
|
||||
// Clamp and convert to integer values. Extract out of SSE register because all
|
||||
// lookup operations use regular addresses.
|
||||
vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxIndexFloat), indexv));
|
||||
vint indexes = _mm_cvttps_epi32(clampedIndexes);
|
||||
int indexArray[4];
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
|
||||
|
||||
int idx;
|
||||
// Load data from the table. This reads more than necessary, but there don't seem
|
||||
// to exist more granular operations (though we could try non-SSE).
|
||||
// Cast to int for convenience in the next operation (partial transpose).
|
||||
vint values[4];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
|
||||
}
|
||||
|
||||
// get 4th value
|
||||
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
tempv = LVFU(data[idx]);
|
||||
p1v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
p2v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
// now p1v is 3 3 3 3
|
||||
// p2v is 3 3 3 3
|
||||
// Partial 4x4 transpose operation. We want two new vectors, the first consisting
|
||||
// of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
|
||||
__m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
|
||||
__m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
|
||||
vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
|
||||
vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
|
||||
|
||||
// get 3rd value
|
||||
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
tempv = LVFU(data[idx]);
|
||||
p1v = _mm_move_ss( p1v, tempv);
|
||||
tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
p2v = _mm_move_ss( p2v, tempv);
|
||||
// now p1v is 3 3 3 2
|
||||
// p2v is 3 3 3 2
|
||||
|
||||
// get 2nd value
|
||||
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
tempv = LVFU(data[idx]);
|
||||
p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
// now p1v is 3 2 3 2
|
||||
// now p2v is 3 2 3 2
|
||||
p1v = _mm_move_ss( p1v, tempv );
|
||||
// now p1v is 3 2 3 1
|
||||
tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
p2v = _mm_move_ss( p2v, tempv);
|
||||
// now p1v is 3 2 3 1
|
||||
|
||||
// get 1st value
|
||||
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
tempv = LVFU(data[idx]);
|
||||
p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(3, 2, 0, 0));
|
||||
// now p1v is 3 2 1 1
|
||||
p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(3, 2, 0, 0));
|
||||
// now p2v is 3 2 1 1
|
||||
p1v = _mm_move_ss( p1v, tempv );
|
||||
// now p1v is 3 2 1 0
|
||||
tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
p2v = _mm_move_ss( p2v, tempv);
|
||||
// now p2v is 3 2 1 0
|
||||
|
||||
vfloat diffv = indexv - _mm_cvtepi32_ps ( idxv );
|
||||
diffv = vself(vorm(maxmask, minmask), _mm_setzero_ps(), diffv);
|
||||
resultv = p1v + p2v * diffv;
|
||||
return resultv ;
|
||||
vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes);
|
||||
return vintpf(diff, upper, lower);
|
||||
}
|
||||
*/
|
||||
#ifdef __SSE4_1__
|
||||
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
|
||||
vfloat operator[](vint idxv ) const
|
||||
@@ -456,7 +437,7 @@ public:
|
||||
}
|
||||
|
||||
idx = 0;
|
||||
} else if (index > maxsf) {
|
||||
} else if (idx > maxs) {
|
||||
if (clip & LUT_CLIP_ABOVE) {
|
||||
return data[upperBound];
|
||||
}
|
||||
@@ -543,6 +524,7 @@ public:
|
||||
maxs = 0;
|
||||
maxsf = 0.f;
|
||||
clip = 0;
|
||||
maxIndexFloat = ((float)upperBound) - 1e-5;
|
||||
}
|
||||
|
||||
// create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor)
|
||||
@@ -652,6 +634,7 @@ public:
|
||||
upperBound = size - 1;
|
||||
maxs = size - 2;
|
||||
maxsf = (float)maxs;
|
||||
maxIndexFloat = ((float)upperBound) - 1e-5;
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
maxsv = F2V( size - 2);
|
||||
sizeiv = _mm_set1_epi32( (int)(size - 1) );
|
||||
|
||||
@@ -21,6 +21,10 @@
|
||||
#include <cstdlib>
|
||||
#include <utility>
|
||||
|
||||
inline size_t padToAlignment(size_t size, size_t align = 16) {
|
||||
return align * ((size + align - 1) / align);
|
||||
}
|
||||
|
||||
// Aligned buffer that should be faster
|
||||
template <class T> class AlignedBuffer
|
||||
{
|
||||
|
||||
@@ -800,6 +800,13 @@ class StandardToneCurve : public ToneCurve
|
||||
{
|
||||
public:
|
||||
void Apply(float& r, float& g, float& b) const;
|
||||
|
||||
// Applies the tone curve to `r`, `g`, `b` arrays, starting at `r[start]`
|
||||
// and ending at `r[end]` (and respectively for `b` and `g`). Uses SSE
|
||||
// and requires that `r`, `g`, and `b` pointers have the same alignment.
|
||||
void BatchApply(
|
||||
const size_t start, const size_t end,
|
||||
float *r, float *g, float *b) const;
|
||||
};
|
||||
|
||||
class AdobeToneCurve : public ToneCurve
|
||||
@@ -875,6 +882,55 @@ inline void StandardToneCurve::Apply (float& r, float& g, float& b) const
|
||||
b = lutToneCurve[b];
|
||||
}
|
||||
|
||||
inline void StandardToneCurve::BatchApply(
|
||||
const size_t start, const size_t end,
|
||||
float *r, float *g, float *b) const {
|
||||
assert (lutToneCurve);
|
||||
assert (lutToneCurve.getClip() & LUT_CLIP_BELOW);
|
||||
assert (lutToneCurve.getClip() & LUT_CLIP_ABOVE);
|
||||
|
||||
// All pointers must have the same alignment for SSE usage. In the loop body below,
|
||||
// we will only check `r`, assuming that the same result would hold for `g` and `b`.
|
||||
assert (reinterpret_cast<uintptr_t>(r) % 16 == reinterpret_cast<uintptr_t>(g) % 16);
|
||||
assert (reinterpret_cast<uintptr_t>(g) % 16 == reinterpret_cast<uintptr_t>(b) % 16);
|
||||
|
||||
size_t i = start;
|
||||
while (true) {
|
||||
if (i >= end) {
|
||||
// If we get to the end before getting to an aligned address, just return.
|
||||
// (Or, for non-SSE mode, if we get to the end.)
|
||||
return;
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
} else if (reinterpret_cast<uintptr_t>(&r[i]) % 16 == 0) {
|
||||
// Otherwise, we get to the first aligned address; go to the SSE part.
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
r[i] = lutToneCurve[r[i]];
|
||||
g[i] = lutToneCurve[g[i]];
|
||||
b[i] = lutToneCurve[b[i]];
|
||||
i++;
|
||||
}
|
||||
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
for (; i + 3 < end; i += 4) {
|
||||
__m128 r_val = LVF(r[i]);
|
||||
__m128 g_val = LVF(g[i]);
|
||||
__m128 b_val = LVF(b[i]);
|
||||
STVF(r[i], lutToneCurve[r_val]);
|
||||
STVF(g[i], lutToneCurve[g_val]);
|
||||
STVF(b[i], lutToneCurve[b_val]);
|
||||
}
|
||||
|
||||
// Remainder in non-SSE.
|
||||
for (; i < end; ++i) {
|
||||
r[i] = lutToneCurve[r[i]];
|
||||
g[i] = lutToneCurve[g[i]];
|
||||
b[i] = lutToneCurve[b[i]];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Tone curve according to Adobe's reference implementation
|
||||
// values in 0xffff space
|
||||
// inlined to make sure there will be no cache flush when used
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "alignedbuffer.h"
|
||||
#include "rtengine.h"
|
||||
#include "improcfun.h"
|
||||
#include "curves.h"
|
||||
@@ -3409,31 +3410,28 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
|
||||
#pragma omp parallel if (multiThread)
|
||||
#endif
|
||||
{
|
||||
char *buffer;
|
||||
size_t perChannelSizeBytes = padToAlignment(sizeof (float) * TS * TS + 4 * 64);
|
||||
AlignedBuffer<float> buffer(3 * perChannelSizeBytes);
|
||||
char *editIFloatBuffer = nullptr;
|
||||
char *editWhateverBuffer = nullptr;
|
||||
|
||||
buffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63);
|
||||
char *data;
|
||||
data = (char*) ( ( uintptr_t (buffer) + uintptr_t (63)) / 64 * 64);
|
||||
|
||||
float *rtemp = (float (*))data;
|
||||
float *gtemp = (float (*)) ((char*)rtemp + sizeof (float) * TS * TS + 4 * 64);
|
||||
float *btemp = (float (*)) ((char*)gtemp + sizeof (float) * TS * TS + 8 * 64);
|
||||
float *rtemp = buffer.data;
|
||||
float *gtemp = &rtemp[perChannelSizeBytes / sizeof(float)];
|
||||
float *btemp = >emp[perChannelSizeBytes / sizeof(float)];
|
||||
int istart;
|
||||
int jstart;
|
||||
int tW;
|
||||
int tH;
|
||||
|
||||
// zero out the buffers
|
||||
memset(buffer, 0, 3 * sizeof (float) * TS * TS + 20 * 64 + 63);
|
||||
memset(rtemp, 0, 3 * perChannelSizeBytes);
|
||||
|
||||
// Allocating buffer for the PipetteBuffer
|
||||
float *editIFloatTmpR = nullptr, *editIFloatTmpG = nullptr, *editIFloatTmpB = nullptr, *editWhateverTmp = nullptr;
|
||||
|
||||
if (editImgFloat) {
|
||||
editIFloatBuffer = (char *) malloc (3 * sizeof (float) * TS * TS + 20 * 64 + 63);
|
||||
data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64);
|
||||
char *data = (char*) ( ( uintptr_t (editIFloatBuffer) + uintptr_t (63)) / 64 * 64);
|
||||
|
||||
editIFloatTmpR = (float (*))data;
|
||||
editIFloatTmpG = (float (*)) ((char*)editIFloatTmpR + sizeof (float) * TS * TS + 4 * 64);
|
||||
@@ -3442,7 +3440,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
|
||||
|
||||
if (editWhatever) {
|
||||
editWhateverBuffer = (char *) malloc (sizeof (float) * TS * TS + 20 * 64 + 63);
|
||||
data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64);
|
||||
char *data = (char*) ( ( uintptr_t (editWhateverBuffer) + uintptr_t (63)) / 64 * 64);
|
||||
|
||||
editWhateverTmp = (float (*))data;
|
||||
}
|
||||
@@ -3618,10 +3616,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
|
||||
if (hasToneCurve1) {
|
||||
if (curveMode == ToneCurveParams::TcMode::STD) { // Standard
|
||||
for (int i = istart, ti = 0; i < tH; i++, ti++) {
|
||||
for (int j = jstart, tj = 0; j < tW; j++, tj++) {
|
||||
const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve1);
|
||||
userToneCurve.Apply (rtemp[ti * TS + tj], gtemp[ti * TS + tj], btemp[ti * TS + tj]);
|
||||
}
|
||||
const StandardToneCurve& userToneCurve = static_cast<const StandardToneCurve&> (customToneCurve1);
|
||||
userToneCurve.BatchApply (
|
||||
0, tW - jstart,
|
||||
&rtemp[ti * TS], >emp[ti * TS], &btemp[ti * TS]);
|
||||
}
|
||||
} else if (curveMode == ToneCurveParams::TcMode::FILMLIKE) { // Adobe like
|
||||
for (int i = istart, ti = 0; i < tH; i++, ti++) {
|
||||
@@ -4529,8 +4527,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
|
||||
}
|
||||
}
|
||||
|
||||
free (buffer);
|
||||
|
||||
if (editIFloatBuffer) {
|
||||
free (editIFloatBuffer);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user