Shadows/Highlights Sparmpask, speedup using SSE4 for native builds

This commit is contained in:
heckflosse 2016-02-11 00:44:26 +01:00
parent a91d2c3dba
commit 7655fce8f9
2 changed files with 76 additions and 36 deletions

View File

@ -71,9 +71,7 @@
#include <glibmm.h> #include <glibmm.h>
#include <fstream> #include <fstream>
#endif #endif
#ifdef __SSE2__ #include "opthelper.h"
#include "sleefsseavx.c"
#endif
#include <assert.h> #include <assert.h>
#include "rt_math.h" #include "rt_math.h"
@ -91,10 +89,9 @@ protected:
private: private:
unsigned int owner; unsigned int owner;
#if defined( __SSE2__ ) && defined( __x86_64__ ) #if defined( __SSE2__ ) && defined( __x86_64__ )
__m128 maxsv __attribute__ ((aligned (16))); vfloat maxsv __attribute__ ((aligned (16)));
__m128 sizev __attribute__ ((aligned (16))); vfloat sizev __attribute__ ((aligned (16)));
__m128i maxsiv __attribute__ ((aligned (16))); vint sizeiv __attribute__ ((aligned (16)));
__m128i sizeiv __attribute__ ((aligned (16)));
#endif #endif
public: public:
/// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed... /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
@ -120,10 +117,9 @@ public:
maxs = size - 2; maxs = size - 2;
maxsf = (float)maxs; maxsf = (float)maxs;
#if defined( __SSE2__ ) && defined( __x86_64__ ) #if defined( __SSE2__ ) && defined( __x86_64__ )
maxsv = _mm_set1_ps( maxs ); maxsv = F2V( maxs );
maxsiv = _mm_cvttps_epi32( maxsv );
sizeiv = _mm_set1_epi32( (int)(size - 1) ); sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = _mm_set1_ps( size - 1 ); sizev = F2V( size - 1 );
#endif #endif
} }
void operator ()(int s, int flags = 0xfffffff) void operator ()(int s, int flags = 0xfffffff)
@ -150,10 +146,9 @@ public:
maxs = size - 2; maxs = size - 2;
maxsf = (float)maxs; maxsf = (float)maxs;
#if defined( __SSE2__ ) && defined( __x86_64__ ) #if defined( __SSE2__ ) && defined( __x86_64__ )
maxsv = _mm_set1_ps( maxs ); maxsv = F2V( maxs );
maxsiv = _mm_cvttps_epi32( maxsv );
sizeiv = _mm_set1_epi32( (int)(size - 1) ); sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = _mm_set1_ps( size - 1 ); sizev = F2V( size - 1 );
#endif #endif
} }
@ -167,11 +162,11 @@ public:
assert (s > 0); assert (s > 0);
if (source == NULL) { if (!source) {
printf("source is NULL!\n"); printf("source is NULL!\n");
} }
assert (source != NULL); assert (source != nullptr);
#endif #endif
dirty = false; // Assumption dirty = false; // Assumption
clip = flags; clip = flags;
@ -182,10 +177,9 @@ public:
maxs = size - 2; maxs = size - 2;
maxsf = (float)maxs; maxsf = (float)maxs;
#if defined( __SSE2__ ) && defined( __x86_64__ ) #if defined( __SSE2__ ) && defined( __x86_64__ )
maxsv = _mm_set1_ps( size - 2); maxsv = F2V( size - 2);
maxsiv = _mm_cvttps_epi32( maxsv );
sizeiv = _mm_set1_epi32( (int)(size - 1) ); sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = _mm_set1_ps( size - 1 ); sizev = F2V( size - 1 );
#endif #endif
for (int i = 0; i < s; i++) { for (int i = 0; i < s; i++) {
@ -195,7 +189,7 @@ public:
LUT() LUT()
{ {
data = NULL; data = nullptr;
reset(); reset();
} }
@ -237,10 +231,10 @@ public:
if (this != &rhs) { if (this != &rhs) {
if (rhs.size > this->size) { if (rhs.size > this->size) {
delete [] this->data; delete [] this->data;
this->data = NULL; this->data = nullptr;
} }
if (this->data == NULL) { if (this->data == nullptr) {
this->data = new T[rhs.size]; this->data = new T[rhs.size];
} }
@ -252,10 +246,9 @@ public:
this->maxs = this->size - 2; this->maxs = this->size - 2;
this->maxsf = (float)this->maxs; this->maxsf = (float)this->maxs;
#if defined( __SSE2__ ) && defined( __x86_64__ ) #if defined( __SSE2__ ) && defined( __x86_64__ )
this->maxsv = _mm_set1_ps( this->size - 2); this->maxsv = F2V( this->size - 2);
this->maxsiv = _mm_cvttps_epi32( this->maxsv );
this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) ); this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) );
this->sizev = _mm_set1_ps( this->size - 1 ); this->sizev = F2V( this->size - 1 );
#endif #endif
} }
@ -268,14 +261,15 @@ public:
} }
#if defined( __SSE2__ ) && defined( __x86_64__ ) #if defined( __SSE2__ ) && defined( __x86_64__ )
__m128 operator[](__m128 indexv ) const /*
vfloat operator[](vfloat indexv ) const
{ {
// printf("don't use this operator. It's not ready for production"); // printf("don't use this operator. It's not ready for production");
return _mm_setzero_ps(); return _mm_setzero_ps();
// convert floats to ints // convert floats to ints
__m128i idxv = _mm_cvttps_epi32( indexv ); vint idxv = _mm_cvttps_epi32( indexv );
__m128 tempv, resultv, p1v, p2v; vfloat tempv, resultv, p1v, p2v;
vmask maxmask = vmaskf_gt(indexv, maxsv); vmask maxmask = vmaskf_gt(indexv, maxsv);
idxv = _mm_castps_si128(vself(maxmask, maxsv, _mm_castsi128_ps(idxv))); idxv = _mm_castps_si128(vself(maxmask, maxsv, _mm_castsi128_ps(idxv)));
vmask minmask = vmaskf_lt(indexv, _mm_setzero_ps()); vmask minmask = vmaskf_lt(indexv, _mm_setzero_ps());
@ -327,15 +321,55 @@ public:
p2v = _mm_move_ss( p2v, tempv); p2v = _mm_move_ss( p2v, tempv);
// now p2v is 3 2 1 0 // now p2v is 3 2 1 0
__m128 diffv = indexv - _mm_cvtepi32_ps ( idxv ); vfloat diffv = indexv - _mm_cvtepi32_ps ( idxv );
diffv = vself(vorm(maxmask, minmask), _mm_setzero_ps(), diffv); diffv = vself(vorm(maxmask, minmask), _mm_setzero_ps(), diffv);
resultv = p1v + p2v * diffv; resultv = p1v + p2v * diffv;
return resultv ; return resultv ;
} }
*/
__m128 operator[](__m128i idxv ) const #ifdef __SSE4_1__
vfloat operator[](vint idxv ) const
{ {
__m128 tempv, p1v; vfloat tempv, p1v;
idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
// access the LUT 4 times and shuffle the values into p1v
int idx;
// get 4th value
idx = _mm_extract_epi32(idxv, 3);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
// now p1v is 3 3 3 3
// get 3rd value
idx = _mm_extract_epi32(idxv, 2);
tempv = _mm_load_ss(&data[idx]);
p1v = _mm_move_ss( p1v, tempv);
// now p1v is 3 3 3 2
// get 2nd value
idx = _mm_extract_epi32(idxv, 1);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
// now p1v is 3 2 3 2
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 3 1
// get 1st value
idx = _mm_cvtsi128_si32(idxv);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
// now p1v is 3 2 1 1
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 1 0
return p1v;
}
#else
vfloat operator[](vint idxv ) const
{
vfloat tempv, p1v;
tempv = _mm_cvtepi32_ps(idxv); tempv = _mm_cvtepi32_ps(idxv);
tempv = _mm_min_ps( tempv, sizev ); tempv = _mm_min_ps( tempv, sizev );
idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( ) )); idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( ) ));
@ -346,7 +380,7 @@ public:
// get 4th value // get 4th value
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3))); idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
tempv = _mm_load_ss(&data[idx]); tempv = _mm_load_ss(&data[idx]);
p1v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(0, 0, 0, 0)); p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
// now p1v is 3 3 3 3 // now p1v is 3 3 3 3
// get 3rd value // get 3rd value
@ -358,7 +392,7 @@ public:
// get 2nd value // get 2nd value
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1))); idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
tempv = _mm_load_ss(&data[idx]); tempv = _mm_load_ss(&data[idx]);
p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(1, 0, 1, 0)); p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
// now p1v is 3 2 3 2 // now p1v is 3 2 3 2
p1v = _mm_move_ss( p1v, tempv ); p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 3 1 // now p1v is 3 2 3 1
@ -366,13 +400,14 @@ public:
// get 1st value // get 1st value
idx = _mm_cvtsi128_si32 (idxv); idx = _mm_cvtsi128_si32 (idxv);
tempv = _mm_load_ss(&data[idx]); tempv = _mm_load_ss(&data[idx]);
p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(3, 2, 0, 0)); p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
// now p1v is 3 2 1 1 // now p1v is 3 2 1 1
p1v = _mm_move_ss( p1v, tempv ); p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 1 0 // now p1v is 3 2 1 0
return p1v; return p1v;
} }
#endif
#endif #endif
// use with float indices // use with float indices
@ -465,7 +500,7 @@ public:
} }
dirty = true; dirty = true;
data = NULL; data = nullptr;
owner = 1; owner = 1;
size = 0; size = 0;
upperBound = 0; upperBound = 0;
@ -484,7 +519,7 @@ class HueLUT : public LUTf
{ {
public: public:
HueLUT() : LUTf() {} HueLUT() : LUTf() {}
HueLUT(bool createArray) : LUTf() explicit HueLUT(bool createArray) : LUTf()
{ {
if (createArray) { if (createArray) {
this->operator () (501, LUT_CLIP_BELOW | LUT_CLIP_ABOVE); this->operator () (501, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);

View File

@ -39,6 +39,11 @@ typedef __m128i vint2;
#define STVFU(x,y) _mm_storeu_ps(&x,y) #define STVFU(x,y) _mm_storeu_ps(&x,y)
#endif #endif
#if defined(__x86_64__) && defined(__AVX__)
#define PERMUTEPS(a,mask) _mm_permute_ps(a,mask)
#else
#define PERMUTEPS(a,mask) _mm_shuffle_ps(a,a,mask)
#endif
static INLINE vfloat LC2VFU(float &a) static INLINE vfloat LC2VFU(float &a)
{ {