diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index da1691748..0f1fc5759 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -26,17 +26,20 @@ typedef __m128i vint2; #define LVFU(x) _mm_loadu_ps(&x) #define STVF(x,y) _mm_store_ps(&x,y) #define STVFU(x,y) _mm_storeu_ps(&x,y) +#define LVI(x) _mm_load_si128((__m128i*)&x) #else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3, also need to map the aligned functions to unaligned functions for WIN32 builds #define LVF(x) _mm_loadu_ps((float*)&x) #define LVFU(x) _mm_loadu_ps(&x) #define STVF(x,y) _mm_storeu_ps(&x,y) #define STVFU(x,y) _mm_storeu_ps(&x,y) +#define LVI(x) _mm_loadu_si128((__m128i*)&x) #endif #else #define LVF(x) _mm_load_ps((float*)&x) #define LVFU(x) _mm_loadu_ps(&x) #define STVF(x,y) _mm_store_ps(&x,y) #define STVFU(x,y) _mm_storeu_ps(&x,y) +#define LVI(x) _mm_load_si128((__m128i*)&x) #endif #if defined(__x86_64__) && defined(__AVX__) diff --git a/rtgui/histogrampanel.cc b/rtgui/histogrampanel.cc index e5d3d560e..9c8163c8e 100644 --- a/rtgui/histogrampanel.cc +++ b/rtgui/histogrampanel.cc @@ -25,8 +25,7 @@ #include "rtimage.h" #include "../rtengine/improccoordinator.h" #include "../rtengine/color.h" - - +#include "../rtengine/opthelper.h" using namespace rtengine; extern Glib::ustring argv0; @@ -852,7 +851,7 @@ void HistogramArea::update (LUTu &histRed, LUTu &histGreen, LUTu &histBlue, LUTu g_idle_add (histupdateUI, haih); } -void HistogramArea::renderHistogram () +SSEFUNCTION void HistogramArea::renderHistogram () { if (!is_realized ()) { @@ -878,7 +877,7 @@ void HistogramArea::renderHistogram () // make double copies of LUT, one for faster access, another one to scale down the raw histos LUTu rhchanged(256), ghchanged(256), bhchanged(256); - unsigned int lhisttemp[256], chisttemp[256], rhtemp[256], ghtemp[256], bhtemp[256]; + unsigned int lhisttemp[256] ALIGNED16 {0}, chisttemp[256] ALIGNED16 {0}, rhtemp[256] ALIGNED16 {0}, ghtemp[256] ALIGNED16 {0}, bhtemp[256] ALIGNED16 {0}; const int scale = (rawMode ? 8 : 1); for(int i = 0; i < 256; i++) { @@ -937,31 +936,48 @@ void HistogramArea::renderHistogram () if (!fullMode) { int area = 0; - if(!rawMode) - for (int i = 0; i < fullhistheight; i++) { - for (int j = 0; j < 256; j++) - if ((needLuma && lhisttemp[j] > i) || (needChroma && chisttemp[j] > i) || (needRed && rhtemp[j] > i) || (needGreen && ghtemp[j] > i) || (needBlue && bhtemp[j] > i)) { - area++; - } +#ifdef __SSE2__ + vint onev = _mm_set1_epi32(1); + vint iv = (vint)ZEROV; +#endif - if ((double)area / (256 * (i + 1)) < 0.3) { - realhistheight = i; - break; - } - } - else - for (int i = 0; i < fullhistheight; i++) { - for (int j = 0; j < 256; j++) - if ((needRed && rhtemp[j] > i) || (needGreen && ghtemp[j] > i) || (needBlue && bhtemp[j] > i)) { - area++; - } + for (int i = 0; i < fullhistheight; i++) { +#ifdef __SSE2__ + vint areatempv = (vint)ZEROV; + + for (int j = 0; j < 256; j += 4) { + vmask mask1v = _mm_cmpgt_epi32(LVI(lhisttemp[j]), iv); + vmask mask2v = _mm_cmpgt_epi32(LVI(rhtemp[j]), iv); + vmask mask3v = _mm_cmpgt_epi32(LVI(ghtemp[j]), iv); + vmask mask4v = _mm_cmpgt_epi32(LVI(bhtemp[j]), iv); + mask1v = _mm_or_si128(mask1v, mask2v); + mask3v = _mm_or_si128(mask3v, mask4v); + mask2v = _mm_cmpgt_epi32(LVI(chisttemp[j]), iv); + mask1v = _mm_or_si128(mask1v, mask3v); + mask1v = _mm_or_si128(mask1v, mask2v); + areatempv = _mm_add_epi32(areatempv, _mm_and_si128(mask1v, onev)); - if ((double)area / (256 * (i + 1)) < 0.3) { - realhistheight = i; - break; - } } + areatempv = _mm_add_epi32(areatempv, (vint)_mm_movehl_ps((vfloat)areatempv, (vfloat)areatempv)); + areatempv = _mm_add_epi32(areatempv, _mm_shuffle_epi32(areatempv, 1)); + area += _mm_cvtsi128_si32(areatempv); + iv = _mm_add_epi32(iv, onev); + +#else + + for (int j = 0; j < 256; j++) + if (lhisttemp[j] > i || rhtemp[j] > i || ghtemp[j] > i || bhtemp[j] > i || chisttemp[j] > i) { + area++; + } + +#endif + + if ((double)area / (256 * (i + 1)) < 0.3) { + realhistheight = i; + break; + } + } } if (realhistheight < winh - 2) {