Speedup for histogram panel

This commit is contained in:
heckflosse 2016-05-03 20:12:04 +02:00
parent 2dd2f5ca17
commit aa5072fa0a
2 changed files with 44 additions and 25 deletions

View File

@ -26,17 +26,20 @@ typedef __m128i vint2;
#define LVFU(x) _mm_loadu_ps(&x)
#define STVF(x,y) _mm_store_ps(&x,y)
#define STVFU(x,y) _mm_storeu_ps(&x,y)
#define LVI(x) _mm_load_si128((__m128i*)&x)
#else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3, also need to map the aligned functions to unaligned functions for WIN32 builds
#define LVF(x) _mm_loadu_ps((float*)&x)
#define LVFU(x) _mm_loadu_ps(&x)
#define STVF(x,y) _mm_storeu_ps(&x,y)
#define STVFU(x,y) _mm_storeu_ps(&x,y)
#define LVI(x) _mm_loadu_si128((__m128i*)&x)
#endif
#else
#define LVF(x) _mm_load_ps((float*)&x)
#define LVFU(x) _mm_loadu_ps(&x)
#define STVF(x,y) _mm_store_ps(&x,y)
#define STVFU(x,y) _mm_storeu_ps(&x,y)
#define LVI(x) _mm_load_si128((__m128i*)&x)
#endif
#if defined(__x86_64__) && defined(__AVX__)

View File

@ -25,8 +25,7 @@
#include "rtimage.h"
#include "../rtengine/improccoordinator.h"
#include "../rtengine/color.h"
#include "../rtengine/opthelper.h"
using namespace rtengine;
extern Glib::ustring argv0;
@ -852,7 +851,7 @@ void HistogramArea::update (LUTu &histRed, LUTu &histGreen, LUTu &histBlue, LUTu
g_idle_add (histupdateUI, haih);
}
void HistogramArea::renderHistogram ()
SSEFUNCTION void HistogramArea::renderHistogram ()
{
if (!is_realized ()) {
@ -878,7 +877,7 @@ void HistogramArea::renderHistogram ()
// make double copies of LUT, one for faster access, another one to scale down the raw histos
LUTu rhchanged(256), ghchanged(256), bhchanged(256);
unsigned int lhisttemp[256], chisttemp[256], rhtemp[256], ghtemp[256], bhtemp[256];
unsigned int lhisttemp[256] ALIGNED16 {0}, chisttemp[256] ALIGNED16 {0}, rhtemp[256] ALIGNED16 {0}, ghtemp[256] ALIGNED16 {0}, bhtemp[256] ALIGNED16 {0};
const int scale = (rawMode ? 8 : 1);
for(int i = 0; i < 256; i++) {
@ -937,31 +936,48 @@ void HistogramArea::renderHistogram ()
if (!fullMode) {
int area = 0;
if(!rawMode)
#ifdef __SSE2__
vint onev = _mm_set1_epi32(1);
vint iv = (vint)ZEROV;
#endif
for (int i = 0; i < fullhistheight; i++) {
#ifdef __SSE2__
vint areatempv = (vint)ZEROV;
for (int j = 0; j < 256; j += 4) {
vmask mask1v = _mm_cmpgt_epi32(LVI(lhisttemp[j]), iv);
vmask mask2v = _mm_cmpgt_epi32(LVI(rhtemp[j]), iv);
vmask mask3v = _mm_cmpgt_epi32(LVI(ghtemp[j]), iv);
vmask mask4v = _mm_cmpgt_epi32(LVI(bhtemp[j]), iv);
mask1v = _mm_or_si128(mask1v, mask2v);
mask3v = _mm_or_si128(mask3v, mask4v);
mask2v = _mm_cmpgt_epi32(LVI(chisttemp[j]), iv);
mask1v = _mm_or_si128(mask1v, mask3v);
mask1v = _mm_or_si128(mask1v, mask2v);
areatempv = _mm_add_epi32(areatempv, _mm_and_si128(mask1v, onev));
}
areatempv = _mm_add_epi32(areatempv, (vint)_mm_movehl_ps((vfloat)areatempv, (vfloat)areatempv));
areatempv = _mm_add_epi32(areatempv, _mm_shuffle_epi32(areatempv, 1));
area += _mm_cvtsi128_si32(areatempv);
iv = _mm_add_epi32(iv, onev);
#else
for (int j = 0; j < 256; j++)
if ((needLuma && lhisttemp[j] > i) || (needChroma && chisttemp[j] > i) || (needRed && rhtemp[j] > i) || (needGreen && ghtemp[j] > i) || (needBlue && bhtemp[j] > i)) {
if (lhisttemp[j] > i || rhtemp[j] > i || ghtemp[j] > i || bhtemp[j] > i || chisttemp[j] > i) {
area++;
}
#endif
if ((double)area / (256 * (i + 1)) < 0.3) {
realhistheight = i;
break;
}
}
else
for (int i = 0; i < fullhistheight; i++) {
for (int j = 0; j < 256; j++)
if ((needRed && rhtemp[j] > i) || (needGreen && ghtemp[j] > i) || (needBlue && bhtemp[j] > i)) {
area++;
}
if ((double)area / (256 * (i + 1)) < 0.3) {
realhistheight = i;
break;
}
}
}
if (realhistheight < winh - 2) {