Merge branch 'master' into gtk3
This commit is contained in:
@@ -197,7 +197,7 @@ void RawImageSource::CA_correct_RT(const double cared, const double cablue, cons
|
||||
|
||||
// assign working space
|
||||
constexpr int buffersize = 3 * sizeof(float) * ts * ts + 6 * sizeof(float) * ts * tsh + 8 * 64 + 63;
|
||||
char *buffer = (char *) calloc(buffersize, 1);
|
||||
char *buffer = (char *) malloc(buffersize);
|
||||
char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
|
||||
|
||||
// shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache
|
||||
@@ -231,6 +231,7 @@ void RawImageSource::CA_correct_RT(const double cared, const double cablue, cons
|
||||
#pragma omp for collapse(2) schedule(dynamic) nowait
|
||||
for (int top = -border ; top < height; top += ts - border2)
|
||||
for (int left = -border; left < width; left += ts - border2) {
|
||||
memset(buffer, 0, buffersize);
|
||||
const int vblock = ((top + border) / (ts - border2)) + 1;
|
||||
const int hblock = ((left + border) / (ts - border2)) + 1;
|
||||
const int bottom = min(top + ts, height + border);
|
||||
@@ -730,6 +731,7 @@ void RawImageSource::CA_correct_RT(const double cared, const double cablue, cons
|
||||
|
||||
for (int top = -border; top < height; top += ts - border2)
|
||||
for (int left = -border; left < width; left += ts - border2) {
|
||||
memset(buffer, 0, buffersize);
|
||||
float lblockshifts[2][2];
|
||||
const int vblock = ((top + border) / (ts - border2)) + 1;
|
||||
const int hblock = ((left + border) / (ts - border2)) + 1;
|
||||
|
||||
@@ -92,9 +92,9 @@ protected:
|
||||
private:
|
||||
unsigned int owner;
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
vfloat maxsv __attribute__ ((aligned (16)));
|
||||
vfloat sizev __attribute__ ((aligned (16)));
|
||||
vint sizeiv __attribute__ ((aligned (16)));
|
||||
vfloat maxsv ALIGNED16;
|
||||
vfloat sizev ALIGNED16;
|
||||
vint sizeiv ALIGNED16;
|
||||
#endif
|
||||
public:
|
||||
/// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
|
||||
|
||||
@@ -418,7 +418,7 @@ SSEFUNCTION void ImProcFunctions::dirpyr_channel(float ** data_fine, float ** da
|
||||
__m128 thousandv = _mm_set1_ps( 1000.0f );
|
||||
__m128 dirwtv, valv, normv, dftemp1v, dftemp2v;
|
||||
// multiplied each value of domkerv by 1000 to avoid multiplication by 1000 inside the loop
|
||||
float domkerv[5][5][4] __attribute__ ((aligned (16))) = {{{1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}}, {{1000, 1000, 1000, 1000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {1000, 1000, 1000, 1000}}, {{1000, 1000, 1000, 1000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {1000, 1000, 1000, 1000}}, {{1000, 1000, 1000, 1000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {1000, 1000, 1000, 1000}}, {{1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}}};
|
||||
float domkerv[5][5][4] ALIGNED16 = {{{1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}}, {{1000, 1000, 1000, 1000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {1000, 1000, 1000, 1000}}, {{1000, 1000, 1000, 1000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {1000, 1000, 1000, 1000}}, {{1000, 1000, 1000, 1000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {2000, 2000, 2000, 2000}, {1000, 1000, 1000, 1000}}, {{1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}, {1000, 1000, 1000, 1000}}};
|
||||
#endif // __SSE2__
|
||||
|
||||
int j;
|
||||
@@ -458,7 +458,7 @@ SSEFUNCTION void ImProcFunctions::dirpyr_channel(float ** data_fine, float ** da
|
||||
|
||||
for (int jnbr = j - scalewin, indexjhlp = 0; jnbr <= j + scalewin; jnbr += scale, indexjhlp++) {
|
||||
dftemp2v = LVFU(data_fine[inbr][jnbr]);
|
||||
dirwtv = _mm_load_ps((float*)&domkerv[indexihlp][indexjhlp]) / (vabsf(dftemp1v - dftemp2v) + thousandv);
|
||||
dirwtv = LVF(domkerv[indexihlp][indexjhlp]) / (vabsf(dftemp1v - dftemp2v) + thousandv);
|
||||
valv += dirwtv * dftemp2v;
|
||||
normv += dirwtv;
|
||||
}
|
||||
|
||||
@@ -22,18 +22,18 @@ typedef __m128i vint2;
|
||||
//
|
||||
#ifdef __GNUC__
|
||||
#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) && (!defined(WIN32) || defined( __x86_64__ ))
|
||||
#define LVF(x) _mm_load_ps(&x)
|
||||
#define LVF(x) _mm_load_ps((float*)&x)
|
||||
#define LVFU(x) _mm_loadu_ps(&x)
|
||||
#define STVF(x,y) _mm_store_ps(&x,y)
|
||||
#define STVFU(x,y) _mm_storeu_ps(&x,y)
|
||||
#else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3, also need to map the aligned functions to unaligned functions for WIN32 builds
|
||||
#define LVF(x) _mm_loadu_ps(&x)
|
||||
#define LVF(x) _mm_loadu_ps((float*)&x)
|
||||
#define LVFU(x) _mm_loadu_ps(&x)
|
||||
#define STVF(x,y) _mm_storeu_ps(&x,y)
|
||||
#define STVFU(x,y) _mm_storeu_ps(&x,y)
|
||||
#endif
|
||||
#else
|
||||
#define LVF(x) _mm_load_ps(&x)
|
||||
#define LVF(x) _mm_load_ps((float*)&x)
|
||||
#define LVFU(x) _mm_loadu_ps(&x)
|
||||
#define STVF(x,y) _mm_store_ps(&x,y)
|
||||
#define STVFU(x,y) _mm_storeu_ps(&x,y)
|
||||
|
||||
@@ -475,7 +475,7 @@ SSEFUNCTION void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, i
|
||||
{
|
||||
#if defined( __SSE2__ ) && defined( __x86_64__ )
|
||||
__m128 dirwtv, valv, normv, dftemp1v, dftemp2v, fgg;
|
||||
float domkerv[5][5][4] __attribute__ ((aligned (16))) = {{{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}};
|
||||
float domkerv[5][5][4] ALIGNED16 = {{{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}};
|
||||
|
||||
#endif // __SSE2__
|
||||
int j;
|
||||
@@ -513,7 +513,7 @@ SSEFUNCTION void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, i
|
||||
|
||||
for (int jnbr = j - scalewin, indexjhlp = 0; jnbr <= j + scalewin; jnbr += scale, indexjhlp++) {
|
||||
dftemp2v = LVFU(data_fine[inbr][jnbr]);
|
||||
dirwtv = ( _mm_load_ps((float*)&domkerv[indexihlp][indexjhlp]) * rangefn[_mm_cvttps_epi32(vabsf(dftemp2v - dftemp1v))] );
|
||||
dirwtv = ( LVF(domkerv[indexihlp][indexjhlp]) * rangefn[_mm_cvttps_epi32(vabsf(dftemp2v - dftemp1v))] );
|
||||
valv += dirwtv * dftemp2v;
|
||||
normv += dirwtv;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user