diff --git a/rtengine/LUT.h b/rtengine/LUT.h index 1c7872e8e..73df71bbb 100644 --- a/rtengine/LUT.h +++ b/rtengine/LUT.h @@ -78,10 +78,10 @@ template class LUT { private: // list of variables ordered to improve cache speed - unsigned int maxs; + unsigned int maxs; T * data; unsigned int clip, size, owner; -#if defined( __SSE2__ ) && ((defined( WIN32 ) && defined( __x86_64__ )) || !defined( WIN32 )) +#if defined( __SSE2__ ) && defined( __x86_64__ ) __m128 maxsv __attribute__ ((aligned (16))); __m128 sizev __attribute__ ((aligned (16))); __m128i maxsiv __attribute__ ((aligned (16))); @@ -104,7 +104,7 @@ public: owner = 1; size = s; maxs=size-2; -#if defined( __SSE2__ ) && ((defined( WIN32 ) && defined( __x86_64__ )) || !defined( WIN32 )) +#if defined( __SSE2__ ) && defined( __x86_64__ ) maxsv = _mm_set1_ps( maxs ); maxsiv = _mm_cvttps_epi32( maxsv ); sizeiv = _mm_set1_epi32( (int)(size-1) ); @@ -125,7 +125,7 @@ public: owner = 1; size = s; maxs=size-2; -#if defined( __SSE2__ ) && ((defined( WIN32 ) && defined( __x86_64__ )) || !defined( WIN32 )) +#if defined( __SSE2__ ) && defined( __x86_64__ ) maxsv = _mm_set1_ps( maxs ); maxsiv = _mm_cvttps_epi32( maxsv ); sizeiv = _mm_set1_epi32( (int)(size-1) ); @@ -148,7 +148,7 @@ public: owner = 1; size = s; maxs=size-2; -#if defined( __SSE2__ ) && ((defined( WIN32 ) && defined( __x86_64__ )) || !defined( WIN32 )) +#if defined( __SSE2__ ) && defined( __x86_64__ ) maxsv = _mm_set1_ps( size - 2); maxsiv = _mm_cvttps_epi32( maxsv ); sizeiv = _mm_set1_epi32( (int)(size-1) ); @@ -190,7 +190,7 @@ public: memcpy(this->data,rhs.data,rhs.size*sizeof(T)); this->size=rhs.size; this->maxs=this->size-2; -#if defined( __SSE2__ ) && ((defined( WIN32 ) && defined( __x86_64__ )) || !defined( WIN32 )) +#if defined( __SSE2__ ) && defined( __x86_64__ ) this->maxsv = _mm_set1_ps( this->size - 2); this->maxsiv = _mm_cvttps_epi32( this->maxsv ); this->sizeiv = _mm_set1_epi32( (int)(this->size-1) ); @@ -210,14 +210,14 @@ public: else return data[size - 1]; } - + } - -#if defined( __SSE2__ ) && ((defined( WIN32 ) && defined( __x86_64__ )) || !defined( WIN32 )) + +#if defined( __SSE2__ ) && defined( __x86_64__ ) __m128 operator[](__m128 indexv ) const { printf("don't use this operator. It's not ready for production"); return _mm_setzero_ps(); - + // convert floats to ints __m128i idxv = _mm_cvttps_epi32( indexv ); __m128 tempv, resultv, p1v, p2v; @@ -258,7 +258,7 @@ public: tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1,1,1,1)); p2v = _mm_move_ss( p2v, tempv); // now p1v is 3 2 3 1 - + // get 1st value idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv,_MM_SHUFFLE(0,0,0,0))); tempv = LVFU(data[idx]); @@ -307,7 +307,7 @@ public: // now p1v is 3 2 3 2 p1v = _mm_move_ss( p1v, tempv ); // now p1v is 3 2 3 1 - + // get 1st value idx = _mm_cvtsi128_si32 (idxv); tempv = _mm_load_ss(&data[idx]); @@ -342,8 +342,8 @@ public: T p2 = data[idx + 1]-p1; return (p1 + p2*diff); } - - + + #ifndef NDEBUG // Debug facility ; dump the content of the LUT in a file. No control of the filename is done void dump(Glib::ustring fname) { diff --git a/rtengine/shmap.cc b/rtengine/shmap.cc index 9681439ff..1faec01f2 100644 --- a/rtengine/shmap.cc +++ b/rtengine/shmap.cc @@ -7,7 +7,7 @@ * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * RawTherapee is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -71,18 +71,18 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int else { //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //experimental dirpyr shmap - + float thresh = 100*radius;//1000; LUTf rangefn(0x10000); float ** dirpyrlo[2]; int intfactor = 1024;//16384; - + //set up range functions for (int i=0; i<0x10000; i++) { //rangefn[i] = (int)(((thresh)/((double)(i) + (thresh)))*intfactor); rangefn[i] = static_cast(xexpf(-(min(10.0f,(static_cast(i)*i) / (thresh*thresh))))*intfactor); - //if (rangefn[i]<0 || rangefn[i]>intfactor) + //if (rangefn[i]<0 || rangefn[i]>intfactor) //printf("i=%d rangefn=%d arg=%f \n",i,rangefn[i], float(i*i) / (thresh*thresh)); } @@ -104,11 +104,11 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int } dirpyr_shmap(dirpyrlo[1-indx], map, W, H, rangefn, level, scale ); - + freeArray(dirpyrlo[0], H); freeArray(dirpyrlo[1], H); - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% /* @@ -122,8 +122,8 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int map[i][j] = (buffer[i-1][j-1]+buffer[i-1][j]+buffer[i-1][j+1]+buffer[i][j-1]+buffer[i][j]+buffer[i][j+1]+buffer[i+1][j-1]+buffer[i+1][j]+buffer[i+1][j+1])/9; else map[i][j] = buffer[i][j]; -*/ - +*/ + } // update average, minimum, maximum @@ -178,21 +178,21 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in #endif { //scale is spacing of directional averaging weights - + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // calculate weights, compute directionally weighted average - + int scalewin, halfwin; if(level < 2) { halfwin = 1; scalewin = halfwin*scale; - + #ifdef _OPENMP #pragma omp parallel #endif { -#if defined( __SSE2__ ) && ((defined( WIN32 ) && defined( __x86_64__ )) || !defined( WIN32 )) +#if defined( __SSE2__ ) && defined( __x86_64__ ) __m128 dirwtv, valv, normv; #endif // __SSE2__ int j; @@ -205,7 +205,7 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in { float val=0; float norm=0; - + for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) { for (int jnbr=j%scale; jnbr<=j+scalewin; jnbr+=scale) { dirwt = ( rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] ); @@ -215,12 +215,12 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in } data_coarse[i][j] = val/norm; // low pass filter } -#if defined( __SSE2__ ) && ((defined( WIN32 ) && defined( __x86_64__ )) || !defined( WIN32 )) +#if defined( __SSE2__ ) && defined( __x86_64__ ) for(; j < (width-scalewin)-3; j+=4) { valv= _mm_setzero_ps(); normv= _mm_setzero_ps(); - + for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) { for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) { dirwtv = ( rangefn[_mm_cvttps_epi32(vabsf(LVFU(data_fine[inbr][jnbr])-LVFU(data_fine[i][j])))] ); @@ -234,7 +234,7 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in { float val=0; float norm=0; - + for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) { for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) { dirwt = ( rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] ); @@ -250,7 +250,7 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in { float val=0; float norm=0; - + for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) { for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) { dirwt = ( rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] ); @@ -265,7 +265,7 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in { float val=0; float norm=0; - + for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) { for (int jnbr=j-scalewin; jnbr