Increased precision for Shadows/Highlights (and a small speedup and reduced memory consumption for method 'Sharp mask'), Issue 2523

2014-10-09 01:21:52 +02:00
parent 96d80d1102
commit ef66cb2e7f
4 changed files with 120 additions and 101 deletions
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -204,6 +204,8 @@ void Crop::update (int todo) {
        double radius = sqrt (double(SKIPS(parent->fw,skip)*SKIPS(parent->fw,skip)+SKIPS(parent->fh,skip)*SKIPS(parent->fh,skip))) / 2.0;
        double shradius = params.sh.radius;
        if (!params.sh.hq) shradius *= radius / 1800.0;
+        if(!cshmap)
+			cshmap = new SHMap (cropw, croph, true);
        cshmap->update (baseCrop, shradius, parent->ipf.lumimul, params.sh.hq, skip);
        if(parent->shmap->min_f < 65535.f) // don't call forceStat with wrong values
            cshmap->forceStat (parent->shmap->max_f, parent->shmap->min_f, parent->shmap->avg);
@@ -525,12 +527,14 @@ if (settings->verbose) printf ("setcropsizes before lock\n");

        if (cbuffer  ) delete [] cbuffer;
        if (cbuf_real) delete [] cbuf_real;
-        if (cshmap   ) delete    cshmap;
+        if (cshmap   ) { delete    cshmap; cshmap = NULL;}
        cbuffer = new float*[croph];
        cbuf_real= new float[(croph+2)*cropw];
        for (int i=0; i<croph; i++)
            cbuffer[i] = cbuf_real+cropw*i+cropw;
-        cshmap = new SHMap (cropw, croph, true);
+		if(params.sh.enabled) {
+			cshmap = new SHMap (cropw, croph, true);
+		}

        EditBuffer::resize(cropw, croph);

--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -310,6 +310,8 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall) {
        double radius = sqrt (double(pW*pW+pH*pH)) / 2.0;
        double shradius = params.sh.radius;
        if (!params.sh.hq) shradius *= radius / 1800.0;
+        if(!shmap)
+			shmap = new SHMap (pW, pH, true);
        shmap->update (oprevi, shradius, ipf.lumimul, params.sh.hq, scale);
    }
    readyphase++;
@@ -688,7 +690,8 @@ void ImProcCoordinator::freeAll () {
            delete previmg;

        delete workimg;
-        delete shmap;
+        if(shmap)
+			delete shmap; shmap = NULL;

    }
    allocated = false;
@@ -733,7 +736,9 @@ if (settings->verbose) printf ("setscale before lock\n");
        //ncie is only used in ImProcCoordinator::updatePreviewImage, it will be allocated on first use and deleted if not used anymore
        previmg = new Image8 (pW, pH);
        workimg = new Image8 (pW, pH);
-        shmap = new SHMap (pW, pH, true);
+   		if(params.sh.enabled) {
+			shmap = new SHMap (pW, pH, true);
+   		}

        allocated = true;
    }
--- a/rtengine/shmap.cc
+++ b/rtengine/shmap.cc
@@ -21,11 +21,8 @@
 #include "rtengine.h"
 #include "rt_math.h"
 #include "rawimagesource.h"
-#include "sleef.c"
 #undef THREAD_PRIORITY_NORMAL
-#ifdef __SSE2__
-#include "sleefsseavx.c"
-#endif // __SSE2__
+#include "opthelper.h"

 namespace rtengine {

@@ -36,6 +33,7 @@ SHMap::SHMap (int w, int h, bool multiThread) : W(w), H(h), multiThread(multiThr
    map = new float*[H];
    for (int i=0; i<H; i++)
        map[i] = new float[W];
+
 }

 SHMap::~SHMap () {
@@ -45,18 +43,24 @@ SHMap::~SHMap () {
    delete [] map;
 }

-void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int skip) {
+void SHMap::fillLuminance( Imagefloat * img, float **luminance, double lumi[3] ) // fill with luminance
+{

-    // fill with luminance
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
    for (int i=0; i<H; i++)
        for (int j=0; j<W; j++) {
-            map[i][j] = lumi[0]*std::max(img->r(i,j),0.f) + lumi[1]*std::max(img->g(i,j),0.f) + lumi[2]*std::max(img->b(i,j),0.f);
+            luminance[i][j] = lumi[0]*std::max(img->r(i,j),0.f) + lumi[1]*std::max(img->g(i,j),0.f) + lumi[2]*std::max(img->b(i,j),0.f);
 		}
+	
+}
+
+void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int skip) {

    if (!hq) {
+		fillLuminance( img, map, lumi);
+
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
@@ -72,43 +76,64 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 		//experimental dirpyr shmap

-		float thresh = 100*radius;//1000;
-		LUTf rangefn(0x10000);
-		float ** dirpyrlo[2];
+		float thresh = (100.f*radius);//1000;

-		int intfactor = 1024;//16384;
-
-		//set up range functions
-		for (int i=0; i<0x10000; i++) {
-			//rangefn[i] = (int)(((thresh)/((double)(i) + (thresh)))*intfactor);
-			rangefn[i] = static_cast<int>(xexpf(-(min(10.0f,(static_cast<float>(i)*i) / (thresh*thresh))))*intfactor);
-			//if (rangefn[i]<0 || rangefn[i]>intfactor)
-				//printf("i=%d rangefn=%d arg=%f \n",i,rangefn[i], float(i*i) / (thresh*thresh));
+		// set up range function
+		// calculate size of Lookup table. That's possible because from a value k for all i>=k rangefn[i] will be exp(-10)
+		// So we use this fact and the automatic clip of lut to reduce the size of lut and the number of calculations to fill the lut
+		// In past this lut had only integer precision with rangefn[i] = 0 for all i>=k
+		// We set the last element to a small epsilon 1e-15 instead of zero to avoid divisions by zero
+		const int lutSize = thresh * sqrtf(10.f) + 1;
+		thresh *= thresh;
+		LUTf rangefn(lutSize);
+		for (int i=0; i<lutSize-1; i++) {
+			rangefn[i] = xexpf(-min(10.f,(static_cast<float>(i)*i) / thresh ));//*intfactor;
+		}
+		rangefn[lutSize-1] = 1e-15f;
+		
+		// We need one temporary buffer
+		float ** buffer = allocArray<float> (W, H);
+		
+		// the final result has to be in map
+		// for an even number of levels that means: map => buffer, buffer => map
+		// for an odd number of levels that means: buffer => map, map => buffer, buffer => map
+		// so let's calculate the number of levels first
+		// There are at least two levels
+		int numLevels=2;
+		int scale=2;
+		while (skip*scale<16) {
+			scale *= 2;
+			numLevels++;
 		}

-		dirpyrlo[0] = allocArray<float> (W, H);
-		dirpyrlo[1] = allocArray<float> (W, H);
+		float ** dirpyrlo[2];
+		if(numLevels&1) { // odd number of levels, start with buffer
+			dirpyrlo[0] = buffer;
+			dirpyrlo[1] = map;
+		} else { // even number of levels, start with map
+			dirpyrlo[0] = map;
+			dirpyrlo[1] = buffer;
+		}

-		int scale=1;
+		fillLuminance( img, dirpyrlo[0], lumi);
+
+		scale = 1;
 		int level=0;
 		int indx=0;
-		dirpyr_shmap(map, dirpyrlo[indx], W, H, rangefn, 0, scale );
+		dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1-indx], W, H, rangefn, level, scale );
 		scale *= 2;
-		level += 1;
+		level ++;
 		indx = 1-indx;
 		while (skip*scale<16) {
-			dirpyr_shmap(dirpyrlo[1-indx], dirpyrlo[indx], W, H, rangefn, level, scale );
+			dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1-indx], W, H, rangefn, level, scale );
 			scale *= 2;
-			level += 1;
+			level ++;
 			indx = 1-indx;
 		}

-		dirpyr_shmap(dirpyrlo[1-indx], map, W, H, rangefn, level, scale );
-
-
-		freeArray<float>(dirpyrlo[0], H);
-		freeArray<float>(dirpyrlo[1], H);
+		dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1-indx], W, H, rangefn, level, scale );

+		freeArray<float>(buffer, H);

 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 /*
@@ -126,8 +151,7 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int

    }
    // update average, minimum, maximum
-
-    float _avg = 0.0f;
+    double _avg = 0.0f;	// use double precision to gain precision especially at systems with few cores and big pictures (error for 36 MPixel on single core was about 8% with float)
    min_f = 65535;
    max_f = 0;
 #ifdef _OPENMP
@@ -138,7 +162,7 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int
    float _max_f = 0.0f;
    float _val;
 #ifdef _OPENMP
-#pragma omp for reduction(+:_avg) nowait
+#pragma omp for reduction(+:_avg) schedule(dynamic,16) nowait
 #endif
    for (int i=0; i<H; i++)
        for (int j=0; j<W; j++) {
@@ -171,11 +195,7 @@ void SHMap::forceStat (float max_, float min_, float avg_) {
    avg = avg_;
 }

-#if defined( __SSE__ ) && defined( WIN32 )
-__attribute__((force_align_arg_pointer)) void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, int height, LUTf & rangefn, int level, int scale)
-#else
-void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, int height, LUTf & rangefn, int level, int scale)
-#endif
+SSEFUNCTION void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, int height, LUTf & rangefn, int level, int scale)
 {
 	//scale is spacing of directional averaging weights

@@ -193,7 +213,7 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in
 #endif
 {
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-	__m128 dirwtv, valv, normv;
+	__m128 dirwtv, valv, normv, dftemp1v, dftemp2v;
 #endif // __SSE2__
 	int j;
 #ifdef _OPENMP
@@ -201,11 +221,9 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in
 #endif
 	for(int i = 0; i < height; i++) {
 		float dirwt;
-		for(j = 0; j < scalewin; j++)
-		{
-			float val=0;
-			float norm=0;
-
+		for(j = 0; j < scalewin; j++) {
+			float val=0.f;
+			float norm=0.f;
 			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j%scale; jnbr<=j+scalewin; jnbr+=scale) {
 					dirwt = ( rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] );
@@ -216,26 +234,25 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in
 			data_coarse[i][j] = val/norm; // low pass filter
 		}
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-		for(; j < (width-scalewin)-3; j+=4)
-		{
+		int inbrMin = max(i-scalewin,i%scale);
+		for(; j < (width-scalewin)-3; j+=4) {
 			valv= _mm_setzero_ps();
 			normv= _mm_setzero_ps();
-
-			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
+			dftemp1v = LVFU(data_fine[i][j]);
+			for(int inbr=inbrMin; inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) {
-					dirwtv = ( rangefn[_mm_cvttps_epi32(vabsf(LVFU(data_fine[inbr][jnbr])-LVFU(data_fine[i][j])))] );
-					valv += dirwtv*LVFU(data_fine[inbr][jnbr]);
+					dftemp2v = LVFU(data_fine[inbr][jnbr]);
+					dirwtv = ( rangefn[_mm_cvttps_epi32(vabsf(dftemp2v-dftemp1v))] );
+					valv += dirwtv*dftemp2v;
 					normv += dirwtv;
 				}
 			}
 			_mm_storeu_ps( &data_coarse[i][j], valv/normv);
 		}
-		for(; j < width-scalewin; j++)
-		{
-			float val=0;
-			float norm=0;
-
-			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
+		for(; j < width-scalewin; j++) {
+			float val=0.f;
+			float norm=0.f;
+			for(int inbr=inbrMin; inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) {
 					dirwt = ( rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] );
 					val += dirwt*data_fine[inbr][jnbr];
@@ -246,11 +263,9 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in
 		}

 #else
-		for(; j < width-scalewin; j++)
-		{
-			float val=0;
-			float norm=0;
-
+		for(; j < width-scalewin; j++) {
+			float val=0.f;
+			float norm=0.f;
 			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) {
 					dirwt = ( rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] );
@@ -261,11 +276,9 @@ void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, in
 			data_coarse[i][j] = val/norm; // low pass filter
 		}
 #endif
-		for(; j < width; j++)
-		{
-			float val=0;
-			float norm=0;
-
+		for(; j < width; j++) {
+			float val=0.f;
+			float norm=0.f;
 			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j-scalewin; jnbr<width; jnbr+=scale) {
 					dirwt = ( rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] );
@@ -289,21 +302,19 @@ else {
 #endif
 {
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-	__m128 dirwtv, valv, normv;
+	__m128 dirwtv, valv, normv, dftemp1v, dftemp2v;
 	float domkerv[5][5][4] __attribute__ ((aligned (16))) = {{{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}},{{1,1,1,1},{2,2,2,2},{2,2,2,2},{2,2,2,2},{1,1,1,1}},{{1,1,1,1},{2,2,2,2},{2,2,2,2},{2,2,2,2},{1,1,1,1}},{{1,1,1,1},{2,2,2,2},{2,2,2,2},{2,2,2,2},{1,1,1,1}},{{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}}};

 #endif // __SSE2__
 	int j;
 #ifdef _OPENMP
-#pragma omp for
+#pragma omp for schedule(dynamic,16)
 #endif
 	for(int i = 0; i < height; i++) {
 		float dirwt;
-		for(j = 0; j < scalewin; j++)
-		{
-			float val=0;
-			float norm=0;
-
+		for(j = 0; j < scalewin; j++) {
+			float val=0.f;
+			float norm=0.f;
 			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j%scale; jnbr<=j+scalewin; jnbr+=scale) {
 					dirwt = ( domker[(inbr-i)/scale+halfwin][(jnbr-j)/scale+halfwin] * rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] );
@@ -314,25 +325,24 @@ else {
 			data_coarse[i][j] = val/norm; // low pass filter
 		}
 #if defined( __SSE2__ ) && defined( __x86_64__ )
-		for(; j < width-scalewin-3; j+=4)
-		{
+		for(; j < width-scalewin-3; j+=4) {
 			valv = _mm_setzero_ps();
 			normv = _mm_setzero_ps();
-
-			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
-				for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) {
-					dirwtv = ( _mm_load_ps((float*)&domkerv[(inbr-i)/scale+halfwin][(jnbr-j)/scale+halfwin]) * rangefn[_mm_cvttps_epi32(vabsf(LVFU(data_fine[inbr][jnbr])-LVFU(data_fine[i][j])))] );
-					valv += dirwtv*LVFU(data_fine[inbr][jnbr]);
+			dftemp1v = LVFU(data_fine[i][j]);
+			for(int inbr=max(i-scalewin,i%scale); inbr<=MIN(i+scalewin, height-1); inbr+=scale) {
+				int indexihlp = (inbr-i)/scale+halfwin;
+				for (int jnbr=j-scalewin,indexjhlp = 0; jnbr<=j+scalewin; jnbr+=scale,indexjhlp++) {
+					dftemp2v = LVFU(data_fine[inbr][jnbr]);
+					dirwtv = ( _mm_load_ps((float*)&domkerv[indexihlp][indexjhlp]) * rangefn[_mm_cvttps_epi32(vabsf(dftemp2v-dftemp1v))] );
+					valv += dirwtv*dftemp2v;
 					normv += dirwtv;
 				}
 			}
 			_mm_storeu_ps( &data_coarse[i][j], valv/normv);
 		}
-		for(; j < width-scalewin; j++)
-		{
+		for(; j < width-scalewin; j++) {
 			float val=0;
 			float norm=0;
-
 			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) {
 					dirwt = ( domker[(inbr-i)/scale+halfwin][(jnbr-j)/scale+halfwin] * rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] );
@@ -344,11 +354,9 @@ else {
 		}

 #else
-		for(; j < width-scalewin; j++)
-		{
+		for(; j < width-scalewin; j++) {
 			float val=0;
 			float norm=0;
-
 			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j-scalewin; jnbr<=j+scalewin; jnbr+=scale) {
 					dirwt = ( domker[(inbr-i)/scale+halfwin][(jnbr-j)/scale+halfwin] * rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] );
@@ -359,11 +367,9 @@ else {
 			data_coarse[i][j] = val/norm; // low pass filter
 		}
 #endif
-		for(; j < width; j++)
-		{
+		for(; j < width; j++) {
 			float val=0;
 			float norm=0;
-
 			for(int inbr=max(i-scalewin,i%scale); inbr<=min(i+scalewin, height-1); inbr+=scale) {
 				for (int jnbr=j-scalewin; jnbr<width; jnbr+=scale) {
 					dirwt = ( domker[(inbr-i)/scale+halfwin][(jnbr-j)/scale+halfwin] * rangefn[abs(data_fine[inbr][jnbr]-data_fine[i][j])] );
--- a/rtengine/shmap.h
+++ b/rtengine/shmap.h
@@ -27,18 +27,22 @@ namespace rtengine {
 class SHMap {

    public: 
-        int W, H;
        float** map;
        float   max_f, min_f, avg;
-        bool multiThread;
        
-     SHMap (int w, int h, bool multiThread);
-    ~SHMap ();
+		 SHMap (int w, int h, bool multiThread);
+		~SHMap ();

-    void update (Imagefloat* img, double radius, double lumi[3], bool hq, int skip);
-    void forceStat (float max_, float min_, float avg_);
-	void dirpyr_shmap       (float ** data_fine, float ** data_coarse, 
-							 int width, int height, LUTf & rangefn, int level, int scale);
+		void update (Imagefloat* img, double radius, double lumi[3], bool hq, int skip);
+		void forceStat (float max_, float min_, float avg_);
+
+	private:
+		int W, H;
+		bool multiThread;
+
+		void fillLuminance( Imagefloat * img, float **luminance, double lumi[3] );
+		void dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, int height, LUTf & rangefn, int level, int scale);
+		
 };
 }
 #endif