Fixed multiprocessor sharpening bug

see issue 1454
2012-07-01 22:20:09 +02:00
parent d47bab4380
commit 33f565a1ba
9 changed files with 132 additions and 68 deletions
--- a/rtengine/PF_correct_RT.cc
+++ b/rtengine/PF_correct_RT.cc
@@ -56,16 +56,15 @@ void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, double radiu
 #pragma omp parallel
 #endif
 	{
-		AlignedBuffer<double>* buffer = new AlignedBuffer<double> (max(src->W,src->H));
-		gaussHorizontal<float> (src->a, tmp1->a, buffer, src->W, src->H, radius, multiThread);
-		gaussHorizontal<float> (src->b, tmp1->b, buffer, src->W, src->H, radius, multiThread);
-		gaussVertical<float>   (tmp1->a, tmp1->a, buffer, src->W, src->H, radius, multiThread);
-		gaussVertical<float>   (tmp1->b, tmp1->b, buffer, src->W, src->H, radius, multiThread);
+		AlignedBufferMP<double> buffer(max(src->W,src->H));

-		gaussHorizontal<float> (src->L, tmp1->L, buffer, src->W, src->H, radius, multiThread);
-		gaussVertical<float>   (tmp1->L, tmp1->L, buffer, src->W, src->H, radius, multiThread);
+		gaussHorizontal<float> (src->a, tmp1->a, buffer, src->W, src->H, radius);
+		gaussHorizontal<float> (src->b, tmp1->b, buffer, src->W, src->H, radius);
+		gaussVertical<float>   (tmp1->a, tmp1->a, buffer, src->W, src->H, radius);
+		gaussVertical<float>   (tmp1->b, tmp1->b, buffer, src->W, src->H, radius);

-		delete buffer;
+		gaussHorizontal<float> (src->L, tmp1->L, buffer, src->W, src->H, radius);
+		gaussVertical<float>   (tmp1->L, tmp1->L, buffer, src->W, src->H, radius);
 	}
 	
 //#ifdef _OPENMP
--- a/rtengine/alignedbuffer.h
+++ b/rtengine/alignedbuffer.h
@@ -1,7 +1,7 @@
 /*
 *  This file is part of RawTherapee.
 *
- *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
+*  Copyright (c) 2004-2012 Gabor Horvath <hgabor@rawtherapee.com>, Oliver Duis <oduis@oliverduis.de>
 *
 *  RawTherapee is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
@@ -19,7 +19,10 @@
 #ifndef _ALIGNEDBUFFER_
 #define _ALIGNEDBUFFER_
 #include <stdint.h>
+#include <vector>
+#include <glibmm.h>

+// Aligned buffer that should be faster
 template <class T> class AlignedBuffer {

    private:
@@ -27,10 +30,12 @@ template <class T> class AlignedBuffer {
      
    public:
      T* data ;
+    bool inUse;

        AlignedBuffer (size_t size, size_t align=16) {
            real = new T[size+2*align];
            data = (T*)((uintptr_t)real + (align-((uintptr_t)real)%align));
+        inUse=true;
        }

        ~AlignedBuffer () {
@@ -38,4 +43,44 @@ template <class T> class AlignedBuffer {
        }
 };

+// Multi processor version, use with OpenMP
+template <class T> class AlignedBufferMP {
+private:
+    Glib::Mutex mtx;
+    std::vector<AlignedBuffer<T>*> buffers;
+    size_t size;
+
+public:
+    AlignedBufferMP(size_t sizeP) {
+        size=sizeP;
+    }
+
+    ~AlignedBufferMP() {
+        for (int i=0;i<buffers.size();i++) delete buffers[i];
+    }
+
+    AlignedBuffer<T>* acquire() {
+        Glib::Mutex::Lock lock(mtx);
+
+        // Find available buffer
+        for (int i;i<buffers.size();i++) {
+            if (!buffers[i]->inUse) {
+                buffers[i]->inUse=true;
+                return buffers[i];
+            }
+        }
+
+        // Add new buffer if nothing is free
+        AlignedBuffer<T>* buffer=new AlignedBuffer<T>(size);
+        buffers.push_back(buffer);
+
+        return buffer;
+    }
+
+    void release(AlignedBuffer<T>* buffer) {
+        Glib::Mutex::Lock lock(mtx);
+
+        buffer->inUse=false;
+    }
+};
 #endif
--- a/rtengine/gauss.h
+++ b/rtengine/gauss.h
@@ -26,45 +26,55 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include <windows.h>
+#include <stdio.h>

 // classical filtering if the support window is small:

-template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int H, const float c0, const float c1, bool multiThread) {
-
+template<class T> void gaussHorizontal3 (T** src, T** dst, AlignedBufferMP<double> &buffer, int W, int H, const float c0, const float c1) {

 #ifdef _OPENMP
 #pragma omp for
 #endif
    for (int i=0; i<H; i++) {
-    	T* temp = buffer;
+    	AlignedBuffer<double>* pBuf = buffer.acquire();
+        T* temp=(T*)pBuf->data;
+
        for (int j=1; j<W-1; j++)
            temp[j] = (T)(c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]);
        dst[i][0] = src[i][0];
        memcpy (dst[i]+1, temp+1, (W-2)*sizeof(T));
+
+        buffer.release(pBuf);
+
        dst[i][W-1] = src[i][W-1];
    }
 }

-template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int W, int H, const float c0, const float c1, bool multiThread) {
+template<class T> void gaussVertical3 (T** src, T** dst, AlignedBufferMP<double> &buffer, int W, int H, const float c0, const float c1) {
    
-	//#pragma omp parallel for if (multiThread)
 #ifdef _OPENMP
 #pragma omp for
 #endif
    for (int i=0; i<W; i++) {
-    	T* temp = buffer;
+        AlignedBuffer<double>* pBuf = buffer.acquire();
+    	T* temp = (T*)pBuf->data;
+
        for (int j = 1; j<H-1; j++) 
        	temp[j] = (T)(c1 * (src[j-1][i] + src[j+1][i]) + c0 * src[j][i]);
        dst[0][i] = src[0][i];
 	    for (int j=1; j<H-1; j++)
            dst[j][i] = temp[j];
+
+        buffer.release(pBuf);
+
        dst[H-1][i] = src[H-1][i];
    }
 }

 // fast gaussian approximation if the support window is large

-template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>* buffer, int W, int H, double sigma, bool multiThread) {
+template<class T> void gaussHorizontal (T** src, T** dst, AlignedBufferMP<double> &buffer, int W, int H, double sigma) {

    if (sigma<0.25) {
        // dont perform filtering
@@ -81,7 +91,7 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
        double csum = 2.0 * c1 + 1.0;
        c1 /= csum;
        double c0 = 1.0 / csum;
-        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, H, c0, c1, multiThread);
+        gaussHorizontal3<T> (src, dst, buffer, W, H, c0, c1);
        return;
    }

@@ -113,10 +123,12 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
    for (int i=0; i<3; i++)
        for (int j=0; j<3; j++)
            M[i][j] /= (1.0+b1-b2+b3)*(1.0+b2+(b1-b3)*b3);
- //   if (multiThread)
+
 	#pragma omp for
    for (int i=0; i<H; i++) {
-        double* temp2 = buffer->data;
+        AlignedBuffer<double>* pBuf = buffer.acquire();
+        double* temp2 = pBuf->data;
+
        temp2[0] = B * src[i][0] + b1*src[i][0] + b2*src[i][0] + b3*src[i][0];
        temp2[1] = B * src[i][1] + b1*temp2[0]  + b2*src[i][0] + b3*src[i][0];
        temp2[2] = B * src[i][2] + b1*temp2[1]  + b2*temp2[0]  + b3*src[i][0];
@@ -136,10 +148,13 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
            temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3];
        for (int j=0; j<W; j++)
            dst[i][j] = (T)temp2[j];
-    }
+
+        buffer.release(pBuf);
 }

-template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* buffer, int W, int H, double sigma, bool multiThread) {
+}
+
+template<class T> void gaussVertical (T** src, T** dst, AlignedBufferMP<double> &buffer, int W, int H, double sigma) {

    if (sigma<0.25) {
        // dont perform filtering
@@ -156,7 +171,7 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
        double csum = 2.0 * c1 + 1.0;
        c1 /= csum;
        double c0 = 1.0 / csum;
-        gaussVertical3<T> (src, dst, (T*)(buffer->data), W, H, c0, c1, multiThread);
+        gaussVertical3<T> (src, dst, buffer, W, H, c0, c1);
        return;
    }

@@ -192,7 +207,8 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
 #pragma omp for
 #endif
    for (int i=0; i<W; i++) {
-        double* temp2 = buffer->data;
+        AlignedBuffer<double>* pBuf = buffer.acquire();
+        double* temp2 = pBuf->data;
    	temp2[0] = B * src[0][i] + b1*src[0][i] + b2*src[0][i] + b3*src[0][i];
        temp2[1] = B * src[1][i] + b1*temp2[0]  + b2*src[0][i] + b3*src[0][i];
        temp2[2] = B * src[2][i] + b1*temp2[1]  + b2*temp2[0]  + b3*src[0][i];
@@ -213,15 +229,9 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
        
        for (int j=0; j<H; j++)
            dst[j][i] = (T)temp2[j];
+
+        buffer.release(pBuf);
    }
 }   

-/*
-void gaussHorizontal_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
-void gaussVertical_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
-void gaussHorizontal_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
-void gaussVertical_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
-void gaussHorizontal_float (float** src, float** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
-void gaussVertical_float (float** src, float** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
-*/
 #endif
--- a/rtengine/imagefloat.cc
+++ b/rtengine/imagefloat.cc
@@ -23,6 +23,7 @@
 #include "rtengine.h"
 #include "mytime.h"
 #include "iccstore.h"
+#include "alignedbuffer.h"

 using namespace rtengine;

@@ -271,22 +272,28 @@ void Imagefloat::calcCroppedHistogram(const ProcParams &params, float scale, LUT

 // Parallized transformation; create transform with cmsFLAGS_NOCACHE!
 void Imagefloat::ExecCMSTransform(cmsHTRANSFORM hTransform) {
+     
+    AlignedBufferMP<float> bufMP(width*3);
+
    // LittleCMS cannot parallize planar setups
    // so build temporary buffers to allow multi processor execution
        #pragma omp parallel for
    for (int y=0; y<height; y++) {
-        float buffer[width*3];
-        float *p=buffer, *pR=r[y], *pG=g[y], *pB=b[y];
+        AlignedBuffer<float>* pBuf=bufMP.acquire();
+
+        float *p=pBuf->data, *pR=r[y], *pG=g[y], *pB=b[y];

        for (int x=0; x<width; x++) {
            *(p++) = *(pR++); *(p++) = *(pG++); *(p++) = *(pB++);
    }

-        cmsDoTransform (hTransform, buffer, buffer, width);
+        cmsDoTransform (hTransform, pBuf->data, pBuf->data, width);

-        p=buffer; pR=r[y]; pG=g[y]; pB=b[y];
+        p=pBuf->data; pR=r[y]; pG=g[y]; pB=b[y];
        for (int x=0; x<width; x++) {
            *(pR++) = *(p++); *(pG++) = *(p++); *(pB++) = *(p++);
 }
+
+        bufMP.release(pBuf);
    }
 }
--- a/rtengine/impulse_denoise.h
+++ b/rtengine/impulse_denoise.h
@@ -62,14 +62,15 @@ void ImProcFunctions::impulse_nr (LabImage* lab, double thresh) {
 	int i1, j1;	
 	
 	//rangeblur<unsigned short, unsigned int> (lab->L, lpf, impish /*used as buffer here*/, width, height, thresh, false);
+    #ifdef _OPENMP
+    #pragma omp parallel
+    #endif
+    {
+		AlignedBufferMP<double> buffer(max(width,height));
 	
-	AlignedBuffer<double>* buffer = new AlignedBuffer<double> (max(width,height));
-
-	gaussHorizontal<float> (lab->L, lpf, buffer, width, height, max(2.0,thresh-1.0), false /*multiThread*/);
-	gaussVertical<float>   (lpf, lpf, buffer, width, height, max(2.0,thresh-1.0), false);
-
-	delete buffer;
-
+	    gaussHorizontal<float> (lab->L, lpf, buffer, width, height, max(2.0,thresh-1.0));
+	    gaussVertical<float>   (lpf, lpf, buffer, width, height, max(2.0,thresh-1.0));
+    }
 	
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	
--- a/rtengine/iplab2rgb.cc
+++ b/rtengine/iplab2rgb.cc
@@ -26,6 +26,7 @@
 #include "../rtgui/options.h"
 #include "settings.h"
 #include "curves.h"
+#include "alignedbuffer.h"


 #ifdef _OPENMP
@@ -49,12 +50,14 @@ void ImProcFunctions::lab2monitorRgb (LabImage* lab, Image8* image) {
 	//gamutmap(lab);

 	if (monitorTransform) {
+        AlignedBufferMP<unsigned short> bufferMP(3*lab->W);
        
        // cmsDoTransform is relatively expensive
        #pragma omp parallel for
 		for (int i=0; i<lab->H; i++) {
            // pre-conversion to integer, since the output is 8 bit anyway, but LCMS is MUCH faster not converting from float
-            unsigned short buffer[3*lab->W];
+            AlignedBuffer<unsigned short>* pBuf=bufferMP.acquire();
+            unsigned short * buffer=pBuf->data;

            const int ix = i * 3 * lab->W;
            int iy = 0;
@@ -81,6 +84,8 @@ void ImProcFunctions::lab2monitorRgb (LabImage* lab, Image8* image) {
 			}

            cmsDoTransform (monitorTransform, buffer, image->data + ix, lab->W);
+
+            bufferMP.release(pBuf);
 		}
        
 	} else {
--- a/rtengine/ipsharpen.cc
+++ b/rtengine/ipsharpen.cc
@@ -80,15 +80,15 @@ void ImProcFunctions::deconvsharpening (LabImage* lab, float** b2) {
 #pragma omp parallel
 #endif
 	{
+        AlignedBufferMP<double> buffer(max(W,H));

-	AlignedBuffer<double>* buffer = new AlignedBuffer<double> (max(W,H));
 	float damping = params->sharpening.deconvdamping / 5.0;
 	bool needdamp = params->sharpening.deconvdamping > 0;
 	for (int k=0; k<params->sharpening.deconviter; k++) {

 		// apply blur function (gaussian blur)
-		gaussHorizontal<float> (tmpI, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
-		gaussVertical<float>   (tmp, tmp,  buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
+            gaussHorizontal<float> (tmpI, tmp, buffer, W, H, params->sharpening.deconvradius / scale);
+            gaussVertical<float>   (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale);

 		if (!needdamp) {
 #ifdef _OPENMP
@@ -102,8 +102,8 @@ void ImProcFunctions::deconvsharpening (LabImage* lab, float** b2) {
 		else
 			dcdamping (tmp, lab->L, damping, W, H);

-		gaussHorizontal<float> (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
-		gaussVertical<float>   (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
+            gaussHorizontal<float> (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale);
+            gaussVertical<float>   (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale);

 #ifdef _OPENMP
 #pragma omp for
@@ -112,7 +112,6 @@ void ImProcFunctions::deconvsharpening (LabImage* lab, float** b2) {
 			for (int j=0; j<W; j++)
 				tmpI[i][j] = tmpI[i][j] * tmp[i][j];
 	} // end for
-	delete buffer;

 	float p2 = params->sharpening.deconvamount / 100.0;
 	float p1 = 1.0 - p2;
@@ -155,18 +154,17 @@ void ImProcFunctions::sharpening (LabImage* lab, float** b2) {
 	{


-	AlignedBuffer<double>* buffer = new AlignedBuffer<double> (max(W,H));
+    AlignedBufferMP<double> buffer(max(W,H));
 	if (params->sharpening.edgesonly==false) {

-		gaussHorizontal<float> (lab->L, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
-		gaussVertical<float>   (b2,     b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
+		gaussHorizontal<float> (lab->L, b2, buffer, W, H, params->sharpening.radius / scale);
+		gaussVertical<float>   (b2,     b2, buffer, W, H, params->sharpening.radius / scale);
 	}
 	else {
 		bilateral<float, float> (lab->L, (float**)b3, b2, W, H, params->sharpening.edges_radius / scale, params->sharpening.edges_tolerance, multiThread);
-		gaussHorizontal<float> (b3, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
-		gaussVertical<float>   (b2, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
+		gaussHorizontal<float> (b3, b2, buffer, W, H, params->sharpening.radius / scale);
+		gaussVertical<float>   (b2, b2, buffer, W, H, params->sharpening.radius / scale);
 	}
-	delete buffer;

 	float** base = lab->L;
 	if (params->sharpening.edgesonly)
--- a/rtengine/shmap.cc
+++ b/rtengine/shmap.cc
@@ -44,23 +44,22 @@ SHMap::~SHMap () {
 }

 void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int skip) {
-
-    // fill with luminance
-    #pragma omp parallel for
-    for (int i=0; i<H; i++)
-        for (int j=0; j<W; j++) {
-            map[i][j] = lumi[0]*std::max(img->r[i][j],0.f) + lumi[1]*std::max(img->g[i][j],0.f) + lumi[2]*std::max(img->b[i][j],0.f);
-		}
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
    {
-    if (!hq) {
-    	AlignedBuffer<double>* buffer = new AlignedBuffer<double> (max(W,H));
-    	gaussHorizontal<float> (map, map, buffer, W, H, radius, multiThread);
-		gaussVertical<float>   (map, map, buffer, W, H, radius, multiThread);
+    // fill with luminance
+    #pragma omp for
+    for (int i=0; i<H; i++)
+        for (int j=0; j<W; j++) {
+            map[i][j] = lumi[0]*std::max(img->r[i][j],0.f) + lumi[1]*std::max(img->g[i][j],0.f) + lumi[2]*std::max(img->b[i][j],0.f);
+		}

-        delete buffer;
+    if (!hq) {
+        AlignedBufferMP<double>* pBuffer = new AlignedBufferMP<double> (max(W,H));
+    	gaussHorizontal<float> (map, map, *pBuffer, W, H, radius);
+		gaussVertical<float>   (map, map, *pBuffer, W, H, radius);
+        delete pBuffer;
    }
    else {
 /*		
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@@ -54,7 +54,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
    }
    procparams::ProcParams& params = job->pparams;

-    // aquire image from imagesource
+    // acquire image from imagesource
    ImageSource* imgsrc = ii->getImageSource ();

    int tr = TR_NONE;