ImProcFunctions cleanup and transition to OpenMP -- phase 1

2010-05-21 16:14:18 +02:00 · 2010-05-21 16:14:18 +02:00 · cd91e7890e
commit cd91e7890e
parent c427279ce3
19 changed files with 1944 additions and 2856 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -116,6 +116,11 @@ if (WITH_RAWZOR)
    endif (WIN32)
 endif (WITH_RAWZOR)
 find_package(OpenMP)
 if (OPENMP_FOUND)
 	set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS})
 endif (OPENMP_FOUND)
 add_subdirectory (rtexif)
 add_subdirectory (rtengine)
 add_subdirectory (rtgui)
--- a/rtengine/CMakeLists.txt
+++ b/rtengine/CMakeLists.txt
@ -6,11 +6,12 @@ link_directories (${CMAKE_CURRENT_SOURCE_DIR}/../rtexif ${EXTRA_LIBDIR} ${GTHREA
    ${GOBJECT_LIBRARY_DIRS} ${GLIB2_LIBRARY_DIRS} ${GLIBMM_LIBRARY_DIRS}
    ${IPTCDATA_LIBRARY_DIRS} ${LCMS_LIBRARY_DIRS})
-set (RTENGINESOURCEFILES colortemp.cc curves.cc dcraw.cc gauss.cc iccstore.cc
+set (RTENGINESOURCEFILES colortemp.cc curves.cc dcraw.cc iccstore.cc
    image8.cc image16.cc imagedata.cc imageio.cc improcfun.cc init.cc dcrop.cc
    loadinitial.cc procparams.cc rawimagesource.cc shmap.cc simpleprocess.cc refreshmap.cc
    stdimagesource.cc myfile.cc iccjpeg.c hlmultipliers.cc improccoordinator.cc
-    processingjob.cc rtthumbnail.cc utils.cc bilateral2.cc)
+    processingjob.cc rtthumbnail.cc utils.cc labimage.cc
    iplab2rgb.cc ipsharpen.cc iptransform.cc ipresize.cc)
 add_library (rtengine ${RTENGINESOURCEFILES})
 #It may be nice to store library version too
--- a/rtengine/bilateral2.h
+++ b/rtengine/bilateral2.h
@ -26,6 +26,9 @@
 #include <mytime.h>
 #include <gauss.h>
 #include <glibmm.h>
 #include <omp.h>
 // This seems ugly, but way faster than any other solutions I tried
 #define ELEM(a,b) (src[i - a][j - b] * ec[src[i - a][j - b]-src[i][j]+0x10000])
 #define SULY(a,b) (ec[src[i - a][j - b]-src[i][j]+0x10000])
@ -34,22 +37,22 @@
                        int* ec = new int [0x20000]; \
                        for (int i=0; i<0x20000; i++) \
                            ec[i] = (int)(exp(-(double)(i-0x10000)*(double)(i-0x10000) / (2.0*sens*sens))*scale); \
-                        int start = row_from; \
+                        int rstart = b; \
-                        if (start<(b)) start = (b); \
+                        int rend = H-b; \
-                        int end = row_to; \
+                        int cstart = b; \
-                        if (end>H-(b)) end = H-(b); \
+                        int cend = W-b;
                        for (int i=start; i<end; i++) { \
                            for (int j=(b); j<W-(b); j++) {
 #define BL_END(b)       buffer[i][j] = v; }} delete [] ec; \
-                        for (int i=row_from; i<row_to; i++)  \
+                        for (int i=0; i<H; i++)  \
                            for (int j=0; j<W; j++)  \
-                                if (i<start || j<(b) || i>=end || j>=W-(b)) \
+                                if (i<rstart || j<cstart || i>=rend || j>=cend) \
                                    dst[i][j] = src[i][j]; \
                                else \
                                    dst[i][j] = buffer[i][j];
-#define BL_OPER3(a11,a12,a21,a22) A v = a11*ELEM(-1,-1) + a12*ELEM(-1,0) + a11*ELEM(-1,1) + \
+#define BL_OPER3(a11,a12,a21,a22) for (int i=rstart; i<rend; i++) { \
 									for (int j=cstart; j<cend; j++) { \
 								   A v = a11*ELEM(-1,-1) + a12*ELEM(-1,0) + a11*ELEM(-1,1) + \
                                        a21*ELEM(0,-1) + a22*ELEM(0,0) + a21*ELEM(0,1) + \
                                        a11*ELEM(1,-1) + a12*ELEM(1,0) + a11*ELEM(1,1); \
                                   v /= a11*SULY(-1,-1) + a12*SULY(-1,0) + a11*SULY(-1,1) + \
@ -57,7 +60,9 @@
                                        a11*SULY(1,-1) + a12*SULY(1,0) + a11*SULY(1,1); 
-#define BL_OPER5(a11,a12,a13,a21,a22,a23,a31,a32,a33) A v = a11*ELEM(-2,-2) + a12*ELEM(-2,-1) + a13*ELEM(-2,0) + a12*ELEM(-2,1) + a11*ELEM(-2,2) + \
+#define BL_OPER5(a11,a12,a13,a21,a22,a23,a31,a32,a33) for (int i=rstart; i<rend; i++) { \
 														for (int j=cstart; j<cend; j++) { \
 													   A v = a11*ELEM(-2,-2) + a12*ELEM(-2,-1) + a13*ELEM(-2,0) + a12*ELEM(-2,1) + a11*ELEM(-2,2) + \
                                                            a21*ELEM(-1,-2) + a22*ELEM(-1,-1) + a23*ELEM(-1,0) + a22*ELEM(-1,1) + a21*ELEM(-1,2) + \
                                                            a31*ELEM(0,-2) + a32*ELEM(0,-1) + a33*ELEM(0,0) + a32*ELEM(0,1) + a31*ELEM(0,2) + \
                                                            a21*ELEM(1,-2) + a22*ELEM(1,-1) + a23*ELEM(1,0) + a22*ELEM(1,1) + a21*ELEM(1,2) + \
@ -69,7 +74,9 @@
                                                            a11*SULY(2,-2) + a12*SULY(2,-1) + a13*SULY(2,0) + a12*SULY(2,1) + a11*SULY(2,2);
 #define BL_OPER7(a11,a12,a13,a14,a21,a22,a23,a24,a31,a32,a33,a34,a41,a42,a43,a44) \
-                                                      A v = a11*ELEM(-3,-3) + a12*ELEM(-3,-2) + a13*ELEM(-3,-1) + a14*ELEM(-3,0) + a13*ELEM(-3,1) + a12*ELEM(-3,2) + a11*ELEM(-3,3) + \
+                                                      for (int i=rstart; i<rend; i++) { \
                                                    	  for (int j=cstart; j<cend; j++) { \
 													   A v = a11*ELEM(-3,-3) + a12*ELEM(-3,-2) + a13*ELEM(-3,-1) + a14*ELEM(-3,0) + a13*ELEM(-3,1) + a12*ELEM(-3,2) + a11*ELEM(-3,3) + \
                                                            a21*ELEM(-2,-3) + a22*ELEM(-2,-2) + a23*ELEM(-2,-1) + a24*ELEM(-2,0) + a23*ELEM(-2,1) + a22*ELEM(-2,2) + a21*ELEM(-2,3) + \
                                                            a31*ELEM(-1,-3) + a32*ELEM(-1,-2) + a33*ELEM(-1,-1) + a34*ELEM(-1,0) + a33*ELEM(-1,1) + a32*ELEM(-1,2) + a31*ELEM(-1,3) + \
                                                            a41*ELEM(0,-3) + a42*ELEM(0,-2) + a43*ELEM(0,-1) + a44*ELEM(0,0) + a43*ELEM(0,1) + a42*ELEM(0,2) + a41*ELEM(0,3) + \
@ -86,6 +93,8 @@
 #define BL_OPER9(a11,a12,a13,a14,a15,a21,a22,a23,a24,a25,a31,a32,a33,a34,a35,a41,a42,a43,a44,a45,a51,a52,a53,a54,a55) \
                                                      for (int i=rstart; i<rend; i++) { \
                                                    	  for (int j=cstart; j<cend; j++) { \
                                                      A v = a11*ELEM(-4,-4) + a12*ELEM(-4,-3) + a13*ELEM(-4,-2) + a14*ELEM(-4,-1) + a15*ELEM(-4,0) + a14*ELEM(-4,1) + a13*ELEM(-4,2) + a12*ELEM(-4,3) + a11*ELEM(-4,4) + \
                                                            a21*ELEM(-3,-4) + a22*ELEM(-3,-3) + a23*ELEM(-3,-2) + a24*ELEM(-3,-1) + a25*ELEM(-3,0) + a24*ELEM(-3,1) + a23*ELEM(-3,2) + a22*ELEM(-3,3) + a21*ELEM(-3,4) + \
                                                            a31*ELEM(-2,-4) + a32*ELEM(-2,-3) + a33*ELEM(-2,-2) + a34*ELEM(-2,-1) + a35*ELEM(-2,0) + a34*ELEM(-2,1) + a33*ELEM(-2,2) + a32*ELEM(-2,3) + a31*ELEM(-2,4) + \
@ -106,6 +115,8 @@
                                                            a11*SULY(4,-4) + a12*SULY(4,-3) + a13*SULY(4,-2) + a14*SULY(4,-1) + a15*SULY(4,0) + a14*SULY(4,1) + a13*SULY(4,2) + a12*SULY(4,3) + a11*SULY(4,4); 
 #define BL_OPER11(a11,a12,a13,a14,a15,a16,a21,a22,a23,a24,a25,a26,a31,a32,a33,a34,a35,a36,a41,a42,a43,a44,a45,a46,a51,a52,a53,a54,a55,a56,a61,a62,a63,a64,a65,a66) \
                                                      for (int i=rstart; i<rend; i++) { \
 														for (int j=cstart; j<cend; j++) { \
                                                      A v = a11*ELEM(-5,-5) + a12*ELEM(-5,-4) + a13*ELEM(-5,-3) + a14*ELEM(-5,-2) + a15*ELEM(-5,-1) + a16*ELEM(-5,0) + a15*ELEM(-5,1) + a14*ELEM(-5,2) + a13*ELEM(-5,3) + a12*ELEM(-5,4) + a11*ELEM(-5,5) + \
                                                            a21*ELEM(-4,-5) + a22*ELEM(-4,-4) + a23*ELEM(-4,-3) + a24*ELEM(-4,-2) + a25*ELEM(-4,-1) + a26*ELEM(-4,0) + a25*ELEM(-4,1) + a24*ELEM(-4,2) + a23*ELEM(-4,3) + a22*ELEM(-4,4) + a21*ELEM(-4,5) + \
                                                            a31*ELEM(-3,-5) + a32*ELEM(-3,-4) + a33*ELEM(-3,-3) + a34*ELEM(-3,-2) + a35*ELEM(-3,-1) + a36*ELEM(-3,0) + a35*ELEM(-3,1) + a34*ELEM(-3,2) + a33*ELEM(-3,3) + a32*ELEM(-3,4) + a31*ELEM(-3,5) + \
@ -131,452 +142,256 @@
 // sigma = 0.5
-template<class T, class A> void bilateral05 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral05 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(318,1)
 	#pragma omp parallel for if (multiThread)
    BL_OPER3(1,7,7,55)
    BL_END(1)
 }
 // sigma = 0.6
-template<class T, class A> void bilateral06 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral06 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(768,1)
 	#pragma omp parallel for if (multiThread)
    BL_OPER3(1,4,4,16)
    BL_END(1)
 }
 // sigma = 0.7
-template<class T, class A> void bilateral07 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral07 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(366,2)
 	#pragma omp parallel for if (multiThread)
    BL_OPER5(0,0,1,0,8,21,1,21,59)
    BL_END(2)
 }
 // sigma = 0.8
-template<class T, class A> void bilateral08 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral08 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(753,2)
 	#pragma omp parallel for if (multiThread)
    BL_OPER5(0,0,1,0,5,10,1,10,23)
    BL_END(2)
 }
 // sigma = 0.9
-template<class T, class A> void bilateral09 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral09 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(595,2)
 	#pragma omp parallel for if (multiThread)
    BL_OPER5(0,1,2,1,6,12,2,12,22)
    BL_END(2)
 }
 // sigma = 1.0
-template<class T, class A> void bilateral10 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral10 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(910,2)
 	#pragma omp parallel for if (multiThread)
    BL_OPER5(0,1,2,1,4,7,2,7,12)
    BL_END(2)
 }
 // sigma = 1.1
-template<class T, class A> void bilateral11 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral11 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(209,3)
 	#pragma omp parallel for if (multiThread)
    BL_OPER7(0,0,1,1,0,2,5,8,1,5,18,27,1,8,27,41)
    BL_END(3)
 }
 // sigma = 1.2
-template<class T, class A> void bilateral12 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral12 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(322,3)
 	#pragma omp parallel for if (multiThread)
    BL_OPER7(0,0,1,1,0,1,4,6,1,4,11,16,1,6,16,23)
    BL_END(3)
 }
 // sigma = 1.3
-template<class T, class A> void bilateral13 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral13 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(336,3)
 	#pragma omp parallel for if (multiThread)
    BL_OPER7(0,0,1,1,0,2,4,6,1,4,11,14,1,6,14,19)
    BL_END(3)
 }
 // sigma = 1.4
-template<class T, class A> void bilateral14 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral14 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(195,3)
 	#pragma omp parallel for if (multiThread)
    BL_OPER7(0,1,2,3,1,4,8,10,2,8,17,21,3,10,21,28)
    BL_END(3)   
 }
 // sigma = 1.5
-template<class T, class A> void bilateral15 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral15 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(132,4)
 	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,0,1,1,0,1,2,4,5,0,2,6,12,14,1,4,12,22,28,1,5,14,28,35)
    BL_END(4)   
 }
 // sigma = 1.6
-template<class T, class A> void bilateral16 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral16 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(180,4)
 	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,0,1,1,0,1,2,3,4,0,2,5,9,10,1,3,9,15,19,1,4,10,19,23)
    BL_END(4)   
 }
 // sigma = 1.7
-template<class T, class A> void bilateral17 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral17 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(195,4)
 	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,1,1,1,0,1,2,3,4,1,2,5,8,9,1,3,8,13,16,1,4,9,16,19)
    BL_END(4)   
 }
 // sigma = 1.8
-template<class T, class A> void bilateral18 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral18 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(151,4)
 	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,1,2,2,0,1,3,5,5,1,3,6,10,12,2,5,10,16,19,2,5,12,19,22)
    BL_END(4)   
 }
 // sigma = 1.9
-template<class T, class A> void bilateral19 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral19 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(151,4)
 	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,1,2,2,0,1,3,4,5,1,3,5,8,9,2,4,8,12,14,2,5,9,14,16)
    BL_END(4)   
 }
 // sigma = 2
-template<class T, class A> void bilateral20 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral20 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(116,5)
 	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,0,1,1,1,0,0,1,2,3,3,0,1,2,4,7,7,1,2,4,8,12,14,1,3,7,12,18,20,1,3,7,14,20,23)
    BL_END(5)   
 }
 // sigma = 2.1
-template<class T, class A> void bilateral21 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral21 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(127,5)
 	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,0,1,1,1,0,0,1,2,3,3,0,1,2,4,6,7,1,2,4,8,11,12,1,3,6,11,15,17,1,3,7,12,17,19)
    BL_END(5)
 }
 // sigma = 2.2
-template<class T, class A> void bilateral22 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral22 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(109,5)
 	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,0,1,1,2,0,1,2,3,3,4,1,2,3,5,7,8,1,3,5,9,12,13,1,3,7,12,16,18,2,4,8,13,18,20)
    BL_END(5)
 }
 // sigma = 2.3
-template<class T, class A> void bilateral23 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral23 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(132,5)
 	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,1,1,1,1,0,1,1,2,3,3,1,1,3,5,6,7,1,2,5,7,10,11,1,3,6,10,13,14,1,3,7,11,14,16)
    BL_END(5)
 }
 // sigma = 2.4
-template<class T, class A> void bilateral24 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral24 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    BL_BEGIN(156,5)
 	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,1,1,1,1,0,1,1,2,3,3,1,1,3,4,5,6,1,2,4,6,8,9,1,3,5,8,10,11,1,3,6,9,11,12)
    BL_END(5)
 }
 // sigma = 2.5
-template<class T, class A> void bilateral25 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral25 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
-    BL_BEGIN(173,5)
+	BL_BEGIN(173,5)
 	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,1,1,1,1,0,1,1,2,3,3,1,1,2,4,5,5,1,2,4,5,7,7,1,3,5,7,9,9,1,3,5,7,9,10)
    BL_END(5)
 }
 class Dim {
    public:
        int W, H, row_from, row_to;
        Dim (int w, int h, int rf, int rt) : W(w), H(h), row_from(rf), row_to(rt) {}
 };
 // main bilateral filter
-template<class T, class A> void bilateral (T** src, T** dst, T** buffer, Dim dim, double sigma, double sens) {
+template<class T, class A> void bilateral (T** src, T** dst, T** buffer, int W, int H, double sigma, double sens, bool multiThread) {
    int W = dim.W;
    int H = dim.H;
    int row_from = dim.row_from;
    int row_to = dim.row_to;
    if (sigma<0.45) 
-        for (int i=row_from; i<row_to; i++) {
+		#pragma omp parallel for if (multiThread)
    	for (int i=0; i<H; i++) {
            memcpy (buffer[i], src[i], W*sizeof(T));
            memcpy (dst[i], buffer[i], W*sizeof(T));
        }
    else if (sigma<0.55)
-        bilateral05<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral05<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<0.65)
-        bilateral06<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral06<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<0.75)
-        bilateral07<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral07<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<0.85)
-        bilateral08<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral08<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<0.95)
-        bilateral09<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral09<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.05)
-        bilateral10<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral10<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.15)
-        bilateral11<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral11<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.25)
-        bilateral12<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral12<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.35)
-        bilateral13<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral13<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.45)
-        bilateral14<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral14<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.55)
-        bilateral15<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral15<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.65)
-        bilateral16<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral16<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.75)
-        bilateral17<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral17<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.85)
-        bilateral18<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral18<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.95)
-        bilateral19<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral19<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.05)
-        bilateral20<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral20<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.15)
-        bilateral21<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral21<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.25)
-        bilateral22<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral22<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.35)
-        bilateral23<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral23<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.45)
-        bilateral24<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral24<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else 
-        bilateral25<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral25<T,A> (src, dst, buffer, W, H, sens, multiThread);
 }
-void bilateral_unsigned (unsigned short** src, unsigned short** dst, unsigned short** buffer, Dim dim, double sigma, double sens);
+// START OF EXPERIMENTAL CODE: O(1) bilateral box filter
-void bilateral_signed (short** src, short** dst, short** buffer, Dim dim, double sigma, double sens);
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 /*
 template<class T> void bilateral (T** src, int** dst, int W, int H, int sigmar, double sigmas) {
  time_t t1 = clock ();
  int r = 2.0*sigmas + 0.5;
  double scaleg = 1024.0;
  double MAXINT = 65536.0*65536.0;
  double sgmax = 275000/(MAXINT/2.0/sigmar/sigmar+(r+1)*(r+1)/sigmas/sigmas);
  printf ("sgmax = %lf\n", sgmax);
  if (scaleg>sgmax)
    scaleg = sgmax;
  // kernel vector for the spatial gaussian filter * 1024
  int* kernel = new int[1+2*r];
  for (int i=0; i<2*r+1; i++) {
    kernel[i] = scaleg / (2.0*sigmas*sigmas) * (i-r)*(i-r) + 0.25;
    printf ("kerneli = %d\n", kernel[i]);
  }
  // exponential lookup table
  int scale = (2.0*sigmar*sigmar) / scaleg;
  int scalem = 65535/((1+2*r)*(1+2*r));
  int ec[256000];
  for (int i=0; i<256000; i++)
    ec[i] = exp (-i/scaleg) * scalem;
  for (int i=r; i<H-r; i++) {
    for (int j=r; j<W-r; j++) {
      int sum = 0.0;
      int val = 0.0;
      int c = src[i][j];
      for (int x=-r; x<=r; x++)
        for (int y=-r; y<=r; y++) {
          unsigned int d = src[i-x][j-y];
          unsigned int mul = ec[(c-d)*(c-d)/scale + kernel[x+r] + kernel[y+r]];
 //          if (mul<275000) {
            val += d*mul;
            sum += mul;
 //          }
 //          else {
 //            printf ("out!!!\n");
 //          }
        }
      dst[i][j] = val / sum;
    }
  }
  delete [] kernel;
  time_t t2 = clock ();
  printf ("bilateral: %d\n", t2-t1);
 }
 */
 #define MAXVAL  0xffff
 #define CLIP(a) ((a)>0?((a)<MAXVAL?(a):MAXVAL):0)
 /*
 template<class T> void bilateral (T** src, T** dst, AlignedBuffer<float>* buffer, int W, int H, double sigmar, double sigmas) {
  time_t t1 = clock ();
    float alpha = 0.5 / sigmar / sigmar;
    // buffer for the final image
    float** buff_final = new float*[H];
    for (int i=0; i<H; i++) {
        buff_final[i] = new float[W];
        memset (buff_final[i], 0, W*sizeof(float));
    }
    // buffer for the normalization 
    float** buff_norm = new float*[H];
    for (int i=0; i<H; i++) {
        buff_norm[i] = new float[W];
        for (int j=0; j<W; j++) 
            buff_norm[i][j] = 1.0;
    }
    // temporary buffer
    float** buff_temp = new float*[H];
    for (int i=0; i<H; i++) 
        buff_temp[i] = new float[W];
    // second order gaussian filter approximation
    // first filtered image: temp <-- y_1
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++)
            buff_temp[i][j] = src[i][j];
 printf ("1: %g\n", buff_temp[100][100]);
    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
 printf ("2: %g\n", buff_temp[100][100]);
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++) {
            buff_norm[i][j] += 2.0*alpha*src[i][j]*buff_temp[i][j]; 
            buff_final[i][j] += buff_temp[i][j];
        }
 printf ("3: %g\n", buff_norm[100][100]);
 printf ("4: %g\n", buff_final[100][100]);
    // second filtered image: temp <-- y_2
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++)
            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j];
    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++) {
            buff_norm[i][j] += alpha*(2.0*alpha*src[i][j]*src[i][j]-1.0)*buff_temp[i][j]; 
            buff_final[i][j] += 2.0*alpha*src[i][j]*buff_temp[i][j];
        }
 printf ("5: %g\n", buff_norm[100][100]);
 printf ("6: %g\n", buff_final[100][100]);
    // third filtered image: temp <-- y_3
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++)
            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j];
    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++) {
            buff_norm[i][j] -= 2.0*alpha*alpha*src[i][j]*(1.0-2.0/3.0*alpha*src[i][j]*src[i][j])*buff_temp[i][j];
            buff_final[i][j] += alpha*(2.0*alpha*src[i][j]*src[i][j]-1.0)*buff_temp[i][j];
        }
    // fourth filtered image: temp <-- y_4
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++)
            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j];
    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++) {
            buff_norm[i][j] += alpha*alpha*(0.5-2.0*alpha*src[i][j]*src[i][j])*buff_temp[i][j];
            buff_final[i][j] -= 2.0*alpha*alpha*src[i][j]*(1.0-2.0/3.0*alpha*src[i][j]*src[i][j])*buff_temp[i][j];
        }
    // fifth filtered image: temp <-- y_5
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++)
            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j];
    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++) {
            buff_norm[i][j] += alpha*alpha*alpha*src[i][j]*buff_temp[i][j];
            buff_final[i][j] += alpha*alpha*(0.5-2.0*alpha*src[i][j]*src[i][j])*buff_temp[i][j];
        }
    // sixth filtered image: temp <-- y_6
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++)
            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j];
    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++) {
            buff_norm[i][j] -= alpha*alpha*alpha/6.0*buff_temp[i][j]; 
            buff_final[i][j] += alpha*alpha*alpha*src[i][j]*buff_temp[i][j];
        }
    // seventh filtered image: temp <-- y_7, and finalizing
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++)
            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j];
    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++) {
            buff_final[i][j] = (buff_final[i][j] - alpha*alpha*alpha/6.0*buff_temp[i][j]) / buff_norm[i][j];
            dst[i][j] = (T)CLIP(buff_final[i][j]);
        }
    // cleanup
    for (int i=0; i<H; i++) {
        delete [] buff_temp[i];
        delete [] buff_norm[i];
        delete [] buff_final[i];
    }
     delete [] buff_temp;   
     delete [] buff_norm;   
     delete [] buff_final;   
  time_t t2 = clock ();
  printf ("bilateral: %d\n", t2-t1);
 }
 */
 #define BINBIT 12
 #define TRANSBIT 4
 template<class T> void bilateral (T** src, T** dst, int W, int H, int sigmar, double sigmas, int row_from, int row_to) {
    // range weights
@ -654,95 +469,4 @@ template<class T> void bilateral (T** src, T** dst, int W, int H, int sigmar, do
        delete [] buff_final[i];
 }
 struct bilateralparams {
    int row_from, row_to;
 };
 void bilateral_box_unsigned (unsigned short** src, unsigned short** dst, int W, int H, int sigmar, double sigmas, bilateralparams row);
 /*
 #define BINBIT 12
 #define TRANSBIT 4
 #define ADDHIST(a,b)  { for (int k=0; k<(1<<BINBIT); k++) (a)[k] = (a)[k]+(b)[k]; }
 #define SUBHIST(a,b)  { for (int k=0; k<(1<<BINBIT); k++) (a)[k] = (a)[k]-(b)[k]; }
 template<class T> void bilateral (T** src, T** dst, AlignedBuffer<float>* buffer, int W, int H, int sigmar, double sigmas) {
    time_t t1 = clock (); 
    unsigned short** rowhist = new unsigned short* [H];
    for (int i=0; i<H; i++) {
        rowhist[i] = new unsigned short [1<<BINBIT];
        memset (rowhist[i], 0, (1<<BINBIT)*sizeof(unsigned short));
    }
    unsigned short* hist = new unsigned short[1<<BINBIT];
    int r = sigmas;
    // buffer for the final image
    float** buff_final = new float*[H];
    for (int i=0; i<H; i++) {
        buff_final[i] = new float[W];
        memset (buff_final[i], 0, W*sizeof(float));
    }
    // fill the row histograms initially
    for (int i=0; i<H; i++) 
        for (int j=0; j<2*r+1; j++)
            rowhist[i][src[i][j]>>TRANSBIT]++;
    // let the game begin...
    for (int j=r+1; j<W-r-1; j++) {
        // update row histograms
        for (int i=0; i<H; i++) {
            rowhist[i][src[i][j-r-1]>>TRANSBIT]--;
            rowhist[i][src[i][j+r]>>TRANSBIT]++;
        }
        // sum up upper histograms
        memset (hist, 0, (1<<BINBIT)*sizeof(unsigned short));
        for (int i=0; i<2*r+1; i++)
            ADDHIST(hist, rowhist[i]);
        for (int i=r+1; i<H-r-1; i++) {
            SUBHIST (hist, rowhist[i-r-1]);
            ADDHIST (hist, rowhist[i+r]);
            // calculate pixel value
            float weight = 0.0;
            for (int k=0; k<=(sigmar>>TRANSBIT); k++) {               
                float w = 1.0 - (double)k/(sigmar>>TRANSBIT);
                int v = src[i][j]>>TRANSBIT;
                if (v-k >= 0) {
                    weight += hist [v-k] * w;
                    buff_final[i][j] += hist [v-k] * w * (src[i][j]-(k<<TRANSBIT));
                }
                if (v+k < (1<<BINBIT)) {
                    weight += hist [v+k] * w;
                    buff_final[i][j] += hist [v+k] * w * (src[i][j]+(k<<TRANSBIT));
                }
            }
            buff_final[i][j] /= weight;
        }
    }
    for (int i=0; i<H; i++) 
        for (int j=0; j<W; j++) 
            if (i<r+1 || i>H-r-1 || j<r+1 || j>W-r-1)
                dst[i][j] = src[i][j];
            else
                dst[i][j] = (T)CLIP(buff_final[i][j]);
    delete [] hist;
    for (int i=0; i<H; i++) {
        delete [] buff_final[i];
        delete [] rowhist[i];
    }
    delete [] buff_final;
    delete [] rowhist;
    time_t t2 = clock ();
    printf ("bilateral: %d\n", t2-t1);
 }
 */
 #endif
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@ -74,6 +74,8 @@ void Crop::update (int todo, bool internal) {
    MyTime t1,t2,t3,t4,t5,t6,t7,t8,t9;
    parent->ipf.setScale (skip);
    t1.set ();
    // give possibility to the listener to modify crop window (as the full image dimensions are already known at this point)
    int wx, wy, ww, wh, ws;
@ -130,7 +132,7 @@ void Crop::update (int todo, bool internal) {
            }
            if (!resizeCrop)
                resizeCrop = new Image16 (rcw, rch);
-            parent->ipf.resize (origCrop, resizeCrop, params.resize);
+            parent->ipf.resize (origCrop, resizeCrop);
            baseCrop = resizeCrop;
        }
        parent->minit.unlock ();
@ -173,7 +175,7 @@ void Crop::update (int todo, bool internal) {
    if (settings->verbose) printf ("C-BLURMAP: %d\n", t4.etime(t3));
    if (todo & M_RGBCURVE) {
-        parent->ipf.rgbProc (baseCrop, laboCrop, &params, parent->tonecurve, cshmap);
+        parent->ipf.rgbProc (baseCrop, laboCrop, parent->tonecurve, cshmap);
    }
    t5.set ();
@ -182,8 +184,8 @@ void Crop::update (int todo, bool internal) {
    if (todo & M_LUMINANCE) {
        parent->ipf.luminanceCurve (laboCrop, labnCrop, parent->lumacurve, 0, croph);
        if (skip==1) {
-            parent->ipf.lumadenoise (labnCrop, &params, 1, cbuffer);
+            parent->ipf.lumadenoise (labnCrop, cbuffer);
-            parent->ipf.sharpening (labnCrop, &params, 1, (unsigned short**)cbuffer);
+            parent->ipf.sharpening (labnCrop, (unsigned short**)cbuffer);
        }
    }
@ -191,9 +193,9 @@ void Crop::update (int todo, bool internal) {
    if (settings->verbose) printf ("C-LUMINANCE: %d\n", t6.etime(t5));
    if (todo & M_COLOR) {
-        parent->ipf.colorCurve (laboCrop, labnCrop, &params);
+        parent->ipf.colorCurve (laboCrop, labnCrop);
        if (skip==1)
-            parent->ipf.colordenoise (labnCrop, &params, 1, cbuffer);
+            parent->ipf.colordenoise (labnCrop, cbuffer);
    }
    t7.set ();
@ -290,7 +292,7 @@ if (settings->verbose) printf ("setcropsizes before lock\n");
    // determine which part of the source image is required to compute the crop rectangle
    int orx, ory, orw, orh;
    ProcParams& params = parent->params;
-    parent->ipf.transCoord (&params, parent->fw, parent->fh, bx1, by1, bw, bh, orx, ory, orw, orh);
+    ImProcFunctions::transCoord (&params, parent->fw, parent->fh, bx1, by1, bw, bh, orx, ory, orw, orh);
    int tr = TR_NONE;
    if (params.coarse.rotate==90)  tr |= TR_R90;
@ -324,7 +326,7 @@ if (settings->verbose) printf ("setcropsizes before lock\n");
        laboCrop = new LabImage (cropw, croph);    
        labnCrop = new LabImage (cropw, croph);    
        cropImg = new Image8 (cropw, croph);
-        cshmap = new SHMap (cropw, croph);
+        cshmap = new SHMap (cropw, croph, true);
        cbuffer = new int*[croph];
        for (int i=0; i<croph; i++)
--- a/rtengine/gauss.cc
+++ b/rtengine/gauss.cc
@ -1,49 +0,0 @@
 /*
 *  This file is part of RawTherapee.
 *
 *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
 *
 *  RawTherapee is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 * 
 *  RawTherapee is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <gauss.h>
 void gaussHorizontal_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma) {
    gaussHorizontal<unsigned short> (src, dst, buffer, W, row_from, row_to, sigma);
 }
 void gaussVertical_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma) {
    gaussVertical<unsigned short> (src, dst, buffer, H, col_from, col_to, sigma);
 }
 void gaussHorizontal_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma) {
    gaussHorizontal<short> (src, dst, buffer, W, row_from, row_to, sigma);
 }
 void gaussVertical_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma) {
    gaussVertical<short> (src, dst, buffer, H, col_from, col_to, sigma);
 }
 void gaussHorizontal_float (float** src, float** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma) {
    gaussHorizontal<float> (src, dst, buffer, W, row_from, row_to, sigma);
 }
 void gaussVertical_float (float** src, float** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma) {
    gaussVertical<float> (src, dst, buffer, H, col_from, col_to, sigma);
 }
--- a/rtengine/gauss.h
+++ b/rtengine/gauss.h
@ -23,453 +23,45 @@
 #include <string.h> 
 #include <math.h>
 #include <alignedbuffer.h>
 #include <omp.h>
-#define NOSSE 1
+// classical filtering if the support window is small:
-#ifndef NOSSE
+template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int H, const float c0, const float c1, bool multiThread) {
 #include <xmmintrin.h>
 template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int row_from, int row_to, const float c0, const float c1) {
    typedef float pfloat[4];
    pfloat* temp = (pfloat*)buffer + 1;
    pfloat* tmp = (pfloat*)buffer;
-    __m128 xmm1; xmm1 = _mm_load1_ps (&c0);
+	#pragma omp parallel for if (multiThread)
-    __m128 xmm2; xmm2 = _mm_load1_ps (&c1);
+    for (int i=0; i<H; i++) {
-    __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6;
+    	T* temp = buffer + omp_get_thread_num() * W;
    __m128 xmm0;
    int i;
    for (i = row_from; i<row_to-3; i+=4) {
        T* row1 = src[i];
        T* row2 = src[i+1];
        T* row3 = src[i+2];
        T* row4 = src[i+3];
        (*tmp)[0] = (float)row1[0];
        (*tmp)[1] = (float)row2[0];
        (*tmp)[2] = (float)row3[0];
        (*tmp)[3] = (float)row4[0];
        xmm3 = _mm_load_ps ((float*)tmp);   // previous element
        (*tmp)[0] = (float)row1[1];
        (*tmp)[1] = (float)row2[1];
        (*tmp)[2] = (float)row3[1];
        (*tmp)[3] = (float)row4[1];
        xmm4 = _mm_load_ps ((float*)tmp);   // current element
        for (int j=1; j<W-1; j++) {
            (*tmp)[0] = (float)row1[j+1];
            (*tmp)[1] = (float)row2[j+1];
            (*tmp)[2] = (float)row3[j+1];
            (*tmp)[3] = (float)row4[j+1];
            xmm5 = _mm_load_ps ((float*)tmp);   // next element
            // compute blured pixel
            xmm0 = _mm_mul_ps (xmm3, xmm2);
            xmm6 = _mm_mul_ps (xmm5, xmm2);
            xmm0 = _mm_add_ps (xmm6, xmm0);
            xmm6 = _mm_mul_ps (xmm4, xmm1);
            xmm0 = _mm_add_ps (xmm6, xmm0);
            // store blured pixel
            _mm_store_ps ((float*)&temp[j], xmm0);
            // update prev and current
            xmm3 = xmm4;
            xmm4 = xmm5;
        }
        for (int j=1; j<W-1; j++) {
          dst[i][j]   = (T)temp[j][0];
          dst[i+1][j] = (T)temp[j][1];
          dst[i+2][j] = (T)temp[j][2];
          dst[i+3][j] = (T)temp[j][3];
        }                    
        dst[i][0] = src[i][0];
        dst[i+1][0] = src[i+1][0];
        dst[i+2][0] = src[i+2][0];
        dst[i+3][0] = src[i+3][0];
        dst[i][W-1] = src[i][W-1];
        dst[i+1][W-1] = src[i+1][W-1];
        dst[i+2][W-1] = src[i+2][W-1];
        dst[i+3][W-1] = src[i+3][W-1];
    }
    for (; i<row_to; i++) {
        for (int j=1; j<W-1; j++)
-            buffer[j] = (T)(c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]);
+            temp[j] = (T)(c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]);
        dst[i][0] = src[i][0];
-        memcpy (dst[i]+1, buffer+1, (W-2)*sizeof(T));
+        memcpy (dst[i]+1, temp+1, (W-2)*sizeof(T));
        dst[i][W-1] = src[i][W-1];
    }
 }
-/*template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int row_from, int row_to, const int c0, const int c1) {
+template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int W, int H, const float c0, const float c1, bool multiThread) {
-    time_t t1 = clock ();
+	#pragma omp parallel for if (multiThread)
-
+    for (int i=0; i<W; i++) {
-    const int csum = c0 + 2 * c1;    
+    	T* temp = buffer + omp_get_thread_num() * H;
    for (int i = row_from; i<row_to; i++) {
        for (int j=1; j<W-1; j++)
            buffer[j] = (c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]) / csum;
        dst[i][0] = src[i][0];
        memcpy (dst[i]+1, buffer+1, (W-2)*sizeof(T));
        dst[i][W-1] = src[i][W-1];
    }
    printf ("horizontal time = %d\n", clock()-t1);
 }
 */
 template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int H, int col_from, int col_to, const float c0, const float c1) {
    typedef float pfloat[4];
    pfloat* temp = (pfloat*)buffer + 1;
    pfloat* tmp = (pfloat*)buffer;
    __m128 xmm1; xmm1 = _mm_load1_ps (&c0);
    __m128 xmm2; xmm2 = _mm_load1_ps (&c1);
    __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6;
    __m128 xmm0;
    int i;
    for (i = col_from; i<col_to-3; i+=4) {
        (*tmp)[0] = (float)src[0][i];
        (*tmp)[1] = (float)src[0][i+1];
        (*tmp)[2] = (float)src[0][i+2];
        (*tmp)[3] = (float)src[0][i+3];
        xmm3 = _mm_load_ps ((float*)tmp);   // previous element
        (*tmp)[0] = (float)src[1][i];
        (*tmp)[1] = (float)src[1][i+1];
        (*tmp)[2] = (float)src[1][i+2];
        (*tmp)[3] = (float)src[1][i+3];
        xmm4 = _mm_load_ps ((float*)tmp);   // current element
        for (int j=1; j<H-1; j++) {
            (*tmp)[0] = (float)src[j+1][i];
            (*tmp)[1] = (float)src[j+1][i+1];
            (*tmp)[2] = (float)src[j+1][i+2];
            (*tmp)[3] = (float)src[j+1][i+3];
            xmm5 = _mm_load_ps ((float*)tmp);   // next element
            // compute blured pixel
            xmm0 = _mm_mul_ps (xmm3, xmm2);
            xmm6 = _mm_mul_ps (xmm5, xmm2);
            xmm0 = _mm_add_ps (xmm6, xmm0);
            xmm6 = _mm_mul_ps (xmm4, xmm1);
            xmm0 = _mm_add_ps (xmm6, xmm0);
            // store blured pixel
            _mm_store_ps ((float*)&temp[j], xmm0);
            // update prev and current
            xmm3 = xmm4;
            xmm4 = xmm5;
        }
        for (int j=1; j<H-1; j++) {
            dst[j][i]   = (T)temp[j][0];
            dst[j][i+1] = (T)temp[j][1];
            dst[j][i+2] = (T)temp[j][2];
            dst[j][i+3] = (T)temp[j][3];
        }        
        dst[0][i] = src[0][i];
        dst[0][i+1] = src[0][i+1];
        dst[0][i+2] = src[0][i+2];
        dst[0][i+3] = src[0][i+3];
        dst[H-1][i] = src[H-1][i];
        dst[H-1][i+1] = src[H-1][i+1];
        dst[H-1][i+2] = src[H-1][i+2];
        dst[H-1][i+3] = src[H-1][i+3];
    }
 //    int stride = dst[1] - dst[0];
    for (; i<col_to; i++) {
        for (int j = 1; j<H-1; j++) 
-            buffer[j] = (T)(c1 * (src[j-1][i] + src[j+1][i]) + c0 * src[j][i]);
+        	temp[j] = (T)(c1 * (src[j-1][i] + src[j+1][i]) + c0 * src[j][i]);
        dst[0][i] = src[0][i];
 	for (int j=1; j<H-1; j++)
          dst[j][i] = buffer[j];
 //        T* crow = &dst[1][i];
 //        T* cbuff = &buffer[1];
 //        for (int j = 1; j<H-1; j++, crow += stride, cbuff++)
 //            (*crow) = (*cbuff);
        dst[H-1][i] = src[H-1][i];
    }
 }
 /*
 template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int H, int col_from, int col_to, const float c0, const float c1) {
    time_t t1 = clock ();
    int stride = dst[1] - dst[0];
    for (int j=col_from; j<col_to; j++) {
        for (int i = 1; i<H-1; i++) 
            buffer[i] = (c1 * (src[i-1][j] + src[i+1][j]) + c0 * src[i][j]);
        dst[0][j] = src[0][j];
 	for (int i=1; i<H-1; i++)
          dst[i][j] = buffer[i];
 //        T* crow = &dst[1][j];
 //        T* cbuff = &buffer[1];
 //        for (int i = 1; i<H-1; i++, crow += stride, cbuff++)
 //            (*crow) = (*cbuff);
        dst[H-1][j] = src[H-1][j];
    }
    printf ("vertical time = %d\n", clock()-t1);
 }
 */
 template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<float>* buffer, int W, int row_from, int row_to, double sigma) {
    if (sigma<0.25) {
        // dont perform filtering
        if (src!=dst)
            for (int i = row_from; i<row_to; i++) 
                memcpy (dst[i], src[i], W*sizeof(T));
        return;
    }
    if (sigma<0.6) {
        // compute 3x3 kernel
 //        double c0 = 2.0 / exp (-1.0 / (2.0 * sigma * sigma));
 //        printf ("c0=%g\n", c0);
 //        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, row_from, row_to, (int) round(c0), 2);
        double c1 = exp (-1.0 / (2.0 * sigma * sigma));
        double csum = 2.0 * c1 + 1.0;
        c1 /= csum;
        double c0 = 1.0 / csum;
        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, row_from, row_to, c0, c1);
        return;
    }
    // horizontal
    float q = 0.98711 * sigma - 0.96330;
    if (sigma<2.5)
        q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
    float b0 = 1.57825 + 2.44413*q + 1.4281*q*q + 0.422205*q*q*q;
    float b1 = 2.44413*q + 2.85619*q*q + 1.26661*q*q*q;
    float b2 = -1.4281*q*q - 1.26661*q*q*q;
    float b3 = 0.422205*q*q*q;
    float B = 1.0 - (b1+b2+b3) / b0;
    b1 /= b0;
    b2 /= b0;
    b3 /= b0;
    // SSE optimized version:
    __m128 xmm1; xmm1 = _mm_load1_ps (&B);
    __m128 xmm2; xmm2 = _mm_load1_ps (&b1);
    __m128 xmm3; xmm3 = _mm_load1_ps (&b2);
    __m128 xmm4; xmm4 = _mm_load1_ps (&b3);
    __m128 xmm5;
    __m128 xmm6;
    typedef float pfloat[4];
    pfloat* temp = (pfloat*)buffer->data + 1;
    pfloat* tmp = (pfloat*)buffer->data;
    memset (temp, 0, W*sizeof(pfloat));
    int i;
    for (i=row_from; i<row_to-3; i+=4) {
        T* row1 = src[i];
        T* row2 = src[i+1];
        T* row3 = src[i+2];
        T* row4 = src[i+3];
        for (int j=0; j<3; j++) {
            (*tmp)[0] = (float)row1[3];
            (*tmp)[1] = (float)row2[3];
            (*tmp)[2] = (float)row3[3];
            (*tmp)[3] = (float)row4[3];
            xmm5 = _mm_load_ps ((float*)tmp);
            _mm_store_ps ((float*)&temp[j], xmm5);
        }
        for (int j=3; j<W; j++) {
            (*tmp)[0] = (float)row1[j];
            (*tmp)[1] = (float)row2[j];
            (*tmp)[2] = (float)row3[j];
            (*tmp)[3] = (float)row4[j];
            xmm5 = _mm_load_ps ((float*)tmp);
            xmm5 =_mm_mul_ps (xmm1, xmm5);
            xmm6 = _mm_load_ps ((float*)&temp[j-1]);
            xmm6 = _mm_mul_ps (xmm2, xmm6);
            xmm5 = _mm_add_ps (xmm6, xmm5);
            xmm6 = _mm_load_ps ((float*)&temp[j-2]);
            xmm6 = _mm_mul_ps (xmm6, xmm3);
            xmm5 = _mm_add_ps (xmm5, xmm6);
            xmm6 = _mm_load_ps ((float*)&temp[j-3]);
            xmm6 = _mm_mul_ps (xmm6, xmm4);
            xmm5 = _mm_add_ps (xmm5, xmm6);
            _mm_store_ps ((float*)&temp[j], xmm5);
        }
        for (int j=W-4; j>=0; j--) {
          xmm5 = _mm_load_ps ((float*)&temp[j]);
          xmm5 =_mm_mul_ps (xmm1, xmm5);
          xmm6 = _mm_load_ps ((float*)&temp[j+1]);
          xmm6 = _mm_mul_ps (xmm2, xmm6);
          xmm5 = _mm_add_ps (xmm6, xmm5);
          xmm6 = _mm_load_ps ((float*)&temp[j+2]);
          xmm6 = _mm_mul_ps (xmm6, xmm3);
          xmm5 = _mm_add_ps (xmm5, xmm6);
          xmm6 = _mm_load_ps ((float*)&temp[j+3]);
          xmm6 = _mm_mul_ps (xmm6, xmm4);
          xmm5 = _mm_add_ps (xmm5, xmm6);
          _mm_store_ps ((float*)&temp[j], xmm5);
        }
        for (int j=0; j<W; j++) {
          dst[i][j]   = (T)temp[j][0];
          dst[i+1][j] = (T)temp[j][1];
          dst[i+2][j] = (T)temp[j][2];
          dst[i+3][j] = (T)temp[j][3];
        }
    }
    // blur remaining rows
    float* temp2 = (float*)buffer->data;
    for (; i<row_to; i++) {
        for (int j=0; j<3; j++)
            temp2[j] = src[i][3];
        for (int j=3; j<W; j++)
            temp2[j] = B * src[i][j] + b1*temp2[j-1] + b2*temp2[j-2] + b3*temp2[j-3];
        for (int j=W-4; j>=0; j--)
            temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3];
        for (int j=0; j<W; j++)
            dst[i][j] = (T)temp2[j];
    }
 }
 template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<float>* buffer, int H, int col_from, int col_to, double sigma) {
    if (sigma<0.25) {
        // dont perform filtering
        if (src!=dst)
            for (int i = 0; i<H; i++) 
                for (int j=col_from; j<col_to; j++)
                    dst[i][j] = src[i][j];
        return;
    }
    if (sigma<0.6) {
        // compute 3x3 kernel
        double c1 = exp (-1.0 / (2.0 * sigma * sigma));
        double csum = 2.0 * c1 + 1.0;
        c1 /= csum;
        double c0 = 1.0 / csum;
        gaussVertical3<T> (src, dst, (T*)(buffer->data), H, col_from, col_to, c0, c1);
 //        double c0 = 2.0 / exp (-1.0 / (2.0 * sigma * sigma));
 //        gaussVertical3<T> (src, dst, (T*)(buffer->data), H, col_from, col_to, (int) round(c0), 2);
        return;
    }
    // vertical
    float q = 0.98711 * sigma - 0.96330;
    if (sigma<2.5)
        q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
    float b0 = 1.57825 + 2.44413*q + 1.4281*q*q + 0.422205*q*q*q;
    float b1 = 2.44413*q + 2.85619*q*q + 1.26661*q*q*q;
    float b2 = -1.4281*q*q - 1.26661*q*q*q;
    float b3 = 0.422205*q*q*q;
    float B = 1.0 - (b1+b2+b3) / b0;
    b1 /= b0;
    b2 /= b0;
    b3 /= b0;
    // SSE optimized version:
    __m128 xmm1; xmm1 = _mm_load1_ps (&B);
    __m128 xmm2; xmm2 = _mm_load1_ps (&b1);
    __m128 xmm3; xmm3 = _mm_load1_ps (&b2);
    __m128 xmm4; xmm4 = _mm_load1_ps (&b3);
    __m128 xmm5;
    __m128 xmm6;
    typedef float pfloat[4];
    pfloat* temp = (pfloat*)buffer->data + 1;
    pfloat* tmp = (pfloat*)buffer->data;
    memset (temp, 0, H*sizeof(pfloat));
    int i;
    for (i=col_from; i<col_to-3; i+=4) {
        for (int j=0; j<3; j++) {
            (*tmp)[0] = (float)src[3][i];
            (*tmp)[1] = (float)src[3][i+1];
            (*tmp)[2] = (float)src[3][i+2];
            (*tmp)[3] = (float)src[3][i+3];
            xmm5 = _mm_load_ps ((float*)tmp);
            _mm_store_ps ((float*)&temp[j], xmm5);
        }
        for (int j=3; j<H; j++) {
            (*tmp)[0] = (float)src[j][i];
            (*tmp)[1] = (float)src[j][i+1];
            (*tmp)[2] = (float)src[j][i+2];
            (*tmp)[3] = (float)src[j][i+3];
            xmm5 = _mm_load_ps ((float*)tmp);
            xmm5 =_mm_mul_ps (xmm1, xmm5);
            xmm6 = _mm_load_ps ((float*)&temp[j-1]);
            xmm6 = _mm_mul_ps (xmm2, xmm6);
            xmm5 = _mm_add_ps (xmm6, xmm5);
            xmm6 = _mm_load_ps ((float*)&temp[j-2]);
            xmm6 = _mm_mul_ps (xmm6, xmm3);
            xmm5 = _mm_add_ps (xmm5, xmm6);
            xmm6 = _mm_load_ps ((float*)&temp[j-3]);
            xmm6 = _mm_mul_ps (xmm6, xmm4);
            xmm5 = _mm_add_ps (xmm5, xmm6);
            _mm_store_ps ((float*)&temp[j], xmm5);
        }
        for (int j=H-4; j>=0; j--) {
            xmm5 = _mm_load_ps ((float*)&temp[j]);
            xmm5 =_mm_mul_ps (xmm1, xmm5);
            xmm6 = _mm_load_ps ((float*)&temp[j+1]);
            xmm6 = _mm_mul_ps (xmm2, xmm6);
            xmm5 = _mm_add_ps (xmm6, xmm5);
            xmm6 = _mm_load_ps ((float*)&temp[j+2]);
            xmm6 = _mm_mul_ps (xmm6, xmm3);
            xmm5 = _mm_add_ps (xmm5, xmm6);
            xmm6 = _mm_load_ps ((float*)&temp[j+3]);
            xmm6 = _mm_mul_ps (xmm6, xmm4);
            xmm5 = _mm_add_ps (xmm5, xmm6);
            _mm_store_ps ((float*)&temp[j], xmm5);
        }
        for (int j=0; j<H; j++) {
            dst[j][i]   = (T)temp[j][0];
            dst[j][i+1] = (T)temp[j][1];
            dst[j][i+2] = (T)temp[j][2];
            dst[j][i+3] = (T)temp[j][3];
        }
    }
    // blur remaining columns
    float* temp2 = (float*)buffer->data;
    for (; i<col_to; i++) {
        for (int j=0; j<3; j++)
            temp2[j] = src[3][i];
        for (int j=3; j<H; j++)
            temp2[j] = B * src[j][i] + b1*temp2[j-1] + b2*temp2[j-2] + b3*temp2[j-3];
        for (int j=H-4; j>=0; j--)
            temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3];
        for (int j=0; j<H; j++)
            dst[j][i] = (T)temp2[j];
    }
 }
 #else
 template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int row_from, int row_to, const float c0, const float c1) {
    for (int i=row_from; i<row_to; i++) {
        for (int j=1; j<W-1; j++)
            buffer[j] = (T)(c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]);
        dst[i][0] = src[i][0];
        memcpy (dst[i]+1, buffer+1, (W-2)*sizeof(T));
        dst[i][W-1] = src[i][W-1];
    }
 }
 template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int H, int col_from, int col_to, const float c0, const float c1) {
    for (int i=col_from; i<col_to; i++) {
        for (int j = 1; j<H-1; j++) 
            buffer[j] = (T)(c1 * (src[j-1][i] + src[j+1][i]) + c0 * src[j][i]);
        dst[0][i] = src[0][i];
 	    for (int j=1; j<H-1; j++)
-            dst[j][i] = buffer[j];
+            dst[j][i] = temp[j];
        dst[H-1][i] = src[H-1][i];
    }
 }
-template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma) {
+// fast gaussian approximation if the support window is large
 template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>* buffer, int W, int H, double sigma, bool multiThread) {
    if (sigma<0.25) {
        // dont perform filtering
        if (src!=dst)
-            for (int i = row_from; i<row_to; i++) 
+            for (int i = 0; i<H; i++)
                memcpy (dst[i], src[i], W*sizeof(T));
        return;
    }
@ -480,11 +72,11 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
        double csum = 2.0 * c1 + 1.0;
        c1 /= csum;
        double c0 = 1.0 / csum;
-        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, row_from, row_to, c0, c1);
+        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, H, c0, c1, multiThread);
        return;
    }
-    // horizontal
+    // coefficient calculation
    double q = 0.98711 * sigma - 0.96330;
    if (sigma<2.5)
        q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
@ -513,8 +105,11 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
        for (int j=0; j<3; j++)
            M[i][j] /= (1.0+b1-b2+b3)*(1.0+b2+(b1-b3)*b3);
-    double* temp2 = (double*)buffer->data;
+	#pragma omp parallel for if (multiThread)
-    for (int i=row_from; i<row_to; i++) {
+    for (int i=0; i<H; i++) {
        double* temp2 = buffer->data + omp_get_thread_num() * W;
        temp2[0] = B * src[i][0] + b1*src[i][0] + b2*src[i][0] + b3*src[i][0];
        temp2[1] = B * src[i][1] + b1*temp2[0]  + b2*src[i][0] + b3*src[i][0];
        temp2[2] = B * src[i][2] + b1*temp2[1]  + b2*temp2[0]  + b3*src[i][0];
@ -537,14 +132,13 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
    }
 }
-template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma) {
+template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* buffer, int W, int H, double sigma, bool multiThread) {
    if (sigma<0.25) {
        // dont perform filtering
        if (src!=dst)
-            for (int i = 0; i<H; i++) 
+            for (int i = 0; i<H; i++)
-                for (int j=col_from; j<col_to; j++)
+                memcpy (dst[i], src[i], W*sizeof(T));
                    dst[i][j] = src[i][j];
        return;
    }
@ -554,11 +148,11 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
        double csum = 2.0 * c1 + 1.0;
        c1 /= csum;
        double c0 = 1.0 / csum;
-        gaussVertical3<T> (src, dst, (T*)(buffer->data), H, col_from, col_to, c0, c1);
+        gaussVertical3<T> (src, dst, (T*)(buffer->data), W, H, c0, c1, multiThread);
        return;
    }
-    // vertical
+    // coefficient calculation
    double q = 0.98711 * sigma - 0.96330;
    if (sigma<2.5)
        q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
@ -587,9 +181,12 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
        for (int j=0; j<3; j++)
            M[i][j] /= (1.0+b1-b2+b3)*(1.0+b2+(b1-b3)*b3);
-    double* temp2 = (double*)buffer->data;
+	#pragma omp parallel for if (multiThread)
-    for (int i=col_from; i<col_to; i++) {
+    for (int i=0; i<W; i++) {
-        temp2[0] = B * src[0][i] + b1*src[0][i] + b2*src[0][i] + b3*src[0][i];
+
        double* temp2 = buffer->data + omp_get_thread_num() * H;
    	temp2[0] = B * src[0][i] + b1*src[0][i] + b2*src[0][i] + b3*src[0][i];
        temp2[1] = B * src[1][i] + b1*temp2[0]  + b2*src[0][i] + b3*src[0][i];
        temp2[2] = B * src[2][i] + b1*temp2[1]  + b2*temp2[0]  + b3*src[0][i];
@ -611,14 +208,13 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
            dst[j][i] = (T)temp2[j];
    }
 }   
 #endif
 /*
 void gaussHorizontal_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
 void gaussVertical_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
 void gaussHorizontal_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
 void gaussVertical_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
 void gaussHorizontal_float (float** src, float** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
 void gaussVertical_float (float** src, float** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
-
+*/
 #endif
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@ -28,7 +28,7 @@ namespace rtengine {
 extern Settings* settings;
 ImProcCoordinator::ImProcCoordinator ()
-    : allocated(false), scale(-1), pW(-1), pH(-1),
+    : ipf(&params, true), allocated(false), scale(-1), pW(-1), pH(-1),
    plistener(NULL), imageListener(NULL), aeListener(NULL), hListener(NULL),
    resultValid(false), awbComputed(false),
    changeSinceLast(0), updaterRunning(false), 
@ -54,7 +54,6 @@ ImProcCoordinator::~ImProcCoordinator () {
        delete toDel[i];
    imgsrc->decreaseRef ();
 	ipf.release ();
    updaterThreadStart.unlock ();
 }
@ -79,6 +78,8 @@ void ImProcCoordinator::updatePreviewImage (int todo) {
    else if (params.resize.dataspec==2)
        params.resize.scale = (double)params.resize.height / (params.coarse.rotate==90 || params.coarse.rotate==270 ? fw : fh);
    ipf.setScale (scale);
    progress ("Applying white balance, color correction & sRBG conversion...",100*readyphase/numofphases);
    if (todo & M_INIT) {
        minit.lock ();
@ -164,7 +165,7 @@ void ImProcCoordinator::updatePreviewImage (int todo) {
    progress ("Exposure curve & CIELAB conversion...",100*readyphase/numofphases);
    if (todo & M_RGBCURVE) {
        CurveFactory::complexCurve (params.toneCurve.expcomp, params.toneCurve.black/65535.0, params.toneCurve.hlcompr, params.toneCurve.shcompr, params.toneCurve.brightness, params.toneCurve.contrast, imgsrc->getDefGain(), imgsrc->getGamma(), true, params.toneCurve.curve, vhist16, tonecurve, bcrgbhist, scale==1 ? 1 : 1);
-        ipf.rgbProc (oprevi, oprevl, &params, tonecurve, shmap);
+        ipf.rgbProc (oprevi, oprevl, tonecurve, shmap);
        // recompute luminance histogram
        memset (lhist16, 0, 65536*sizeof(int));
@ -186,12 +187,12 @@ void ImProcCoordinator::updatePreviewImage (int todo) {
        readyphase++;
        if (scale==1) {
            progress ("Denoising luminance...",100*readyphase/numofphases);
-            ipf.lumadenoise (nprevl, &params, scale*params.resize.scale, buffer);
+            ipf.lumadenoise (nprevl, buffer);
        }
        readyphase++;
        if (scale==1) {
            progress ("Sharpening...",100*readyphase/numofphases);
-            ipf.sharpening (nprevl, &params, scale*params.resize.scale, (unsigned short**)buffer);
+            ipf.sharpening (nprevl, (unsigned short**)buffer);
        }
        readyphase++;
    }
@ -201,11 +202,11 @@ void ImProcCoordinator::updatePreviewImage (int todo) {
    if (todo & M_COLOR) {
        progress ("Applying Color Boost...",100*readyphase/numofphases);
-        ipf.colorCurve (oprevl, nprevl, &params);
+        ipf.colorCurve (oprevl, nprevl);
        readyphase++;
        if (scale==1) {
            progress ("Denoising color...",100*readyphase/numofphases);
-            ipf.colordenoise (nprevl, &params, scale*params.resize.scale, buffer);
+            ipf.colordenoise (nprevl, buffer);
        }
        readyphase++;
    }
@ -314,7 +315,7 @@ if (settings->verbose) printf ("setscale before lock\n");
        oprevl = new LabImage (pW, pH);    
        nprevl = new LabImage (pW, pH);    
        previmg = new Image8 (pW, pH);
-        shmap = new SHMap (pW, pH);
+        shmap = new SHMap (pW, pH, true);
        buffer = new int*[pH];
        for (int i=0; i<pH; i++)
@ -410,7 +411,7 @@ void ImProcCoordinator::getSpotWB (int x, int y, int rect, double& temp, double&
        for (int j=x-rect; j<=x+rect; j++) 
            points.push_back (Coord2D (j, i));
-    ipf.transCoord (&params, fw, fh, points, red, green, blue);
+    ImProcFunctions::transCoord (&params, fw, fh, points, red, green, blue);
    int tr = TR_NONE;
    if (params.coarse.rotate==90)  tr |= TR_R90;
    if (params.coarse.rotate==180) tr |= TR_R180;
@ -437,7 +438,7 @@ void ImProcCoordinator::getAutoCrop (double ratio, int &x, int &y, int &w, int &
        x = (fullw - w) / 2;
        y = (fullh - h) / 2;       
        int orx, ory, orw, orh;
-        clipped = ipf.transCoord (&params, fw, fh, x, y, w, h, orx, ory, orw, orh);
+        clipped = ImProcFunctions::transCoord (&params, fw, fh, x, y, w, h, orx, ory, orw, orh);
        w -= 4;
    }
    if (ratio>0)
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@ -24,89 +24,75 @@
 #include <procparams.h>
 #include <shmap.h>
 #include <coord2d.h>
 #include <labimage.h>
 namespace rtengine {
 using namespace procparams;
 class LabImage {
    private:
        bool fromImage;
    public:
        int W, H;
        unsigned short** L;
        short** a;
        short** b;
     LabImage (int w, int h);
     LabImage (Image16* im);
    ~LabImage ();
 };
 class ImProcFunctions {
    protected:
        struct STemp {
            int cx, cy, sx, sy, oW, oH;
        };
 		cmsHTRANSFORM monitorTransform;
        void transform_         (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
        void simpltransform_    (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
        void vignetting_        (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
        void transform_sep_     (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
        void rgbProc_           (Image16* working, LabImage* lab, const ProcParams* params, int* tonecurve, SHMap* shmap, int row_from, int row_to);
        void lab2rgb_           (LabImage* lab, Image8* image, int row_from, int row_to);
        void colorCurve_        (LabImage* lold, LabImage* lnew, const ProcParams* params, int row_from, int row_to, double* cmultiplier);
        void sharpenHaloCtrl    (LabImage* lab, const ProcParams* params, unsigned short** blurmap, unsigned short** base, int W, int row_from, int row_to);
        void firstAnalysis_     (Image16* original, Glib::ustring wprofile, unsigned int* histogram, int* chroma_radius, int row_from, int row_to);
        void resize_            (Image16* src, Image16* dst, ResizeParams params, int row_from, int row_to);
        void damping_           (float** aI, unsigned short** aO, float damping, int W, int rowfrom, int rowto);
    public:
        static int* cacheL;
        static int* cachea;
        static int* cacheb;
        static int* xcache;
        static int* ycache;
        static int* zcache;
-    
+        static unsigned short gamma2curve[65536];
        struct STemp {
            int cx, cy, sx, sy, oW, oH;
        };
 		cmsHTRANSFORM monitorTransform;
        int chroma_scale;
        int chroma_radius;
 		const ProcParams* params;
 		double scale;
 		bool multiThread;
        void transform_         (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
        void simpltransform_    (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
        void vignetting_        (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
        void transform_sep_     (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
        void sharpenHaloCtrl    (LabImage* lab, unsigned short** blurmap, unsigned short** base, int W, int H);
        void firstAnalysis_     (Image16* original, Glib::ustring wprofile, unsigned int* histogram, int* chroma_radius, int row_from, int row_to);
        void dcdamping          (float** aI, unsigned short** aO, float damping, int W, int H);
    public:
        double lumimul[3];
        static unsigned short gamma2curve[65536];
        static void initCache ();
-        ImProcFunctions () : monitorTransform(NULL) {}
+        ImProcFunctions (const ProcParams* iparams, bool imultiThread=true)
-		void release    ();
+			: monitorTransform(NULL), params(iparams), scale(1), multiThread(imultiThread) {}
-		
+        ~ImProcFunctions ();
        void setScale 		(double iscale);
        void firstAnalysis  (Image16* working, const ProcParams* params, unsigned int* vhist16, double gamma);
-        void rgbProc        (Image16* working, LabImage* lab, const ProcParams* params, int* tonecurve, SHMap* shmap);
+        void rgbProc        (Image16* working, LabImage* lab, int* tonecurve, SHMap* shmap);
        void luminanceCurve (LabImage* lold, LabImage* lnew, int* curve, int row_from, int row_to);
-        void colorCurve     (LabImage* lold, LabImage* lnew, const ProcParams* params);
+        void colorCurve     (LabImage* lold, LabImage* lnew);
-        void sharpening     (LabImage* lab, const ProcParams* params, double scale, unsigned short** buffer);
+        void sharpening     (LabImage* lab, unsigned short** buffer);
-        void lumadenoise    (LabImage* lab, const ProcParams* params, double scale, int** buffer);
+        void lumadenoise    (LabImage* lab, int** buffer);
-        void colordenoise   (LabImage* lab, const ProcParams* params, double scale, int** buffer);
+        void colordenoise   (LabImage* lab, int** buffer);
        void transform      (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int sx, int sy, int oW, int oH);
        void simpltransform (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int sx, int sy, int oW, int oH);
        void vignetting     (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int oW, int oH);
        void lab2rgb        (LabImage* lab, Image8* image);
-        void resize         (Image16* src, Image16* dst, ResizeParams params);
+        void resize         (Image16* src, Image16* dst);
-        bool transCoord     (const ProcParams* params, int W, int H, int x, int y, int w, int h, int& xv, int& yv, int& wv, int& hv);
+        void deconvsharpening(LabImage* lab, unsigned short** buffer);
        bool transCoord     (const ProcParams* params, int W, int H, std::vector<Coord2D> &src, std::vector<Coord2D> &red,  std::vector<Coord2D> &green, std::vector<Coord2D> &blue);
        void deconvsharpening(LabImage* lab, const ProcParams* params, double scale, unsigned short** buffer);
        Image8*     lab2rgb     (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile);
        Image16*    lab2rgb16   (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile);
        static bool transCoord     (const ProcParams* params, int W, int H, int x, int y, int w, int h, int& xv, int& yv, int& wv, int& hv);
        static bool transCoord     (const ProcParams* params, int W, int H, std::vector<Coord2D> &src, std::vector<Coord2D> &red,  std::vector<Coord2D> &green, std::vector<Coord2D> &blue);
        static void getAutoExp  (unsigned int* histogram, int histcompr, double expcomp, double clip, double& br, int& bl);
 };
 };
--- a/rtengine/iplab2rgb.cc
+++ b/rtengine/iplab2rgb.cc
@ -0,0 +1,259 @@
 /*
 *  This file is part of RawTherapee.
 *
 *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
 *
 *  RawTherapee is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  RawTherapee is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <rtengine.h>
 #include <improcfun.h>
 #include <glibmm.h>
 #include <iccstore.h>
 #include <omp.h>
 namespace rtengine {
 #undef CLIP
 #undef CLIPTO
 #undef CMAXVAL
 #define CMAXVAL 0xffff
 #define CLIP(a) ((a)>0?((a)<CMAXVAL?(a):CMAXVAL):0)
 #define CLIPTO(a,b,c) ((a)>(b)?((a)<(c)?(a):(c)):(b))
 extern const Settings* settings;
 void ImProcFunctions::lab2rgb (LabImage* lab, Image8* image) {
 	if (monitorTransform) {
 	    int ix = 0;
        short* buffer = new short [3*lab->W];
 		for (int i=0; i<lab->H; i++) {
 			unsigned short* rL = lab->L[i];
 			short* ra = lab->a[i];
 			short* rb = lab->b[i];
 			int iy = 0;
 			for (int j=0; j<lab->W; j++) {
                int y_ = rL[j];
                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
                x_ = CLIPTO(x_,0,369820);
                y_ = CLIPTO(y_,0,825745);
                y_ = ycache[y_];
 				x_ = xcache[x_];
 				z_ = zcache[z_];
                buffer[iy++] = CLIP(x_);
                buffer[iy++] = CLIP(y_);
                buffer[iy++] = CLIP(z_);
 			}
            cmsDoTransform (monitorTransform, buffer, image->data + ix, lab->W);
            ix += 3*lab->W;
 		}
        delete [] buffer;
 	}
 	else {
 		#pragma omp parallel for if (multiThread)
 		for (int i=0; i<lab->H; i++) {
 			unsigned short* rL = lab->L[i];
 			short* ra = lab->a[i];
 			short* rb = lab->b[i];
 			int ix = 3*i*lab->W;
 			for (int j=0; j<lab->W; j++) {
                int y_ = rL[j];
                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
                x_ = CLIPTO(x_,0,369820);
                y_ = CLIPTO(y_,0,825745);
                y_ = ycache[y_];
 				x_ = xcache[x_];
 				z_ = zcache[z_];
 				/* XYZ-D50 to RGB */
 				int R = (25689*x_-13261*y_-4022*z_) >> 13;
 				int G = (-8017*x_+15697*y_+274*z_) >> 13;
 				int B = (590*x_-1877*y_+11517*z_) >> 13;
 				/* copy RGB */
 				image->data[ix++] = gamma2curve[CLIP(R)] >> 8;
 				image->data[ix++] = gamma2curve[CLIP(G)] >> 8;
 				image->data[ix++] = gamma2curve[CLIP(B)] >> 8;
 			}
 		}
 	}
 }
 Image8* ImProcFunctions::lab2rgb (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile) {
    if (cx<0) cx = 0;
    if (cy<0) cy = 0;
    if (cx+cw>lab->W) cw = lab->W-cx;
    if (cy+ch>lab->H) ch = lab->H-cy;
    Image8* image = new Image8 (cw, ch);
    cmsHPROFILE oprof = iccStore.getProfile (profile);
    if (oprof) {
        cmsHPROFILE iprof = iccStore.getXYZProfile ();
        lcmsMutex->lock ();
        cmsHTRANSFORM hTransform = cmsCreateTransform (iprof, TYPE_RGB_16, oprof, TYPE_RGB_8, settings->colorimetricIntent, 0);
        lcmsMutex->unlock ();
        int ix = 0;
        short* buffer = new short [3*cw];
        for (int i=cy; i<cy+ch; i++) {
            unsigned short* rL = lab->L[i];
            short* ra = lab->a[i];
            short* rb = lab->b[i];
            int iy = 0;
            for (int j=cx; j<cx+cw; j++) {
                int y_ = rL[j];
                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
                x_ = CLIPTO(x_,0,369820);
                y_ = CLIPTO(y_,0,825745);
                y_ = ycache[y_];
 				x_ = xcache[x_];
 				z_ = zcache[z_];
                buffer[iy++] = CLIP(x_);
                buffer[iy++] = CLIP(y_);
                buffer[iy++] = CLIP(z_);
            }
            cmsDoTransform (hTransform, buffer, image->data + ix, cw);
            ix += 3*cw;
        }
        delete [] buffer;
        cmsDeleteTransform(hTransform);
    }
    else {
 		#pragma omp parallel for if (multiThread)
        for (int i=cy; i<cy+ch; i++) {
            unsigned short* rL = lab->L[i];
            short* ra = lab->a[i];
            short* rb = lab->b[i];
 			int ix = 3*i*cw;
            for (int j=cx; j<cx+cw; j++) {
                int y_ = rL[j];
                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
                x_ = CLIPTO(x_,0,369820);
                y_ = CLIPTO(y_,0,825745);
                y_ = ycache[y_];
 				x_ = xcache[x_];
 				z_ = zcache[z_];
                int R = (25689*x_-13261*y_-4022*z_) >> 13;
                int G = (-8017*x_+15697*y_+274*z_) >> 13;
                int B = (590*x_-1877*y_+11517*z_) >> 13;
                image->data[ix++] = gamma2curve[CLIP(R)] >> 8;
                image->data[ix++] = gamma2curve[CLIP(G)] >> 8;
                image->data[ix++] = gamma2curve[CLIP(B)] >> 8;
            }
        }
    }
    return image;
 }
 Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile) {
    if (cx<0) cx = 0;
    if (cy<0) cy = 0;
    if (cx+cw>lab->W) cw = lab->W-cx;
    if (cy+ch>lab->H) ch = lab->H-cy;
    Image16* image = new Image16 (cw, ch);
    cmsHPROFILE oprof = iccStore.getProfile (profile);
    if (oprof) {
 		#pragma omp parallel for if (multiThread)
 		for (int i=cy; i<cy+ch; i++) {
 			unsigned short* rL = lab->L[i];
 			short* ra = lab->a[i];
 			short* rb = lab->b[i];
 			short* xa = (short*)image->r[i-cy];
 			short* ya = (short*)image->g[i-cy];
 			short* za = (short*)image->b[i-cy];
 			for (int j=cx; j<cx+cw; j++) {
                int y_ = rL[j];
                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
                x_ = CLIPTO(x_,0,369820);
                y_ = CLIPTO(y_,0,825745);
                y_ = ycache[y_];
 				x_ = xcache[x_];
 				z_ = zcache[z_];
 				xa[j-cx] = CLIP(x_);
 				ya[j-cx] = CLIP(y_);
 				za[j-cx] = CLIP(z_);
 			}
 		}
        cmsHPROFILE iprof = iccStore.getXYZProfile ();
        lcmsMutex->lock ();
 		cmsHTRANSFORM hTransform = cmsCreateTransform (iprof, TYPE_RGB_16_PLANAR, oprof, TYPE_RGB_16_PLANAR, settings->colorimetricIntent, 0);
        lcmsMutex->unlock ();
 		cmsDoTransform (hTransform, image->data, image->data, image->planestride/2);
 		cmsDeleteTransform(hTransform);
 	}
 	else {
 		#pragma omp parallel for if (multiThread)
 		for (int i=cy; i<cy+ch; i++) {
 			unsigned short* rL = lab->L[i];
 			short* ra = lab->a[i];
 			short* rb = lab->b[i];
 			for (int j=cx; j<cx+cw; j++) {
                int y_ = rL[j];
                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
                x_ = CLIPTO(x_,0,369820);
                y_ = CLIPTO(y_,0,825745);
                y_ = ycache[y_];
 				x_ = xcache[x_];
 				z_ = zcache[z_];
 				int R = (25689*x_-13261*y_-4022*z_) >> 13;
 				int G = (-8017*x_+15697*y_+274*z_) >> 13;
 				int B = (590*x_-1877*y_+11517*z_) >> 13;
 				image->r[i-cy][j-cx] = gamma2curve[CLIP(R)];
 				image->g[i-cy][j-cx] = gamma2curve[CLIP(G)];
 				image->b[i-cy][j-cx] = gamma2curve[CLIP(B)];
 			}
 		}
 	}
    return image;
 }
 }
--- a/rtengine/ipresize.cc
+++ b/rtengine/ipresize.cc
@ -0,0 +1,293 @@
 /*
 *  This file is part of RawTherapee.
 *
 *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
 *
 *  RawTherapee is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  RawTherapee is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <rtengine.h>
 #include <improcfun.h>
 #include <glibmm.h>
 #include <omp.h>
 namespace rtengine {
 #undef CLIP
 #undef CLIPTO
 #undef CMAXVAL
 #define CMAXVAL 0xffff
 #define CLIP(a) ((a)>0?((a)<CMAXVAL?(a):CMAXVAL):0)
 #define CLIPTO(a,b,c) ((a)>(b)?((a)<(c)?(a):(c)):(b))
 void ImProcFunctions::resize (Image16* src, Image16* dst) {
 	if(params->resize.method == "Downscale (Better)") {
        // small-scale algorithm by Ilia
        // provides much better quality on small scales
        // calculates mean value over source pixels which current destination pixel covers
        // works only for scales < 1
        // for scales ~1 it is analogous to bilinear
        // possibly, for even less scale factors (< 0.2 possibly) boundary pixels are not needed, omitting them can give a speedup
        // this algorithm is much slower on small factors than others, because it uses all pixels of the SOURCE image
        // Ilia Popov ilia_popov@rambler.ru 2010
        double delta = 1.0 / params->resize.scale;
        double k = params->resize.scale * params->resize.scale;
 		#pragma omp parallel for if (multiThread)
        for(int i = 0; i < dst->height; i++) {
            // top and bottom boundary coordinates
            double y0 = i * delta;
            double y1 = (i + 1) * delta;
            int m0 = y0;
            m0 = CLIPTO(m0, 0, src->height-1);
            int m1 = y1;
            m1 = CLIPTO(m1, 0, src->height-1);
            // weights of boundary pixels
            double wy0 = 1.0 - (y0 - m0);
            double wy1 = y1 - m1;
            for(int j = 0; j < dst->width; j++) {
                // left and right boundary coordinates
                double x0 = j * delta;
                double x1 = (j + 1) * delta;
                int n0 = x0;
                n0 = CLIPTO(n0, 0, src->width-1);
                int n1 = x1;
                n1 = CLIPTO(n1, 0, src->width-1);
                double wx0 = 1.0 - (x0 - n0);
                double wx1 = x1 - n1;
                double r = 0;
                double g = 0;
                double b = 0;
                // integration
                // corners
                r += wy0 * wx0 * src->r[m0][n0] + wy0 * wx1 * src->r[m0][n1] + wy1 * wx0 * src->r[m1][n0] + wy1 * wx1 * src->r[m1][n1];
                g += wy0 * wx0 * src->g[m0][n0] + wy0 * wx1 * src->g[m0][n1] + wy1 * wx0 * src->g[m1][n0] + wy1 * wx1 * src->g[m1][n1];
                b += wy0 * wx0 * src->b[m0][n0] + wy0 * wx1 * src->b[m0][n1] + wy1 * wx0 * src->b[m1][n0] + wy1 * wx1 * src->b[m1][n1];
                // top and bottom boundaries
                for(int n = n0 + 1; n < n1; n++) {
                    r += wy0 * src->r[m0][n] + wy1 * src->r[m1][n];
                    g += wy0 * src->g[m0][n] + wy1 * src->g[m1][n];
                    b += wy0 * src->b[m0][n] + wy1 * src->b[m1][n];
                }
                // inner rows
                for(int m = m0 + 1; m < m1; m++) {
                    // left and right boundaries
                    r += wx0 * src->r[m][n0] + wx1 * src->r[m][n1];
                    g += wx0 * src->g[m][n0] + wx1 * src->g[m][n1];
                    b += wx0 * src->b[m][n0] + wx1 * src->b[m][n1];
                    // inner pixels
                    for(int n = n0 + 1; n < n1; n++) {
                        r += src->r[m][n];
                        g += src->g[m][n];
                        b += src->b[m][n];
                    }
                }
                // overall weight is equal to the DST pixel area in SRC coordinates
                r *= k;
                g *= k;
                b *= k;
                dst->r[i][j] = CLIP((int)r);
                dst->g[i][j] = CLIP((int)g);
                dst->b[i][j] = CLIP((int)b);
            }
        }
        return;
    }
    if(params->resize.method == "Downscale (Faster)")
 	{
        // faster version of algo above, does not take into account border pixels,
        // which are summed with non-unity weights in slow algo. So, no need
        // for weights at all
        // Ilia Popov ilia_popov@rambler.ru 5.04.2010
        double delta = 1.0 / params->resize.scale;
        int p = (int) delta;
        // if actually we are doing upscaling, behave like Nearest
        if(p == 0)
            p = 1;
        int q = p/2;
        // may cause problems on 32-bit systems on extremely small factors.
        // In that case change 1024 to smth less
        const int divider = 1024;
        // scaling factor after summation
        int k = divider / (p * p);
 		#pragma omp parallel for if (multiThread)
        for(int i = 0; i < dst->height; i++) {
            // y coordinate of center of destination pixel
            double y = (i + 0.5) * delta;
            int m0 = (int) (y) - q;
            m0 = CLIPTO(m0, 0, src->height-1);
            int m1 = m0 + p;
            if(m1 > src->height) {
                m1 = src->height;
                m0 = m1 - p;
            }
            m1 = CLIPTO(m1, 0, src->height);
            for(int j = 0; j < dst->width; j++) {
                // x coordinate of center of destination pixel
                double x = (j + 0.5) * delta;
                int n0 = (int) (x) - q;
                n0 = CLIPTO(n0, 0, src->width-1);
                int n1 = n0 + p;
                if(n1 > src->width) {
                    n1 = src->width;
                    n0 = n1 - p;
                }
                n1 = CLIPTO(n1, 0, src->width);
                int r = 0;
                int g = 0;
                int b = 0;
                // integration
                for(int m = m0; m < m1; m++) {
                    for(int n = n0; n < n1; n++) {
                        r += src->r[m][n];
                        g += src->g[m][n];
                        b += src->b[m][n];
                    }
                }
                dst->r[i][j] = CLIP( r * k / divider);
                dst->g[i][j] = CLIP( g * k / divider);
                dst->b[i][j] = CLIP( b * k / divider);
            }
        }
        return;
    }
    if (params->resize.method.substr(0,7)=="Bicubic") {
        double Av = -0.5;
        if (params->resize.method=="Bicubic (Sharper)")
            Av = -0.75;
        else if (params->resize.method=="Bicubic (Softer)")
            Av = -0.25;
 		#pragma omp parallel for if (multiThread)
        for (int i=0; i<dst->height; i++) {
            double wx[4], wy[4];
            double Dy = i / params->resize.scale;
            int yc  =  (int) Dy; Dy -= (double)yc;
            int ys = yc - 1; // smallest y-index used for interpolation
            // compute vertical weights
            double t1y = -Av*(Dy-1.0)*Dy;
            double t2y = (3.0-2.0*Dy)*Dy*Dy;
            wy[3] = t1y*Dy;
            wy[2] = t1y*(Dy-1.0) + t2y;
            wy[1] = -t1y*Dy + 1.0 - t2y;
            wy[0] = -t1y*(Dy-1.0);
            for (int j=0; j<dst->width; j++) {
                double Dx = j / params->resize.scale;
                int xc  =  (int) Dx; Dx -= (double)xc;
                int xs = xc - 1; // smallest x-index used for interpolation
                if (ys >= 0 && ys <src->height-3 && xs >= 0 && xs <= src->width-3) {
                    // compute horizontal weights
                    double t1 = -Av*(Dx-1.0)*Dx;
                    double t2 = (3.0-2.0*Dx)*Dx*Dx;
                    wx[3] = t1*Dx;
                    wx[2] = t1*(Dx-1.0) + t2;
                    wx[1] = -t1*Dx + 1.0 - t2;
                    wx[0] = -t1*(Dx-1.0);
                    // compute weighted sum
                    int r = 0;
                    int g = 0;
                    int b = 0;
                    for (int x=0; x<4; x++)
                        for (int y=0; y<4; y++) {
                            double w = wx[x]*wy[y];
                            r += w*src->r[ys+y][xs+x];
                            g += w*src->g[ys+y][xs+x];
                            b += w*src->b[ys+y][xs+x];
                        }
                    dst->r[i][j] = CLIP(r);
                    dst->g[i][j] = CLIP(g);
                    dst->b[i][j] = CLIP(b);
                }
                else {
                    xc = CLIPTO(xc, 0, src->width-1);
                    yc = CLIPTO(yc, 0, src->height-1);
                    int nx = xc + 1;
                    if (nx>=src->width)
                        nx = xc;
                    int ny = yc + 1;
                    if (ny>=src->height)
                        ny = yc;
                    dst->r[i][j] = (1-Dx)*(1-Dy)*src->r[yc][xc] + (1-Dx)*Dy*src->r[ny][xc] + Dx*(1-Dy)*src->r[yc][nx] + Dx*Dy*src->r[ny][nx];
                    dst->g[i][j] = (1-Dx)*(1-Dy)*src->g[yc][xc] + (1-Dx)*Dy*src->g[ny][xc] + Dx*(1-Dy)*src->g[yc][nx] + Dx*Dy*src->g[ny][nx];
                    dst->b[i][j] = (1-Dx)*(1-Dy)*src->b[yc][xc] + (1-Dx)*Dy*src->b[ny][xc] + Dx*(1-Dy)*src->b[yc][nx] + Dx*Dy*src->b[ny][nx];
                }
            }
        }
    }
    else if (params->resize.method=="Bilinear") {
 		#pragma omp parallel for if (multiThread)
        for (int i=0; i<dst->height; i++) {
            int sy = i/params->resize.scale;
            sy = CLIPTO(sy, 0, src->height-1);
            double dy = i/params->resize.scale - sy;
            int ny = sy+1;
            if (ny>=src->height)
                ny = sy;
            for (int j=0; j<dst->width; j++) {
                int sx = j/params->resize.scale;
                sx = CLIPTO(sx, 0, src->width-1);
                double dx = j/params->resize.scale - sx;
                int nx = sx+1;
                if (nx>=src->width)
                    nx = sx;
                dst->r[i][j] = (1-dx)*(1-dy)*src->r[sy][sx] + (1-dx)*dy*src->r[ny][sx] + dx*(1-dy)*src->r[sy][nx] + dx*dy*src->r[ny][nx];
                dst->g[i][j] = (1-dx)*(1-dy)*src->g[sy][sx] + (1-dx)*dy*src->g[ny][sx] + dx*(1-dy)*src->g[sy][nx] + dx*dy*src->g[ny][nx];
                dst->b[i][j] = (1-dx)*(1-dy)*src->b[sy][sx] + (1-dx)*dy*src->b[ny][sx] + dx*(1-dy)*src->b[sy][nx] + dx*dy*src->b[ny][nx];
            }
        }
    }
    else {
 		#pragma omp parallel for if (multiThread)
        for (int i=0; i<dst->height; i++) {
            int sy = i/params->resize.scale;
            sy = CLIPTO(sy, 0, src->height-1);
            for (int j=0; j<dst->width; j++) {
                int sx = j/params->resize.scale;
                sx = CLIPTO(sx, 0, src->width-1);
                dst->r[i][j] = src->r[sy][sx];
                dst->g[i][j] = src->g[sy][sx];
                dst->b[i][j] = src->b[sy][sx];
            }
        }
    }
 }
 }
--- a/rtengine/ipsharpen.cc
+++ b/rtengine/ipsharpen.cc
@ -0,0 +1,201 @@
 /*
 *  This file is part of RawTherapee.
 *
 *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
 *
 *  RawTherapee is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  RawTherapee is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <rtengine.h>
 #include <improcfun.h>
 #include <omp.h>
 #include <minmax.h>
 #include <gauss.h>
 #include <bilateral2.h>
 namespace rtengine {
 #undef CLIP
 #undef CMAXVAL
 #undef ABS
 #define CMAXVAL 0xffff
 #define CLIP(a) ((a)>0?((a)<CMAXVAL?(a):CMAXVAL):0)
 #define ABS(a) ((a)<0?-(a):(a))
 void ImProcFunctions::dcdamping (float** aI, unsigned short** aO, float damping, int W, int H) {
 	#pragma omp parallel for if (multiThread)
 	for (int i=0; i<H; i++)
        for (int j=0; j<W; j++) {
            float I = aI[i][j];
            float O = (float)aO[i][j];
            if (O==0.0 || I==0.0) {
                aI[i][j] = 0.0;
                continue;
            }
            float U = -(O * log(I/O) - I + O) * 2.0 / (damping*damping);
            U = MIN(U,1.0);
            U = U*U*U*U*(5.0-U*4.0);
            aI[i][j] = (O - I) / I * U + 1.0;
        }
 }
 void ImProcFunctions::deconvsharpening (LabImage* lab, unsigned short** b2) {
    if (params->sharpening.enabled==false || params->sharpening.deconvamount<1)
        return;
    int W = lab->W, H = lab->H;
    float** tmpI = new float*[H];
    for (int i=0; i<H; i++) {
        tmpI[i] = new float[W];
        for (int j=0; j<W; j++)
            tmpI[i][j] = (float)lab->L[i][j];
    }
    float** tmp = (float**)b2;
    AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(W,H)*omp_get_max_threads());
    float damping = params->sharpening.deconvdamping / 5.0;
    bool needdamp = params->sharpening.deconvdamping > 0;
    for (int k=0; k<params->sharpening.deconviter; k++) {
    	// apply blur function (gaussian blur)
        gaussHorizontal<float> (tmpI, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
        gaussVertical<float>   (tmp, tmp,  buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
    	if (!needdamp) {
 			#pragma omp parallel for if (multiThread)
            for (int i=0; i<H; i++)
                for (int j=0; j<W; j++)
                    if (tmp[i][j]>0)
                        tmp[i][j] = (float)lab->L[i][j] / tmp[i][j];
        }
        else
 			dcdamping (tmp, lab->L, damping, W, H);
        gaussHorizontal<float> (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
        gaussVertical<float>   (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
 		#pragma omp parallel for if (multiThread)
        for (int i=0; i<H; i++)
            for (int j=0; j<W; j++)
                tmpI[i][j] = tmpI[i][j] * tmp[i][j];
    }
 	#pragma omp parallel for if (multiThread)
    for (int i=0; i<H; i++)
        for (int j=0; j<W; j++)
            lab->L[i][j] = lab->L[i][j]*(100-params->sharpening.deconvamount) / 100 + (int)CLIP(tmpI[i][j])*params->sharpening.deconvamount / 100;
    for (int i=0; i<H; i++)
        delete [] tmpI[i];
    delete [] tmpI;
 }
 void ImProcFunctions::sharpening (LabImage* lab, unsigned short** b2) {
    if (params->sharpening.method=="rld") {
        deconvsharpening (lab, b2);
        return;
    }
    if (params->sharpening.enabled==false || params->sharpening.amount<1 || lab->W<8 || lab->H<8)
        return;
    int W = lab->W, H = lab->H;
    unsigned short** b3;
    AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(W,H)*omp_get_max_threads());
    if (params->sharpening.edgesonly==false) {
        gaussHorizontal<unsigned short> (lab->L, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
        gaussVertical<unsigned short>   (b2,     b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
    }
    else {
        b3 = new unsigned short*[H];
        for (int i=0; i<H; i++)
            b3[i] = new unsigned short[W];
 		bilateral<unsigned short, unsigned int> (lab->L, (unsigned short**)b3, b2, W, H, params->sharpening.edges_radius / scale, params->sharpening.edges_tolerance, multiThread);
 		gaussHorizontal<unsigned short> (b3, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
 		gaussVertical<unsigned short>   (b2, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
    }
    delete buffer;
    unsigned short** base = lab->L;
    if (params->sharpening.edgesonly)
        base = b3;
    if (params->sharpening.halocontrol==false) {
 		#pragma omp parallel for if (multiThread)
    	for (int i=0; i<H; i++)
            for (int j=0; j<W; j++) {
                int diff = base[i][j] - b2[i][j];
                if (ABS(diff)>params->sharpening.threshold) {
                    int val = lab->L[i][j] + params->sharpening.amount * diff / 100;
                    lab->L[i][j] = CLIP(val);
                }
            }
    }
    else
 		sharpenHaloCtrl (lab, b2, base, W, H);
    if (params->sharpening.edgesonly) {
        for (int i=0; i<H; i++)
            delete [] b3[i];
        delete [] b3;
    }
 }
 void ImProcFunctions::sharpenHaloCtrl (LabImage* lab, unsigned short** blurmap, unsigned short** base, int W, int H) {
    int scale = 100 * (100-params->sharpening.halocontrol_amount);
    unsigned short** nL = base;
 	#pragma omp parallel for if (multiThread)
    for (int i=2; i<H-2; i++) {
        int max1 = 0, max2 = 0, min1 = 0, min2 = 0, maxn, minn, np1, np2, np3, min, max;
        for (int j=2; j<W-2; j++) {
            int diff = base[i][j] - blurmap[i][j];
            if (ABS(diff) > params->sharpening.threshold) {
                // compute maximum/minimum in a delta environment
                np1 = 2*(nL[i-2][j] + nL[i-2][j+1] + nL[i-2][j+2] + nL[i-1][j] + nL[i-1][j+1] + nL[i-1][j+2] + nL[i][j] + nL[i][j+1] + nL[i][j+2]) / 27 + nL[i-1][j+1] / 3;
                np2 = 2*(nL[i-1][j] + nL[i-1][j+1] + nL[i-1][j+2] + nL[i][j] + nL[i][j+1] + nL[i][j+2] + nL[i+1][j] + nL[i+1][j+1] + nL[i+1][j+2]) / 27 + nL[i][j+1] / 3;
                np3 = 2*(nL[i][j] + nL[i][j+1] + nL[i][j+2] + nL[i+1][j] + nL[i+1][j+1] + nL[i+1][j+2] + nL[i+2][j] + nL[i+2][j+1] + nL[i+2][j+2]) / 27 + nL[i+1][j+1] / 3;
                MINMAX3(np1,np2,np3,maxn,minn);
                MAX3(max1,max2,maxn,max);
                MIN3(min1,min2,minn,min);
                max1 = max2; max2 = maxn;
                min1 = min2; min2 = minn;
                if (max < lab->L[i][j])
                    max = lab->L[i][j];
                if (min > lab->L[i][j])
                    min = lab->L[i][j];
                int val = lab->L[i][j] + params->sharpening.amount * diff / 100;
                int newL = CLIP(val);
                // applying halo control
                if (newL > max)
                    newL = max + (newL-max) * scale / 10000;
                else if (newL<min)
                    newL = min - (min-newL) * scale / 10000;
                lab->L[i][j] = newL;
            }
        }
    }
 }
 }
--- a/rtengine/iptransform.cc
+++ b/rtengine/iptransform.cc
@ -0,0 +1,821 @@
 /*
 *  This file is part of RawTherapee.
 *
 *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
 *
 *  RawTherapee is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  RawTherapee is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <rtengine.h>
 #include <improcfun.h>
 #include <omp.h>
 namespace rtengine {
 #undef CMAXVAL
 #undef MAX
 #undef MIN
 #undef CLIP
 #undef CLIPTOC
 #define CMAXVAL 0xffff
 #define MAX(a,b) ((a)<(b)?(b):(a))
 #define MIN(a,b) ((a)>(b)?(b):(a))
 #define CLIP(a) ((a)>0?((a)<CMAXVAL?(a):CMAXVAL):0)
 #define CLIPTOC(a,b,c,d) ((a)>=(b)?((a)<=(c)?(a):((c),d=true)):((b),d=true))
 extern const Settings* settings;
 void ImProcFunctions::vignetting_ (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to) {
  int oW = sizes.oW;
  int oH = sizes.oH;
  int cx = sizes.cx;
  int cy = sizes.cy;
  double  w2 = (double) oW  / 2.0 - 0.5;
  double  h2 = (double) oH  / 2.0 - 0.5;
  double maxRadius = sqrt( (double)( oW*oW + oH*oH ) ) / 2;
  double v = 1.0 - params->vignetting.amount * 3.0 / 400.0;
  double b = 1.0 + params->vignetting.radius * 7.0 / 100.0;
  double mul = (1.0-v) / tanh(b);
  int val;
  for (int y=row_from; y<row_to; y++) {
      double y_d = (double) (y + cy) - h2 ;
      for (int x=0; x<transformed->width; x++) {
          double x_d = (double) (x + cx) - w2 ;
          double r = sqrt(x_d*x_d + y_d*y_d);
          double vign = v + mul * tanh (b*(maxRadius-r) / maxRadius);
          val = original->r[y][x] / vign;
          transformed->r[y][x] = CLIP(val);
          val =  original->g[y][x] / vign;
          transformed->g[y][x] = CLIP(val);
          val = original->b[y][x] / vign;
          transformed->b[y][x] = CLIP(val);
      }
  }
 }
 void ImProcFunctions::vignetting (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int oW, int oH) {
    STemp sizes;
    sizes.cx = cx;
    sizes.cy = cy;
    sizes.oW = oW;
    sizes.oH = oH;
    if (settings->dualThreadEnabled) {
        Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::vignetting_), original, transformed, params, sizes, 0, transformed->height/2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
        Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::vignetting_), original, transformed, params, sizes, transformed->height/2, transformed->height), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
        thread1->join ();
        thread2->join ();
    }
    else
        vignetting_ (original, transformed, params, sizes, 0, transformed->height);
 }
 #include "cubint.cc"
 void ImProcFunctions::transform_ (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to) {
  int oW = sizes.oW;
  int oH = sizes.oH;
  int cx = sizes.cx;
  int cy = sizes.cy;
  int sx = sizes.sx;
  int sy = sizes.sy;
  double  w2 = (double) oW  / 2.0 - 0.5;
  double  h2 = (double) oH  / 2.0 - 0.5;
  double cost = cos(params->rotate.degree * 3.14/180.0);
  double sint = sin(params->rotate.degree * 3.14/180.0);
  double  max_x = (double) (sx + original->width - 1);
  double  max_y = (double) (sy + original->height - 1);
  double  min_x = (double) sx;
  double  min_y = (double) sy;
  const int n2 = 2;
  const int n = 4;
  int mix  = original->width - 1; // maximum x-index src
  int miy  = original->height - 1;// maximum y-index src
  int mix2 = mix +1 - n;
  int miy2 = miy +1 - n;
  double scale = (oW>oH) ? (double)oW / 2.0 : (double)oH / 2.0 ;
  double radius = sqrt( (double)( oW*oW + oH*oH ) );
  radius /= (oW<oH) ? oW : oH;
  double a = params->distortion.amount;
  double d = 1.0 - a;
    // magnify image to keep size
    double rotmagn = 1.0;
    if (params->rotate.fill) {
        double beta = atan((double)MIN(oH,oW)/MAX(oW,oH));
        rotmagn = sin(beta) / sin(fabs(params->rotate.degree) * 3.14/180.0 + beta);
    }
    // 1. check upper and lower border
    double d1 = rotmagn - a*h2/scale;
    double d2 = rotmagn - a*w2/scale;
    double d3 = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
    d = MIN(d,MIN(d1,MIN(d2,d3)));
    // auxilary variables for vignetting
    double maxRadius = sqrt( (double)( oW*oW + oH*oH ) ) / 2 / scale;
    double v = 1.0 - params->vignetting.amount * 3.0 / 400.0;
    double b = 1.0 + params->vignetting.radius * 7.0 / 100.0;
    double mul = (1.0-v) / tanh(b);
    // main cycle
    double eps = 1e-10;
    bool calc_r=( (fabs(a)>eps) || (fabs(1.0-v)>eps) );
    bool do_vign = (fabs(1.0-v)>eps);
    for (int y=row_from; y<row_to; y++) {
        double y_d = (double) (y + cy) - h2 ;
        for (int x=0; x<transformed->width; x++) {
            double x_d = (double) (x + cx) - w2 ;
            double r=0.0;
            double s = d;//10000.0;
 	    if (calc_r)
 	    {
 	            r=(sqrt(x_d*x_d + y_d*y_d)) / scale;
 	            if (r<radius)
 	            s += a * r ;
 	    }
            double Dx = s*(x_d * cost - y_d * sint) + w2;
            double Dy = s*(x_d * sint + y_d * cost) + h2;
            if (fabs(Dx)<eps) Dx = 0;
            if (fabs(Dy)<eps) Dy = 0;
            if (fabs(Dx-max_x)<eps) Dx = nextafter(max_x,0);
            if (fabs(Dy-max_y)<eps) Dy = nextafter(max_y,0);
            bool valid = !((Dx >= max_x)   || (Dy >= max_y) || (Dx < min_x) || (Dy < min_y));
            // Convert only valid pixels
            if (valid) {
                // Extract integer and fractions of source screen coordinates
                int xc  =  (int) (Dx); Dx -= (double)xc;
                int yc  =  (int) (Dy); Dy -= (double)yc;
                int ys = yc +1 - n2 - sy; // smallest y-index used for interpolation
                int xs = xc +1 - n2 - sx; // smallest x-index used for interpolation
                double vignmul = 1.0;
 		if (do_vign) vignmul /= (v + mul * tanh (b*(maxRadius-s*r) / maxRadius));
                if (ys >= 0 && ys <= miy2 && xs >= 0 && xs <= mix2)   // all interpolation pixels inside image
                    cubint (original, xs, ys, Dx, Dy, &(transformed->r[y][x]), &(transformed->g[y][x]), &(transformed->b[y][x]), vignmul);
                else { // edge pixels
                    int y1 = (yc>0) ? yc : 0;
                    if (y1>miy) y1 = miy;
                    int y2 = (yc<miy) ? yc+1 : miy;
                    if (y2<0) y2 = 0;
                    int x1 = (xc>0) ? xc : 0;
                    if (x1>mix) x1 = mix;
                    int x2 = (xc<mix) ? xc+1 : mix;
                    if (x2<0) x2 = 0;
                    int r = vignmul*(original->r[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->r[y1][x2]*Dx*(1.0-Dy) + original->r[y2][x1]*(1.0-Dx)*Dy + original->r[y2][x2]*Dx*Dy);
                    int g = vignmul*(original->g[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->g[y1][x2]*Dx*(1.0-Dy) + original->g[y2][x1]*(1.0-Dx)*Dy + original->g[y2][x2]*Dx*Dy);
                    int b = vignmul*(original->b[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->b[y1][x2]*Dx*(1.0-Dy) + original->b[y2][x1]*(1.0-Dx)*Dy + original->b[y2][x2]*Dx*Dy);
                    transformed->r[y][x] = CLIP(r);
                    transformed->g[y][x] = CLIP(g);
                    transformed->b[y][x] = CLIP(b);
                }
            }
            else {
                // not valid (source pixel x,y not inside source image, etc.)
                transformed->r[y][x] = 0;
                transformed->g[y][x] = 0;
                transformed->b[y][x] = 0;
            }
        }
    }
 }
 void ImProcFunctions::simpltransform_ (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to) {
  int oW = sizes.oW;
  int oH = sizes.oH;
  int cx = sizes.cx;
  int cy = sizes.cy;
  int sx = sizes.sx;
  int sy = sizes.sy;
  double  w2 = (double) oW  / 2.0 - 0.5;
  double  h2 = (double) oH  / 2.0 - 0.5;
  double cost = cos(params->rotate.degree * 3.14/180.0);
  double sint = sin(params->rotate.degree * 3.14/180.0);
  double  max_x = (double) (sx + original->width - 1);
  double  max_y = (double) (sy + original->height - 1);
  double  min_x = (double) sx;
  double  min_y = (double) sy;
  const int n2 = 2;
  const int n = 2;
  int mix  = original->width - 1; // maximum x-index src
  int miy  = original->height - 1;// maximum y-index src
  int mix2 = mix +1 - n;
  int miy2 = miy +1 - n;
  double scale = (oW>oH) ? (double)oW / 2.0 : (double)oH / 2.0 ;
  double radius = sqrt( (double)( oW*oW + oH*oH ) );
  radius /= (oW<oH) ? oW : oH;
  double a = params->distortion.amount;
  double d = 1.0 - a;
    // magnify image to keep size
    double rotmagn = 1.0;
    if (params->rotate.fill) {
        double beta = atan((double)MIN(oH,oW)/MAX(oW,oH));
        rotmagn = sin(beta) / sin(fabs(params->rotate.degree) * 3.14/180.0 + beta);
    }
    // 1. check upper and lower border
    double d1r = rotmagn - a*h2/scale - params->cacorrection.red;
    double d2r = rotmagn - a*w2/scale - params->cacorrection.red;
    double d3r = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.red;
    double dr = MIN(d,MIN(d1r,MIN(d2r,d3r)));
    double d1b = rotmagn - a*h2/scale - params->cacorrection.blue;
    double d2b = rotmagn - a*w2/scale - params->cacorrection.blue;
    double d3b = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.blue;
    double db = MIN(d,MIN(d1b,MIN(d2b,d3b)));
    double d1g = rotmagn - a*h2/scale;
    double d2g = rotmagn - a*w2/scale;
    double d3g = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
    double dg = MIN(d,MIN(d1g,MIN(d2g,d3g)));
    d = MIN(dg,MIN(dr,db));
    // auxilary variables for vignetting
    double maxRadius = sqrt( (double)( oW*oW + oH*oH ) ) / 2 / scale;
    double v = 1.0 - params->vignetting.amount * 3.0 / 400.0;
    double b = 1.0 + params->vignetting.radius * 7.0 / 100.0;
    double mul = (1.0-v) / tanh(b);
    // main cycle
    double eps = 1e-10;
    bool calc_r=( (fabs(a)>eps) || (fabs(1.0-v)>eps) );
    bool do_vign = (fabs(1.0-v)>eps);
    for (int y=row_from; y<row_to; y++) {
        double y_d = (double) (y + cy) - h2 ;
        for (int x=0; x<transformed->width; x++) {
            double x_d = (double) (x + cx) - w2 ;
            double r=0.0;
            double s = d;//10000.0;
 	    if (calc_r)
 	    {
 	            r=(sqrt(x_d*x_d + y_d*y_d)) / scale;
 	            if (r<radius)
 	            s += a * r ;
 	    }
            double Dx = s*(x_d * cost - y_d * sint) + w2;
            double Dy = s*(x_d * sint + y_d * cost) + h2;
            if (fabs(Dx)<eps) Dx = 0;
            if (fabs(Dy)<eps) Dy = 0;
            if (fabs(Dx-max_x)<eps) Dx = nextafter(max_x,0);
            if (fabs(Dy-max_y)<eps) Dy = nextafter(max_y,0);
            bool valid = !((Dx >= max_x)   || (Dy >= max_y) || (Dx < min_x) || (Dy < min_y));
            // Convert only valid pixels
            if (valid) {
                // Extract integer and fractions of source screen coordinates
                int xc  =  (int) (Dx); Dx -= (double)xc;
                int yc  =  (int) (Dy); Dy -= (double)yc;
                int ys = yc +1 - n2 - sy; // smallest y-index used for interpolation
                int xs = xc +1 - n2 - sx; // smallest x-index used for interpolation
                double vignmul = 1.0;
 		if (do_vign) vignmul /= (v + mul * tanh (b*(maxRadius-s*r) / maxRadius));
                if (ys >= 0 && ys <= miy2 && xs >= 0 && xs <= mix2 && yc < miy-1) {   // all interpolation pixels inside image
                    int r = vignmul*(original->r[yc][xc]*(1.0-Dx)*(1.0-Dy) + original->r[yc][xc+1]*Dx*(1.0-Dy) + original->r[yc+1][xc]*(1.0-Dx)*Dy + original->r[yc+1][xc+1]*Dx*Dy);
                    int g = vignmul*(original->g[yc][xc]*(1.0-Dx)*(1.0-Dy) + original->g[yc][xc+1]*Dx*(1.0-Dy) + original->g[yc+1][xc]*(1.0-Dx)*Dy + original->g[yc+1][xc+1]*Dx*Dy);
                    int b = vignmul*(original->b[yc][xc]*(1.0-Dx)*(1.0-Dy) + original->b[yc][xc+1]*Dx*(1.0-Dy) + original->b[yc+1][xc]*(1.0-Dx)*Dy + original->b[yc+1][xc+1]*Dx*Dy);
                    transformed->r[y][x] = CLIP(r);
                    transformed->g[y][x] = CLIP(g);
                    transformed->b[y][x] = CLIP(b);
                }
                else { // edge pixels
                    int y1 = (yc>0) ? yc : 0;
                    if (y1>miy) y1 = miy;
                    int y2 = (yc<miy) ? yc+1 : miy;
                    if (y2<0) y2 = 0;
                    int x1 = (xc>0) ? xc : 0;
                    if (x1>mix) x1 = mix;
                    int x2 = (xc<mix) ? xc+1 : mix;
                    if (x2<0) x2 = 0;
                    int r = vignmul*(original->r[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->r[y1][x2]*Dx*(1.0-Dy) + original->r[y2][x1]*(1.0-Dx)*Dy + original->r[y2][x2]*Dx*Dy);
                    int g = vignmul*(original->g[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->g[y1][x2]*Dx*(1.0-Dy) + original->g[y2][x1]*(1.0-Dx)*Dy + original->g[y2][x2]*Dx*Dy);
                    int b = vignmul*(original->b[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->b[y1][x2]*Dx*(1.0-Dy) + original->b[y2][x1]*(1.0-Dx)*Dy + original->b[y2][x2]*Dx*Dy);
                    transformed->r[y][x] = CLIP(r);
                    transformed->g[y][x] = CLIP(g);
                    transformed->b[y][x] = CLIP(b);
                }
            }
            else {
                // not valid (source pixel x,y not inside source image, etc.)
                transformed->r[y][x] = 0;
                transformed->g[y][x] = 0;
                transformed->b[y][x] = 0;
            }
        }
    }
 }
 #include "cubintch.cc"
 void ImProcFunctions::transform_sep_ (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to) {
  int oW = sizes.oW;
  int oH = sizes.oH;
  int cx = sizes.cx;
  int cy = sizes.cy;
  int sx = sizes.sx;
  int sy = sizes.sy;
  double  w2 = (double) oW  / 2.0 - 0.5;
  double  h2 = (double) oH  / 2.0 - 0.5;
  double cost = cos(params->rotate.degree * 3.14/180.0);
  double sint = sin(params->rotate.degree * 3.14/180.0);
  double  max_x = (double) (sx + original->width - 1);
  double  max_y = (double) (sy + original->height - 1);
  double  min_x = (double) sx;
  double  min_y = (double) sy;
  const int n2 = 2;
  const int n = 4;
  int mix  = original->width - 1; // maximum x-index src
  int miy  = original->height - 1;// maximum y-index src
  int mix2 = mix +1 - n;
  int miy2 = miy +1 - n;
  double scale = (oW>oH) ? (double)oW / 2.0 : (double)oH / 2.0 ;
  double radius = sqrt( (double)( oW*oW + oH*oH ) );
  radius /= (oW<oH) ? oW : oH;
  double a = params->distortion.amount;
    double d = 1.0 - a;
    double cdist[3];
    cdist[0] = params->cacorrection.red;
    cdist[1] = 0.0;
    cdist[2] = params->cacorrection.blue;
    // magnify image to keep size
    double rotmagn = 1.0;
    if (params->rotate.fill) {
        double beta = atan((double)MIN(oH,oW)/MAX(oW,oH));
        rotmagn = sin(beta) / sin(fabs(params->rotate.degree) * 3.14/180.0 + beta);
    }
    // 1. check upper and lower border
    double d1r = rotmagn - a*h2/scale - params->cacorrection.red;
    double d2r = rotmagn - a*w2/scale - params->cacorrection.red;
    double d3r = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.red;
    double dr = MIN(d,MIN(d1r,MIN(d2r,d3r)));
    double d1b = rotmagn - a*h2/scale - params->cacorrection.blue;
    double d2b = rotmagn - a*w2/scale - params->cacorrection.blue;
    double d3b = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.blue;
    double db = MIN(d,MIN(d1b,MIN(d2b,d3b)));
    double d1g = rotmagn - a*h2/scale;
    double d2g = rotmagn - a*w2/scale;
    double d3g = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
    double dg = MIN(d,MIN(d1g,MIN(d2g,d3g)));
    d = MIN(dg,MIN(dr,db));
    unsigned short** chorig[3];
    chorig[0] = original->r;
    chorig[1] = original->g;
    chorig[2] = original->b;
    unsigned short** chtrans[3];
    chtrans[0] = transformed->r;
    chtrans[1] = transformed->g;
    chtrans[2] = transformed->b;
    // auxilary variables for vignetting
    double maxRadius = sqrt( (double)( oW*oW + oH*oH ) ) / 2 / scale;
    double v = 1.0 - params->vignetting.amount * 3.0 / 400.0;
    double b = 1.0 + params->vignetting.radius * 7.0 / 100.0;
    double mul = (1.0-v) / tanh(b);
    // main cycle
    double eps = 1e-10;
    for (int y=row_from; y<row_to; y++) {
        double y_d = (double) (y + cy) - h2 ;
        for (int x=0; x<transformed->width; x++) {
            double x_d = (double) (x + cx) - w2 ;
            double r = (sqrt(x_d*x_d + y_d*y_d)) / scale;
            double s = 10000.0;
            if (r<radius)
 	            s = a * r + d;
            double vignmul = 1.0 / (v + mul * tanh (b*(maxRadius-s*r) / maxRadius));
            for (int c=0; c<3; c++) {
                double Dx = (s + cdist[c]) * (x_d * cost - y_d * sint) + w2;
                double Dy = (s + cdist[c]) * (x_d * sint + y_d * cost) + h2;
                if (fabs(Dx)<eps) Dx = 0;
                if (fabs(Dy)<eps) Dy = 0;
                if (fabs(Dx-max_x)<eps) Dx = nextafter(max_x,0);
                if (fabs(Dy-max_y)<eps) Dy = nextafter(max_y,0);
                bool valid = !((Dx >= max_x)   || (Dy >= max_y) || (Dx < min_x) || (Dy < min_y));
                // Convert only valid pixels
                if (valid) {
                    // Extract integer and fractions of source screen coordinates
                    int xc  =  (int) (Dx); Dx -= (double)xc;
                    int yc  =  (int) (Dy); Dy -= (double)yc;
                    int ys = yc +1 - n2 - sy; // smallest y-index used for interpolation
                    int xs = xc +1 - n2 - sx; // smallest x-index used for interpolation
                    if (ys >= 0 && ys <= miy2 && xs >= 0 && xs <= mix2)  // all interpolation pixels inside image
                        cubintch (chorig[c], xs, ys, Dx, Dy, &(chtrans[c][y][x]), vignmul);
                    else {// edge pixels, linear interpolation
                        int y1 = (yc>0) ? yc : 0;
                        if (y1>miy) y1 = miy;
                        int y2 = (yc<miy) ? yc+1 : miy;
                        if (y2<0) y2 = 0;
                        int x1 = (xc>0) ? xc : 0;
                        if (x1>mix) x1 = mix;
                        int x2 = (xc<mix) ? xc+1 : mix;
                        if (x2<0) x2 = 0;
                        int val = vignmul*(chorig[c][y1][x1]*(1.0-Dx)*(1.0-Dy) + chorig[c][y1][x2]*Dx*(1.0-Dy) + chorig[c][y2][x1]*(1.0-Dx)*Dy + chorig[c][y2][x2]*Dx*Dy);
                        chtrans[c][y][x] = CLIP(val);
                    }
                }
                else // not valid (source pixel x,y not inside source image, etc.)
                    chtrans[c][y][x] = 0;
            }
        }
    }
 }
 bool ImProcFunctions::transCoord (const ProcParams* params, int W, int H, std::vector<Coord2D> &src, std::vector<Coord2D> &red,  std::vector<Coord2D> &green, std::vector<Coord2D> &blue) {
    bool clipresize = true;
    bool clipped = false;
    red.clear ();
    green.clear ();
    blue.clear ();
    bool needstransform  = 0;// fabs(params->rotate.degree)>1e-15 || fabs(params->distortion.amount)>1e-15 || fabs(params->cacorrection.red)>1e-15 || fabs(params->cacorrection.blue)>1e-15;
    if (!needstransform) {
        if (clipresize) {
            // Apply resizing
            if (fabs(params->resize.scale-1.0)>=1e-7) {
                for (int i=0; i<src.size(); i++) {
                    red.push_back   (Coord2D (src[i].x / params->resize.scale, src[i].y / params->resize.scale));
                    green.push_back (Coord2D (src[i].x / params->resize.scale, src[i].y / params->resize.scale));
                    blue.push_back  (Coord2D (src[i].x / params->resize.scale, src[i].y / params->resize.scale));
                }
                for (int i=0; i<src.size(); i++) {
                    red[i].x = CLIPTOC(red[i].x,0,W-1,clipped);
                    red[i].y = CLIPTOC(red[i].y,0,H-1,clipped);
                    green[i].x = CLIPTOC(green[i].x,0,W-1,clipped);
                    green[i].y = CLIPTOC(green[i].y,0,H-1,clipped);
                    blue[i].x = CLIPTOC(blue[i].x,0,W-1,clipped);
                    blue[i].y = CLIPTOC(blue[i].y,0,H-1,clipped);
                }
            }
            else
                for (int i=0; i<src.size(); i++) {
                    red.push_back   (Coord2D (src[i].x, src[i].y));
                    green.push_back (Coord2D (src[i].x, src[i].y));
                    blue.push_back  (Coord2D (src[i].x, src[i].y));
                }
        }
        return clipped;
    }
    double rW = W*params->resize.scale;
    double rH = H*params->resize.scale;
    double  w2 = (double) rW  / 2.0 - 0.5;
    double  h2 = (double) rH  / 2.0 - 0.5;
    double cost = cos(params->rotate.degree * 3.14/180.0);
    double sint = sin(params->rotate.degree * 3.14/180.0);
    double scale = (rW>rH) ? rW / 2.0 : rH / 2.0 ;
    double radius = sqrt ((double)(rW*rW + rH*rH ));
    radius /= (rW<rH) ? rW : rH;
    double a = params->distortion.amount;
    double d = 1.0 - a;
    // magnify image to keep size
    double rotmagn = 1.0;
    if (params->rotate.fill) {
        double beta = atan(MIN(rH,rW)/MAX(rW,rH));
        rotmagn = sin(beta) / sin(fabs(params->rotate.degree) * 3.14/180.0 + beta);
    }
    if (params->cacorrection.red==0 && params->cacorrection.blue==0) {
        // 1. check upper and lower border
        double d1 = rotmagn - a*h2/scale;
        double d2 = rotmagn - a*w2/scale;
        double d3 = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
        d = MIN(d,MIN(d1,MIN(d2,d3)));
        for (int i=0; i<src.size(); i++) {
            double y_d = src[i].y - h2 ;
            double x_d = src[i].x - w2 ;
            double r = (sqrt(x_d*x_d + y_d*y_d)) / scale;
            double s = 10000.0;
            if (r<radius)
                s = a * r + d;
            red.push_back (Coord2D(s*(x_d * cost - y_d * sint) + w2, s*(x_d * sint + y_d * cost) + h2));
            green.push_back (Coord2D(s*(x_d * cost - y_d * sint) + w2, s*(x_d * sint + y_d * cost) + h2));
            blue.push_back (Coord2D(s*(x_d * cost - y_d * sint) + w2, s*(x_d * sint + y_d * cost) + h2));
        }
    }
    else {
        double cdist[3];
        cdist[0] = params->cacorrection.red;
        cdist[1] = 0.0;
        cdist[2] = params->cacorrection.blue;
        // 1. check upper and lower border
        double d1r = rotmagn - a*h2/scale - params->cacorrection.red;
        double d2r = rotmagn - a*w2/scale - params->cacorrection.red;
        double d3r = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.red;
        double dr = MIN(d,MIN(d1r,MIN(d2r,d3r)));
        double d1b = rotmagn - a*h2/scale - params->cacorrection.blue;
        double d2b = rotmagn - a*w2/scale - params->cacorrection.blue;
        double d3b = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.blue;
        double db = MIN(d,MIN(d1b,MIN(d2b,d3b)));
        double d1g = rotmagn - a*h2/scale;
        double d2g = rotmagn - a*w2/scale;
        double d3g = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
        double dg = MIN(d,MIN(d1g,MIN(d2g,d3g)));
        d = MIN(dg,MIN(dr,db));
        for (int i=0; i<src.size(); i++) {
            double y_d = src[i].y - h2 ;
            double x_d = src[i].x - w2 ;
            double r = (sqrt(x_d*x_d + y_d*y_d)) / scale;
            double s = 10000.0;
            if (r<radius)
                s = a * r + d;
            src[i].x = s*(x_d * cost - y_d * sint) + w2;
            src[i].y  = s*(x_d * sint + y_d * cost) + h2;
            red.push_back (Coord2D((s+cdist[0])*(x_d * cost - y_d * sint) + w2, (s+cdist[0])*(x_d * sint + y_d * cost) + h2));
            green.push_back (Coord2D((s+cdist[1])*(x_d * cost - y_d * sint) + w2, (s+cdist[1])*(x_d * sint + y_d * cost) + h2));
            blue.push_back (Coord2D((s+cdist[2])*(x_d * cost - y_d * sint) + w2, (s+cdist[2])*(x_d * sint + y_d * cost) + h2));
        }
    }
    if (clipresize) {
        if (fabs(params->resize.scale-1.0)>=1e-7) {
            for (int i=0; i<src.size(); i++) {
                red[i].x /= params->resize.scale;
                red[i].y /= params->resize.scale;
                green[i].x /= params->resize.scale;
                green[i].y /= params->resize.scale;
                blue[i].x /= params->resize.scale;
                blue[i].y /= params->resize.scale;
            }
        }
        for (int i=0; i<src.size(); i++) {
            red[i].x = CLIPTOC(red[i].x,0,W-1,clipped);
            red[i].y = CLIPTOC(red[i].y,0,H-1,clipped);
            green[i].x = CLIPTOC(green[i].x,0,W-1,clipped);
            green[i].y = CLIPTOC(green[i].y,0,H-1,clipped);
            blue[i].x = CLIPTOC(blue[i].x,0,W-1,clipped);
            blue[i].y = CLIPTOC(blue[i].y,0,H-1,clipped);
        }
    }
    return clipped;
 }
 bool ImProcFunctions::transCoord (const ProcParams* params, int W, int H, int x, int y, int w, int h, int& xv, int& yv, int& wv, int& hv) {
    int x1 = x, y1 = y;
    int x2 = x1 + w - 1;
    int y2 = y1 + h - 1;
    std::vector<Coord2D> corners (8);
    corners[0].set (x1, y1);
    corners[1].set (x1, y2);
    corners[2].set (x2, y2);
    corners[3].set (x2, y1);
    corners[4].set ((x1+x2)/2, y1);
    corners[5].set ((x1+x2)/2, y2);
    corners[6].set (x1, (y1+y2)/2);
    corners[7].set (x2, (y1+y2)/2);
    std::vector<Coord2D> r, g, b;
    bool result = transCoord (params, W, H, corners, r, g, b);
    std::vector<Coord2D> transCorners;
    transCorners.insert (transCorners.end(), r.begin(), r.end());
    transCorners.insert (transCorners.end(), g.begin(), g.end());
    transCorners.insert (transCorners.end(), b.begin(), b.end());
    double x1d = transCorners[0].x;
    for (int i=1; i<transCorners.size(); i++)
        if (transCorners[i].x<x1d)
            x1d = transCorners[i].x;
   int x1v = (int)(x1d);
    double y1d = transCorners[0].y;
    for (int i=1; i<transCorners.size(); i++)
        if (transCorners[i].y<y1d)
            y1d = transCorners[i].y;
    int y1v = (int)(y1d);
    double x2d = transCorners[0].x;
    for (int i=1; i<transCorners.size(); i++)
        if (transCorners[i].x>x2d)
            x2d = transCorners[i].x;
    int x2v = (int)ceil(x2d);
    double y2d = transCorners[0].y;
    for (int i=1; i<transCorners.size(); i++)
        if (transCorners[i].y>y2d)
            y2d = transCorners[i].y;
    int y2v = (int)ceil(y2d);
    xv = x1v;
    yv = y1v;
    wv = x2v - x1v + 1;
    hv = y2v - y1v + 1;
    return result;
 }
 void ImProcFunctions::transform (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int sx, int sy, int oW, int oH) {
    STemp sizes;
    sizes.cx = 0;//cx;
    sizes.cy = 0;//cy;
    sizes.oW = oW;
    sizes.oH = oH;
    sizes.sx = 0;//sx;
    sizes.sy = 0;//sy;
    if (params->cacorrection.red==0 && params->cacorrection.blue==0) {
        if (settings->dualThreadEnabled) {
            Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::transform_), original, transformed, params, sizes, 0, transformed->height/2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
            Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::transform_), original, transformed, params, sizes, transformed->height/2, transformed->height), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
            thread1->join ();
            thread2->join ();
        }
        else
            transform_ (original, transformed, params, sizes, 0, transformed->height);
    }
    else {
        if (settings->dualThreadEnabled) {
            Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::transform_sep_), original, transformed, params, sizes, 0, transformed->height/2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
            Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::transform_sep_), original, transformed, params, sizes, transformed->height/2, transformed->height), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
            thread1->join ();
            thread2->join ();
        }
        else
            transform_sep_ (original, transformed, params, sizes, 0, transformed->height);
    }
 }
 void ImProcFunctions::simpltransform (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int sx, int sy, int oW, int oH) {
    STemp sizes;
    sizes.cx = 0;//cx;
    sizes.cy = 0;//cy;
    sizes.oW = oW;
    sizes.oH = oH;
    sizes.sx = 0;//sx;
    sizes.sy = 0;//sy;
    if (settings->dualThreadEnabled) {
        Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::simpltransform_), original, transformed, params, sizes, 0, transformed->height/2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
        Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::simpltransform_), original, transformed, params, sizes, transformed->height/2, transformed->height), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
        thread1->join ();
        thread2->join ();
    }
    else
        simpltransform_ (original, transformed, params, sizes, 0, transformed->height);
 }
 /*void ImProcFunctions::transform (Image16* original, Image16* transformed, const ProcParams* params, int ox, int oy) {
  if (!transformed)
    return;
  int oW = W, oH = H, tW = W, tH = H;
  double  w2 = (double) tW / 2.0 - 0.5;
  double  h2 = (double) tH / 2.0 - 0.5;
  double  sw2 = (double) oW  / 2.0 - 0.5;
  double  sh2 = (double) oH  / 2.0 - 0.5;
  double cost = cos(params->rotate_fine * 3.14/180.0);
  double sint = sin(params->rotate_fine * 3.14/180.0);
  double  max_x = (double) oW;
  double  max_y = (double) oH;
  double  min_x =  0.0;
  double  min_y =  0.0;
  const int n2 = 2;
  const int n = 4;
  int mix  = oW - 1; // maximum x-index src
  int miy  = oH - 1;// maximum y-index src
  int mix2 = mix +1 - n;
  int miy2 = miy +1 - n;
  double scale = (tW>tH) ? (double)tW / 2.0 : (double)tH / 2.0 ;
  double radius = sqrt( (double)( tW*tW + tH*tH ) );
  radius /= (tW<tH) ? tW : tH;
  double a = params->lens_distortion;
  for (int y=0; y<transformed->height; y++) {
    double y_d = (double) y + oy - h2 ;
    for (int x=0; x<transformed->width; x++) {
      double x_d = (double) x + ox - w2 ;
      double r = (sqrt(x_d*x_d + y_d*y_d)) / scale;
      double s = 10000.0;
      if (r<radius)
 	        s = a * r + 1.0 - a;
      double Dx = s*(x_d * cost - y_d * sint) + sw2;
      double Dy = s*(x_d * sint + y_d * cost) + sh2;
      bool valid = !((Dx >= max_x)   || (Dy >= max_y) || (Dx < min_x) || (Dy < min_y));
      // Convert only valid pixels
      if (valid) {
        // Extract integer and fractions of source screen coordinates
        int xc  =  (int) floor (Dx) ; Dx -= (double)xc;
        int yc  =  (int) floor (Dy) ; Dy -= (double)yc;
        int ys = yc +1 - n2 ; // smallest y-index used for interpolation
        int xs = xc +1 - n2 ; // smallest x-index used for interpolation
        unsigned short sr[2][2], sg[2][2], sb[2][2];
        if (ys >= 0 && ys <= miy2 && xs >= 0 && xs <= mix2)   // all interpolation pixels inside image
          cubint (original, xs, ys, Dx, Dy, &(transformed->r[y][x]), &(transformed->g[y][x]), &(transformed->b[y][x]));
        else { // edge pixels
          transformed->r[y][x] = 0;
          transformed->g[y][x] = 0;
          transformed->b[y][x] = 0;
        }
      }
      else {
        // not valid (source pixel x,y not inside source image, etc.)
        transformed->r[y][x] = 0;
        transformed->g[y][x] = 0;
        transformed->b[y][x] = 0;
      }
    }
  }
 }*/
 }
--- a/rtengine/labimage.cc
+++ b/rtengine/labimage.cc
@ -0,0 +1,42 @@
 #include <labimage.h>
 namespace rtengine {
 LabImage::LabImage (int w, int h) : W(w), H(h), fromImage(false) {
    L = new unsigned short*[H];
    for (int i=0; i<H; i++)
        L[i] = new unsigned short[W];
    a = new short*[H];
    for (int i=0; i<H; i++)
        a[i] = new short[W];
    b = new short*[H];
    for (int i=0; i<H; i++)
        b[i] = new short[W];
 }
 LabImage::LabImage (Image16* im) {
    W = im->width;
    H = im->height;
    L = im->r;
    a = (short**) im->g;
    b = (short**) im->b;
    fromImage = true;
 }
 LabImage::~LabImage () {
    if (!fromImage) {
        for (int i=0; i<H; i++) {
            delete [] L[i];
            delete [] a[i];
            delete [] b[i];
        }
        delete [] L;
        delete [] a;
        delete [] b;
    }
 }
 }
--- a/rtengine/bilateral2.cc
+++ b/rtengine/bilateral2.cc
@ -7,7 +7,7 @@
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
- * 
+ *
 *  RawTherapee is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
@ -16,20 +16,26 @@
 *  You should have received a copy of the GNU General Public License
 *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include <bilateral2.h>
+#ifndef _LABIMAGE_H_
 #define _LABIMAGE_H_
-void bilateral_unsigned (unsigned short** src, unsigned short** dst, unsigned short** buffer, Dim dim, double sigma, double sens) {
+#include <image16.h>
-    bilateral<unsigned short, unsigned int> (src, dst, buffer, dim, sigma, sens);
+namespace rtengine {
 class LabImage {
    private:
        bool fromImage;
    public:
        int W, H;
        unsigned short** L;
        short** a;
        short** b;
     LabImage (int w, int h);
     LabImage (Image16* im);
    ~LabImage ();
 };
 }
-
+#endif
 void bilateral_signed (short** src, short** dst, short** buffer, Dim dim, double sigma, double sens) {
    bilateral<short, int> (src, dst, buffer, dim, sigma, sens);
 }
 void bilateral_box_unsigned (unsigned short** src, unsigned short** dst, int W, int H, int sigmar, double sigmas, bilateralparams row) {
    bilateral<unsigned short> (src, dst, W, H, sigmar, sigmas, row.row_from, row.row_to);
 }
--- a/rtengine/rtthumbnail.cc
+++ b/rtengine/rtthumbnail.cc
@ -206,7 +206,7 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
        gmi = 1024.0 * gm / mul_lum;
        bmi = 1024.0 * bm / mul_lum;  
    }
-    // resize to requested with and perform coarse transformation
+    // resize to requested width and perform coarse transformation
    int rwidth;
    if (params.coarse.rotate==90 || params.coarse.rotate==270) {
        rwidth = rheight;
@ -269,7 +269,9 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
    int fw = baseImg->width;
    int fh = baseImg->height;
-    ImProcFunctions ipf;
+    ImProcFunctions ipf (&params, false);
    ipf.setScale (sqrt(double(fw*fw+fh*fh))/sqrt(double(thumbImg->width*thumbImg->width+thumbImg->height*thumbImg->height))*scale);
    unsigned int* hist16 = new unsigned int [65536];
    ipf.firstAnalysis (baseImg, &params, hist16, isRaw ? 2.2 : 0.0);
@ -299,7 +301,7 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
            for (int i=0; i<fh; i++)
                buffer[i] = new unsigned short[fw];
        }
-        shmap = new SHMap (fw, fh);
+        shmap = new SHMap (fw, fh, false);
        double radius = sqrt (double(fw*fw+fh*fh)) / 2.0;
        double shradius = radius / 1800.0 * params.sh.radius;
        shmap->update (baseImg, buffer, shradius, ipf.lumimul, params.sh.hq);
@ -321,7 +323,7 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
    CurveFactory::complexCurve (br, bl/65535.0, params.toneCurve.hlcompr, params.toneCurve.shcompr, params.toneCurve.brightness, params.toneCurve.contrast, logDefGain, isRaw ? 2.2 : 0, true, params.toneCurve.curve, hist16, curve, NULL, 16);
    LabImage* labView = new LabImage (baseImg);
-    ipf.rgbProc (baseImg, labView, &params, curve, shmap);
+    ipf.rgbProc (baseImg, labView, curve, shmap);
    if (shmap)
        delete shmap;
@ -341,12 +343,11 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
    delete [] hist16;
    // color processing
-    ipf.colorCurve (labView, labView, &params);
+    ipf.colorCurve (labView, labView);
    // obtain final image
    Image8* readyImg = new Image8 (fw, fh);
    ipf.lab2rgb (labView, readyImg);
    ipf.release ();
    delete baseImg;
    // calculate scale
@ -453,8 +454,7 @@ void Thumbnail::getSpotWB (const procparams::ProcParams& params, int xp, int yp,
        fw = thumbImg->height;
        fh = thumbImg->width;
    }
-    ImProcFunctions ipf;  
+    ImProcFunctions::transCoord (&params, fw, fh, points, red, green, blue);
    ipf.transCoord (&params, fw, fh, points, red, green, blue);
    int tr = TR_NONE;
    if (params.coarse.rotate==90)  tr |= TR_R90;
    if (params.coarse.rotate==180) tr |= TR_R180;
--- a/rtengine/shmap.cc
+++ b/rtengine/shmap.cc
@ -29,7 +29,7 @@ namespace rtengine {
 extern const Settings* settings;
-SHMap::SHMap (int w, int h) : W(w), H(h) {
+SHMap::SHMap (int w, int h, bool multiThread) : W(w), H(h), multiThread(multiThread) {
    map = new unsigned short*[H];
    for (int i=0; i<H; i++)
@ -52,51 +52,27 @@ void SHMap::update (Image16* img, unsigned short** buffer, double radius, double
 			map[i][j] = CLIP(val);
 		}
 //MyTime t1,t2;
 //t1.set ();
    if (!hq) {
    	AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(W,H)*omp_get_max_threads());
-        AlignedBuffer<double>* buffer1 = new AlignedBuffer<double> (MAX(W,H)*5);
+    	gaussHorizontal<unsigned short> (map, map, buffer, W, H, radius, multiThread);
-        AlignedBuffer<double>* buffer2 = new AlignedBuffer<double> (MAX(W,H)*5);
+		gaussVertical<unsigned short>   (map, map, buffer, W, H, radius, multiThread);
-        // blur
+        delete buffer;
        if (settings->dualThreadEnabled) {
            Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(gaussHorizontal_unsigned), map, map, buffer1, W, 0, H/2, radius), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
            Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(gaussHorizontal_unsigned), map, map, buffer2, W, H/2, H, radius), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
            thread1->join ();
            thread2->join ();
            thread1 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(gaussVertical_unsigned), map, map, buffer1, H, 0, W/2, radius), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
            thread2 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(gaussVertical_unsigned), map, map, buffer2, H, W/2, W, radius), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
            thread1->join ();
            thread2->join ();
        }
        else {
            gaussHorizontal_unsigned (map, map, buffer1, W, 0, H, radius);
            gaussVertical_unsigned (map, map, buffer1, H, 0, W, radius);
        }    
        delete buffer1;
        delete buffer2;
    }
    else {
-        if (settings->dualThreadEnabled) {
+		#pragma omp parallel if (multiThread)
-            bilateralparams r1, r2;
+    	{
-            r1.row_from = 0;
+    		int tid = omp_get_thread_num();
-            r1.row_to = H/2;
+    		int nthreads = omp_get_num_threads();
-            r2.row_from = H/2;
+    		int blk = H/nthreads;
-            r2.row_to = H;
+
-            Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(bilateral_box_unsigned), map, buffer, W, H, 8000, radius, r1), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+    		if (tid<nthreads-1)
-            Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(bilateral_box_unsigned), map, buffer, W, H, 8000, radius, r2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+    			bilateral<unsigned short> (map, buffer, W, H, 8000, radius, tid*blk, (tid+1)*blk);
-            thread1->join ();
+    		else
-            thread2->join ();
+    			bilateral<unsigned short> (map, buffer, W, H, 8000, radius, tid*blk, H);
-        }
+		}
-        else {
+        // anti-alias filtering the result
            bilateralparams r1;
            r1.row_from = 0;
            r1.row_to = H;
            bilateral_box_unsigned (map, buffer, W, H, 8000, radius, r1);
        }
        for (int i=0; i<H; i++)
            for (int j=0; j<W; j++)
                if (i>0 && j>0 && i<H-1 && j<W-1)
@ -105,9 +81,6 @@ void SHMap::update (Image16* img, unsigned short** buffer, double radius, double
                    map[i][j] = buffer[i][j];
    }
 //    t2.set ();
 //    printf ("shmap: %d\n", t2.etime (t1));
    // update average, minimum, maximum
    double _avg = 0;
    int n = 1;
--- a/rtengine/shmap.h
+++ b/rtengine/shmap.h
@ -29,8 +29,9 @@ class SHMap {
        int W, H;
        unsigned short** map;
        unsigned short   max, min, avg;
        bool multiThread;
-     SHMap (int w, int h);
+     SHMap (int w, int h, bool multiThread);
    ~SHMap ();
    void update (Image16* img, unsigned short** buffer, double radius, double lumi[3], bool hq);
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@ -69,7 +69,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
    int fw, fh;
    imgsrc->getFullSize (fw, fh, tr);
-    ImProcFunctions ipf;
+    ImProcFunctions ipf (&params, true);
    Image16* baseImg;
    PreviewProps pp (0, 0, fw, fh, 1);
@ -83,7 +83,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
        fw *= params.resize.scale;
        fh *= params.resize.scale;
        baseImg = new Image16 (fw, fh);
-        ipf.resize (oorig, baseImg, params.resize);
+        ipf.resize (oorig, baseImg);
        delete oorig;
    }
    if (pl) 
@ -117,7 +117,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
    SHMap* shmap = NULL;
    if (params.sh.enabled) {
-        shmap = new SHMap (fw, fh);
+        shmap = new SHMap (fw, fh, true);
        double radius = sqrt (double(fw*fw+fh*fh)) / 2.0;
        double shradius = radius / 1800.0 * params.sh.radius;
        shmap->update (baseImg, (unsigned short**)buffer, shradius, ipf.lumimul, params.sh.hq);
@ -138,7 +138,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
    CurveFactory::complexCurve (br, bl/65535.0, params.toneCurve.hlcompr, params.toneCurve.shcompr, params.toneCurve.brightness, params.toneCurve.contrast, imgsrc->getDefGain(), imgsrc->getGamma(), true, params.toneCurve.curve, hist16, curve, NULL);
    LabImage* labView = new LabImage (baseImg);
-    ipf.rgbProc (baseImg, labView, &params, curve, shmap);
+    ipf.rgbProc (baseImg, labView, curve, shmap);
    if (shmap)
        delete shmap;
@ -155,15 +155,15 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
    // luminance processing
    CurveFactory::complexCurve (0.0, 0.0, 0.0, 0.0, params.lumaCurve.brightness, params.lumaCurve.contrast, 0.0, 0.0, false, params.lumaCurve.curve, hist16, curve, NULL);
    ipf.luminanceCurve (labView, labView, curve, 0, fh);
-    ipf.lumadenoise (labView, &params, 1, buffer);
+    ipf.lumadenoise (labView, buffer);
-    ipf.sharpening (labView, &params, 1, (unsigned short**)buffer);
+    ipf.sharpening (labView, (unsigned short**)buffer);
    delete [] curve;
    delete [] hist16;
    // color processing
-    ipf.colorCurve (labView, labView, &params);
+    ipf.colorCurve (labView, labView);
-    ipf.colordenoise (labView, &params, 1, buffer);
+    ipf.colordenoise (labView, buffer);
    for (int i=0; i<fh; i++)
        delete [] buffer[i];
@ -182,7 +182,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
        ch = params.crop.h;
    }
    readyImg = ipf.lab2rgb16 (labView, cx, cy, cw, ch, params.icm.output);
    ipf.release ();
    if (pl) 
        pl->setProgress (1.0);