ImProcFunctions cleanup and transition to OpenMP -- phase 1

2010-05-21 16:14:18 +02:00 · 2010-05-21 16:14:18 +02:00 · cd91e7890e
commit cd91e7890e
parent c427279ce3
19 changed files with 1944 additions and 2856 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -116,6 +116,11 @@ if (WITH_RAWZOR)
    endif (WIN32)
 endif (WITH_RAWZOR)

+find_package(OpenMP)
+if (OPENMP_FOUND)
+	set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS})
+endif (OPENMP_FOUND)
+
 add_subdirectory (rtexif)
 add_subdirectory (rtengine)
 add_subdirectory (rtgui)
--- a/rtengine/CMakeLists.txt
+++ b/rtengine/CMakeLists.txt
@ -6,11 +6,12 @@ link_directories (${CMAKE_CURRENT_SOURCE_DIR}/../rtexif ${EXTRA_LIBDIR} ${GTHREA
    ${GOBJECT_LIBRARY_DIRS} ${GLIB2_LIBRARY_DIRS} ${GLIBMM_LIBRARY_DIRS}
    ${IPTCDATA_LIBRARY_DIRS} ${LCMS_LIBRARY_DIRS})

-set (RTENGINESOURCEFILES colortemp.cc curves.cc dcraw.cc gauss.cc iccstore.cc
+set (RTENGINESOURCEFILES colortemp.cc curves.cc dcraw.cc iccstore.cc
    image8.cc image16.cc imagedata.cc imageio.cc improcfun.cc init.cc dcrop.cc
    loadinitial.cc procparams.cc rawimagesource.cc shmap.cc simpleprocess.cc refreshmap.cc
    stdimagesource.cc myfile.cc iccjpeg.c hlmultipliers.cc improccoordinator.cc
-    processingjob.cc rtthumbnail.cc utils.cc bilateral2.cc)
+    processingjob.cc rtthumbnail.cc utils.cc labimage.cc
+    iplab2rgb.cc ipsharpen.cc iptransform.cc ipresize.cc)

 add_library (rtengine ${RTENGINESOURCEFILES})
 #It may be nice to store library version too
--- a/rtengine/bilateral2.h
+++ b/rtengine/bilateral2.h
@ -26,6 +26,9 @@
 #include <mytime.h>
 #include <gauss.h>
 #include <glibmm.h>
+#include <omp.h>
+
+// This seems ugly, but way faster than any other solutions I tried

 #define ELEM(a,b) (src[i - a][j - b] * ec[src[i - a][j - b]-src[i][j]+0x10000])
 #define SULY(a,b) (ec[src[i - a][j - b]-src[i][j]+0x10000])
@ -34,22 +37,22 @@
                        int* ec = new int [0x20000]; \
                        for (int i=0; i<0x20000; i++) \
                            ec[i] = (int)(exp(-(double)(i-0x10000)*(double)(i-0x10000) / (2.0*sens*sens))*scale); \
-                        int start = row_from; \
-                        if (start<(b)) start = (b); \
-                        int end = row_to; \
-                        if (end>H-(b)) end = H-(b); \
-                        for (int i=start; i<end; i++) { \
-                            for (int j=(b); j<W-(b); j++) {
+                        int rstart = b; \
+                        int rend = H-b; \
+                        int cstart = b; \
+                        int cend = W-b;
                            
 #define BL_END(b)       buffer[i][j] = v; }} delete [] ec; \
-                        for (int i=row_from; i<row_to; i++)  \
+                        for (int i=0; i<H; i++)  \
                            for (int j=0; j<W; j++)  \
-                                if (i<start || j<(b) || i>=end || j>=W-(b)) \
+                                if (i<rstart || j<cstart || i>=rend || j>=cend) \
                                    dst[i][j] = src[i][j]; \
                                else \
                                    dst[i][j] = buffer[i][j];

-#define BL_OPER3(a11,a12,a21,a22) A v = a11*ELEM(-1,-1) + a12*ELEM(-1,0) + a11*ELEM(-1,1) + \
+#define BL_OPER3(a11,a12,a21,a22) for (int i=rstart; i<rend; i++) { \
+									for (int j=cstart; j<cend; j++) { \
+								   A v = a11*ELEM(-1,-1) + a12*ELEM(-1,0) + a11*ELEM(-1,1) + \
                                        a21*ELEM(0,-1) + a22*ELEM(0,0) + a21*ELEM(0,1) + \
                                        a11*ELEM(1,-1) + a12*ELEM(1,0) + a11*ELEM(1,1); \
                                   v /= a11*SULY(-1,-1) + a12*SULY(-1,0) + a11*SULY(-1,1) + \
@ -57,7 +60,9 @@
                                        a11*SULY(1,-1) + a12*SULY(1,0) + a11*SULY(1,1); 


-#define BL_OPER5(a11,a12,a13,a21,a22,a23,a31,a32,a33) A v = a11*ELEM(-2,-2) + a12*ELEM(-2,-1) + a13*ELEM(-2,0) + a12*ELEM(-2,1) + a11*ELEM(-2,2) + \
+#define BL_OPER5(a11,a12,a13,a21,a22,a23,a31,a32,a33) for (int i=rstart; i<rend; i++) { \
+														for (int j=cstart; j<cend; j++) { \
+													   A v = a11*ELEM(-2,-2) + a12*ELEM(-2,-1) + a13*ELEM(-2,0) + a12*ELEM(-2,1) + a11*ELEM(-2,2) + \
                                                            a21*ELEM(-1,-2) + a22*ELEM(-1,-1) + a23*ELEM(-1,0) + a22*ELEM(-1,1) + a21*ELEM(-1,2) + \
                                                            a31*ELEM(0,-2) + a32*ELEM(0,-1) + a33*ELEM(0,0) + a32*ELEM(0,1) + a31*ELEM(0,2) + \
                                                            a21*ELEM(1,-2) + a22*ELEM(1,-1) + a23*ELEM(1,0) + a22*ELEM(1,1) + a21*ELEM(1,2) + \
@ -69,7 +74,9 @@
                                                            a11*SULY(2,-2) + a12*SULY(2,-1) + a13*SULY(2,0) + a12*SULY(2,1) + a11*SULY(2,2);

 #define BL_OPER7(a11,a12,a13,a14,a21,a22,a23,a24,a31,a32,a33,a34,a41,a42,a43,a44) \
-                                                      A v = a11*ELEM(-3,-3) + a12*ELEM(-3,-2) + a13*ELEM(-3,-1) + a14*ELEM(-3,0) + a13*ELEM(-3,1) + a12*ELEM(-3,2) + a11*ELEM(-3,3) + \
+                                                      for (int i=rstart; i<rend; i++) { \
+                                                    	  for (int j=cstart; j<cend; j++) { \
+													   A v = a11*ELEM(-3,-3) + a12*ELEM(-3,-2) + a13*ELEM(-3,-1) + a14*ELEM(-3,0) + a13*ELEM(-3,1) + a12*ELEM(-3,2) + a11*ELEM(-3,3) + \
                                                            a21*ELEM(-2,-3) + a22*ELEM(-2,-2) + a23*ELEM(-2,-1) + a24*ELEM(-2,0) + a23*ELEM(-2,1) + a22*ELEM(-2,2) + a21*ELEM(-2,3) + \
                                                            a31*ELEM(-1,-3) + a32*ELEM(-1,-2) + a33*ELEM(-1,-1) + a34*ELEM(-1,0) + a33*ELEM(-1,1) + a32*ELEM(-1,2) + a31*ELEM(-1,3) + \
                                                            a41*ELEM(0,-3) + a42*ELEM(0,-2) + a43*ELEM(0,-1) + a44*ELEM(0,0) + a43*ELEM(0,1) + a42*ELEM(0,2) + a41*ELEM(0,3) + \
@ -86,6 +93,8 @@
                                                       

 #define BL_OPER9(a11,a12,a13,a14,a15,a21,a22,a23,a24,a25,a31,a32,a33,a34,a35,a41,a42,a43,a44,a45,a51,a52,a53,a54,a55) \
+                                                      for (int i=rstart; i<rend; i++) { \
+                                                    	  for (int j=cstart; j<cend; j++) { \
                                                      A v = a11*ELEM(-4,-4) + a12*ELEM(-4,-3) + a13*ELEM(-4,-2) + a14*ELEM(-4,-1) + a15*ELEM(-4,0) + a14*ELEM(-4,1) + a13*ELEM(-4,2) + a12*ELEM(-4,3) + a11*ELEM(-4,4) + \
                                                            a21*ELEM(-3,-4) + a22*ELEM(-3,-3) + a23*ELEM(-3,-2) + a24*ELEM(-3,-1) + a25*ELEM(-3,0) + a24*ELEM(-3,1) + a23*ELEM(-3,2) + a22*ELEM(-3,3) + a21*ELEM(-3,4) + \
                                                            a31*ELEM(-2,-4) + a32*ELEM(-2,-3) + a33*ELEM(-2,-2) + a34*ELEM(-2,-1) + a35*ELEM(-2,0) + a34*ELEM(-2,1) + a33*ELEM(-2,2) + a32*ELEM(-2,3) + a31*ELEM(-2,4) + \
@ -106,6 +115,8 @@
                                                            a11*SULY(4,-4) + a12*SULY(4,-3) + a13*SULY(4,-2) + a14*SULY(4,-1) + a15*SULY(4,0) + a14*SULY(4,1) + a13*SULY(4,2) + a12*SULY(4,3) + a11*SULY(4,4); 

 #define BL_OPER11(a11,a12,a13,a14,a15,a16,a21,a22,a23,a24,a25,a26,a31,a32,a33,a34,a35,a36,a41,a42,a43,a44,a45,a46,a51,a52,a53,a54,a55,a56,a61,a62,a63,a64,a65,a66) \
+                                                      for (int i=rstart; i<rend; i++) { \
+														for (int j=cstart; j<cend; j++) { \
                                                      A v = a11*ELEM(-5,-5) + a12*ELEM(-5,-4) + a13*ELEM(-5,-3) + a14*ELEM(-5,-2) + a15*ELEM(-5,-1) + a16*ELEM(-5,0) + a15*ELEM(-5,1) + a14*ELEM(-5,2) + a13*ELEM(-5,3) + a12*ELEM(-5,4) + a11*ELEM(-5,5) + \
                                                            a21*ELEM(-4,-5) + a22*ELEM(-4,-4) + a23*ELEM(-4,-3) + a24*ELEM(-4,-2) + a25*ELEM(-4,-1) + a26*ELEM(-4,0) + a25*ELEM(-4,1) + a24*ELEM(-4,2) + a23*ELEM(-4,3) + a22*ELEM(-4,4) + a21*ELEM(-4,5) + \
                                                            a31*ELEM(-3,-5) + a32*ELEM(-3,-4) + a33*ELEM(-3,-3) + a34*ELEM(-3,-2) + a35*ELEM(-3,-1) + a36*ELEM(-3,0) + a35*ELEM(-3,1) + a34*ELEM(-3,2) + a33*ELEM(-3,3) + a32*ELEM(-3,4) + a31*ELEM(-3,5) + \
@ -131,452 +142,256 @@


 // sigma = 0.5
-template<class T, class A> void bilateral05 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral05 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    
    BL_BEGIN(318,1)
+	#pragma omp parallel for if (multiThread)
    BL_OPER3(1,7,7,55)
    BL_END(1)
 }

 // sigma = 0.6
-template<class T, class A> void bilateral06 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral06 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    
    BL_BEGIN(768,1)
+	#pragma omp parallel for if (multiThread)
    BL_OPER3(1,4,4,16)
    BL_END(1)
 }

 // sigma = 0.7
-template<class T, class A> void bilateral07 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral07 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    
    BL_BEGIN(366,2)
+	#pragma omp parallel for if (multiThread)
    BL_OPER5(0,0,1,0,8,21,1,21,59)
    BL_END(2)
 }

 // sigma = 0.8
-template<class T, class A> void bilateral08 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral08 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
    
    BL_BEGIN(753,2)
+	#pragma omp parallel for if (multiThread)
    BL_OPER5(0,0,1,0,5,10,1,10,23)
    BL_END(2)
 }

 // sigma = 0.9
-template<class T, class A> void bilateral09 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral09 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(595,2)
+	#pragma omp parallel for if (multiThread)
    BL_OPER5(0,1,2,1,6,12,2,12,22)
    BL_END(2)
 }

 // sigma = 1.0
-template<class T, class A> void bilateral10 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral10 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {

    BL_BEGIN(910,2)
+	#pragma omp parallel for if (multiThread)
    BL_OPER5(0,1,2,1,4,7,2,7,12)
    BL_END(2)
 }

 // sigma = 1.1
-template<class T, class A> void bilateral11 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral11 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(209,3)
+	#pragma omp parallel for if (multiThread)
    BL_OPER7(0,0,1,1,0,2,5,8,1,5,18,27,1,8,27,41)
    BL_END(3)
 }

 // sigma = 1.2
-template<class T, class A> void bilateral12 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral12 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(322,3)
+	#pragma omp parallel for if (multiThread)
    BL_OPER7(0,0,1,1,0,1,4,6,1,4,11,16,1,6,16,23)
    BL_END(3)
 }

 // sigma = 1.3
-template<class T, class A> void bilateral13 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral13 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(336,3)
+	#pragma omp parallel for if (multiThread)
    BL_OPER7(0,0,1,1,0,2,4,6,1,4,11,14,1,6,14,19)
    BL_END(3)
 }

 // sigma = 1.4
-template<class T, class A> void bilateral14 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral14 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(195,3)
+	#pragma omp parallel for if (multiThread)
    BL_OPER7(0,1,2,3,1,4,8,10,2,8,17,21,3,10,21,28)
    BL_END(3)   
 }

 // sigma = 1.5
-template<class T, class A> void bilateral15 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral15 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(132,4)
+	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,0,1,1,0,1,2,4,5,0,2,6,12,14,1,4,12,22,28,1,5,14,28,35)
    BL_END(4)   
 }

 // sigma = 1.6
-template<class T, class A> void bilateral16 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral16 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(180,4)
+	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,0,1,1,0,1,2,3,4,0,2,5,9,10,1,3,9,15,19,1,4,10,19,23)
    BL_END(4)   
 }

 // sigma = 1.7
-template<class T, class A> void bilateral17 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral17 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(195,4)
+	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,1,1,1,0,1,2,3,4,1,2,5,8,9,1,3,8,13,16,1,4,9,16,19)
    BL_END(4)   
 }

 // sigma = 1.8
-template<class T, class A> void bilateral18 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral18 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(151,4)
+	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,1,2,2,0,1,3,5,5,1,3,6,10,12,2,5,10,16,19,2,5,12,19,22)
    BL_END(4)   
 }

 // sigma = 1.9
-template<class T, class A> void bilateral19 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral19 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {
   
    BL_BEGIN(151,4)
+	#pragma omp parallel for if (multiThread)
    BL_OPER9(0,0,1,2,2,0,1,3,4,5,1,3,5,8,9,2,4,8,12,14,2,5,9,14,16)
    BL_END(4)   
 }

 // sigma = 2
-template<class T, class A> void bilateral20 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral20 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {

    BL_BEGIN(116,5)
+	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,0,1,1,1,0,0,1,2,3,3,0,1,2,4,7,7,1,2,4,8,12,14,1,3,7,12,18,20,1,3,7,14,20,23)
    BL_END(5)   
 }

 // sigma = 2.1
-template<class T, class A> void bilateral21 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral21 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {

    BL_BEGIN(127,5)
+	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,0,1,1,1,0,0,1,2,3,3,0,1,2,4,6,7,1,2,4,8,11,12,1,3,6,11,15,17,1,3,7,12,17,19)
    BL_END(5)
 }

 // sigma = 2.2
-template<class T, class A> void bilateral22 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral22 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {

    BL_BEGIN(109,5)
+	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,0,1,1,2,0,1,2,3,3,4,1,2,3,5,7,8,1,3,5,9,12,13,1,3,7,12,16,18,2,4,8,13,18,20)
    BL_END(5)
 }

 // sigma = 2.3
-template<class T, class A> void bilateral23 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral23 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {

    BL_BEGIN(132,5)
+	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,1,1,1,1,0,1,1,2,3,3,1,1,3,5,6,7,1,2,5,7,10,11,1,3,6,10,13,14,1,3,7,11,14,16)
    BL_END(5)
 }

 // sigma = 2.4
-template<class T, class A> void bilateral24 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral24 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {

    BL_BEGIN(156,5)
+	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,1,1,1,1,0,1,1,2,3,3,1,1,3,4,5,6,1,2,4,6,8,9,1,3,5,8,10,11,1,3,6,9,11,12)
    BL_END(5)
 }

 // sigma = 2.5
-template<class T, class A> void bilateral25 (T** src, T** dst, T** buffer, int W, int H, int row_from, int row_to, double sens) {
+template<class T, class A> void bilateral25 (T** src, T** dst, T** buffer, int W, int H, double sens, bool multiThread) {

-    BL_BEGIN(173,5)
+	BL_BEGIN(173,5)
+	#pragma omp parallel for if (multiThread)
    BL_OPER11(0,0,1,1,1,1,0,1,1,2,3,3,1,1,2,4,5,5,1,2,4,5,7,7,1,3,5,7,9,9,1,3,5,7,9,10)
    BL_END(5)
 }

-class Dim {
-
-    public:
-        int W, H, row_from, row_to;
-        
-        Dim (int w, int h, int rf, int rt) : W(w), H(h), row_from(rf), row_to(rt) {}
-
-};
-
 // main bilateral filter
-template<class T, class A> void bilateral (T** src, T** dst, T** buffer, Dim dim, double sigma, double sens) {
-
-    int W = dim.W;
-    int H = dim.H;
-    int row_from = dim.row_from;
-    int row_to = dim.row_to;
+template<class T, class A> void bilateral (T** src, T** dst, T** buffer, int W, int H, double sigma, double sens, bool multiThread) {

    if (sigma<0.45) 
-        for (int i=row_from; i<row_to; i++) {
+		#pragma omp parallel for if (multiThread)
+    	for (int i=0; i<H; i++) {
            memcpy (buffer[i], src[i], W*sizeof(T));
            memcpy (dst[i], buffer[i], W*sizeof(T));
        }
    else if (sigma<0.55)
-        bilateral05<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral05<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<0.65)
-        bilateral06<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral06<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<0.75)
-        bilateral07<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral07<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<0.85)
-        bilateral08<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral08<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<0.95)
-        bilateral09<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral09<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.05)
-        bilateral10<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral10<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.15)
-        bilateral11<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral11<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.25)
-        bilateral12<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral12<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.35)
-        bilateral13<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral13<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.45)
-        bilateral14<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral14<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.55)
-        bilateral15<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral15<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.65)
-        bilateral16<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral16<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.75)
-        bilateral17<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral17<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.85)
-        bilateral18<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral18<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<1.95)
-        bilateral19<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral19<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.05)
-        bilateral20<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral20<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.15)
-        bilateral21<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral21<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.25)
-        bilateral22<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral22<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.35)
-        bilateral23<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral23<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else if (sigma<2.45)
-        bilateral24<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral24<T,A> (src, dst, buffer, W, H, sens, multiThread);
    else 
-        bilateral25<T,A> (src, dst, buffer, W, H, row_from, row_to, sens);
+        bilateral25<T,A> (src, dst, buffer, W, H, sens, multiThread);
 }

-void bilateral_unsigned (unsigned short** src, unsigned short** dst, unsigned short** buffer, Dim dim, double sigma, double sens);
-void bilateral_signed (short** src, short** dst, short** buffer, Dim dim, double sigma, double sens);
-
-/*
-template<class T> void bilateral (T** src, int** dst, int W, int H, int sigmar, double sigmas) {
-
-  time_t t1 = clock ();
-
-  int r = 2.0*sigmas + 0.5;
-
-  double scaleg = 1024.0;
-
-  double MAXINT = 65536.0*65536.0;
-  double sgmax = 275000/(MAXINT/2.0/sigmar/sigmar+(r+1)*(r+1)/sigmas/sigmas);
-  printf ("sgmax = %lf\n", sgmax);
-  if (scaleg>sgmax)
-    scaleg = sgmax;
-
-  // kernel vector for the spatial gaussian filter * 1024
-  int* kernel = new int[1+2*r];
-  for (int i=0; i<2*r+1; i++) {
-    kernel[i] = scaleg / (2.0*sigmas*sigmas) * (i-r)*(i-r) + 0.25;
-    printf ("kerneli = %d\n", kernel[i]);
-  }
-
-  // exponential lookup table
-  int scale = (2.0*sigmar*sigmar) / scaleg;
-  int scalem = 65535/((1+2*r)*(1+2*r));
-  int ec[256000];
-  for (int i=0; i<256000; i++)
-    ec[i] = exp (-i/scaleg) * scalem;
-
-  for (int i=r; i<H-r; i++) {
-    for (int j=r; j<W-r; j++) {
-      int sum = 0.0;
-      int val = 0.0;
-      int c = src[i][j];
-      for (int x=-r; x<=r; x++)
-        for (int y=-r; y<=r; y++) {
-          unsigned int d = src[i-x][j-y];
-          unsigned int mul = ec[(c-d)*(c-d)/scale + kernel[x+r] + kernel[y+r]];
-//          if (mul<275000) {
-            val += d*mul;
-            sum += mul;
-//          }
-//          else {
-//            printf ("out!!!\n");
-//          }
-        }
-      dst[i][j] = val / sum;
-    }
-  }
-  delete [] kernel;
-
-  time_t t2 = clock ();
-  printf ("bilateral: %d\n", t2-t1);
-}
-*/
+// START OF EXPERIMENTAL CODE: O(1) bilateral box filter
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 #define MAXVAL  0xffff
 #define CLIP(a) ((a)>0?((a)<MAXVAL?(a):MAXVAL):0)
-/*
-template<class T> void bilateral (T** src, T** dst, AlignedBuffer<float>* buffer, int W, int H, double sigmar, double sigmas) {
-
-  time_t t1 = clock ();
-
-    float alpha = 0.5 / sigmar / sigmar;
-
-    // buffer for the final image
-    float** buff_final = new float*[H];
-    for (int i=0; i<H; i++) {
-        buff_final[i] = new float[W];
-        memset (buff_final[i], 0, W*sizeof(float));
-    }
-    // buffer for the normalization 
-    float** buff_norm = new float*[H];
-    for (int i=0; i<H; i++) {
-        buff_norm[i] = new float[W];
-        for (int j=0; j<W; j++) 
-            buff_norm[i][j] = 1.0;
-    }
-    // temporary buffer
-    float** buff_temp = new float*[H];
-    for (int i=0; i<H; i++) 
-        buff_temp[i] = new float[W];
-
-    // second order gaussian filter approximation
-    // first filtered image: temp <-- y_1
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++)
-            buff_temp[i][j] = src[i][j];
-
-printf ("1: %g\n", buff_temp[100][100]);
-            
-    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
-    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
-
-printf ("2: %g\n", buff_temp[100][100]);
-    
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++) {
-            buff_norm[i][j] += 2.0*alpha*src[i][j]*buff_temp[i][j]; 
-            buff_final[i][j] += buff_temp[i][j];
-        }
-
-printf ("3: %g\n", buff_norm[100][100]);
-printf ("4: %g\n", buff_final[100][100]);
-        
-    // second filtered image: temp <-- y_2
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++)
-            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j];
-            
-    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
-    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
-
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++) {
-            buff_norm[i][j] += alpha*(2.0*alpha*src[i][j]*src[i][j]-1.0)*buff_temp[i][j]; 
-            buff_final[i][j] += 2.0*alpha*src[i][j]*buff_temp[i][j];
-        }
-
-printf ("5: %g\n", buff_norm[100][100]);
-printf ("6: %g\n", buff_final[100][100]);
-
-    // third filtered image: temp <-- y_3
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++)
-            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j];
-            
-    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
-    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
-
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++) {
-            buff_norm[i][j] -= 2.0*alpha*alpha*src[i][j]*(1.0-2.0/3.0*alpha*src[i][j]*src[i][j])*buff_temp[i][j];
-            buff_final[i][j] += alpha*(2.0*alpha*src[i][j]*src[i][j]-1.0)*buff_temp[i][j];
-        }
-
-    // fourth filtered image: temp <-- y_4
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++)
-            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j];
-            
-    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
-    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
-
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++) {
-            buff_norm[i][j] += alpha*alpha*(0.5-2.0*alpha*src[i][j]*src[i][j])*buff_temp[i][j];
-            buff_final[i][j] -= 2.0*alpha*alpha*src[i][j]*(1.0-2.0/3.0*alpha*src[i][j]*src[i][j])*buff_temp[i][j];
-        }
-    
-    // fifth filtered image: temp <-- y_5
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++)
-            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j];
-            
-    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
-    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
-
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++) {
-            buff_norm[i][j] += alpha*alpha*alpha*src[i][j]*buff_temp[i][j];
-            buff_final[i][j] += alpha*alpha*(0.5-2.0*alpha*src[i][j]*src[i][j])*buff_temp[i][j];
-        }
-
-    // sixth filtered image: temp <-- y_6
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++)
-            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j];
-            
-    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
-    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
-
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++) {
-            buff_norm[i][j] -= alpha*alpha*alpha/6.0*buff_temp[i][j]; 
-            buff_final[i][j] += alpha*alpha*alpha*src[i][j]*buff_temp[i][j];
-        }
-
-    // seventh filtered image: temp <-- y_7, and finalizing
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++)
-            buff_temp[i][j] = (float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j]*(float)src[i][j];
-            
-    gaussHorizontal_float (buff_temp, buff_temp, buffer, W, 0, H, sigmas);
-    gaussVertical_float   (buff_temp, buff_temp, buffer, H, 0, W, sigmas);
-
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++) {
-            buff_final[i][j] = (buff_final[i][j] - alpha*alpha*alpha/6.0*buff_temp[i][j]) / buff_norm[i][j];
-            dst[i][j] = (T)CLIP(buff_final[i][j]);
-        }
-
-    // cleanup
-    for (int i=0; i<H; i++) {
-        delete [] buff_temp[i];
-        delete [] buff_norm[i];
-        delete [] buff_final[i];
-    }
-     delete [] buff_temp;   
-     delete [] buff_norm;   
-     delete [] buff_final;   
-  time_t t2 = clock ();
-  printf ("bilateral: %d\n", t2-t1);
-}
-*/

 #define BINBIT 12
 #define TRANSBIT 4

-
 template<class T> void bilateral (T** src, T** dst, int W, int H, int sigmar, double sigmas, int row_from, int row_to) {

    // range weights
@ -654,95 +469,4 @@ template<class T> void bilateral (T** src, T** dst, int W, int H, int sigmar, do
        delete [] buff_final[i];
 }

-struct bilateralparams {
-
-    int row_from, row_to;
-};
-
-void bilateral_box_unsigned (unsigned short** src, unsigned short** dst, int W, int H, int sigmar, double sigmas, bilateralparams row);
-
-/*
-#define BINBIT 12
-#define TRANSBIT 4
-
-#define ADDHIST(a,b)  { for (int k=0; k<(1<<BINBIT); k++) (a)[k] = (a)[k]+(b)[k]; }
-#define SUBHIST(a,b)  { for (int k=0; k<(1<<BINBIT); k++) (a)[k] = (a)[k]-(b)[k]; }
-
-template<class T> void bilateral (T** src, T** dst, AlignedBuffer<float>* buffer, int W, int H, int sigmar, double sigmas) {
-
-    time_t t1 = clock (); 
-
-    unsigned short** rowhist = new unsigned short* [H];
-    for (int i=0; i<H; i++) {
-        rowhist[i] = new unsigned short [1<<BINBIT];
-        memset (rowhist[i], 0, (1<<BINBIT)*sizeof(unsigned short));
-    }
-
-    unsigned short* hist = new unsigned short[1<<BINBIT];
-
-    int r = sigmas;
-
-    // buffer for the final image
-    float** buff_final = new float*[H];
-    for (int i=0; i<H; i++) {
-        buff_final[i] = new float[W];
-        memset (buff_final[i], 0, W*sizeof(float));
-    }
-
-    // fill the row histograms initially
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<2*r+1; j++)
-            rowhist[i][src[i][j]>>TRANSBIT]++;
-
-    // let the game begin...
-    for (int j=r+1; j<W-r-1; j++) {
-        // update row histograms
-        for (int i=0; i<H; i++) {
-            rowhist[i][src[i][j-r-1]>>TRANSBIT]--;
-            rowhist[i][src[i][j+r]>>TRANSBIT]++;
-        }
-        // sum up upper histograms
-        memset (hist, 0, (1<<BINBIT)*sizeof(unsigned short));
-        for (int i=0; i<2*r+1; i++)
-            ADDHIST(hist, rowhist[i]);
-        for (int i=r+1; i<H-r-1; i++) {
-            SUBHIST (hist, rowhist[i-r-1]);
-            ADDHIST (hist, rowhist[i+r]);
-            // calculate pixel value
-            float weight = 0.0;
-            for (int k=0; k<=(sigmar>>TRANSBIT); k++) {               
-                float w = 1.0 - (double)k/(sigmar>>TRANSBIT);
-                int v = src[i][j]>>TRANSBIT;
-                if (v-k >= 0) {
-                    weight += hist [v-k] * w;
-                    buff_final[i][j] += hist [v-k] * w * (src[i][j]-(k<<TRANSBIT));
-                }
-                if (v+k < (1<<BINBIT)) {
-                    weight += hist [v+k] * w;
-                    buff_final[i][j] += hist [v+k] * w * (src[i][j]+(k<<TRANSBIT));
-                }
-            }
-            buff_final[i][j] /= weight;
-        }
-    }
-    for (int i=0; i<H; i++) 
-        for (int j=0; j<W; j++) 
-            if (i<r+1 || i>H-r-1 || j<r+1 || j>W-r-1)
-                dst[i][j] = src[i][j];
-            else
-                dst[i][j] = (T)CLIP(buff_final[i][j]);
-
-    delete [] hist;
-    for (int i=0; i<H; i++) {
-        delete [] buff_final[i];
-        delete [] rowhist[i];
-    }
-    delete [] buff_final;
-    delete [] rowhist;
-
-    time_t t2 = clock ();
-    printf ("bilateral: %d\n", t2-t1);
-}
-
-*/
 #endif
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@ -74,6 +74,8 @@ void Crop::update (int todo, bool internal) {

    MyTime t1,t2,t3,t4,t5,t6,t7,t8,t9;

+    parent->ipf.setScale (skip);
+
    t1.set ();
    // give possibility to the listener to modify crop window (as the full image dimensions are already known at this point)
    int wx, wy, ww, wh, ws;
@ -130,7 +132,7 @@ void Crop::update (int todo, bool internal) {
            }
            if (!resizeCrop)
                resizeCrop = new Image16 (rcw, rch);
-            parent->ipf.resize (origCrop, resizeCrop, params.resize);
+            parent->ipf.resize (origCrop, resizeCrop);
            baseCrop = resizeCrop;
        }
        parent->minit.unlock ();
@ -173,7 +175,7 @@ void Crop::update (int todo, bool internal) {
    if (settings->verbose) printf ("C-BLURMAP: %d\n", t4.etime(t3));

    if (todo & M_RGBCURVE) {
-        parent->ipf.rgbProc (baseCrop, laboCrop, &params, parent->tonecurve, cshmap);
+        parent->ipf.rgbProc (baseCrop, laboCrop, parent->tonecurve, cshmap);
    }

    t5.set ();
@ -182,8 +184,8 @@ void Crop::update (int todo, bool internal) {
    if (todo & M_LUMINANCE) {
        parent->ipf.luminanceCurve (laboCrop, labnCrop, parent->lumacurve, 0, croph);
        if (skip==1) {
-            parent->ipf.lumadenoise (labnCrop, &params, 1, cbuffer);
-            parent->ipf.sharpening (labnCrop, &params, 1, (unsigned short**)cbuffer);
+            parent->ipf.lumadenoise (labnCrop, cbuffer);
+            parent->ipf.sharpening (labnCrop, (unsigned short**)cbuffer);
        }
    }

@ -191,9 +193,9 @@ void Crop::update (int todo, bool internal) {
    if (settings->verbose) printf ("C-LUMINANCE: %d\n", t6.etime(t5));

    if (todo & M_COLOR) {
-        parent->ipf.colorCurve (laboCrop, labnCrop, &params);
+        parent->ipf.colorCurve (laboCrop, labnCrop);
        if (skip==1)
-            parent->ipf.colordenoise (labnCrop, &params, 1, cbuffer);
+            parent->ipf.colordenoise (labnCrop, cbuffer);
    }

    t7.set ();
@ -290,7 +292,7 @@ if (settings->verbose) printf ("setcropsizes before lock\n");
    // determine which part of the source image is required to compute the crop rectangle
    int orx, ory, orw, orh;
    ProcParams& params = parent->params;
-    parent->ipf.transCoord (&params, parent->fw, parent->fh, bx1, by1, bw, bh, orx, ory, orw, orh);
+    ImProcFunctions::transCoord (&params, parent->fw, parent->fh, bx1, by1, bw, bh, orx, ory, orw, orh);
        
    int tr = TR_NONE;
    if (params.coarse.rotate==90)  tr |= TR_R90;
@ -324,7 +326,7 @@ if (settings->verbose) printf ("setcropsizes before lock\n");
        laboCrop = new LabImage (cropw, croph);    
        labnCrop = new LabImage (cropw, croph);    
        cropImg = new Image8 (cropw, croph);
-        cshmap = new SHMap (cropw, croph);
+        cshmap = new SHMap (cropw, croph, true);
        
        cbuffer = new int*[croph];
        for (int i=0; i<croph; i++)
--- a/rtengine/gauss.cc
+++ b/rtengine/gauss.cc
@ -1,49 +0,0 @@
-/*
- *  This file is part of RawTherapee.
- *
- *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
- *
- *  RawTherapee is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- * 
- *  RawTherapee is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
- */
-#include <gauss.h>
-
-void gaussHorizontal_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma) {
-
-    gaussHorizontal<unsigned short> (src, dst, buffer, W, row_from, row_to, sigma);
-}
-
-void gaussVertical_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma) {
-
-    gaussVertical<unsigned short> (src, dst, buffer, H, col_from, col_to, sigma);
-}
-
-void gaussHorizontal_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma) {
-
-    gaussHorizontal<short> (src, dst, buffer, W, row_from, row_to, sigma);
-}
-
-void gaussVertical_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma) {
-
-    gaussVertical<short> (src, dst, buffer, H, col_from, col_to, sigma);
-}
-
-void gaussHorizontal_float (float** src, float** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma) {
-
-    gaussHorizontal<float> (src, dst, buffer, W, row_from, row_to, sigma);
-}
-
-void gaussVertical_float (float** src, float** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma) {
-
-    gaussVertical<float> (src, dst, buffer, H, col_from, col_to, sigma);
-}
--- a/rtengine/gauss.h
+++ b/rtengine/gauss.h
@ -23,453 +23,45 @@
 #include <string.h> 
 #include <math.h>
 #include <alignedbuffer.h>
+#include <omp.h>

-#define NOSSE 1
+// classical filtering if the support window is small:

-#ifndef NOSSE
-#include <xmmintrin.h>
-template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int row_from, int row_to, const float c0, const float c1) {
+template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int H, const float c0, const float c1, bool multiThread) {

-    typedef float pfloat[4];
-    pfloat* temp = (pfloat*)buffer + 1;
-    pfloat* tmp = (pfloat*)buffer;
-
-    __m128 xmm1; xmm1 = _mm_load1_ps (&c0);
-    __m128 xmm2; xmm2 = _mm_load1_ps (&c1);
-    __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6;
-    __m128 xmm0;
-
-    int i;
-    for (i = row_from; i<row_to-3; i+=4) {
-        T* row1 = src[i];
-        T* row2 = src[i+1];
-        T* row3 = src[i+2];
-        T* row4 = src[i+3];
-        (*tmp)[0] = (float)row1[0];
-        (*tmp)[1] = (float)row2[0];
-        (*tmp)[2] = (float)row3[0];
-        (*tmp)[3] = (float)row4[0];
-        xmm3 = _mm_load_ps ((float*)tmp);   // previous element
-        (*tmp)[0] = (float)row1[1];
-        (*tmp)[1] = (float)row2[1];
-        (*tmp)[2] = (float)row3[1];
-        (*tmp)[3] = (float)row4[1];
-        xmm4 = _mm_load_ps ((float*)tmp);   // current element
-
-        for (int j=1; j<W-1; j++) {
-            (*tmp)[0] = (float)row1[j+1];
-            (*tmp)[1] = (float)row2[j+1];
-            (*tmp)[2] = (float)row3[j+1];
-            (*tmp)[3] = (float)row4[j+1];
-            xmm5 = _mm_load_ps ((float*)tmp);   // next element
-            // compute blured pixel
-            xmm0 = _mm_mul_ps (xmm3, xmm2);
-            xmm6 = _mm_mul_ps (xmm5, xmm2);
-            xmm0 = _mm_add_ps (xmm6, xmm0);
-            xmm6 = _mm_mul_ps (xmm4, xmm1);
-            xmm0 = _mm_add_ps (xmm6, xmm0);
-            // store blured pixel
-            _mm_store_ps ((float*)&temp[j], xmm0);
-            // update prev and current
-            xmm3 = xmm4;
-            xmm4 = xmm5;
-        }
-        for (int j=1; j<W-1; j++) {
-          dst[i][j]   = (T)temp[j][0];
-          dst[i+1][j] = (T)temp[j][1];
-          dst[i+2][j] = (T)temp[j][2];
-          dst[i+3][j] = (T)temp[j][3];
-        }                    
-        dst[i][0] = src[i][0];
-        dst[i+1][0] = src[i+1][0];
-        dst[i+2][0] = src[i+2][0];
-        dst[i+3][0] = src[i+3][0];
-        dst[i][W-1] = src[i][W-1];
-        dst[i+1][W-1] = src[i+1][W-1];
-        dst[i+2][W-1] = src[i+2][W-1];
-        dst[i+3][W-1] = src[i+3][W-1];
-    }
-    for (; i<row_to; i++) {
+	#pragma omp parallel for if (multiThread)
+    for (int i=0; i<H; i++) {
+    	T* temp = buffer + omp_get_thread_num() * W;
        for (int j=1; j<W-1; j++)
-            buffer[j] = (T)(c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]);
+            temp[j] = (T)(c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]);
        dst[i][0] = src[i][0];
-        memcpy (dst[i]+1, buffer+1, (W-2)*sizeof(T));
+        memcpy (dst[i]+1, temp+1, (W-2)*sizeof(T));
        dst[i][W-1] = src[i][W-1];
    }
 }

-/*template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int row_from, int row_to, const int c0, const int c1) {
+template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int W, int H, const float c0, const float c1, bool multiThread) {
    
-    time_t t1 = clock ();
-
-    const int csum = c0 + 2 * c1;    
-    for (int i = row_from; i<row_to; i++) {
-        for (int j=1; j<W-1; j++)
-            buffer[j] = (c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]) / csum;
-        dst[i][0] = src[i][0];
-        memcpy (dst[i]+1, buffer+1, (W-2)*sizeof(T));
-        dst[i][W-1] = src[i][W-1];
-    }
-    printf ("horizontal time = %d\n", clock()-t1);
-}
-*/
-
-template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int H, int col_from, int col_to, const float c0, const float c1) {
-    
-    typedef float pfloat[4];
-    pfloat* temp = (pfloat*)buffer + 1;
-    pfloat* tmp = (pfloat*)buffer;
-
-    __m128 xmm1; xmm1 = _mm_load1_ps (&c0);
-    __m128 xmm2; xmm2 = _mm_load1_ps (&c1);
-    __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6;
-    __m128 xmm0;
-
-    int i;
-    for (i = col_from; i<col_to-3; i+=4) {
-        (*tmp)[0] = (float)src[0][i];
-        (*tmp)[1] = (float)src[0][i+1];
-        (*tmp)[2] = (float)src[0][i+2];
-        (*tmp)[3] = (float)src[0][i+3];
-        xmm3 = _mm_load_ps ((float*)tmp);   // previous element
-        (*tmp)[0] = (float)src[1][i];
-        (*tmp)[1] = (float)src[1][i+1];
-        (*tmp)[2] = (float)src[1][i+2];
-        (*tmp)[3] = (float)src[1][i+3];
-        xmm4 = _mm_load_ps ((float*)tmp);   // current element
-
-        for (int j=1; j<H-1; j++) {
-            (*tmp)[0] = (float)src[j+1][i];
-            (*tmp)[1] = (float)src[j+1][i+1];
-            (*tmp)[2] = (float)src[j+1][i+2];
-            (*tmp)[3] = (float)src[j+1][i+3];
-            xmm5 = _mm_load_ps ((float*)tmp);   // next element
-            // compute blured pixel
-            xmm0 = _mm_mul_ps (xmm3, xmm2);
-            xmm6 = _mm_mul_ps (xmm5, xmm2);
-            xmm0 = _mm_add_ps (xmm6, xmm0);
-            xmm6 = _mm_mul_ps (xmm4, xmm1);
-            xmm0 = _mm_add_ps (xmm6, xmm0);
-            // store blured pixel
-            _mm_store_ps ((float*)&temp[j], xmm0);
-            // update prev and current
-            xmm3 = xmm4;
-            xmm4 = xmm5;
-        }
-        for (int j=1; j<H-1; j++) {
-            dst[j][i]   = (T)temp[j][0];
-            dst[j][i+1] = (T)temp[j][1];
-            dst[j][i+2] = (T)temp[j][2];
-            dst[j][i+3] = (T)temp[j][3];
-        }        
-        dst[0][i] = src[0][i];
-        dst[0][i+1] = src[0][i+1];
-        dst[0][i+2] = src[0][i+2];
-        dst[0][i+3] = src[0][i+3];
-        dst[H-1][i] = src[H-1][i];
-        dst[H-1][i+1] = src[H-1][i+1];
-        dst[H-1][i+2] = src[H-1][i+2];
-        dst[H-1][i+3] = src[H-1][i+3];
-    }
-//    int stride = dst[1] - dst[0];
-    for (; i<col_to; i++) {
+	#pragma omp parallel for if (multiThread)
+    for (int i=0; i<W; i++) {
+    	T* temp = buffer + omp_get_thread_num() * H;
        for (int j = 1; j<H-1; j++) 
-            buffer[j] = (T)(c1 * (src[j-1][i] + src[j+1][i]) + c0 * src[j][i]);
+        	temp[j] = (T)(c1 * (src[j-1][i] + src[j+1][i]) + c0 * src[j][i]);
        dst[0][i] = src[0][i];
-	for (int j=1; j<H-1; j++)
-          dst[j][i] = buffer[j];
-//        T* crow = &dst[1][i];
-//        T* cbuff = &buffer[1];
-//        for (int j = 1; j<H-1; j++, crow += stride, cbuff++)
-//            (*crow) = (*cbuff);
+	    for (int j=1; j<H-1; j++)
+            dst[j][i] = temp[j];
        dst[H-1][i] = src[H-1][i];
    }
 }
-/*
-template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int H, int col_from, int col_to, const float c0, const float c1) {

-    time_t t1 = clock ();
+// fast gaussian approximation if the support window is large

-    int stride = dst[1] - dst[0];
-    for (int j=col_from; j<col_to; j++) {
-        for (int i = 1; i<H-1; i++) 
-            buffer[i] = (c1 * (src[i-1][j] + src[i+1][j]) + c0 * src[i][j]);
-        dst[0][j] = src[0][j];
-	for (int i=1; i<H-1; i++)
-          dst[i][j] = buffer[i];
-//        T* crow = &dst[1][j];
-//        T* cbuff = &buffer[1];
-//        for (int i = 1; i<H-1; i++, crow += stride, cbuff++)
-//            (*crow) = (*cbuff);
-        dst[H-1][j] = src[H-1][j];
-    }
-    
-    printf ("vertical time = %d\n", clock()-t1);
-}
-*/
-template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<float>* buffer, int W, int row_from, int row_to, double sigma) {
-
-    if (sigma<0.25) {
-        // dont perform filtering
-        if (src!=dst)
-            for (int i = row_from; i<row_to; i++) 
-                memcpy (dst[i], src[i], W*sizeof(T));
-        return;
-    }
-
-    if (sigma<0.6) {
-        // compute 3x3 kernel
-//        double c0 = 2.0 / exp (-1.0 / (2.0 * sigma * sigma));
-//        printf ("c0=%g\n", c0);
-//        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, row_from, row_to, (int) round(c0), 2);
-        double c1 = exp (-1.0 / (2.0 * sigma * sigma));
-        double csum = 2.0 * c1 + 1.0;
-        c1 /= csum;
-        double c0 = 1.0 / csum;
-        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, row_from, row_to, c0, c1);
-        return;
-    }
-
-    // horizontal
-    float q = 0.98711 * sigma - 0.96330;
-    if (sigma<2.5)
-        q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
-    float b0 = 1.57825 + 2.44413*q + 1.4281*q*q + 0.422205*q*q*q;
-    float b1 = 2.44413*q + 2.85619*q*q + 1.26661*q*q*q;
-    float b2 = -1.4281*q*q - 1.26661*q*q*q;
-    float b3 = 0.422205*q*q*q;
-    float B = 1.0 - (b1+b2+b3) / b0;
-
-    b1 /= b0;
-    b2 /= b0;
-    b3 /= b0;
-    
-    // SSE optimized version:
-    __m128 xmm1; xmm1 = _mm_load1_ps (&B);
-    __m128 xmm2; xmm2 = _mm_load1_ps (&b1);
-    __m128 xmm3; xmm3 = _mm_load1_ps (&b2);
-    __m128 xmm4; xmm4 = _mm_load1_ps (&b3);
-    __m128 xmm5;
-    __m128 xmm6;
-      
-    typedef float pfloat[4];
-    pfloat* temp = (pfloat*)buffer->data + 1;
-    pfloat* tmp = (pfloat*)buffer->data;
-
-    memset (temp, 0, W*sizeof(pfloat));
-
-    int i;
-    for (i=row_from; i<row_to-3; i+=4) {
-        T* row1 = src[i];
-        T* row2 = src[i+1];
-        T* row3 = src[i+2];
-        T* row4 = src[i+3];
-        for (int j=0; j<3; j++) {
-            (*tmp)[0] = (float)row1[3];
-            (*tmp)[1] = (float)row2[3];
-            (*tmp)[2] = (float)row3[3];
-            (*tmp)[3] = (float)row4[3];
-            xmm5 = _mm_load_ps ((float*)tmp);
-            _mm_store_ps ((float*)&temp[j], xmm5);
-        }
-        for (int j=3; j<W; j++) {
-            (*tmp)[0] = (float)row1[j];
-            (*tmp)[1] = (float)row2[j];
-            (*tmp)[2] = (float)row3[j];
-            (*tmp)[3] = (float)row4[j];
-            xmm5 = _mm_load_ps ((float*)tmp);
-            xmm5 =_mm_mul_ps (xmm1, xmm5);
-            xmm6 = _mm_load_ps ((float*)&temp[j-1]);
-            xmm6 = _mm_mul_ps (xmm2, xmm6);
-            xmm5 = _mm_add_ps (xmm6, xmm5);
-            xmm6 = _mm_load_ps ((float*)&temp[j-2]);
-            xmm6 = _mm_mul_ps (xmm6, xmm3);
-            xmm5 = _mm_add_ps (xmm5, xmm6);
-            xmm6 = _mm_load_ps ((float*)&temp[j-3]);
-            xmm6 = _mm_mul_ps (xmm6, xmm4);
-            xmm5 = _mm_add_ps (xmm5, xmm6);
-            _mm_store_ps ((float*)&temp[j], xmm5);
-        }
-        for (int j=W-4; j>=0; j--) {
-          xmm5 = _mm_load_ps ((float*)&temp[j]);
-          xmm5 =_mm_mul_ps (xmm1, xmm5);
-          xmm6 = _mm_load_ps ((float*)&temp[j+1]);
-          xmm6 = _mm_mul_ps (xmm2, xmm6);
-          xmm5 = _mm_add_ps (xmm6, xmm5);
-          xmm6 = _mm_load_ps ((float*)&temp[j+2]);
-          xmm6 = _mm_mul_ps (xmm6, xmm3);
-          xmm5 = _mm_add_ps (xmm5, xmm6);
-          xmm6 = _mm_load_ps ((float*)&temp[j+3]);
-          xmm6 = _mm_mul_ps (xmm6, xmm4);
-          xmm5 = _mm_add_ps (xmm5, xmm6);
-          _mm_store_ps ((float*)&temp[j], xmm5);
-        }
-        for (int j=0; j<W; j++) {
-          dst[i][j]   = (T)temp[j][0];
-          dst[i+1][j] = (T)temp[j][1];
-          dst[i+2][j] = (T)temp[j][2];
-          dst[i+3][j] = (T)temp[j][3];
-        }
-    }
-    // blur remaining rows
-    float* temp2 = (float*)buffer->data;
-    for (; i<row_to; i++) {
-        for (int j=0; j<3; j++)
-            temp2[j] = src[i][3];
-        for (int j=3; j<W; j++)
-            temp2[j] = B * src[i][j] + b1*temp2[j-1] + b2*temp2[j-2] + b3*temp2[j-3];
-        for (int j=W-4; j>=0; j--)
-            temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3];
-        for (int j=0; j<W; j++)
-            dst[i][j] = (T)temp2[j];
-    }
-}
-
-template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<float>* buffer, int H, int col_from, int col_to, double sigma) {
+template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>* buffer, int W, int H, double sigma, bool multiThread) {

    if (sigma<0.25) {
        // dont perform filtering
        if (src!=dst)
            for (int i = 0; i<H; i++)
-                for (int j=col_from; j<col_to; j++)
-                    dst[i][j] = src[i][j];
-        return;
-    }
-
-    if (sigma<0.6) {
-        // compute 3x3 kernel
-        double c1 = exp (-1.0 / (2.0 * sigma * sigma));
-        double csum = 2.0 * c1 + 1.0;
-        c1 /= csum;
-        double c0 = 1.0 / csum;
-        gaussVertical3<T> (src, dst, (T*)(buffer->data), H, col_from, col_to, c0, c1);
-//        double c0 = 2.0 / exp (-1.0 / (2.0 * sigma * sigma));
-//        gaussVertical3<T> (src, dst, (T*)(buffer->data), H, col_from, col_to, (int) round(c0), 2);
-        return;
-    }
-
-    // vertical
-    float q = 0.98711 * sigma - 0.96330;
-    if (sigma<2.5)
-        q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
-    float b0 = 1.57825 + 2.44413*q + 1.4281*q*q + 0.422205*q*q*q;
-    float b1 = 2.44413*q + 2.85619*q*q + 1.26661*q*q*q;
-    float b2 = -1.4281*q*q - 1.26661*q*q*q;
-    float b3 = 0.422205*q*q*q;
-    float B = 1.0 - (b1+b2+b3) / b0;
-
-    b1 /= b0;
-    b2 /= b0;
-    b3 /= b0;
-    
-    // SSE optimized version:
-    __m128 xmm1; xmm1 = _mm_load1_ps (&B);
-    __m128 xmm2; xmm2 = _mm_load1_ps (&b1);
-    __m128 xmm3; xmm3 = _mm_load1_ps (&b2);
-    __m128 xmm4; xmm4 = _mm_load1_ps (&b3);
-    __m128 xmm5;
-    __m128 xmm6;
-      
-    typedef float pfloat[4];
-    pfloat* temp = (pfloat*)buffer->data + 1;
-    pfloat* tmp = (pfloat*)buffer->data;
-
-    memset (temp, 0, H*sizeof(pfloat));
-
-    int i;
-    for (i=col_from; i<col_to-3; i+=4) {
-        for (int j=0; j<3; j++) {
-            (*tmp)[0] = (float)src[3][i];
-            (*tmp)[1] = (float)src[3][i+1];
-            (*tmp)[2] = (float)src[3][i+2];
-            (*tmp)[3] = (float)src[3][i+3];
-            xmm5 = _mm_load_ps ((float*)tmp);
-            _mm_store_ps ((float*)&temp[j], xmm5);
-        }
-        for (int j=3; j<H; j++) {
-            (*tmp)[0] = (float)src[j][i];
-            (*tmp)[1] = (float)src[j][i+1];
-            (*tmp)[2] = (float)src[j][i+2];
-            (*tmp)[3] = (float)src[j][i+3];
-            xmm5 = _mm_load_ps ((float*)tmp);
-            xmm5 =_mm_mul_ps (xmm1, xmm5);
-            xmm6 = _mm_load_ps ((float*)&temp[j-1]);
-            xmm6 = _mm_mul_ps (xmm2, xmm6);
-            xmm5 = _mm_add_ps (xmm6, xmm5);
-            xmm6 = _mm_load_ps ((float*)&temp[j-2]);
-            xmm6 = _mm_mul_ps (xmm6, xmm3);
-            xmm5 = _mm_add_ps (xmm5, xmm6);
-            xmm6 = _mm_load_ps ((float*)&temp[j-3]);
-            xmm6 = _mm_mul_ps (xmm6, xmm4);
-            xmm5 = _mm_add_ps (xmm5, xmm6);
-            _mm_store_ps ((float*)&temp[j], xmm5);
-        }
-        for (int j=H-4; j>=0; j--) {
-            xmm5 = _mm_load_ps ((float*)&temp[j]);
-            xmm5 =_mm_mul_ps (xmm1, xmm5);
-            xmm6 = _mm_load_ps ((float*)&temp[j+1]);
-            xmm6 = _mm_mul_ps (xmm2, xmm6);
-            xmm5 = _mm_add_ps (xmm6, xmm5);
-            xmm6 = _mm_load_ps ((float*)&temp[j+2]);
-            xmm6 = _mm_mul_ps (xmm6, xmm3);
-            xmm5 = _mm_add_ps (xmm5, xmm6);
-            xmm6 = _mm_load_ps ((float*)&temp[j+3]);
-            xmm6 = _mm_mul_ps (xmm6, xmm4);
-            xmm5 = _mm_add_ps (xmm5, xmm6);
-            _mm_store_ps ((float*)&temp[j], xmm5);
-        }
-        for (int j=0; j<H; j++) {
-            dst[j][i]   = (T)temp[j][0];
-            dst[j][i+1] = (T)temp[j][1];
-            dst[j][i+2] = (T)temp[j][2];
-            dst[j][i+3] = (T)temp[j][3];
-        }
-    }
-    // blur remaining columns
-    float* temp2 = (float*)buffer->data;
-    for (; i<col_to; i++) {
-        for (int j=0; j<3; j++)
-            temp2[j] = src[3][i];
-        for (int j=3; j<H; j++)
-            temp2[j] = B * src[j][i] + b1*temp2[j-1] + b2*temp2[j-2] + b3*temp2[j-3];
-        for (int j=H-4; j>=0; j--)
-            temp2[j] = B * temp2[j] + b1*temp2[j+1] + b2*temp2[j+2] + b3*temp2[j+3];
-        for (int j=0; j<H; j++)
-            dst[j][i] = (T)temp2[j];
-    }
-}
-
-#else
-
-template<class T> void gaussHorizontal3 (T** src, T** dst, T* buffer, int W, int row_from, int row_to, const float c0, const float c1) {
-    
-    for (int i=row_from; i<row_to; i++) {
-        for (int j=1; j<W-1; j++)
-            buffer[j] = (T)(c1 * (src[i][j-1] + src[i][j+1]) + c0 * src[i][j]);
-        dst[i][0] = src[i][0];
-        memcpy (dst[i]+1, buffer+1, (W-2)*sizeof(T));
-        dst[i][W-1] = src[i][W-1];
-    }
-}
-
-template<class T> void gaussVertical3 (T** src, T** dst, T* buffer, int H, int col_from, int col_to, const float c0, const float c1) {
-    
-    for (int i=col_from; i<col_to; i++) {
-        for (int j = 1; j<H-1; j++) 
-            buffer[j] = (T)(c1 * (src[j-1][i] + src[j+1][i]) + c0 * src[j][i]);
-        dst[0][i] = src[0][i];
-	    for (int j=1; j<H-1; j++)
-            dst[j][i] = buffer[j];
-        dst[H-1][i] = src[H-1][i];
-    }
-}
-
-template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma) {
-
-    if (sigma<0.25) {
-        // dont perform filtering
-        if (src!=dst)
-            for (int i = row_from; i<row_to; i++) 
                memcpy (dst[i], src[i], W*sizeof(T));
        return;
    }
@ -480,11 +72,11 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
        double csum = 2.0 * c1 + 1.0;
        c1 /= csum;
        double c0 = 1.0 / csum;
-        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, row_from, row_to, c0, c1);
+        gaussHorizontal3<T> (src, dst, (T*)(buffer->data), W, H, c0, c1, multiThread);
        return;
    }

-    // horizontal
+    // coefficient calculation
    double q = 0.98711 * sigma - 0.96330;
    if (sigma<2.5)
        q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
@ -513,8 +105,11 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
        for (int j=0; j<3; j++)
            M[i][j] /= (1.0+b1-b2+b3)*(1.0+b2+(b1-b3)*b3);
    
-    double* temp2 = (double*)buffer->data;
-    for (int i=row_from; i<row_to; i++) {
+	#pragma omp parallel for if (multiThread)
+    for (int i=0; i<H; i++) {
+
+        double* temp2 = buffer->data + omp_get_thread_num() * W;
+
        temp2[0] = B * src[i][0] + b1*src[i][0] + b2*src[i][0] + b3*src[i][0];
        temp2[1] = B * src[i][1] + b1*temp2[0]  + b2*src[i][0] + b3*src[i][0];
        temp2[2] = B * src[i][2] + b1*temp2[1]  + b2*temp2[0]  + b3*src[i][0];
@ -537,14 +132,13 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBuffer<double>*
    }
 }

-template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma) {
+template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* buffer, int W, int H, double sigma, bool multiThread) {

    if (sigma<0.25) {
        // dont perform filtering
        if (src!=dst)
            for (int i = 0; i<H; i++)
-                for (int j=col_from; j<col_to; j++)
-                    dst[i][j] = src[i][j];
+                memcpy (dst[i], src[i], W*sizeof(T));
        return;
    }

@ -554,11 +148,11 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
        double csum = 2.0 * c1 + 1.0;
        c1 /= csum;
        double c0 = 1.0 / csum;
-        gaussVertical3<T> (src, dst, (T*)(buffer->data), H, col_from, col_to, c0, c1);
+        gaussVertical3<T> (src, dst, (T*)(buffer->data), W, H, c0, c1, multiThread);
        return;
    }

-    // vertical
+    // coefficient calculation
    double q = 0.98711 * sigma - 0.96330;
    if (sigma<2.5)
        q = 3.97156 - 4.14554 * sqrt (1.0 - 0.26891 * sigma);
@ -587,9 +181,12 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
        for (int j=0; j<3; j++)
            M[i][j] /= (1.0+b1-b2+b3)*(1.0+b2+(b1-b3)*b3);

-    double* temp2 = (double*)buffer->data;
-    for (int i=col_from; i<col_to; i++) {
-        temp2[0] = B * src[0][i] + b1*src[0][i] + b2*src[0][i] + b3*src[0][i];
+	#pragma omp parallel for if (multiThread)
+    for (int i=0; i<W; i++) {
+
+        double* temp2 = buffer->data + omp_get_thread_num() * H;
+
+    	temp2[0] = B * src[0][i] + b1*src[0][i] + b2*src[0][i] + b3*src[0][i];
        temp2[1] = B * src[1][i] + b1*temp2[0]  + b2*src[0][i] + b3*src[0][i];
        temp2[2] = B * src[2][i] + b1*temp2[1]  + b2*temp2[0]  + b3*src[0][i];

@ -611,14 +208,13 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBuffer<double>* b
            dst[j][i] = (T)temp2[j];
    }
 }   
-#endif
-

+/*
 void gaussHorizontal_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
 void gaussVertical_unsigned (unsigned short** src, unsigned short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
 void gaussHorizontal_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
 void gaussVertical_signed (short** src, short** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
 void gaussHorizontal_float (float** src, float** dst, AlignedBuffer<double>* buffer, int W, int row_from, int row_to, double sigma);
 void gaussVertical_float (float** src, float** dst, AlignedBuffer<double>* buffer, int H, int col_from, int col_to, double sigma);
-
+*/
 #endif
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@ -28,7 +28,7 @@ namespace rtengine {
 extern Settings* settings;

 ImProcCoordinator::ImProcCoordinator ()
-    : allocated(false), scale(-1), pW(-1), pH(-1),
+    : ipf(&params, true), allocated(false), scale(-1), pW(-1), pH(-1),
    plistener(NULL), imageListener(NULL), aeListener(NULL), hListener(NULL),
    resultValid(false), awbComputed(false),
    changeSinceLast(0), updaterRunning(false), 
@ -54,7 +54,6 @@ ImProcCoordinator::~ImProcCoordinator () {
        delete toDel[i];

    imgsrc->decreaseRef ();
-	ipf.release ();
    updaterThreadStart.unlock ();
 }

@ -79,6 +78,8 @@ void ImProcCoordinator::updatePreviewImage (int todo) {
    else if (params.resize.dataspec==2)
        params.resize.scale = (double)params.resize.height / (params.coarse.rotate==90 || params.coarse.rotate==270 ? fw : fh);

+    ipf.setScale (scale);
+
    progress ("Applying white balance, color correction & sRBG conversion...",100*readyphase/numofphases);
    if (todo & M_INIT) {
        minit.lock ();
@ -164,7 +165,7 @@ void ImProcCoordinator::updatePreviewImage (int todo) {
    progress ("Exposure curve & CIELAB conversion...",100*readyphase/numofphases);
    if (todo & M_RGBCURVE) {
        CurveFactory::complexCurve (params.toneCurve.expcomp, params.toneCurve.black/65535.0, params.toneCurve.hlcompr, params.toneCurve.shcompr, params.toneCurve.brightness, params.toneCurve.contrast, imgsrc->getDefGain(), imgsrc->getGamma(), true, params.toneCurve.curve, vhist16, tonecurve, bcrgbhist, scale==1 ? 1 : 1);
-        ipf.rgbProc (oprevi, oprevl, &params, tonecurve, shmap);
+        ipf.rgbProc (oprevi, oprevl, tonecurve, shmap);

        // recompute luminance histogram
        memset (lhist16, 0, 65536*sizeof(int));
@ -186,12 +187,12 @@ void ImProcCoordinator::updatePreviewImage (int todo) {
        readyphase++;
        if (scale==1) {
            progress ("Denoising luminance...",100*readyphase/numofphases);
-            ipf.lumadenoise (nprevl, &params, scale*params.resize.scale, buffer);
+            ipf.lumadenoise (nprevl, buffer);
        }
        readyphase++;
        if (scale==1) {
            progress ("Sharpening...",100*readyphase/numofphases);
-            ipf.sharpening (nprevl, &params, scale*params.resize.scale, (unsigned short**)buffer);
+            ipf.sharpening (nprevl, (unsigned short**)buffer);
        }
        readyphase++;
    }
@ -201,11 +202,11 @@ void ImProcCoordinator::updatePreviewImage (int todo) {

    if (todo & M_COLOR) {
        progress ("Applying Color Boost...",100*readyphase/numofphases);
-        ipf.colorCurve (oprevl, nprevl, &params);
+        ipf.colorCurve (oprevl, nprevl);
        readyphase++;
        if (scale==1) {
            progress ("Denoising color...",100*readyphase/numofphases);
-            ipf.colordenoise (nprevl, &params, scale*params.resize.scale, buffer);
+            ipf.colordenoise (nprevl, buffer);
        }
        readyphase++;
    }
@ -314,7 +315,7 @@ if (settings->verbose) printf ("setscale before lock\n");
        oprevl = new LabImage (pW, pH);    
        nprevl = new LabImage (pW, pH);    
        previmg = new Image8 (pW, pH);
-        shmap = new SHMap (pW, pH);
+        shmap = new SHMap (pW, pH, true);
        
        buffer = new int*[pH];
        for (int i=0; i<pH; i++)
@ -410,7 +411,7 @@ void ImProcCoordinator::getSpotWB (int x, int y, int rect, double& temp, double&
        for (int j=x-rect; j<=x+rect; j++) 
            points.push_back (Coord2D (j, i));

-    ipf.transCoord (&params, fw, fh, points, red, green, blue);
+    ImProcFunctions::transCoord (&params, fw, fh, points, red, green, blue);
    int tr = TR_NONE;
    if (params.coarse.rotate==90)  tr |= TR_R90;
    if (params.coarse.rotate==180) tr |= TR_R180;
@ -437,7 +438,7 @@ void ImProcCoordinator::getAutoCrop (double ratio, int &x, int &y, int &w, int &
        x = (fullw - w) / 2;
        y = (fullh - h) / 2;       
        int orx, ory, orw, orh;
-        clipped = ipf.transCoord (&params, fw, fh, x, y, w, h, orx, ory, orw, orh);
+        clipped = ImProcFunctions::transCoord (&params, fw, fh, x, y, w, h, orx, ory, orw, orh);
        w -= 4;
    }
    if (ratio>0)
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@ -24,89 +24,75 @@
 #include <procparams.h>
 #include <shmap.h>
 #include <coord2d.h>
+#include <labimage.h>

 namespace rtengine {

 using namespace procparams;

-class LabImage {
-    private:
-        bool fromImage;
-
-    public:
-        int W, H;
-        unsigned short** L;
-        short** a;
-        short** b;
-        
-     LabImage (int w, int h);
-     LabImage (Image16* im);
-    ~LabImage ();
-};
-
 class ImProcFunctions {

-    protected:
-        struct STemp {
-            int cx, cy, sx, sy, oW, oH;
-        };
-		cmsHTRANSFORM monitorTransform;
-
-        void transform_         (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
-        void simpltransform_    (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
-        void vignetting_        (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
-        void transform_sep_     (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
-        void rgbProc_           (Image16* working, LabImage* lab, const ProcParams* params, int* tonecurve, SHMap* shmap, int row_from, int row_to);
-        void lab2rgb_           (LabImage* lab, Image8* image, int row_from, int row_to);
-        void colorCurve_        (LabImage* lold, LabImage* lnew, const ProcParams* params, int row_from, int row_to, double* cmultiplier);
-        void sharpenHaloCtrl    (LabImage* lab, const ProcParams* params, unsigned short** blurmap, unsigned short** base, int W, int row_from, int row_to);
-        void firstAnalysis_     (Image16* original, Glib::ustring wprofile, unsigned int* histogram, int* chroma_radius, int row_from, int row_to);
-        void resize_            (Image16* src, Image16* dst, ResizeParams params, int row_from, int row_to);
-        void damping_           (float** aI, unsigned short** aO, float damping, int W, int rowfrom, int rowto);
-
-    public:
-
        static int* cacheL;
        static int* cachea;
        static int* cacheb;
        static int* xcache;
        static int* ycache;
        static int* zcache;
+        static unsigned short gamma2curve[65536];
+
+        struct STemp {
+            int cx, cy, sx, sy, oW, oH;
+        };
+		cmsHTRANSFORM monitorTransform;

        int chroma_scale;
        int chroma_radius;

-        double lumimul[3];
-        static unsigned short gamma2curve[65536];
+		const ProcParams* params;
+		double scale;
+		bool multiThread;

+        void transform_         (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
+        void simpltransform_    (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
+        void vignetting_        (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
+        void transform_sep_     (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to);
+        void sharpenHaloCtrl    (LabImage* lab, unsigned short** blurmap, unsigned short** base, int W, int H);
+        void firstAnalysis_     (Image16* original, Glib::ustring wprofile, unsigned int* histogram, int* chroma_radius, int row_from, int row_to);
+        void dcdamping          (float** aI, unsigned short** aO, float damping, int W, int H);
+
+    public:
+
+        double lumimul[3];

        static void initCache ();
        
-        ImProcFunctions () : monitorTransform(NULL) {}
-		void release    ();
+        ImProcFunctions (const ProcParams* iparams, bool imultiThread=true)
+			: monitorTransform(NULL), params(iparams), scale(1), multiThread(imultiThread) {}
+        ~ImProcFunctions ();

+        void setScale 		(double iscale);
 		
        void firstAnalysis  (Image16* working, const ProcParams* params, unsigned int* vhist16, double gamma);
    
-        void rgbProc        (Image16* working, LabImage* lab, const ProcParams* params, int* tonecurve, SHMap* shmap);
+        void rgbProc        (Image16* working, LabImage* lab, int* tonecurve, SHMap* shmap);
        void luminanceCurve (LabImage* lold, LabImage* lnew, int* curve, int row_from, int row_to);
-        void colorCurve     (LabImage* lold, LabImage* lnew, const ProcParams* params);
-        void sharpening     (LabImage* lab, const ProcParams* params, double scale, unsigned short** buffer);
-        void lumadenoise    (LabImage* lab, const ProcParams* params, double scale, int** buffer);
-        void colordenoise   (LabImage* lab, const ProcParams* params, double scale, int** buffer);
+        void colorCurve     (LabImage* lold, LabImage* lnew);
+        void sharpening     (LabImage* lab, unsigned short** buffer);
+        void lumadenoise    (LabImage* lab, int** buffer);
+        void colordenoise   (LabImage* lab, int** buffer);
        void transform      (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int sx, int sy, int oW, int oH);
        void simpltransform (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int sx, int sy, int oW, int oH);
        void vignetting     (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int oW, int oH);
        void lab2rgb        (LabImage* lab, Image8* image);
-        void resize         (Image16* src, Image16* dst, ResizeParams params);
+        void resize         (Image16* src, Image16* dst);

-        bool transCoord     (const ProcParams* params, int W, int H, int x, int y, int w, int h, int& xv, int& yv, int& wv, int& hv);
-        bool transCoord     (const ProcParams* params, int W, int H, std::vector<Coord2D> &src, std::vector<Coord2D> &red,  std::vector<Coord2D> &green, std::vector<Coord2D> &blue);
-        void deconvsharpening(LabImage* lab, const ProcParams* params, double scale, unsigned short** buffer);
+        void deconvsharpening(LabImage* lab, unsigned short** buffer);

        Image8*     lab2rgb     (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile);
        Image16*    lab2rgb16   (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile);

+        static bool transCoord     (const ProcParams* params, int W, int H, int x, int y, int w, int h, int& xv, int& yv, int& wv, int& hv);
+        static bool transCoord     (const ProcParams* params, int W, int H, std::vector<Coord2D> &src, std::vector<Coord2D> &red,  std::vector<Coord2D> &green, std::vector<Coord2D> &blue);
        static void getAutoExp  (unsigned int* histogram, int histcompr, double expcomp, double clip, double& br, int& bl);
 };
 };
--- a/rtengine/iplab2rgb.cc
+++ b/rtengine/iplab2rgb.cc
@ -0,0 +1,259 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <rtengine.h>
+#include <improcfun.h>
+#include <glibmm.h>
+#include <iccstore.h>
+#include <omp.h>
+
+namespace rtengine {
+
+#undef CLIP
+#undef CLIPTO
+#undef CMAXVAL
+
+#define CMAXVAL 0xffff
+#define CLIP(a) ((a)>0?((a)<CMAXVAL?(a):CMAXVAL):0)
+#define CLIPTO(a,b,c) ((a)>(b)?((a)<(c)?(a):(c)):(b))
+
+extern const Settings* settings;
+
+void ImProcFunctions::lab2rgb (LabImage* lab, Image8* image) {
+
+	if (monitorTransform) {
+	    int ix = 0;
+        short* buffer = new short [3*lab->W];
+		for (int i=0; i<lab->H; i++) {
+			unsigned short* rL = lab->L[i];
+			short* ra = lab->a[i];
+			short* rb = lab->b[i];
+			int iy = 0;
+			for (int j=0; j<lab->W; j++) {
+
+                int y_ = rL[j];
+                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
+                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
+
+                x_ = CLIPTO(x_,0,369820);
+                y_ = CLIPTO(y_,0,825745);
+
+                y_ = ycache[y_];
+				x_ = xcache[x_];
+				z_ = zcache[z_];
+
+                buffer[iy++] = CLIP(x_);
+                buffer[iy++] = CLIP(y_);
+                buffer[iy++] = CLIP(z_);
+			}
+            cmsDoTransform (monitorTransform, buffer, image->data + ix, lab->W);
+            ix += 3*lab->W;
+		}
+        delete [] buffer;
+	}
+	else {
+		#pragma omp parallel for if (multiThread)
+		for (int i=0; i<lab->H; i++) {
+			unsigned short* rL = lab->L[i];
+			short* ra = lab->a[i];
+			short* rb = lab->b[i];
+			int ix = 3*i*lab->W;
+			for (int j=0; j<lab->W; j++) {
+
+                int y_ = rL[j];
+                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
+                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
+
+                x_ = CLIPTO(x_,0,369820);
+                y_ = CLIPTO(y_,0,825745);
+
+                y_ = ycache[y_];
+				x_ = xcache[x_];
+				z_ = zcache[z_];
+
+				/* XYZ-D50 to RGB */
+				int R = (25689*x_-13261*y_-4022*z_) >> 13;
+				int G = (-8017*x_+15697*y_+274*z_) >> 13;
+				int B = (590*x_-1877*y_+11517*z_) >> 13;
+
+				/* copy RGB */
+				image->data[ix++] = gamma2curve[CLIP(R)] >> 8;
+				image->data[ix++] = gamma2curve[CLIP(G)] >> 8;
+				image->data[ix++] = gamma2curve[CLIP(B)] >> 8;
+			}
+		}
+	}
+}
+
+Image8* ImProcFunctions::lab2rgb (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile) {
+
+    if (cx<0) cx = 0;
+    if (cy<0) cy = 0;
+    if (cx+cw>lab->W) cw = lab->W-cx;
+    if (cy+ch>lab->H) ch = lab->H-cy;
+
+    Image8* image = new Image8 (cw, ch);
+
+    cmsHPROFILE oprof = iccStore.getProfile (profile);
+
+    if (oprof) {
+        cmsHPROFILE iprof = iccStore.getXYZProfile ();
+        lcmsMutex->lock ();
+        cmsHTRANSFORM hTransform = cmsCreateTransform (iprof, TYPE_RGB_16, oprof, TYPE_RGB_8, settings->colorimetricIntent, 0);
+        lcmsMutex->unlock ();
+        int ix = 0;
+        short* buffer = new short [3*cw];
+        for (int i=cy; i<cy+ch; i++) {
+            unsigned short* rL = lab->L[i];
+            short* ra = lab->a[i];
+            short* rb = lab->b[i];
+            int iy = 0;
+            for (int j=cx; j<cx+cw; j++) {
+
+                int y_ = rL[j];
+                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
+                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
+
+                x_ = CLIPTO(x_,0,369820);
+                y_ = CLIPTO(y_,0,825745);
+
+                y_ = ycache[y_];
+				x_ = xcache[x_];
+				z_ = zcache[z_];
+
+                buffer[iy++] = CLIP(x_);
+                buffer[iy++] = CLIP(y_);
+                buffer[iy++] = CLIP(z_);
+            }
+            cmsDoTransform (hTransform, buffer, image->data + ix, cw);
+            ix += 3*cw;
+        }
+        delete [] buffer;
+        cmsDeleteTransform(hTransform);
+    }
+    else {
+		#pragma omp parallel for if (multiThread)
+        for (int i=cy; i<cy+ch; i++) {
+            unsigned short* rL = lab->L[i];
+            short* ra = lab->a[i];
+            short* rb = lab->b[i];
+			int ix = 3*i*cw;
+            for (int j=cx; j<cx+cw; j++) {
+
+                int y_ = rL[j];
+                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
+                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
+
+                x_ = CLIPTO(x_,0,369820);
+                y_ = CLIPTO(y_,0,825745);
+
+                y_ = ycache[y_];
+				x_ = xcache[x_];
+				z_ = zcache[z_];
+
+                int R = (25689*x_-13261*y_-4022*z_) >> 13;
+                int G = (-8017*x_+15697*y_+274*z_) >> 13;
+                int B = (590*x_-1877*y_+11517*z_) >> 13;
+
+                image->data[ix++] = gamma2curve[CLIP(R)] >> 8;
+                image->data[ix++] = gamma2curve[CLIP(G)] >> 8;
+                image->data[ix++] = gamma2curve[CLIP(B)] >> 8;
+            }
+        }
+    }
+    return image;
+}
+
+Image16* ImProcFunctions::lab2rgb16 (LabImage* lab, int cx, int cy, int cw, int ch, Glib::ustring profile) {
+
+    if (cx<0) cx = 0;
+    if (cy<0) cy = 0;
+    if (cx+cw>lab->W) cw = lab->W-cx;
+    if (cy+ch>lab->H) ch = lab->H-cy;
+
+    Image16* image = new Image16 (cw, ch);
+
+    cmsHPROFILE oprof = iccStore.getProfile (profile);
+
+    if (oprof) {
+		#pragma omp parallel for if (multiThread)
+		for (int i=cy; i<cy+ch; i++) {
+			unsigned short* rL = lab->L[i];
+			short* ra = lab->a[i];
+			short* rb = lab->b[i];
+			short* xa = (short*)image->r[i-cy];
+			short* ya = (short*)image->g[i-cy];
+			short* za = (short*)image->b[i-cy];
+			for (int j=cx; j<cx+cw; j++) {
+
+                int y_ = rL[j];
+                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
+                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
+
+                x_ = CLIPTO(x_,0,369820);
+                y_ = CLIPTO(y_,0,825745);
+
+                y_ = ycache[y_];
+				x_ = xcache[x_];
+				z_ = zcache[z_];
+
+				xa[j-cx] = CLIP(x_);
+				ya[j-cx] = CLIP(y_);
+				za[j-cx] = CLIP(z_);
+			}
+		}
+        cmsHPROFILE iprof = iccStore.getXYZProfile ();
+        lcmsMutex->lock ();
+		cmsHTRANSFORM hTransform = cmsCreateTransform (iprof, TYPE_RGB_16_PLANAR, oprof, TYPE_RGB_16_PLANAR, settings->colorimetricIntent, 0);
+        lcmsMutex->unlock ();
+		cmsDoTransform (hTransform, image->data, image->data, image->planestride/2);
+		cmsDeleteTransform(hTransform);
+	}
+	else {
+		#pragma omp parallel for if (multiThread)
+		for (int i=cy; i<cy+ch; i++) {
+			unsigned short* rL = lab->L[i];
+			short* ra = lab->a[i];
+			short* rb = lab->b[i];
+			for (int j=cx; j<cx+cw; j++) {
+
+                int y_ = rL[j];
+                int x_ = rL[j]+10486+ra[j]*152/chroma_scale+141556;
+                int z_ = rL[j]+10486-rb[j]*380/chroma_scale+369619;
+
+                x_ = CLIPTO(x_,0,369820);
+                y_ = CLIPTO(y_,0,825745);
+
+                y_ = ycache[y_];
+				x_ = xcache[x_];
+				z_ = zcache[z_];
+
+				int R = (25689*x_-13261*y_-4022*z_) >> 13;
+				int G = (-8017*x_+15697*y_+274*z_) >> 13;
+				int B = (590*x_-1877*y_+11517*z_) >> 13;
+
+				image->r[i-cy][j-cx] = gamma2curve[CLIP(R)];
+				image->g[i-cy][j-cx] = gamma2curve[CLIP(G)];
+				image->b[i-cy][j-cx] = gamma2curve[CLIP(B)];
+			}
+		}
+	}
+    return image;
+}
+
+}
--- a/rtengine/ipresize.cc
+++ b/rtengine/ipresize.cc
@ -0,0 +1,293 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <rtengine.h>
+#include <improcfun.h>
+#include <glibmm.h>
+#include <omp.h>
+
+namespace rtengine {
+
+#undef CLIP
+#undef CLIPTO
+#undef CMAXVAL
+
+#define CMAXVAL 0xffff
+#define CLIP(a) ((a)>0?((a)<CMAXVAL?(a):CMAXVAL):0)
+#define CLIPTO(a,b,c) ((a)>(b)?((a)<(c)?(a):(c)):(b))
+
+void ImProcFunctions::resize (Image16* src, Image16* dst) {
+
+	if(params->resize.method == "Downscale (Better)") {
+        // small-scale algorithm by Ilia
+        // provides much better quality on small scales
+        // calculates mean value over source pixels which current destination pixel covers
+        // works only for scales < 1
+        // for scales ~1 it is analogous to bilinear
+        // possibly, for even less scale factors (< 0.2 possibly) boundary pixels are not needed, omitting them can give a speedup
+        // this algorithm is much slower on small factors than others, because it uses all pixels of the SOURCE image
+        // Ilia Popov ilia_popov@rambler.ru 2010
+
+        double delta = 1.0 / params->resize.scale;
+        double k = params->resize.scale * params->resize.scale;
+
+		#pragma omp parallel for if (multiThread)
+        for(int i = 0; i < dst->height; i++) {
+            // top and bottom boundary coordinates
+            double y0 = i * delta;
+            double y1 = (i + 1) * delta;
+
+            int m0 = y0;
+            m0 = CLIPTO(m0, 0, src->height-1);
+
+            int m1 = y1;
+            m1 = CLIPTO(m1, 0, src->height-1);
+
+            // weights of boundary pixels
+            double wy0 = 1.0 - (y0 - m0);
+            double wy1 = y1 - m1;
+
+            for(int j = 0; j < dst->width; j++) {
+                // left and right boundary coordinates
+                double x0 = j * delta;
+                double x1 = (j + 1) * delta;
+
+                int n0 = x0;
+                n0 = CLIPTO(n0, 0, src->width-1);
+                int n1 = x1;
+                n1 = CLIPTO(n1, 0, src->width-1);
+
+                double wx0 = 1.0 - (x0 - n0);
+                double wx1 = x1 - n1;
+
+                double r = 0;
+                double g = 0;
+                double b = 0;
+
+                // integration
+                // corners
+                r += wy0 * wx0 * src->r[m0][n0] + wy0 * wx1 * src->r[m0][n1] + wy1 * wx0 * src->r[m1][n0] + wy1 * wx1 * src->r[m1][n1];
+                g += wy0 * wx0 * src->g[m0][n0] + wy0 * wx1 * src->g[m0][n1] + wy1 * wx0 * src->g[m1][n0] + wy1 * wx1 * src->g[m1][n1];
+                b += wy0 * wx0 * src->b[m0][n0] + wy0 * wx1 * src->b[m0][n1] + wy1 * wx0 * src->b[m1][n0] + wy1 * wx1 * src->b[m1][n1];
+
+                // top and bottom boundaries
+                for(int n = n0 + 1; n < n1; n++) {
+                    r += wy0 * src->r[m0][n] + wy1 * src->r[m1][n];
+                    g += wy0 * src->g[m0][n] + wy1 * src->g[m1][n];
+                    b += wy0 * src->b[m0][n] + wy1 * src->b[m1][n];
+                }
+
+                // inner rows
+                for(int m = m0 + 1; m < m1; m++) {
+                    // left and right boundaries
+                    r += wx0 * src->r[m][n0] + wx1 * src->r[m][n1];
+                    g += wx0 * src->g[m][n0] + wx1 * src->g[m][n1];
+                    b += wx0 * src->b[m][n0] + wx1 * src->b[m][n1];
+                    // inner pixels
+                    for(int n = n0 + 1; n < n1; n++) {
+                        r += src->r[m][n];
+                        g += src->g[m][n];
+                        b += src->b[m][n];
+                    }
+                }
+                // overall weight is equal to the DST pixel area in SRC coordinates
+                r *= k;
+                g *= k;
+                b *= k;
+
+                dst->r[i][j] = CLIP((int)r);
+                dst->g[i][j] = CLIP((int)g);
+                dst->b[i][j] = CLIP((int)b);
+            }
+        }
+        return;
+    }
+
+    if(params->resize.method == "Downscale (Faster)")
+	{
+        // faster version of algo above, does not take into account border pixels,
+        // which are summed with non-unity weights in slow algo. So, no need
+        // for weights at all
+        // Ilia Popov ilia_popov@rambler.ru 5.04.2010
+
+        double delta = 1.0 / params->resize.scale;
+
+        int p = (int) delta;
+
+        // if actually we are doing upscaling, behave like Nearest
+        if(p == 0)
+            p = 1;
+
+        int q = p/2;
+
+        // may cause problems on 32-bit systems on extremely small factors.
+        // In that case change 1024 to smth less
+        const int divider = 1024;
+
+        // scaling factor after summation
+        int k = divider / (p * p);
+
+		#pragma omp parallel for if (multiThread)
+        for(int i = 0; i < dst->height; i++) {
+            // y coordinate of center of destination pixel
+            double y = (i + 0.5) * delta;
+
+            int m0 = (int) (y) - q;
+            m0 = CLIPTO(m0, 0, src->height-1);
+
+            int m1 = m0 + p;
+            if(m1 > src->height) {
+                m1 = src->height;
+                m0 = m1 - p;
+            }
+            m1 = CLIPTO(m1, 0, src->height);
+
+            for(int j = 0; j < dst->width; j++) {
+                // x coordinate of center of destination pixel
+                double x = (j + 0.5) * delta;
+
+                int n0 = (int) (x) - q;
+                n0 = CLIPTO(n0, 0, src->width-1);
+
+                int n1 = n0 + p;
+                if(n1 > src->width) {
+                    n1 = src->width;
+                    n0 = n1 - p;
+                }
+                n1 = CLIPTO(n1, 0, src->width);
+
+                int r = 0;
+                int g = 0;
+                int b = 0;
+
+                // integration
+                for(int m = m0; m < m1; m++) {
+                    for(int n = n0; n < n1; n++) {
+                        r += src->r[m][n];
+                        g += src->g[m][n];
+                        b += src->b[m][n];
+                    }
+                }
+                dst->r[i][j] = CLIP( r * k / divider);
+                dst->g[i][j] = CLIP( g * k / divider);
+                dst->b[i][j] = CLIP( b * k / divider);
+            }
+        }
+        return;
+    }
+    if (params->resize.method.substr(0,7)=="Bicubic") {
+        double Av = -0.5;
+        if (params->resize.method=="Bicubic (Sharper)")
+            Av = -0.75;
+        else if (params->resize.method=="Bicubic (Softer)")
+            Av = -0.25;
+		#pragma omp parallel for if (multiThread)
+        for (int i=0; i<dst->height; i++) {
+            double wx[4], wy[4];
+            double Dy = i / params->resize.scale;
+            int yc  =  (int) Dy; Dy -= (double)yc;
+            int ys = yc - 1; // smallest y-index used for interpolation
+            // compute vertical weights
+            double t1y = -Av*(Dy-1.0)*Dy;
+            double t2y = (3.0-2.0*Dy)*Dy*Dy;
+            wy[3] = t1y*Dy;
+            wy[2] = t1y*(Dy-1.0) + t2y;
+            wy[1] = -t1y*Dy + 1.0 - t2y;
+            wy[0] = -t1y*(Dy-1.0);
+            for (int j=0; j<dst->width; j++) {
+                double Dx = j / params->resize.scale;
+                int xc  =  (int) Dx; Dx -= (double)xc;
+                int xs = xc - 1; // smallest x-index used for interpolation
+                if (ys >= 0 && ys <src->height-3 && xs >= 0 && xs <= src->width-3) {
+                    // compute horizontal weights
+                    double t1 = -Av*(Dx-1.0)*Dx;
+                    double t2 = (3.0-2.0*Dx)*Dx*Dx;
+                    wx[3] = t1*Dx;
+                    wx[2] = t1*(Dx-1.0) + t2;
+                    wx[1] = -t1*Dx + 1.0 - t2;
+                    wx[0] = -t1*(Dx-1.0);
+                    // compute weighted sum
+                    int r = 0;
+                    int g = 0;
+                    int b = 0;
+                    for (int x=0; x<4; x++)
+                        for (int y=0; y<4; y++) {
+                            double w = wx[x]*wy[y];
+                            r += w*src->r[ys+y][xs+x];
+                            g += w*src->g[ys+y][xs+x];
+                            b += w*src->b[ys+y][xs+x];
+                        }
+                    dst->r[i][j] = CLIP(r);
+                    dst->g[i][j] = CLIP(g);
+                    dst->b[i][j] = CLIP(b);
+                }
+                else {
+                    xc = CLIPTO(xc, 0, src->width-1);
+                    yc = CLIPTO(yc, 0, src->height-1);
+                    int nx = xc + 1;
+                    if (nx>=src->width)
+                        nx = xc;
+                    int ny = yc + 1;
+                    if (ny>=src->height)
+                        ny = yc;
+                    dst->r[i][j] = (1-Dx)*(1-Dy)*src->r[yc][xc] + (1-Dx)*Dy*src->r[ny][xc] + Dx*(1-Dy)*src->r[yc][nx] + Dx*Dy*src->r[ny][nx];
+                    dst->g[i][j] = (1-Dx)*(1-Dy)*src->g[yc][xc] + (1-Dx)*Dy*src->g[ny][xc] + Dx*(1-Dy)*src->g[yc][nx] + Dx*Dy*src->g[ny][nx];
+                    dst->b[i][j] = (1-Dx)*(1-Dy)*src->b[yc][xc] + (1-Dx)*Dy*src->b[ny][xc] + Dx*(1-Dy)*src->b[yc][nx] + Dx*Dy*src->b[ny][nx];
+                }
+            }
+        }
+    }
+    else if (params->resize.method=="Bilinear") {
+		#pragma omp parallel for if (multiThread)
+        for (int i=0; i<dst->height; i++) {
+            int sy = i/params->resize.scale;
+            sy = CLIPTO(sy, 0, src->height-1);
+            double dy = i/params->resize.scale - sy;
+            int ny = sy+1;
+            if (ny>=src->height)
+                ny = sy;
+            for (int j=0; j<dst->width; j++) {
+                int sx = j/params->resize.scale;
+                sx = CLIPTO(sx, 0, src->width-1);
+                double dx = j/params->resize.scale - sx;
+                int nx = sx+1;
+                if (nx>=src->width)
+                    nx = sx;
+                dst->r[i][j] = (1-dx)*(1-dy)*src->r[sy][sx] + (1-dx)*dy*src->r[ny][sx] + dx*(1-dy)*src->r[sy][nx] + dx*dy*src->r[ny][nx];
+                dst->g[i][j] = (1-dx)*(1-dy)*src->g[sy][sx] + (1-dx)*dy*src->g[ny][sx] + dx*(1-dy)*src->g[sy][nx] + dx*dy*src->g[ny][nx];
+                dst->b[i][j] = (1-dx)*(1-dy)*src->b[sy][sx] + (1-dx)*dy*src->b[ny][sx] + dx*(1-dy)*src->b[sy][nx] + dx*dy*src->b[ny][nx];
+            }
+        }
+    }
+    else {
+		#pragma omp parallel for if (multiThread)
+        for (int i=0; i<dst->height; i++) {
+            int sy = i/params->resize.scale;
+            sy = CLIPTO(sy, 0, src->height-1);
+            for (int j=0; j<dst->width; j++) {
+                int sx = j/params->resize.scale;
+                sx = CLIPTO(sx, 0, src->width-1);
+                dst->r[i][j] = src->r[sy][sx];
+                dst->g[i][j] = src->g[sy][sx];
+                dst->b[i][j] = src->b[sy][sx];
+            }
+        }
+    }
+}
+
+}
--- a/rtengine/ipsharpen.cc
+++ b/rtengine/ipsharpen.cc
@ -0,0 +1,201 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <rtengine.h>
+#include <improcfun.h>
+#include <omp.h>
+#include <minmax.h>
+#include <gauss.h>
+#include <bilateral2.h>
+
+namespace rtengine {
+
+#undef CLIP
+#undef CMAXVAL
+#undef ABS
+
+#define CMAXVAL 0xffff
+#define CLIP(a) ((a)>0?((a)<CMAXVAL?(a):CMAXVAL):0)
+#define ABS(a) ((a)<0?-(a):(a))
+
+void ImProcFunctions::dcdamping (float** aI, unsigned short** aO, float damping, int W, int H) {
+
+	#pragma omp parallel for if (multiThread)
+	for (int i=0; i<H; i++)
+        for (int j=0; j<W; j++) {
+            float I = aI[i][j];
+            float O = (float)aO[i][j];
+            if (O==0.0 || I==0.0) {
+                aI[i][j] = 0.0;
+                continue;
+            }
+            float U = -(O * log(I/O) - I + O) * 2.0 / (damping*damping);
+            U = MIN(U,1.0);
+            U = U*U*U*U*(5.0-U*4.0);
+            aI[i][j] = (O - I) / I * U + 1.0;
+        }
+}
+
+void ImProcFunctions::deconvsharpening (LabImage* lab, unsigned short** b2) {
+
+    if (params->sharpening.enabled==false || params->sharpening.deconvamount<1)
+        return;
+
+    int W = lab->W, H = lab->H;
+
+    float** tmpI = new float*[H];
+    for (int i=0; i<H; i++) {
+        tmpI[i] = new float[W];
+        for (int j=0; j<W; j++)
+            tmpI[i][j] = (float)lab->L[i][j];
+    }
+
+    float** tmp = (float**)b2;
+
+    AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(W,H)*omp_get_max_threads());
+
+    float damping = params->sharpening.deconvdamping / 5.0;
+    bool needdamp = params->sharpening.deconvdamping > 0;
+    for (int k=0; k<params->sharpening.deconviter; k++) {
+
+    	// apply blur function (gaussian blur)
+        gaussHorizontal<float> (tmpI, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
+        gaussVertical<float>   (tmp, tmp,  buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
+
+    	if (!needdamp) {
+			#pragma omp parallel for if (multiThread)
+            for (int i=0; i<H; i++)
+                for (int j=0; j<W; j++)
+                    if (tmp[i][j]>0)
+                        tmp[i][j] = (float)lab->L[i][j] / tmp[i][j];
+        }
+        else
+			dcdamping (tmp, lab->L, damping, W, H);
+
+        gaussHorizontal<float> (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
+        gaussVertical<float>   (tmp, tmp, buffer, W, H, params->sharpening.deconvradius / scale, multiThread);
+
+		#pragma omp parallel for if (multiThread)
+        for (int i=0; i<H; i++)
+            for (int j=0; j<W; j++)
+                tmpI[i][j] = tmpI[i][j] * tmp[i][j];
+    }
+
+	#pragma omp parallel for if (multiThread)
+    for (int i=0; i<H; i++)
+        for (int j=0; j<W; j++)
+            lab->L[i][j] = lab->L[i][j]*(100-params->sharpening.deconvamount) / 100 + (int)CLIP(tmpI[i][j])*params->sharpening.deconvamount / 100;
+
+    for (int i=0; i<H; i++)
+        delete [] tmpI[i];
+    delete [] tmpI;
+}
+
+void ImProcFunctions::sharpening (LabImage* lab, unsigned short** b2) {
+
+    if (params->sharpening.method=="rld") {
+        deconvsharpening (lab, b2);
+        return;
+    }
+
+    if (params->sharpening.enabled==false || params->sharpening.amount<1 || lab->W<8 || lab->H<8)
+        return;
+
+    int W = lab->W, H = lab->H;
+    unsigned short** b3;
+
+    AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(W,H)*omp_get_max_threads());
+
+    if (params->sharpening.edgesonly==false) {
+
+        gaussHorizontal<unsigned short> (lab->L, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
+        gaussVertical<unsigned short>   (b2,     b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
+    }
+    else {
+        b3 = new unsigned short*[H];
+        for (int i=0; i<H; i++)
+            b3[i] = new unsigned short[W];
+
+		bilateral<unsigned short, unsigned int> (lab->L, (unsigned short**)b3, b2, W, H, params->sharpening.edges_radius / scale, params->sharpening.edges_tolerance, multiThread);
+		gaussHorizontal<unsigned short> (b3, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
+		gaussVertical<unsigned short>   (b2, b2, buffer, W, H, params->sharpening.radius / scale, multiThread);
+    }
+    delete buffer;
+
+    unsigned short** base = lab->L;
+    if (params->sharpening.edgesonly)
+        base = b3;
+
+    if (params->sharpening.halocontrol==false) {
+		#pragma omp parallel for if (multiThread)
+    	for (int i=0; i<H; i++)
+            for (int j=0; j<W; j++) {
+                int diff = base[i][j] - b2[i][j];
+                if (ABS(diff)>params->sharpening.threshold) {
+                    int val = lab->L[i][j] + params->sharpening.amount * diff / 100;
+                    lab->L[i][j] = CLIP(val);
+                }
+            }
+    }
+    else
+		sharpenHaloCtrl (lab, b2, base, W, H);
+
+    if (params->sharpening.edgesonly) {
+        for (int i=0; i<H; i++)
+            delete [] b3[i];
+        delete [] b3;
+    }
+}
+
+void ImProcFunctions::sharpenHaloCtrl (LabImage* lab, unsigned short** blurmap, unsigned short** base, int W, int H) {
+
+    int scale = 100 * (100-params->sharpening.halocontrol_amount);
+    unsigned short** nL = base;
+	#pragma omp parallel for if (multiThread)
+    for (int i=2; i<H-2; i++) {
+        int max1 = 0, max2 = 0, min1 = 0, min2 = 0, maxn, minn, np1, np2, np3, min, max;
+        for (int j=2; j<W-2; j++) {
+            int diff = base[i][j] - blurmap[i][j];
+            if (ABS(diff) > params->sharpening.threshold) {
+                // compute maximum/minimum in a delta environment
+                np1 = 2*(nL[i-2][j] + nL[i-2][j+1] + nL[i-2][j+2] + nL[i-1][j] + nL[i-1][j+1] + nL[i-1][j+2] + nL[i][j] + nL[i][j+1] + nL[i][j+2]) / 27 + nL[i-1][j+1] / 3;
+                np2 = 2*(nL[i-1][j] + nL[i-1][j+1] + nL[i-1][j+2] + nL[i][j] + nL[i][j+1] + nL[i][j+2] + nL[i+1][j] + nL[i+1][j+1] + nL[i+1][j+2]) / 27 + nL[i][j+1] / 3;
+                np3 = 2*(nL[i][j] + nL[i][j+1] + nL[i][j+2] + nL[i+1][j] + nL[i+1][j+1] + nL[i+1][j+2] + nL[i+2][j] + nL[i+2][j+1] + nL[i+2][j+2]) / 27 + nL[i+1][j+1] / 3;
+                MINMAX3(np1,np2,np3,maxn,minn);
+                MAX3(max1,max2,maxn,max);
+                MIN3(min1,min2,minn,min);
+                max1 = max2; max2 = maxn;
+                min1 = min2; min2 = minn;
+                if (max < lab->L[i][j])
+                    max = lab->L[i][j];
+                if (min > lab->L[i][j])
+                    min = lab->L[i][j];
+                int val = lab->L[i][j] + params->sharpening.amount * diff / 100;
+                int newL = CLIP(val);
+                // applying halo control
+                if (newL > max)
+                    newL = max + (newL-max) * scale / 10000;
+                else if (newL<min)
+                    newL = min - (min-newL) * scale / 10000;
+                lab->L[i][j] = newL;
+            }
+        }
+    }
+}
+
+}
--- a/rtengine/iptransform.cc
+++ b/rtengine/iptransform.cc
@ -0,0 +1,821 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <rtengine.h>
+#include <improcfun.h>
+#include <omp.h>
+
+namespace rtengine {
+
+#undef CMAXVAL
+#undef MAX
+#undef MIN
+#undef CLIP
+#undef CLIPTOC
+
+#define CMAXVAL 0xffff
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MIN(a,b) ((a)>(b)?(b):(a))
+#define CLIP(a) ((a)>0?((a)<CMAXVAL?(a):CMAXVAL):0)
+#define CLIPTOC(a,b,c,d) ((a)>=(b)?((a)<=(c)?(a):((c),d=true)):((b),d=true))
+
+extern const Settings* settings;
+
+void ImProcFunctions::vignetting_ (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to) {
+
+  int oW = sizes.oW;
+  int oH = sizes.oH;
+  int cx = sizes.cx;
+  int cy = sizes.cy;
+
+  double  w2 = (double) oW  / 2.0 - 0.5;
+  double  h2 = (double) oH  / 2.0 - 0.5;
+
+  double maxRadius = sqrt( (double)( oW*oW + oH*oH ) ) / 2;
+
+  double v = 1.0 - params->vignetting.amount * 3.0 / 400.0;
+  double b = 1.0 + params->vignetting.radius * 7.0 / 100.0;
+
+  double mul = (1.0-v) / tanh(b);
+
+  int val;
+  for (int y=row_from; y<row_to; y++) {
+      double y_d = (double) (y + cy) - h2 ;
+      for (int x=0; x<transformed->width; x++) {
+          double x_d = (double) (x + cx) - w2 ;
+          double r = sqrt(x_d*x_d + y_d*y_d);
+          double vign = v + mul * tanh (b*(maxRadius-r) / maxRadius);
+          val = original->r[y][x] / vign;
+          transformed->r[y][x] = CLIP(val);
+          val =  original->g[y][x] / vign;
+          transformed->g[y][x] = CLIP(val);
+          val = original->b[y][x] / vign;
+          transformed->b[y][x] = CLIP(val);
+      }
+  }
+}
+
+void ImProcFunctions::vignetting (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int oW, int oH) {
+
+    STemp sizes;
+    sizes.cx = cx;
+    sizes.cy = cy;
+    sizes.oW = oW;
+    sizes.oH = oH;
+
+    if (settings->dualThreadEnabled) {
+        Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::vignetting_), original, transformed, params, sizes, 0, transformed->height/2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+        Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::vignetting_), original, transformed, params, sizes, transformed->height/2, transformed->height), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+        thread1->join ();
+        thread2->join ();
+    }
+    else
+        vignetting_ (original, transformed, params, sizes, 0, transformed->height);
+}
+
+#include "cubint.cc"
+void ImProcFunctions::transform_ (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to) {
+
+  int oW = sizes.oW;
+  int oH = sizes.oH;
+  int cx = sizes.cx;
+  int cy = sizes.cy;
+  int sx = sizes.sx;
+  int sy = sizes.sy;
+
+  double  w2 = (double) oW  / 2.0 - 0.5;
+  double  h2 = (double) oH  / 2.0 - 0.5;
+
+  double cost = cos(params->rotate.degree * 3.14/180.0);
+  double sint = sin(params->rotate.degree * 3.14/180.0);
+
+  double  max_x = (double) (sx + original->width - 1);
+  double  max_y = (double) (sy + original->height - 1);
+  double  min_x = (double) sx;
+  double  min_y = (double) sy;
+
+  const int n2 = 2;
+  const int n = 4;
+
+  int mix  = original->width - 1; // maximum x-index src
+  int miy  = original->height - 1;// maximum y-index src
+  int mix2 = mix +1 - n;
+  int miy2 = miy +1 - n;
+
+  double scale = (oW>oH) ? (double)oW / 2.0 : (double)oH / 2.0 ;
+  double radius = sqrt( (double)( oW*oW + oH*oH ) );
+  radius /= (oW<oH) ? oW : oH;
+
+  double a = params->distortion.amount;
+
+  double d = 1.0 - a;
+
+    // magnify image to keep size
+    double rotmagn = 1.0;
+    if (params->rotate.fill) {
+        double beta = atan((double)MIN(oH,oW)/MAX(oW,oH));
+        rotmagn = sin(beta) / sin(fabs(params->rotate.degree) * 3.14/180.0 + beta);
+    }
+    // 1. check upper and lower border
+    double d1 = rotmagn - a*h2/scale;
+    double d2 = rotmagn - a*w2/scale;
+    double d3 = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
+    d = MIN(d,MIN(d1,MIN(d2,d3)));
+
+    // auxilary variables for vignetting
+    double maxRadius = sqrt( (double)( oW*oW + oH*oH ) ) / 2 / scale;
+
+    double v = 1.0 - params->vignetting.amount * 3.0 / 400.0;
+    double b = 1.0 + params->vignetting.radius * 7.0 / 100.0;
+
+    double mul = (1.0-v) / tanh(b);
+
+    // main cycle
+    double eps = 1e-10;
+    bool calc_r=( (fabs(a)>eps) || (fabs(1.0-v)>eps) );
+    bool do_vign = (fabs(1.0-v)>eps);
+
+    for (int y=row_from; y<row_to; y++) {
+        double y_d = (double) (y + cy) - h2 ;
+        for (int x=0; x<transformed->width; x++) {
+            double x_d = (double) (x + cx) - w2 ;
+
+            double r=0.0;
+            double s = d;//10000.0;
+	    if (calc_r)
+	    {
+	            r=(sqrt(x_d*x_d + y_d*y_d)) / scale;
+	            if (r<radius)
+	            s += a * r ;
+	    }
+
+            double Dx = s*(x_d * cost - y_d * sint) + w2;
+            double Dy = s*(x_d * sint + y_d * cost) + h2;
+
+            if (fabs(Dx)<eps) Dx = 0;
+            if (fabs(Dy)<eps) Dy = 0;
+            if (fabs(Dx-max_x)<eps) Dx = nextafter(max_x,0);
+            if (fabs(Dy-max_y)<eps) Dy = nextafter(max_y,0);
+
+            bool valid = !((Dx >= max_x)   || (Dy >= max_y) || (Dx < min_x) || (Dy < min_y));
+
+            // Convert only valid pixels
+            if (valid) {
+                // Extract integer and fractions of source screen coordinates
+                int xc  =  (int) (Dx); Dx -= (double)xc;
+                int yc  =  (int) (Dy); Dy -= (double)yc;
+                int ys = yc +1 - n2 - sy; // smallest y-index used for interpolation
+                int xs = xc +1 - n2 - sx; // smallest x-index used for interpolation
+
+                double vignmul = 1.0;
+		if (do_vign) vignmul /= (v + mul * tanh (b*(maxRadius-s*r) / maxRadius));
+
+                if (ys >= 0 && ys <= miy2 && xs >= 0 && xs <= mix2)   // all interpolation pixels inside image
+                    cubint (original, xs, ys, Dx, Dy, &(transformed->r[y][x]), &(transformed->g[y][x]), &(transformed->b[y][x]), vignmul);
+                else { // edge pixels
+                    int y1 = (yc>0) ? yc : 0;
+                    if (y1>miy) y1 = miy;
+                    int y2 = (yc<miy) ? yc+1 : miy;
+                    if (y2<0) y2 = 0;
+                    int x1 = (xc>0) ? xc : 0;
+                    if (x1>mix) x1 = mix;
+                    int x2 = (xc<mix) ? xc+1 : mix;
+                    if (x2<0) x2 = 0;
+                    int r = vignmul*(original->r[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->r[y1][x2]*Dx*(1.0-Dy) + original->r[y2][x1]*(1.0-Dx)*Dy + original->r[y2][x2]*Dx*Dy);
+                    int g = vignmul*(original->g[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->g[y1][x2]*Dx*(1.0-Dy) + original->g[y2][x1]*(1.0-Dx)*Dy + original->g[y2][x2]*Dx*Dy);
+                    int b = vignmul*(original->b[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->b[y1][x2]*Dx*(1.0-Dy) + original->b[y2][x1]*(1.0-Dx)*Dy + original->b[y2][x2]*Dx*Dy);
+                    transformed->r[y][x] = CLIP(r);
+                    transformed->g[y][x] = CLIP(g);
+                    transformed->b[y][x] = CLIP(b);
+                }
+            }
+            else {
+                // not valid (source pixel x,y not inside source image, etc.)
+                transformed->r[y][x] = 0;
+                transformed->g[y][x] = 0;
+                transformed->b[y][x] = 0;
+            }
+        }
+    }
+}
+
+void ImProcFunctions::simpltransform_ (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to) {
+
+  int oW = sizes.oW;
+  int oH = sizes.oH;
+  int cx = sizes.cx;
+  int cy = sizes.cy;
+  int sx = sizes.sx;
+  int sy = sizes.sy;
+
+  double  w2 = (double) oW  / 2.0 - 0.5;
+  double  h2 = (double) oH  / 2.0 - 0.5;
+
+  double cost = cos(params->rotate.degree * 3.14/180.0);
+  double sint = sin(params->rotate.degree * 3.14/180.0);
+
+  double  max_x = (double) (sx + original->width - 1);
+  double  max_y = (double) (sy + original->height - 1);
+  double  min_x = (double) sx;
+  double  min_y = (double) sy;
+
+  const int n2 = 2;
+  const int n = 2;
+
+  int mix  = original->width - 1; // maximum x-index src
+  int miy  = original->height - 1;// maximum y-index src
+  int mix2 = mix +1 - n;
+  int miy2 = miy +1 - n;
+
+  double scale = (oW>oH) ? (double)oW / 2.0 : (double)oH / 2.0 ;
+  double radius = sqrt( (double)( oW*oW + oH*oH ) );
+  radius /= (oW<oH) ? oW : oH;
+
+  double a = params->distortion.amount;
+
+  double d = 1.0 - a;
+
+
+    // magnify image to keep size
+    double rotmagn = 1.0;
+    if (params->rotate.fill) {
+        double beta = atan((double)MIN(oH,oW)/MAX(oW,oH));
+        rotmagn = sin(beta) / sin(fabs(params->rotate.degree) * 3.14/180.0 + beta);
+    }
+    // 1. check upper and lower border
+    double d1r = rotmagn - a*h2/scale - params->cacorrection.red;
+    double d2r = rotmagn - a*w2/scale - params->cacorrection.red;
+    double d3r = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.red;
+    double dr = MIN(d,MIN(d1r,MIN(d2r,d3r)));
+    double d1b = rotmagn - a*h2/scale - params->cacorrection.blue;
+    double d2b = rotmagn - a*w2/scale - params->cacorrection.blue;
+    double d3b = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.blue;
+    double db = MIN(d,MIN(d1b,MIN(d2b,d3b)));
+    double d1g = rotmagn - a*h2/scale;
+    double d2g = rotmagn - a*w2/scale;
+    double d3g = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
+    double dg = MIN(d,MIN(d1g,MIN(d2g,d3g)));
+
+    d = MIN(dg,MIN(dr,db));
+
+    // auxilary variables for vignetting
+    double maxRadius = sqrt( (double)( oW*oW + oH*oH ) ) / 2 / scale;
+
+    double v = 1.0 - params->vignetting.amount * 3.0 / 400.0;
+    double b = 1.0 + params->vignetting.radius * 7.0 / 100.0;
+
+    double mul = (1.0-v) / tanh(b);
+
+    // main cycle
+    double eps = 1e-10;
+    bool calc_r=( (fabs(a)>eps) || (fabs(1.0-v)>eps) );
+    bool do_vign = (fabs(1.0-v)>eps);
+
+    for (int y=row_from; y<row_to; y++) {
+        double y_d = (double) (y + cy) - h2 ;
+        for (int x=0; x<transformed->width; x++) {
+            double x_d = (double) (x + cx) - w2 ;
+
+            double r=0.0;
+            double s = d;//10000.0;
+	    if (calc_r)
+	    {
+	            r=(sqrt(x_d*x_d + y_d*y_d)) / scale;
+	            if (r<radius)
+	            s += a * r ;
+	    }
+
+            double Dx = s*(x_d * cost - y_d * sint) + w2;
+            double Dy = s*(x_d * sint + y_d * cost) + h2;
+
+            if (fabs(Dx)<eps) Dx = 0;
+            if (fabs(Dy)<eps) Dy = 0;
+            if (fabs(Dx-max_x)<eps) Dx = nextafter(max_x,0);
+            if (fabs(Dy-max_y)<eps) Dy = nextafter(max_y,0);
+
+            bool valid = !((Dx >= max_x)   || (Dy >= max_y) || (Dx < min_x) || (Dy < min_y));
+
+            // Convert only valid pixels
+            if (valid) {
+                // Extract integer and fractions of source screen coordinates
+                int xc  =  (int) (Dx); Dx -= (double)xc;
+                int yc  =  (int) (Dy); Dy -= (double)yc;
+                int ys = yc +1 - n2 - sy; // smallest y-index used for interpolation
+                int xs = xc +1 - n2 - sx; // smallest x-index used for interpolation
+
+                double vignmul = 1.0;
+		if (do_vign) vignmul /= (v + mul * tanh (b*(maxRadius-s*r) / maxRadius));
+
+                if (ys >= 0 && ys <= miy2 && xs >= 0 && xs <= mix2 && yc < miy-1) {   // all interpolation pixels inside image
+
+                    int r = vignmul*(original->r[yc][xc]*(1.0-Dx)*(1.0-Dy) + original->r[yc][xc+1]*Dx*(1.0-Dy) + original->r[yc+1][xc]*(1.0-Dx)*Dy + original->r[yc+1][xc+1]*Dx*Dy);
+                    int g = vignmul*(original->g[yc][xc]*(1.0-Dx)*(1.0-Dy) + original->g[yc][xc+1]*Dx*(1.0-Dy) + original->g[yc+1][xc]*(1.0-Dx)*Dy + original->g[yc+1][xc+1]*Dx*Dy);
+                    int b = vignmul*(original->b[yc][xc]*(1.0-Dx)*(1.0-Dy) + original->b[yc][xc+1]*Dx*(1.0-Dy) + original->b[yc+1][xc]*(1.0-Dx)*Dy + original->b[yc+1][xc+1]*Dx*Dy);
+                    transformed->r[y][x] = CLIP(r);
+                    transformed->g[y][x] = CLIP(g);
+                    transformed->b[y][x] = CLIP(b);
+                }
+                else { // edge pixels
+                    int y1 = (yc>0) ? yc : 0;
+                    if (y1>miy) y1 = miy;
+                    int y2 = (yc<miy) ? yc+1 : miy;
+                    if (y2<0) y2 = 0;
+                    int x1 = (xc>0) ? xc : 0;
+                    if (x1>mix) x1 = mix;
+                    int x2 = (xc<mix) ? xc+1 : mix;
+                    if (x2<0) x2 = 0;
+                    int r = vignmul*(original->r[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->r[y1][x2]*Dx*(1.0-Dy) + original->r[y2][x1]*(1.0-Dx)*Dy + original->r[y2][x2]*Dx*Dy);
+                    int g = vignmul*(original->g[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->g[y1][x2]*Dx*(1.0-Dy) + original->g[y2][x1]*(1.0-Dx)*Dy + original->g[y2][x2]*Dx*Dy);
+                    int b = vignmul*(original->b[y1][x1]*(1.0-Dx)*(1.0-Dy) + original->b[y1][x2]*Dx*(1.0-Dy) + original->b[y2][x1]*(1.0-Dx)*Dy + original->b[y2][x2]*Dx*Dy);
+                    transformed->r[y][x] = CLIP(r);
+                    transformed->g[y][x] = CLIP(g);
+                    transformed->b[y][x] = CLIP(b);
+                }
+            }
+            else {
+                // not valid (source pixel x,y not inside source image, etc.)
+                transformed->r[y][x] = 0;
+                transformed->g[y][x] = 0;
+                transformed->b[y][x] = 0;
+            }
+        }
+    }
+}
+
+
+#include "cubintch.cc"
+void ImProcFunctions::transform_sep_ (Image16* original, Image16* transformed, const ProcParams* params, STemp sizes, int row_from, int row_to) {
+
+  int oW = sizes.oW;
+  int oH = sizes.oH;
+  int cx = sizes.cx;
+  int cy = sizes.cy;
+  int sx = sizes.sx;
+  int sy = sizes.sy;
+
+  double  w2 = (double) oW  / 2.0 - 0.5;
+  double  h2 = (double) oH  / 2.0 - 0.5;
+
+  double cost = cos(params->rotate.degree * 3.14/180.0);
+  double sint = sin(params->rotate.degree * 3.14/180.0);
+
+  double  max_x = (double) (sx + original->width - 1);
+  double  max_y = (double) (sy + original->height - 1);
+  double  min_x = (double) sx;
+  double  min_y = (double) sy;
+
+  const int n2 = 2;
+  const int n = 4;
+
+  int mix  = original->width - 1; // maximum x-index src
+  int miy  = original->height - 1;// maximum y-index src
+  int mix2 = mix +1 - n;
+  int miy2 = miy +1 - n;
+
+  double scale = (oW>oH) ? (double)oW / 2.0 : (double)oH / 2.0 ;
+  double radius = sqrt( (double)( oW*oW + oH*oH ) );
+  radius /= (oW<oH) ? oW : oH;
+
+  double a = params->distortion.amount;
+    double d = 1.0 - a;
+
+    double cdist[3];
+    cdist[0] = params->cacorrection.red;
+    cdist[1] = 0.0;
+    cdist[2] = params->cacorrection.blue;
+
+    // magnify image to keep size
+    double rotmagn = 1.0;
+    if (params->rotate.fill) {
+        double beta = atan((double)MIN(oH,oW)/MAX(oW,oH));
+        rotmagn = sin(beta) / sin(fabs(params->rotate.degree) * 3.14/180.0 + beta);
+    }
+    // 1. check upper and lower border
+    double d1r = rotmagn - a*h2/scale - params->cacorrection.red;
+    double d2r = rotmagn - a*w2/scale - params->cacorrection.red;
+    double d3r = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.red;
+    double dr = MIN(d,MIN(d1r,MIN(d2r,d3r)));
+    double d1b = rotmagn - a*h2/scale - params->cacorrection.blue;
+    double d2b = rotmagn - a*w2/scale - params->cacorrection.blue;
+    double d3b = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.blue;
+    double db = MIN(d,MIN(d1b,MIN(d2b,d3b)));
+    double d1g = rotmagn - a*h2/scale;
+    double d2g = rotmagn - a*w2/scale;
+    double d3g = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
+    double dg = MIN(d,MIN(d1g,MIN(d2g,d3g)));
+
+    d = MIN(dg,MIN(dr,db));
+
+    unsigned short** chorig[3];
+    chorig[0] = original->r;
+    chorig[1] = original->g;
+    chorig[2] = original->b;
+
+    unsigned short** chtrans[3];
+    chtrans[0] = transformed->r;
+    chtrans[1] = transformed->g;
+    chtrans[2] = transformed->b;
+
+
+    // auxilary variables for vignetting
+    double maxRadius = sqrt( (double)( oW*oW + oH*oH ) ) / 2 / scale;
+
+    double v = 1.0 - params->vignetting.amount * 3.0 / 400.0;
+    double b = 1.0 + params->vignetting.radius * 7.0 / 100.0;
+
+    double mul = (1.0-v) / tanh(b);
+
+    // main cycle
+    double eps = 1e-10;
+    for (int y=row_from; y<row_to; y++) {
+        double y_d = (double) (y + cy) - h2 ;
+        for (int x=0; x<transformed->width; x++) {
+            double x_d = (double) (x + cx) - w2 ;
+
+            double r = (sqrt(x_d*x_d + y_d*y_d)) / scale;
+            double s = 10000.0;
+            if (r<radius)
+	            s = a * r + d;
+
+            double vignmul = 1.0 / (v + mul * tanh (b*(maxRadius-s*r) / maxRadius));
+
+            for (int c=0; c<3; c++) {
+
+                double Dx = (s + cdist[c]) * (x_d * cost - y_d * sint) + w2;
+                double Dy = (s + cdist[c]) * (x_d * sint + y_d * cost) + h2;
+
+                if (fabs(Dx)<eps) Dx = 0;
+                if (fabs(Dy)<eps) Dy = 0;
+                if (fabs(Dx-max_x)<eps) Dx = nextafter(max_x,0);
+                if (fabs(Dy-max_y)<eps) Dy = nextafter(max_y,0);
+
+                bool valid = !((Dx >= max_x)   || (Dy >= max_y) || (Dx < min_x) || (Dy < min_y));
+
+                // Convert only valid pixels
+                if (valid) {
+                    // Extract integer and fractions of source screen coordinates
+                    int xc  =  (int) (Dx); Dx -= (double)xc;
+                    int yc  =  (int) (Dy); Dy -= (double)yc;
+                    int ys = yc +1 - n2 - sy; // smallest y-index used for interpolation
+                    int xs = xc +1 - n2 - sx; // smallest x-index used for interpolation
+
+                    if (ys >= 0 && ys <= miy2 && xs >= 0 && xs <= mix2)  // all interpolation pixels inside image
+                        cubintch (chorig[c], xs, ys, Dx, Dy, &(chtrans[c][y][x]), vignmul);
+                    else {// edge pixels, linear interpolation
+                        int y1 = (yc>0) ? yc : 0;
+                        if (y1>miy) y1 = miy;
+                        int y2 = (yc<miy) ? yc+1 : miy;
+                        if (y2<0) y2 = 0;
+                        int x1 = (xc>0) ? xc : 0;
+                        if (x1>mix) x1 = mix;
+                        int x2 = (xc<mix) ? xc+1 : mix;
+                        if (x2<0) x2 = 0;
+                        int val = vignmul*(chorig[c][y1][x1]*(1.0-Dx)*(1.0-Dy) + chorig[c][y1][x2]*Dx*(1.0-Dy) + chorig[c][y2][x1]*(1.0-Dx)*Dy + chorig[c][y2][x2]*Dx*Dy);
+                        chtrans[c][y][x] = CLIP(val);
+                    }
+                }
+                else // not valid (source pixel x,y not inside source image, etc.)
+                    chtrans[c][y][x] = 0;
+            }
+        }
+    }
+}
+
+bool ImProcFunctions::transCoord (const ProcParams* params, int W, int H, std::vector<Coord2D> &src, std::vector<Coord2D> &red,  std::vector<Coord2D> &green, std::vector<Coord2D> &blue) {
+
+    bool clipresize = true;
+    bool clipped = false;
+
+    red.clear ();
+    green.clear ();
+    blue.clear ();
+    bool needstransform  = 0;// fabs(params->rotate.degree)>1e-15 || fabs(params->distortion.amount)>1e-15 || fabs(params->cacorrection.red)>1e-15 || fabs(params->cacorrection.blue)>1e-15;
+    if (!needstransform) {
+        if (clipresize) {
+            // Apply resizing
+            if (fabs(params->resize.scale-1.0)>=1e-7) {
+                for (int i=0; i<src.size(); i++) {
+                    red.push_back   (Coord2D (src[i].x / params->resize.scale, src[i].y / params->resize.scale));
+                    green.push_back (Coord2D (src[i].x / params->resize.scale, src[i].y / params->resize.scale));
+                    blue.push_back  (Coord2D (src[i].x / params->resize.scale, src[i].y / params->resize.scale));
+                }
+                for (int i=0; i<src.size(); i++) {
+                    red[i].x = CLIPTOC(red[i].x,0,W-1,clipped);
+                    red[i].y = CLIPTOC(red[i].y,0,H-1,clipped);
+                    green[i].x = CLIPTOC(green[i].x,0,W-1,clipped);
+                    green[i].y = CLIPTOC(green[i].y,0,H-1,clipped);
+                    blue[i].x = CLIPTOC(blue[i].x,0,W-1,clipped);
+                    blue[i].y = CLIPTOC(blue[i].y,0,H-1,clipped);
+                }
+            }
+            else
+                for (int i=0; i<src.size(); i++) {
+                    red.push_back   (Coord2D (src[i].x, src[i].y));
+                    green.push_back (Coord2D (src[i].x, src[i].y));
+                    blue.push_back  (Coord2D (src[i].x, src[i].y));
+                }
+        }
+        return clipped;
+    }
+    double rW = W*params->resize.scale;
+    double rH = H*params->resize.scale;
+    double  w2 = (double) rW  / 2.0 - 0.5;
+    double  h2 = (double) rH  / 2.0 - 0.5;
+    double cost = cos(params->rotate.degree * 3.14/180.0);
+    double sint = sin(params->rotate.degree * 3.14/180.0);
+
+    double scale = (rW>rH) ? rW / 2.0 : rH / 2.0 ;
+    double radius = sqrt ((double)(rW*rW + rH*rH ));
+    radius /= (rW<rH) ? rW : rH;
+    double a = params->distortion.amount;
+    double d = 1.0 - a;
+
+    // magnify image to keep size
+    double rotmagn = 1.0;
+    if (params->rotate.fill) {
+        double beta = atan(MIN(rH,rW)/MAX(rW,rH));
+        rotmagn = sin(beta) / sin(fabs(params->rotate.degree) * 3.14/180.0 + beta);
+    }
+    if (params->cacorrection.red==0 && params->cacorrection.blue==0) {
+        // 1. check upper and lower border
+        double d1 = rotmagn - a*h2/scale;
+        double d2 = rotmagn - a*w2/scale;
+        double d3 = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
+        d = MIN(d,MIN(d1,MIN(d2,d3)));
+
+        for (int i=0; i<src.size(); i++) {
+            double y_d = src[i].y - h2 ;
+            double x_d = src[i].x - w2 ;
+            double r = (sqrt(x_d*x_d + y_d*y_d)) / scale;
+            double s = 10000.0;
+            if (r<radius)
+                s = a * r + d;
+            red.push_back (Coord2D(s*(x_d * cost - y_d * sint) + w2, s*(x_d * sint + y_d * cost) + h2));
+            green.push_back (Coord2D(s*(x_d * cost - y_d * sint) + w2, s*(x_d * sint + y_d * cost) + h2));
+            blue.push_back (Coord2D(s*(x_d * cost - y_d * sint) + w2, s*(x_d * sint + y_d * cost) + h2));
+        }
+    }
+    else {
+        double cdist[3];
+        cdist[0] = params->cacorrection.red;
+        cdist[1] = 0.0;
+        cdist[2] = params->cacorrection.blue;
+
+        // 1. check upper and lower border
+        double d1r = rotmagn - a*h2/scale - params->cacorrection.red;
+        double d2r = rotmagn - a*w2/scale - params->cacorrection.red;
+        double d3r = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.red;
+        double dr = MIN(d,MIN(d1r,MIN(d2r,d3r)));
+        double d1b = rotmagn - a*h2/scale - params->cacorrection.blue;
+        double d2b = rotmagn - a*w2/scale - params->cacorrection.blue;
+        double d3b = rotmagn - a*sqrt(h2*h2+w2*w2) / scale - params->cacorrection.blue;
+        double db = MIN(d,MIN(d1b,MIN(d2b,d3b)));
+        double d1g = rotmagn - a*h2/scale;
+        double d2g = rotmagn - a*w2/scale;
+        double d3g = rotmagn - a*sqrt(h2*h2+w2*w2) / scale;
+        double dg = MIN(d,MIN(d1g,MIN(d2g,d3g)));
+
+        d = MIN(dg,MIN(dr,db));
+
+        for (int i=0; i<src.size(); i++) {
+            double y_d = src[i].y - h2 ;
+            double x_d = src[i].x - w2 ;
+            double r = (sqrt(x_d*x_d + y_d*y_d)) / scale;
+            double s = 10000.0;
+            if (r<radius)
+                s = a * r + d;
+            src[i].x = s*(x_d * cost - y_d * sint) + w2;
+            src[i].y  = s*(x_d * sint + y_d * cost) + h2;
+
+            red.push_back (Coord2D((s+cdist[0])*(x_d * cost - y_d * sint) + w2, (s+cdist[0])*(x_d * sint + y_d * cost) + h2));
+            green.push_back (Coord2D((s+cdist[1])*(x_d * cost - y_d * sint) + w2, (s+cdist[1])*(x_d * sint + y_d * cost) + h2));
+            blue.push_back (Coord2D((s+cdist[2])*(x_d * cost - y_d * sint) + w2, (s+cdist[2])*(x_d * sint + y_d * cost) + h2));
+        }
+    }
+
+    if (clipresize) {
+        if (fabs(params->resize.scale-1.0)>=1e-7) {
+            for (int i=0; i<src.size(); i++) {
+                red[i].x /= params->resize.scale;
+                red[i].y /= params->resize.scale;
+                green[i].x /= params->resize.scale;
+                green[i].y /= params->resize.scale;
+                blue[i].x /= params->resize.scale;
+                blue[i].y /= params->resize.scale;
+            }
+        }
+        for (int i=0; i<src.size(); i++) {
+            red[i].x = CLIPTOC(red[i].x,0,W-1,clipped);
+            red[i].y = CLIPTOC(red[i].y,0,H-1,clipped);
+            green[i].x = CLIPTOC(green[i].x,0,W-1,clipped);
+            green[i].y = CLIPTOC(green[i].y,0,H-1,clipped);
+            blue[i].x = CLIPTOC(blue[i].x,0,W-1,clipped);
+            blue[i].y = CLIPTOC(blue[i].y,0,H-1,clipped);
+        }
+    }
+    return clipped;
+}
+
+bool ImProcFunctions::transCoord (const ProcParams* params, int W, int H, int x, int y, int w, int h, int& xv, int& yv, int& wv, int& hv) {
+
+    int x1 = x, y1 = y;
+    int x2 = x1 + w - 1;
+    int y2 = y1 + h - 1;
+
+    std::vector<Coord2D> corners (8);
+    corners[0].set (x1, y1);
+    corners[1].set (x1, y2);
+    corners[2].set (x2, y2);
+    corners[3].set (x2, y1);
+    corners[4].set ((x1+x2)/2, y1);
+    corners[5].set ((x1+x2)/2, y2);
+    corners[6].set (x1, (y1+y2)/2);
+    corners[7].set (x2, (y1+y2)/2);
+
+    std::vector<Coord2D> r, g, b;
+
+    bool result = transCoord (params, W, H, corners, r, g, b);
+
+    std::vector<Coord2D> transCorners;
+    transCorners.insert (transCorners.end(), r.begin(), r.end());
+    transCorners.insert (transCorners.end(), g.begin(), g.end());
+    transCorners.insert (transCorners.end(), b.begin(), b.end());
+
+    double x1d = transCorners[0].x;
+    for (int i=1; i<transCorners.size(); i++)
+        if (transCorners[i].x<x1d)
+            x1d = transCorners[i].x;
+   int x1v = (int)(x1d);
+
+    double y1d = transCorners[0].y;
+    for (int i=1; i<transCorners.size(); i++)
+        if (transCorners[i].y<y1d)
+            y1d = transCorners[i].y;
+    int y1v = (int)(y1d);
+
+    double x2d = transCorners[0].x;
+    for (int i=1; i<transCorners.size(); i++)
+        if (transCorners[i].x>x2d)
+            x2d = transCorners[i].x;
+    int x2v = (int)ceil(x2d);
+
+    double y2d = transCorners[0].y;
+    for (int i=1; i<transCorners.size(); i++)
+        if (transCorners[i].y>y2d)
+            y2d = transCorners[i].y;
+    int y2v = (int)ceil(y2d);
+
+    xv = x1v;
+    yv = y1v;
+    wv = x2v - x1v + 1;
+    hv = y2v - y1v + 1;
+
+    return result;
+}
+
+void ImProcFunctions::transform (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int sx, int sy, int oW, int oH) {
+
+    STemp sizes;
+    sizes.cx = 0;//cx;
+    sizes.cy = 0;//cy;
+    sizes.oW = oW;
+    sizes.oH = oH;
+    sizes.sx = 0;//sx;
+    sizes.sy = 0;//sy;
+
+    if (params->cacorrection.red==0 && params->cacorrection.blue==0) {
+        if (settings->dualThreadEnabled) {
+            Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::transform_), original, transformed, params, sizes, 0, transformed->height/2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+            Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::transform_), original, transformed, params, sizes, transformed->height/2, transformed->height), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+            thread1->join ();
+            thread2->join ();
+        }
+        else
+            transform_ (original, transformed, params, sizes, 0, transformed->height);
+    }
+    else {
+        if (settings->dualThreadEnabled) {
+            Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::transform_sep_), original, transformed, params, sizes, 0, transformed->height/2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+            Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::transform_sep_), original, transformed, params, sizes, transformed->height/2, transformed->height), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+            thread1->join ();
+            thread2->join ();
+        }
+        else
+            transform_sep_ (original, transformed, params, sizes, 0, transformed->height);
+    }
+}
+
+void ImProcFunctions::simpltransform (Image16* original, Image16* transformed, const ProcParams* params, int cx, int cy, int sx, int sy, int oW, int oH) {
+
+    STemp sizes;
+    sizes.cx = 0;//cx;
+    sizes.cy = 0;//cy;
+    sizes.oW = oW;
+    sizes.oH = oH;
+    sizes.sx = 0;//sx;
+    sizes.sy = 0;//sy;
+
+    if (settings->dualThreadEnabled) {
+        Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::simpltransform_), original, transformed, params, sizes, 0, transformed->height/2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+        Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::mem_fun(*this, &ImProcFunctions::simpltransform_), original, transformed, params, sizes, transformed->height/2, transformed->height), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
+        thread1->join ();
+        thread2->join ();
+    }
+    else
+        simpltransform_ (original, transformed, params, sizes, 0, transformed->height);
+}
+/*void ImProcFunctions::transform (Image16* original, Image16* transformed, const ProcParams* params, int ox, int oy) {
+
+  if (!transformed)
+    return;
+
+  int oW = W, oH = H, tW = W, tH = H;
+
+  double  w2 = (double) tW / 2.0 - 0.5;
+  double  h2 = (double) tH / 2.0 - 0.5;
+  double  sw2 = (double) oW  / 2.0 - 0.5;
+  double  sh2 = (double) oH  / 2.0 - 0.5;
+
+  double cost = cos(params->rotate_fine * 3.14/180.0);
+  double sint = sin(params->rotate_fine * 3.14/180.0);
+
+  double  max_x = (double) oW;
+  double  max_y = (double) oH;
+  double  min_x =  0.0;
+  double  min_y =  0.0;
+
+  const int n2 = 2;
+  const int n = 4;
+
+  int mix  = oW - 1; // maximum x-index src
+  int miy  = oH - 1;// maximum y-index src
+  int mix2 = mix +1 - n;
+  int miy2 = miy +1 - n;
+
+  double scale = (tW>tH) ? (double)tW / 2.0 : (double)tH / 2.0 ;
+  double radius = sqrt( (double)( tW*tW + tH*tH ) );
+  radius /= (tW<tH) ? tW : tH;
+
+  double a = params->lens_distortion;
+
+  for (int y=0; y<transformed->height; y++) {
+    double y_d = (double) y + oy - h2 ;
+
+    for (int x=0; x<transformed->width; x++) {
+      double x_d = (double) x + ox - w2 ;
+
+      double r = (sqrt(x_d*x_d + y_d*y_d)) / scale;
+      double s = 10000.0;
+      if (r<radius)
+	        s = a * r + 1.0 - a;
+
+      double Dx = s*(x_d * cost - y_d * sint) + sw2;
+      double Dy = s*(x_d * sint + y_d * cost) + sh2;
+
+      bool valid = !((Dx >= max_x)   || (Dy >= max_y) || (Dx < min_x) || (Dy < min_y));
+
+      // Convert only valid pixels
+      if (valid) {
+        // Extract integer and fractions of source screen coordinates
+        int xc  =  (int) floor (Dx) ; Dx -= (double)xc;
+        int yc  =  (int) floor (Dy) ; Dy -= (double)yc;
+        int ys = yc +1 - n2 ; // smallest y-index used for interpolation
+        int xs = xc +1 - n2 ; // smallest x-index used for interpolation
+
+        unsigned short sr[2][2], sg[2][2], sb[2][2];
+
+        if (ys >= 0 && ys <= miy2 && xs >= 0 && xs <= mix2)   // all interpolation pixels inside image
+          cubint (original, xs, ys, Dx, Dy, &(transformed->r[y][x]), &(transformed->g[y][x]), &(transformed->b[y][x]));
+        else { // edge pixels
+          transformed->r[y][x] = 0;
+          transformed->g[y][x] = 0;
+          transformed->b[y][x] = 0;
+        }
+      }
+      else {
+        // not valid (source pixel x,y not inside source image, etc.)
+        transformed->r[y][x] = 0;
+        transformed->g[y][x] = 0;
+        transformed->b[y][x] = 0;
+      }
+    }
+  }
+}*/
+
+}
+
--- a/rtengine/labimage.cc
+++ b/rtengine/labimage.cc
@ -0,0 +1,42 @@
+#include <labimage.h>
+namespace rtengine {
+
+LabImage::LabImage (int w, int h) : W(w), H(h), fromImage(false) {
+
+    L = new unsigned short*[H];
+    for (int i=0; i<H; i++)
+        L[i] = new unsigned short[W];
+
+    a = new short*[H];
+    for (int i=0; i<H; i++)
+        a[i] = new short[W];
+
+    b = new short*[H];
+    for (int i=0; i<H; i++)
+        b[i] = new short[W];
+}
+
+LabImage::LabImage (Image16* im) {
+
+    W = im->width;
+    H = im->height;
+    L = im->r;
+    a = (short**) im->g;
+    b = (short**) im->b;
+    fromImage = true;
+}
+
+LabImage::~LabImage () {
+
+    if (!fromImage) {
+        for (int i=0; i<H; i++) {
+            delete [] L[i];
+            delete [] a[i];
+            delete [] b[i];
+        }
+        delete [] L;
+        delete [] a;
+        delete [] b;
+    }
+}
+}
--- a/rtengine/bilateral2.cc
+++ b/rtengine/bilateral2.cc
@ -16,20 +16,26 @@
 *  You should have received a copy of the GNU General Public License
 *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include <bilateral2.h>
+#ifndef _LABIMAGE_H_
+#define _LABIMAGE_H_

-void bilateral_unsigned (unsigned short** src, unsigned short** dst, unsigned short** buffer, Dim dim, double sigma, double sens) {
+#include <image16.h>

-    bilateral<unsigned short, unsigned int> (src, dst, buffer, dim, sigma, sens);
+namespace rtengine {
+
+class LabImage {
+    private:
+        bool fromImage;
+
+    public:
+        int W, H;
+        unsigned short** L;
+        short** a;
+        short** b;
+
+     LabImage (int w, int h);
+     LabImage (Image16* im);
+    ~LabImage ();
+};
 }
-
-void bilateral_signed (short** src, short** dst, short** buffer, Dim dim, double sigma, double sens) {
-
-    bilateral<short, int> (src, dst, buffer, dim, sigma, sens);
-}
-
-void bilateral_box_unsigned (unsigned short** src, unsigned short** dst, int W, int H, int sigmar, double sigmas, bilateralparams row) {
-
-    bilateral<unsigned short> (src, dst, W, H, sigmar, sigmas, row.row_from, row.row_to);
-}
-
+#endif
--- a/rtengine/rtthumbnail.cc
+++ b/rtengine/rtthumbnail.cc
@ -206,7 +206,7 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
        gmi = 1024.0 * gm / mul_lum;
        bmi = 1024.0 * bm / mul_lum;  
    }
-    // resize to requested with and perform coarse transformation
+    // resize to requested width and perform coarse transformation
    int rwidth;
    if (params.coarse.rotate==90 || params.coarse.rotate==270) {
        rwidth = rheight;
@ -269,7 +269,9 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
    int fw = baseImg->width;
    int fh = baseImg->height;

-    ImProcFunctions ipf;
+    ImProcFunctions ipf (&params, false);
+    ipf.setScale (sqrt(double(fw*fw+fh*fh))/sqrt(double(thumbImg->width*thumbImg->width+thumbImg->height*thumbImg->height))*scale);
+
    unsigned int* hist16 = new unsigned int [65536];
    ipf.firstAnalysis (baseImg, &params, hist16, isRaw ? 2.2 : 0.0);

@ -299,7 +301,7 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
            for (int i=0; i<fh; i++)
                buffer[i] = new unsigned short[fw];
        }
-        shmap = new SHMap (fw, fh);
+        shmap = new SHMap (fw, fh, false);
        double radius = sqrt (double(fw*fw+fh*fh)) / 2.0;
        double shradius = radius / 1800.0 * params.sh.radius;
        shmap->update (baseImg, buffer, shradius, ipf.lumimul, params.sh.hq);
@ -321,7 +323,7 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
    CurveFactory::complexCurve (br, bl/65535.0, params.toneCurve.hlcompr, params.toneCurve.shcompr, params.toneCurve.brightness, params.toneCurve.contrast, logDefGain, isRaw ? 2.2 : 0, true, params.toneCurve.curve, hist16, curve, NULL, 16);

    LabImage* labView = new LabImage (baseImg);
-    ipf.rgbProc (baseImg, labView, &params, curve, shmap);
+    ipf.rgbProc (baseImg, labView, curve, shmap);

    if (shmap)
        delete shmap;
@ -341,12 +343,11 @@ IImage8* Thumbnail::processImage (const procparams::ProcParams& params, int rhei
    delete [] hist16;

    // color processing
-    ipf.colorCurve (labView, labView, &params);
+    ipf.colorCurve (labView, labView);

    // obtain final image
    Image8* readyImg = new Image8 (fw, fh);
    ipf.lab2rgb (labView, readyImg);
-    ipf.release ();
    delete baseImg;

    // calculate scale
@ -453,8 +454,7 @@ void Thumbnail::getSpotWB (const procparams::ProcParams& params, int xp, int yp,
        fw = thumbImg->height;
        fh = thumbImg->width;
    }
-    ImProcFunctions ipf;  
-    ipf.transCoord (&params, fw, fh, points, red, green, blue);
+    ImProcFunctions::transCoord (&params, fw, fh, points, red, green, blue);
    int tr = TR_NONE;
    if (params.coarse.rotate==90)  tr |= TR_R90;
    if (params.coarse.rotate==180) tr |= TR_R180;
--- a/rtengine/shmap.cc
+++ b/rtengine/shmap.cc
@ -29,7 +29,7 @@ namespace rtengine {

 extern const Settings* settings;

-SHMap::SHMap (int w, int h) : W(w), H(h) {
+SHMap::SHMap (int w, int h, bool multiThread) : W(w), H(h), multiThread(multiThread) {

    map = new unsigned short*[H];
    for (int i=0; i<H; i++)
@ -52,51 +52,27 @@ void SHMap::update (Image16* img, unsigned short** buffer, double radius, double
 			map[i][j] = CLIP(val);
 		}

-//MyTime t1,t2;
-//t1.set ();
-
    if (!hq) {
+    	AlignedBuffer<double>* buffer = new AlignedBuffer<double> (MAX(W,H)*omp_get_max_threads());

-        AlignedBuffer<double>* buffer1 = new AlignedBuffer<double> (MAX(W,H)*5);
-        AlignedBuffer<double>* buffer2 = new AlignedBuffer<double> (MAX(W,H)*5);
+    	gaussHorizontal<unsigned short> (map, map, buffer, W, H, radius, multiThread);
+		gaussVertical<unsigned short>   (map, map, buffer, W, H, radius, multiThread);

-        // blur
-        if (settings->dualThreadEnabled) {
-            Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(gaussHorizontal_unsigned), map, map, buffer1, W, 0, H/2, radius), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
-            Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(gaussHorizontal_unsigned), map, map, buffer2, W, H/2, H, radius), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
-            thread1->join ();
-            thread2->join ();
-            thread1 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(gaussVertical_unsigned), map, map, buffer1, H, 0, W/2, radius), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
-            thread2 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(gaussVertical_unsigned), map, map, buffer2, H, W/2, W, radius), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
-            thread1->join ();
-            thread2->join ();
-        }
-        else {
-            gaussHorizontal_unsigned (map, map, buffer1, W, 0, H, radius);
-            gaussVertical_unsigned (map, map, buffer1, H, 0, W, radius);
-        }    
-
-        delete buffer1;
-        delete buffer2;
+        delete buffer;
    }
    else {
-        if (settings->dualThreadEnabled) {
-            bilateralparams r1, r2;
-            r1.row_from = 0;
-            r1.row_to = H/2;
-            r2.row_from = H/2;
-            r2.row_to = H;
-            Glib::Thread *thread1 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(bilateral_box_unsigned), map, buffer, W, H, 8000, radius, r1), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
-            Glib::Thread *thread2 = Glib::Thread::create(sigc::bind(sigc::ptr_fun(bilateral_box_unsigned), map, buffer, W, H, 8000, radius, r2), 0, true, true, Glib::THREAD_PRIORITY_NORMAL);
-            thread1->join ();
-            thread2->join ();
-        }
-        else {
-            bilateralparams r1;
-            r1.row_from = 0;
-            r1.row_to = H;
-            bilateral_box_unsigned (map, buffer, W, H, 8000, radius, r1);
-        }
+		#pragma omp parallel if (multiThread)
+    	{
+    		int tid = omp_get_thread_num();
+    		int nthreads = omp_get_num_threads();
+    		int blk = H/nthreads;
+
+    		if (tid<nthreads-1)
+    			bilateral<unsigned short> (map, buffer, W, H, 8000, radius, tid*blk, (tid+1)*blk);
+    		else
+    			bilateral<unsigned short> (map, buffer, W, H, 8000, radius, tid*blk, H);
+		}
+        // anti-alias filtering the result
        for (int i=0; i<H; i++)
            for (int j=0; j<W; j++)
                if (i>0 && j>0 && i<H-1 && j<W-1)
@ -105,9 +81,6 @@ void SHMap::update (Image16* img, unsigned short** buffer, double radius, double
                    map[i][j] = buffer[i][j];
    }

-//    t2.set ();
-//    printf ("shmap: %d\n", t2.etime (t1));
-
    // update average, minimum, maximum
    double _avg = 0;
    int n = 1;
--- a/rtengine/shmap.h
+++ b/rtengine/shmap.h
@ -29,8 +29,9 @@ class SHMap {
        int W, H;
        unsigned short** map;
        unsigned short   max, min, avg;
+        bool multiThread;
        
-     SHMap (int w, int h);
+     SHMap (int w, int h, bool multiThread);
    ~SHMap ();

    void update (Image16* img, unsigned short** buffer, double radius, double lumi[3], bool hq);
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@ -69,7 +69,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
    int fw, fh;
    imgsrc->getFullSize (fw, fh, tr);

-    ImProcFunctions ipf;
+    ImProcFunctions ipf (&params, true);
    
    Image16* baseImg;
    PreviewProps pp (0, 0, fw, fh, 1);
@ -83,7 +83,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
        fw *= params.resize.scale;
        fh *= params.resize.scale;
        baseImg = new Image16 (fw, fh);
-        ipf.resize (oorig, baseImg, params.resize);
+        ipf.resize (oorig, baseImg);
        delete oorig;
    }
    if (pl) 
@ -117,7 +117,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p

    SHMap* shmap = NULL;
    if (params.sh.enabled) {
-        shmap = new SHMap (fw, fh);
+        shmap = new SHMap (fw, fh, true);
        double radius = sqrt (double(fw*fw+fh*fh)) / 2.0;
        double shradius = radius / 1800.0 * params.sh.radius;
        shmap->update (baseImg, (unsigned short**)buffer, shradius, ipf.lumimul, params.sh.hq);
@ -138,7 +138,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
    CurveFactory::complexCurve (br, bl/65535.0, params.toneCurve.hlcompr, params.toneCurve.shcompr, params.toneCurve.brightness, params.toneCurve.contrast, imgsrc->getDefGain(), imgsrc->getGamma(), true, params.toneCurve.curve, hist16, curve, NULL);

    LabImage* labView = new LabImage (baseImg);
-    ipf.rgbProc (baseImg, labView, &params, curve, shmap);
+    ipf.rgbProc (baseImg, labView, curve, shmap);

    if (shmap)
        delete shmap;
@ -155,15 +155,15 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
    // luminance processing
    CurveFactory::complexCurve (0.0, 0.0, 0.0, 0.0, params.lumaCurve.brightness, params.lumaCurve.contrast, 0.0, 0.0, false, params.lumaCurve.curve, hist16, curve, NULL);
    ipf.luminanceCurve (labView, labView, curve, 0, fh);
-    ipf.lumadenoise (labView, &params, 1, buffer);
-    ipf.sharpening (labView, &params, 1, (unsigned short**)buffer);
+    ipf.lumadenoise (labView, buffer);
+    ipf.sharpening (labView, (unsigned short**)buffer);

    delete [] curve;
    delete [] hist16;

    // color processing
-    ipf.colorCurve (labView, labView, &params);
-    ipf.colordenoise (labView, &params, 1, buffer);
+    ipf.colorCurve (labView, labView);
+    ipf.colordenoise (labView, buffer);

    for (int i=0; i<fh; i++)
        delete [] buffer[i];
@ -182,7 +182,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
        ch = params.crop.h;
    }
    readyImg = ipf.lab2rgb16 (labView, cx, cy, cw, ch, params.icm.output);
-    ipf.release ();

    if (pl) 
        pl->setProgress (1.0);