diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index f1d331acd..0b4b90a31 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -25,7 +25,6 @@
 #include <math.h>
 #include <fftw3.h>
 #include "../rtgui/threadutils.h"
-
 #include "rtengine.h"
 #include "improcfun.h"
 #include "LUT.h"
@@ -36,19 +35,12 @@
 #include "mytime.h"
 #include "sleef.c"
 #include "opthelper.h"
+#include "cplx_wavelet_dec.h"
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
-#include "cplx_wavelet_dec.h"
-
-//#define MIN(a,b) ((a) < (b) ? (a) : (b))
-//#define MAX(a,b) ((a) > (b) ? (a) : (b))
-//#define LIM(x,min,max) MAX(min,MIN(x,max))
-//#define ULIM(x,y,z) ((y) < (z) ? LIM(x,y,z) : LIM(x,z,y))
-//#define CLIP(x) LIM(x,0,65535)
-
 #define TS 64		// Tile size
 #define offset 25	// shift between tiles
 #define fTS ((TS/2+1))	// second dimension of Fourier tiles
@@ -139,11 +131,10 @@ extern const Settings* settings;
 // Median calculation using quicksort
 float fq_sort2(float arr[], int n) 
 {
-    int low, high ;
-    int median;
-    int middle, ll, hh;
+    int low = 0;
+    int high = n-1;
+    int median = (low + high) / 2;
 
-    low = 0 ; high = n-1 ; median = (low + high) / 2;
     for (;;) {
         if (high <= low)
             return arr[median] ;
@@ -153,49 +144,47 @@ float fq_sort2(float arr[], int n)
            return arr[median] ;
         }
 
-    middle = (low + high) / 2;
-    if (arr[middle] > arr[high]) ELEM_FLOAT_SWAP(arr[middle], arr[high]) ;
-    if (arr[low] > arr[high]) ELEM_FLOAT_SWAP(arr[low], arr[high]) ;
-    if (arr[middle] > arr[low]) ELEM_FLOAT_SWAP(arr[middle], arr[low]) ;
+		int middle = (low + high) / 2;
+		if (arr[middle] > arr[high]) ELEM_FLOAT_SWAP(arr[middle], arr[high]) ;
+		if (arr[low] > arr[high]) ELEM_FLOAT_SWAP(arr[low], arr[high]) ;
+		if (arr[middle] > arr[low]) ELEM_FLOAT_SWAP(arr[middle], arr[low]) ;
 
-    ELEM_FLOAT_SWAP(arr[middle], arr[low+1]) ;
-    ll = low + 1;
-    hh = high;
+		ELEM_FLOAT_SWAP(arr[middle], arr[low+1]) ;
+		int ll = low + 1;
+		int hh = high;
 
-    for (;;) {
-        do ll++; while (arr[low] > arr[ll]) ;
-        do hh--; while (arr[hh]  > arr[low]) ;
+		for (;;) {
+			do ll++; while (arr[low] > arr[ll]) ;
+			do hh--; while (arr[hh]  > arr[low]) ;
 
-        if (hh < ll)
-        break;
+			if (hh < ll)
+			break;
 
-        ELEM_FLOAT_SWAP(arr[ll], arr[hh]) ;
-   }
+			ELEM_FLOAT_SWAP(arr[ll], arr[hh]) ;
+	   }
 
-    ELEM_FLOAT_SWAP(arr[low], arr[hh]) ;
+		ELEM_FLOAT_SWAP(arr[low], arr[hh]) ;
 
-    if (hh <= median)
-        low = ll;
-        if (hh >= median)
-        high = hh - 1;
+		if (hh <= median)
+			low = ll;
+			if (hh >= median)
+			high = hh - 1;
     }
  }
 
 float media(float *elements, int N)
 {
-  int i,j,min;
-  float temp;
 
   //   Order elements (only half of them)
-   for (i = 0; i < (N >> 1) + 1; ++i)
+   for (int i = 0; i < (N >> 1) + 1; ++i)
    {
       //   Find position of minimum element
-      min = i;
-      for (j = i + 1; j < N; ++j)
+      int min = i;
+      for (int j = i + 1; j < N; ++j)
          if (elements[j] < elements[min])
             min = j;
       //   Put found minimum element in its place
-      temp = elements[i];
+      float temp = elements[i];
       elements[i] = elements[min];
       elements[min] = temp;
    }
@@ -242,7 +231,7 @@ void ImProcFunctions::Tile_calc (int tilesize, int overlap, int kall, int imwidt
 }
 
 int denoiseNestedLevels = 1;
-
+enum nrquality {QUALITY_STANDARD, QUALITY_HIGH};
 
 SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst,Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi)
 {
@@ -250,7 +239,6 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
 	MyTime t1e,t2e;
 	t1e.set();
 //#endif
-
 	if (dnparams.luma==0 && dnparams.chroma==0  && !dnparams.median && !noiseLCurve && !noiseCCurve) {
 		//nothing to do; copy src to dst or do nothing in case src == dst
 		if(src != dst)
@@ -261,85 +249,107 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
 		}
 			
 		return;
-	}
+	}
+	
 	static MyMutex FftwMutex;
 	MyMutex::MyLock lock(FftwMutex);
-	int hei,wid;
+	
+	const nrquality nrQuality = (dnparams.smethod == "shal") ? QUALITY_STANDARD : QUALITY_HIGH;//shrink method
+	const float qhighFactor = (nrQuality == QUALITY_HIGH) ? 1.f/(float) settings->nrhigh : 1.0f;
+	const bool useNoiseCCurve = (noiseCCurve && noiseCCurve.getSum() > 5.f );
+	const bool useNoiseLCurve = (noiseLCurve && noiseLCurve.getSum() >= 7.f );
+
+	int hei;
 	float** lumcalc;
-	float** acalc;
-	float** bcalc;
-	if(noiseLCurve || noiseCCurve) {
+	float** ccalc;
+
+	bool ponder=false;
+	float ponderCC=1.f;
+	if(settings->leveldnautsimpl==1 && params->dirpyrDenoise.Cmethod=="PON") {ponder=true;ponderCC=0.5f;}
+	if(settings->leveldnautsimpl==1 && params->dirpyrDenoise.Cmethod=="PRE") {ponderCC=0.5f;}
+	if(settings->leveldnautsimpl==0 && params->dirpyrDenoise.Cmethod=="PREV") {ponderCC=0.5f;}
+
+	const bool denoiseMethodRgb = (dnparams.dmethod == "RGB");
+
+	// init luma noisevarL
+	const float noiseluma=(float) dnparams.luma;
+	const float noisevarL = (useNoiseLCurve && (denoiseMethodRgb || !isRAW)) ? (float) (SQR(((noiseluma+1.0)/125.0)*(10.+ (noiseluma+1.0)/25.0))) : (float) (SQR((noiseluma/125.0)*(1.0+ noiseluma/25.0)));
+
+	if(useNoiseLCurve || useNoiseCCurve) {
 		hei=calclum->height;
-		wid=calclum->width;
+		int wid=calclum->width;
 		TMatrix wprofi = iccStore->workingSpaceMatrix (params->icm.working);
 
-		double wpi[3][3] = {
-			{wprofi[0][0],wprofi[0][1],wprofi[0][2]},
-			{wprofi[1][0],wprofi[1][1],wprofi[1][2]},
-			{wprofi[2][0],wprofi[2][1],wprofi[2][2]}
+		const float wpi[3][3] = {
+			{static_cast<float>(wprofi[0][0]),static_cast<float>(wprofi[0][1]),static_cast<float>(wprofi[0][2])},
+			{static_cast<float>(wprofi[1][0]),static_cast<float>(wprofi[1][1]),static_cast<float>(wprofi[1][2])},
+			{static_cast<float>(wprofi[2][0]),static_cast<float>(wprofi[2][1]),static_cast<float>(wprofi[2][2])}
 			};
-		lumcalc = new float*[(hei+1)/2];
-			for (int i=0; i<(hei+1)/2; i++)
-				lumcalc[i] = new float[(wid+1)/2];
-		acalc = new float*[(hei+1)/2];
-			for (int i=0; i<(hei+1)/2; i++)
-				acalc[i] = new float[(wid+1)/2];
-		bcalc = new float*[(hei+1)/2];
-			for (int i=0; i<(hei+1)/2; i++)
-				bcalc[i] = new float[(wid+1)/2];
+		lumcalc = new float*[(hei)];
+			for (int i=0; i<(hei); i++)
+				lumcalc[i] = new float[(wid)];
+		ccalc = new float*[(hei)];
+			for (int i=0; i<(hei); i++)
+				ccalc[i] = new float[(wid)];
+
+		float cn100Precalc;
+		if(useNoiseCCurve)
+			cn100Precalc = SQR(1.f + ponderCC*(4.f*noiseCCurve[100.f / 60.f]));
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif		
-		for(int ii=0;ii<hei;ii+=2){
-			for(int jj=0;jj<wid;jj+=2){
+		for(int ii=0;ii<hei;ii++){
+			for(int jj=0;jj<wid;jj++){
 				float LLum,AAum,BBum;
 			
 				float RL = calclum->r(ii,jj);
 				float GL = calclum->g(ii,jj);
 				float BL = calclum->b(ii,jj);
-							// determine luminance for noisecurve
+				// determine luminance and chrominance for noisecurves
 				float XL,YL,ZL;
 				Color::rgbxyz(RL,GL,BL,XL,YL,ZL,wpi);
-				Color::XYZ2Lab(XL, YL, ZL, LLum, AAum, BBum);							
-				lumcalc[ii>>1][jj>>1]=LLum;
-				acalc[ii>>1][jj>>1]=AAum;
-				bcalc[ii>>1][jj>>1]=BBum;
+				Color::XYZ2Lab(XL, YL, ZL, LLum, AAum, BBum);
+				if(useNoiseLCurve) {
+					float epsi = 0.01f;
+					if(LLum<2.f)
+						LLum = 2.f;//avoid divided by zero
+					if(LLum>32768.f)
+						LLum = 32768.f;	// not strictly necessary
+					float kinterm = epsi + noiseLCurve[xdivf(LLum,15)*500.f];
+					kinterm *= 100.f;
+					kinterm += noiseluma;
+					lumcalc[ii][jj] = SQR((kinterm/125.f)*(1.f+kinterm/25.f));
+				}
+				if(useNoiseCCurve) {
+					float cN = sqrtf(SQR(AAum)+SQR(BBum));
+					if(cN > 100)
+						ccalc[ii][jj] = SQR(1.f + ponderCC*(4.f*noiseCCurve[cN / 60.f]));
+					else
+						ccalc[ii][jj] = cn100Precalc;
+				}
 			}
-		}
+		}
+
 	delete calclum;
 	calclum = NULL;
-}		
-
-	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-	/*if (plistener) {
-	 plistener->setProgressStr ("Denoise...");
-	 plistener->setProgress (0.0);
-	 }*/
-
-//		volatile double progress = 0.0;
-
-	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
+	}
+
 	const short int imheight=src->height, imwidth=src->width;
-	Qhigh=1.0f;
-	if(dnparams.smethod=="shalbi")
-		Qhigh=1.f/(float) settings->nrhigh;
+
 	if (dnparams.luma!=0 || dnparams.chroma!=0 || dnparams.methodmed=="Lab" || dnparams.methodmed=="Lonly" ) {
-		const bool perf = (dnparams.dmethod=="RGB");
 		// gamma transform for input data
 		float gam = dnparams.gamma;
 		float gamthresh = 0.001f;
 		if(!isRAW) {//reduce gamma under 1 for Lab mode ==> TIF and JPG
-			if(gam <1.9f)
-				gam=1.f - (1.9f-gam)/3.f;//minimum gamma 0.7
+			if(gam < 1.9f)
+				gam = 1.f - (1.9f-gam)/3.f;//minimum gamma 0.7
 			else if (gam >= 1.9f && gam <= 3.f)
-				gam=(1.4f/1.1f)*gam - 1.41818f;
+				gam = (1.4f/1.1f)*gam - 1.41818f;
 		}
 		float gamslope = exp(log((double)gamthresh)/gam)/gamthresh;
 
 		LUTf gamcurve(65536,LUT_CLIP_BELOW);
-		if(perf) {
+		if(denoiseMethodRgb) {
 			for (int i=0; i<65536; i++) {
 				gamcurve[i] = (Color::gamma((double)i/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f;
 			}
@@ -355,7 +365,7 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
 		float igamslope = 1.f/gamslope;
 
 		LUTf igamcurve(65536,LUT_CLIP_BELOW);
-		if(perf) {
+		if(denoiseMethodRgb) {
 			for (int i=0; i<65536; i++) {
 				igamcurve[i] = (Color::gamma((float)i/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
 			}
@@ -366,43 +376,58 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
 		}
 
 		const float gain = pow (2.0f, float(expcomp));
-		float incr=1.f;
-		float noisevar_Ldetail = SQR((float)(SQR(100.-dnparams.Ldetail) + 50.*(100.-dnparams.Ldetail)) * TS * 0.5f * incr);
-		bool enhance_denoise = dnparams.enhance;
-		int gamlab = settings->denoiselabgamma;//gamma lab essentialy for Luminance detail
-		if(gamlab > 2)
-			gamlab=2;
+		float noisevar_Ldetail = SQR((float)(SQR(100.-dnparams.Ldetail) + 50.*(100.-dnparams.Ldetail)) * TS * 0.5f);
+
 		if(settings->verbose)
-			printf("Denoise Lab=%i\n",gamlab);
-
+			printf("Denoise Lab=%i\n",settings->denoiselabgamma);
+
+		// To avoid branches in loops we access the gammatabs by pointers
+		// modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10
+		// we can put other as gamma g=2.6 slope=11, etc.
+		// but noting to do with real gamma !!!: it's only for data Lab # data RGB
+		// finally I opted fot gamma55 and with options we can change
+
+		LUTf *denoisegamtab;
+		LUTf *denoiseigamtab;
+		switch(settings->denoiselabgamma) {
+			case 0:	 denoisegamtab = &(Color::gammatab_26_11);
+					 denoiseigamtab = &(Color::igammatab_26_11);
+					 break;
+			case 1:  denoisegamtab = &(Color::gammatab_4);
+					 denoiseigamtab = &(Color::igammatab_4);
+					 break;
+			default: denoisegamtab = &(Color::gammatab_55);
+					 denoiseigamtab = &(Color::igammatab_55);
+					 break;
+		}
+
 		array2D<float> tilemask_in(TS,TS);
 		array2D<float> tilemask_out(TS,TS);
 
-		const int border = MAX(2,TS/16);
-		
-
+		const int border = MAX(2,TS/16);
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-	for (int i=0; i<TS; i++) {
-		float i1 = abs((i>TS/2 ? i-TS+1 : i));
-		float vmask = (i1<border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
-		float vmask2 = (i1<2*border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
-		for (int j=0; j<TS; j++) {
-			float j1 = abs((j>TS/2 ? j-TS+1 : j));
-			tilemask_in[i][j] = (vmask * (j1<border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
-			tilemask_out[i][j] = (vmask2 * (j1<2*border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
+		for (int i=0; i<TS; i++) {
+			float i1 = abs((i>TS/2 ? i-TS+1 : i));
+			float vmask = (i1<border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
+			float vmask2 = (i1<2*border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
+			for (int j=0; j<TS; j++) {
+				float j1 = abs((j>TS/2 ? j-TS+1 : j));
+				tilemask_in[i][j] = (vmask * (j1<border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
+				tilemask_out[i][j] = (vmask2 * (j1<2*border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
 
+			}
 		}
-	}
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	// begin tile processing of image
 
 	//output buffer
-	Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
-	for (int n=0; n<3*imwidth*imheight; n++) dsttmp->data[n] = 0;
-
+	Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
+	for (int n=0; n<3*imwidth*imheight; n++)
+		dsttmp->data[n] = 0;
+
 int tilesize;
 int overlap;
 if(settings->leveldnti ==0) {
@@ -451,12 +476,12 @@ if(settings->leveldnti ==1) {
 	fftwf_free ( Lbloxtmp );
 	fftwf_free ( fLbloxtmp );
 
+#ifndef _OPENMP
 	int numthreads = 1;
-
-#ifdef _OPENMP
+#else
 	// Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
 	int numtiles = numtiles_W * numtiles_H;
-	numthreads = MIN(numtiles,omp_get_max_threads());
+	int numthreads = MIN(numtiles,omp_get_max_threads());
 	if(options.rgbDenoiseThreadLimit > 0)
 		numthreads = MIN(numthreads,options.rgbDenoiseThreadLimit);
 	denoiseNestedLevels = omp_get_max_threads() / numthreads;
@@ -483,17 +508,17 @@ if(settings->leveldnti ==1) {
 	TMatrix wiprof = iccStore->workingSpaceInverseMatrix (params->icm.working);
 	//inverse matrix user select
 	const float wip[3][3] = {
-		{wiprof[0][0],wiprof[0][1],wiprof[0][2]},
-		{wiprof[1][0],wiprof[1][1],wiprof[1][2]},
-		{wiprof[2][0],wiprof[2][1],wiprof[2][2]}
+		{static_cast<float>(wiprof[0][0]),static_cast<float>(wiprof[0][1]),static_cast<float>(wiprof[0][2])},
+		{static_cast<float>(wiprof[1][0]),static_cast<float>(wiprof[1][1]),static_cast<float>(wiprof[1][2])},
+		{static_cast<float>(wiprof[2][0]),static_cast<float>(wiprof[2][1]),static_cast<float>(wiprof[2][2])}
 	};
 
 	TMatrix wprof = iccStore->workingSpaceMatrix (params->icm.working);
 
 	const float wp[3][3] = {
-		{wprof[0][0],wprof[0][1],wprof[0][2]},
-		{wprof[1][0],wprof[1][1],wprof[1][2]},
-		{wprof[2][0],wprof[2][1],wprof[2][2]}
+		{static_cast<float>(wprof[0][0]),static_cast<float>(wprof[0][1]),static_cast<float>(wprof[0][2])},
+		{static_cast<float>(wprof[1][0]),static_cast<float>(wprof[1][1]),static_cast<float>(wprof[1][2])},
+		{static_cast<float>(wprof[2][0]),static_cast<float>(wprof[2][1]),static_cast<float>(wprof[2][2])}
 	};
 
 
@@ -502,20 +527,15 @@ if(settings->leveldnti ==1) {
 #endif
 	{
 	float resid=0.f;
-	float nbresid=0;
+	float nbresid=0.f;
 	float maxredresid=0.f;
 	float maxblueresid=0.f;
 	float residred=0.f;
 	float residblue=0.f;
 	
-	int pos;
-	float** noisevarlum = new float*[(tileheight+1)/2];
-	for (int i=0; i<(tileheight+1)/2; i++)
-		noisevarlum[i] = new float[(tilewidth+1)/2];
-	float** noisevarchrom = new float*[(tileheight+1)/2];
-	for (int i=0; i<(tileheight+1)/2; i++)
-		noisevarchrom[i] = new float[(tilewidth+1)/2];
-
+	int pos;
+	float* noisevarlum = new float[((tileheight+1)/2)*((tilewidth+1)/2)];
+	float* noisevarchrom = new float[((tileheight+1)/2)*((tilewidth+1)/2)];
 
 #ifdef _OPENMP
 #pragma omp for schedule(dynamic) collapse(2)
@@ -524,31 +544,13 @@ if(settings->leveldnti ==1) {
 	for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
 		for (int tileleft=0; tileleft<imwidth ; tileleft+=tileWskip) {
 			//printf("titop=%d tileft=%d\n",tiletop/tileHskip, tileleft/tileWskip);
-			pos= (tiletop/tileHskip)*numtiles_W + tileleft/tileWskip ;
-	//	    printf("posw=%d\n", pos);
-	/*		if(kall==0) {//with Dcrop
-				//int trafx, int trafy, int trafw, int trafh, // trafx trafy = begin window trafw trafh = width and heigh window
-				//int widIm, int heiIm : width and height full image
-				int centerwCrop=trafx+trafw/2;//center crop window
-				int centerhCrop=trafy+trafh/2;
-				int poscalculate; 
-		//	for(int wcr=0;wcr< ;wcr++) {
-		//	for(int hcr=0;hcr<numtiles_H;hcr++) {
-		//	    int posca= hcr*numtiles_W + wcr;
-		
-			}
-			*/
+			pos = (tiletop/tileHskip)*numtiles_W + tileleft/tileWskip ;
 			int tileright = MIN(imwidth,tileleft+tilewidth);
 			int tilebottom = MIN(imheight,tiletop+tileheight);
 			int width  = tileright-tileleft;
 			int height = tilebottom-tiletop;
-			bool ponder=false;
-			float ponderCC=1.f;
-			if(settings->leveldnautsimpl==1 && params->dirpyrDenoise.Cmethod=="PON") {ponder=true;ponderCC=0.5f;}
-			if(settings->leveldnautsimpl==1 && params->dirpyrDenoise.Cmethod=="PRE") {ponderCC=0.5f;}
-			if(settings->leveldnautsimpl==0 && params->dirpyrDenoise.Cmethod=="PREV") {ponderCC=0.5f;}
-			
-			float realred2, realred, realblue, realblue2;
+			int width2 = (width+1)/2;
+			float realred, realblue;
 			float interm_med =(float) dnparams.chroma/10.0;
 			float intermred, intermblue;
 			if(dnparams.redchro > 0.) intermred=(dnparams.redchro/10.); else intermred= (float) dnparams.redchro/7.0;//increase slower than linear for more sensit
@@ -561,74 +563,50 @@ if(settings->leveldnti ==1) {
 			const float noisevarab_b = SQR(realblue);
 		//	printf("Ch=%f red=%f blu=%f\n",interm_med, intermred,intermblue );
 
+			
+			//pixel weight
+			array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
+
+			//residual between input and denoised L channel
+			array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
+
 			//input L channel
 			array2D<float> Lin(width,height);
 			//wavelet denoised image
 			LabImage * labdn = new LabImage(width,height);
-			float* mad_LL = new float [height*width];
-
-			float* mad_aa = new float [height*width];
-			float* mad_bb = new float [height*width];
-
-			//residual between input and denoised L channel
-			array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
-			//pixel weight
-			array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
-			// init luma noisevarL
-			float noiseluma=(float) dnparams.luma;
-			if(perf && noiseLCurve)
-				noiseluma += 1.f;
-			float noisevarL	 = (float) (SQR((noiseluma/125.0)*(1.+ noiseluma/25.0)));
-		//	printf("nova=%f\n",noisevarL);
-			//TODO: implement using AlignedBufferMP
-			//fill tile from image; convert RGB to "luma/chroma"
+
+			//fill tile from image; convert RGB to "luma/chroma"
+			const float maxNoiseVarab = max(noisevarab_b,noisevarab_r);
 			if (isRAW) {//image is raw; use channel differences for chroma channels
 
-				if(!perf){//lab mode
+				if(!denoiseMethodRgb){//lab mode
 						//modification Jacques feb 2013 and july 2014
 #ifdef _OPENMP
 #pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
 #endif
-				for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
+				for (int i=tiletop; i<tilebottom; i++) {
 					int i1 = i - tiletop;
-					for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+					for (int j=tileleft; j<tileright; j++) {
 						int j1 = j - tileleft;
 						float R_ = gain*src->r(i,j);
 						float G_ = gain*src->g(i,j);
 						float B_ = gain*src->b(i,j);
 
-						//modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10
-						//we can put other as gamma g=2.6 slope=11, etc.
-						// but noting to do with real gamma !!!: it's only for data Lab # data RGB
-						//finally I opted fot gamma55 and with options we can change
-						if (gamlab == 0) {// options 12/2013
-							R_ = Color::igammatab_26_11[R_];
-							G_ = Color::igammatab_26_11[G_];
-							B_ = Color::igammatab_26_11[B_];
-						}
-						else if (gamlab == 1) {
-						//other new gamma 4 5 
-							R_ = Color::igammatab_4[R_];
-							G_ = Color::igammatab_4[G_];
-							B_ = Color::igammatab_4[B_];
-						}
-						else if (gamlab == 2) {							
-						//new gamma 5.5 10 better for detail luminance..it is a compromise...which depends on the image (distribution BL, ML, HL ...)
-							R_ = Color::igammatab_55[R_];
-							G_ = Color::igammatab_55[G_];
-							B_ = Color::igammatab_55[B_];
-						}
+						R_ = (*denoiseigamtab)[R_];
+						G_ = (*denoiseigamtab)[G_];
+						B_ = (*denoiseigamtab)[B_];
+
 						//apply gamma noise	standard (slider)
 						R_ = R_<65535.0f ? gamcurve[R_] : (Color::gammanf(R_/65535.f, gam)*32768.0f);
 						G_ = G_<65535.0f ? gamcurve[G_] : (Color::gammanf(G_/65535.f, gam)*32768.0f);
 						B_ = B_<65535.0f ? gamcurve[B_] : (Color::gammanf(B_/65535.f, gam)*32768.0f);
 						
 						//true conversion xyz=>Lab
-						float L,a,b;
 						float X,Y,Z;
 						Color::rgbxyz(R_,G_,B_,X,Y,Z,wp);
 
 						//convert to Lab
+						float L,a,b;
 						Color::XYZ2Lab(X, Y, Z, L, a, b);
 
 						labdn->L[i1][j1] = L;
@@ -637,28 +615,9 @@ if(settings->leveldnti ==1) {
 					
 						Lin[i1][j1] = L;
 
-						if(((i1|j1)&1) == 0) {
-							if(noiseLCurve) {
-								float kN = lumcalc[i>>1][j>>1]; //with no gamma and take into account working profile
-								float epsi = 0.01f;
-								if(kN<2.f)
-									kN = 2.f;//avoid divided by zero
-								if(kN>32768.f)
-									kN = 32768.f;	// not strictly necessary							
-								float kinterm = epsi + noiseLCurve[xdivf(kN,15)*500.f];
-								float ki = kinterm*100.f;
-								ki += noiseluma;
-								noisevarlum[i1>>1][j1>>1] = SQR((ki/125.f)*(1.f+ki/25.f));
-							}
-							if(noiseCCurve) {
-								float aN = acalc[i>>1][j>>1];
-								float bN = bcalc[i>>1][j>>1];
-								float cN = sqrtf(SQR(aN)+SQR(bN));
-								if(cN < 100.f)
-									cN = 100.f;//avoid divided by zero
-								float Cinterm = 1.f + ponderCC*4.f*noiseCCurve[cN/60.f];//C=f(C)
-								noisevarchrom[i1>>1][j1>>1] = max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
-							}
+						if(((i1|j1)&1) == 0) {
+							noisevarlum[(i1>>1)*width2+(j1>>1)] = useNoiseLCurve ? lumcalc[i>>1][j>>1] : noisevarL;
+							noisevarchrom[(i1>>1)*width2+(j1>>1)] = useNoiseCCurve ? maxNoiseVarab*ccalc[i>>1][j>>1] : 1.f;
 						}
 						//end chroma
 
@@ -669,9 +628,9 @@ if(settings->leveldnti ==1) {
 #ifdef _OPENMP
 #pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
 #endif
-				for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
+				for (int i=tiletop; i<tilebottom; i++) {
 					int i1 = i - tiletop;
-					for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+					for (int j=tileleft; j<tileright; j++) {
 						int j1 = j - tileleft;
 
 						float X = gain*src->r(i,j);
@@ -681,36 +640,17 @@ if(settings->leveldnti ==1) {
 						X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
 						Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
 						Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-						if(((i1|j1)&1) == 0) {
-							if(noiseLCurve) {
-							//	float noiseluma=(float) dnparams.luma;	
-								float kN = lumcalc[i>>1][j>>1];
-								float epsi = 0.01f;
-								if (kN<2.f) 
-									kN = 2.f;
-								if (kN>32768.f)
-									kN = 32768.f;
-								float kinterm = epsi + noiseLCurve[xdivf(kN,15)*500.f];
-								float ki = kinterm*100.f;
-								ki += noiseluma;
-								noisevarlum[i1>>1][j1>>1] = SQR((ki/125.f)*(1.f+ki/25.f));
-							}
-							if(noiseCCurve) {
-								float aN = acalc[i>>1][j>>1];
-								float bN = bcalc[i>>1][j>>1];
-								float cN = sqrtf(SQR(aN)+SQR(bN));
-								if(cN < 100.f)
-									cN = 100.f;//avoid divided by zero
-								float Cinterm = 1.f + ponderCC*4.f*noiseCCurve[cN/60.f];
-								noisevarchrom[i1>>1][j1>>1] = max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
-							}
-						}
 						//end chroma					
 						labdn->L[i1][j1] = Y;
 						labdn->a[i1][j1] = (X-Y);
 						labdn->b[i1][j1] = (Y-Z);
 
 						Lin[i1][j1] = Y;
+
+						if(((i1|j1)&1) == 0) {
+							noisevarlum[(i1>>1)*width2+(j1>>1)] = useNoiseLCurve ? lumcalc[i>>1][j>>1] : noisevarL;
+							noisevarchrom[(i1>>1)*width2+(j1>>1)] = useNoiseCCurve ? maxNoiseVarab*ccalc[i>>1][j>>1] : 1.f;
+						}
 					}
 				}
 
@@ -720,12 +660,12 @@ if(settings->leveldnti ==1) {
 #ifdef _OPENMP
 #pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
 #endif
-				for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
+				for (int i=tiletop; i<tilebottom; i++) {
 					int i1 = i - tiletop;
-					for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+					for (int j=tileleft; j<tileright; j++) {
 						int j1 = j - tileleft;
 						float L,a,b;
-						float rLum=src->r(i,j) ;//for luminance denoise curve
+						float rLum=src->r(i,j) ;//for denoise curves
 						float gLum=src->g(i,j) ;
 						float bLum=src->b(i,j) ;
 						
@@ -746,15 +686,21 @@ if(settings->leveldnti ==1) {
 
 						//convert Lab
 						Color::XYZ2Lab(X, Y, Z, L, a, b);
-						float Llum,alum,blum;
+						labdn->L[i1][j1] = L;
+						labdn->a[i1][j1] = a;
+						labdn->b[i1][j1] = b;
+
+						Lin[i1][j1] = L;
+
 						if(((i1|j1)&1) == 0) {
-							if(noiseLCurve || noiseCCurve) {
+							float Llum,alum,blum;
+							if(useNoiseLCurve || useNoiseCCurve) {
 								float XL,YL,ZL;
 								Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp);
 								Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum);
 							}
 					
-							if(noiseLCurve) {
+							if(useNoiseLCurve) {
 								float kN = Llum;
 								float epsi=0.01f;
 								if(kN<2.f) kN=2.f;
@@ -762,33 +708,26 @@ if(settings->leveldnti ==1) {
 								float kinterm=epsi + noiseLCurve[xdivf(kN,15)*500.f];
 								float ki=kinterm*100.f;
 								ki+=noiseluma;
-								noiseluma += 1.f;
-								noisevarlum[i1>>1][j1>>1]=SQR((ki/125.f)*(1.f+ki/25.f));
-							}	
-							if(noiseCCurve) {
+								noisevarlum[(i1>>1)*width2+(j1>>1)]=SQR((ki/125.f)*(1.f+ki/25.f));
+							} else {
+								noisevarlum[(i1>>1)*width2+(j1>>1)] = noisevarL;
+							}
+							if(useNoiseCCurve) {
 								float aN=alum;
 								float bN=blum;
 								float cN=sqrtf(SQR(aN)+SQR(bN));
 								if(cN < 100.f)
 									cN=100.f;//avoid divided by zero ???
 								float Cinterm=1.f + ponderCC*4.f*noiseCCurve[cN/60.f];
-								noisevarchrom[i1>>1][j1>>1]=max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
-							}
+								noisevarchrom[(i1>>1)*width2+(j1>>1)]= maxNoiseVarab*SQR(Cinterm);
+							} else {
+								noisevarchrom[(i1>>1)*width2+(j1>>1)] = 1.f;
+							}
 						}
-						labdn->L[i1][j1] = L;
-						labdn->a[i1][j1] = a;
-						labdn->b[i1][j1] = b;
-
-						Lin[i1][j1] = L;
 					}
 				}
 			}
 
-			//initial impulse denoise, removed in Issue 2557
-//				if (dnparams.luma>0.01) {
-//					impulse_nr (labdn, float(MIN(50.0,dnparams.luma))/20.0f);
-//				}
-
 			int datalen = labdn->W * labdn->H;
 
 			//now perform basic wavelet denoise
@@ -808,27 +747,24 @@ if(settings->leveldnti ==1) {
 				execwavelet=true;
 			if(settings->leveldnautsimpl==1 && (dnparams.Cmethod=="AUT" || dnparams.Cmethod=="PRE"))
 				autoch=true;
-			if(settings->leveldnautsimpl==0 && dnparams.C2method=="AUTO" || dnparams.C2method=="PREV")
+			if(settings->leveldnautsimpl==0 && (dnparams.C2method=="AUTO" || dnparams.C2method=="PREV"))
 				autoch=true;
 			
 			if(execwavelet) {//gain time if user choose only median  sliders L <=1  slider chrom master < 1
-			{ // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
-			wavelet_decomposition* Ldecomp;
-			wavelet_decomposition* adecomp;
-			wavelet_decomposition* bdecomp;
-			int schoice=0;//shrink method
-			if(dnparams.smethod=="shal") schoice=0;
-			else if(dnparams.smethod=="shalbi") schoice=2;
-
-			int levwav=5;
-			float maxreal = max(realred, realblue);
-			//increase the level of wavelet if user increase much or very much sliders
-			if( maxreal < 8.f) levwav=5;
-			else if( maxreal < 10.f)levwav=6;
-			else if( maxreal < 15.f)levwav=7;
-			else levwav=8;//maximum ==> I have increase Maxlevel in cplx_wavelet_dec.h from 8 to 9
-			if(schoice==2) levwav+=settings->nrwavlevel;//increase level for enhanced mode
-			if(levwav>8) levwav=8;
+				wavelet_decomposition* Ldecomp;
+				wavelet_decomposition* adecomp;
+				wavelet_decomposition* bdecomp;
+				
+				int levwav=5;
+				float maxreal = max(realred, realblue);
+				//increase the level of wavelet if user increase much or very much sliders
+				if( maxreal < 8.f) levwav=5;
+				else if( maxreal < 10.f)levwav=6;
+				else if( maxreal < 15.f)levwav=7;
+				else levwav=8;//maximum ==> I have increase Maxlevel in cplx_wavelet_dec.h from 8 to 9
+				if(nrQuality == QUALITY_HIGH)
+					levwav += settings->nrwavlevel;//increase level for enhanced mode
+				if(levwav>8) levwav=8;
 			
 			//	if (settings->verbose) printf("levwavelet=%i  noisevarA=%f noisevarB=%f \n",levwav, noisevarab_r, noisevarab_b );
 #ifdef _OPENMP
@@ -839,33 +775,36 @@ if(settings->leveldnti ==1) {
 #pragma omp section
 #endif
 {
-			Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, levwav/*maxlevels*/, 1/*subsampling*/ );
+				Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, levwav/*maxlevels*/, 1/*subsampling*/ );
 }
 #ifdef _OPENMP
 #pragma omp section
 #endif
 {
-			adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H,levwav, 1 );
+				adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H,levwav, 1 );
 }
 #ifdef _OPENMP
 #pragma omp section
 #endif
 {
-			bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, levwav, 1 );
+				bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, levwav, 1 );
 }
 }
-			if(schoice==0) WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL, mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, schoice, autoch, perf );//enhance mode
-			if(schoice==2) {
-				WaveletDenoiseAll_BiShrink(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL,  mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, schoice, autoch, perf );//enhance mode
-				WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL,  mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut ,redaut, blueaut, maxredaut, maxblueaut, schoice, autoch, perf );
+				if(nrQuality==QUALITY_STANDARD)
+					WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, noisevarab_r, noisevarab_b,labdn, useNoiseLCurve, useNoiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, autoch, denoiseMethodRgb );//enhance mode
+				else /*if(nrQuality==QUALITY_HIGH)*/ {
+					WaveletDenoiseAll_BiShrink(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, noisevarab_r, noisevarab_b,labdn, useNoiseLCurve, useNoiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, autoch, denoiseMethodRgb );//enhance mode
+					WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, noisevarab_r, noisevarab_b,labdn, useNoiseLCurve, useNoiseCCurve, chaut ,redaut, blueaut, maxredaut, maxblueaut, autoch, denoiseMethodRgb );
 				}
-			float chresid,chmaxredresid,chmaxblueresid,chresidred, chresidblue;	
-			//kall=0 call by Dcrop
-			resid=0.f;residred=0.f;residblue=0.f;
-			if(kall==0) Noise_residual(*Ldecomp, *adecomp, *bdecomp,  width, height, chresid, chmaxredresid,chmaxblueresid, chresidred, chresidblue, resid, residblue, residred, maxredresid, maxblueresid, nbresid);
-			//printf("NoiRESID=%3.1f maxR=%3.1f maxB=%3.1f red=%3.1f  blue=%3.1f\n",chresid, chmaxredresid,chmaxblueresid, chresidred, chresidblue);	
-			nresi=chresid;
-			highresi=chresid + 0.66f*(max(chmaxredresid,chmaxblueresid) - chresid);//evaluate sigma
+					//kall=0 call by Dcrop
+				if(kall==0) {
+					float chresid,chmaxredresid,chmaxblueresid,chresidred, chresidblue;	
+					resid=0.f;residred=0.f;residblue=0.f;
+					Noise_residual(*Ldecomp, *adecomp, *bdecomp,  width, height, chresid, chmaxredresid,chmaxblueresid, chresidred, chresidblue, resid, residblue, residred, maxredresid, maxblueresid, nbresid, denoiseMethodRgb);
+					//printf("NoiRESID=%3.1f maxR=%3.1f maxB=%3.1f red=%3.1f  blue=%3.1f\n",chresid, chmaxredresid,chmaxblueresid, chresidred, chresidblue);	
+					nresi=chresid;
+					highresi=chresid + 0.66f*(max(chmaxredresid,chmaxblueresid) - chresid);//evaluate sigma
+				}
 #ifdef _OPENMP
 #pragma omp parallel sections num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
 #endif
@@ -874,34 +813,26 @@ if(settings->leveldnti ==1) {
 #pragma omp section
 #endif
 {
-			Ldecomp->reconstruct(labdn->data);
-			delete Ldecomp;
+				Ldecomp->reconstruct(labdn->data);
+				delete Ldecomp;
 }
 #ifdef _OPENMP
 #pragma omp section
 #endif
 {
-			adecomp->reconstruct(labdn->data+datalen);
-			delete adecomp;
+				adecomp->reconstruct(labdn->data+datalen);
+				delete adecomp;
 }
 #ifdef _OPENMP
 #pragma omp section
 #endif
 {
-			bdecomp->reconstruct(labdn->data+2*datalen);
-			delete bdecomp;
+				bdecomp->reconstruct(labdn->data+2*datalen);
+				delete bdecomp;
 }
 }
 			}
-			}
-			//TODO: at this point wavelet coefficients storage can be freed
-			//Issue 1680: Done now
 
-			//second impulse denoise, removed in Issue 2557
-//				if (dnparams.luma>0.01) {
-//					impulse_nr (labdn, MIN(50.0f,(float)dnparams.luma)/20.0f);
-//				}
-			//PF_correct_RT(dst, dst, defringe.radius, defringe.threshold);
 
 			
 			int metchoice=0;
@@ -911,7 +842,7 @@ if(settings->leveldnti ==1) {
 			else if(dnparams.methodmed=="Lpab") metchoice=4;
 			
 			//median on Luminance Lab only
-		if( (metchoice==1 || metchoice==2 || metchoice==3 || metchoice==4)   && dnparams.median) {
+		if( (metchoice==1 || metchoice==2 || metchoice==3 || metchoice==4) && dnparams.median) {
 		//printf("Lab et Lonly \n");
 			float** tmL;
 			int wid=labdn->W;
@@ -977,7 +908,7 @@ if(settings->leveldnti ==1) {
 #pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
 #endif
 					for (int i=1; i<hei-1; i++) {
-						float pp[9],results[5],temp;
+						float pp[9],temp;
 						if(methmedL == 0) 
 							for (int j=1; j<wid-1; j++) {
 								med2(labdn->L[i][j] ,labdn->L[i-1][j], labdn->L[i+1][j] ,labdn->L[i][j+1],labdn->L[i][j-1], tmL[i][j]);//3x3 soft
@@ -1249,14 +1180,13 @@ if(settings->leveldnti ==1) {
 			const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad;
 			const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad;
 
-			//const int nrtiles = numblox_W*numblox_H;
 			// end of tiling calc
 			{
 
 			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 			// Main detail recovery algorithm: Block loop
-				//DCT block data storage
+			//DCT block data storage
 
 
 #ifdef _OPENMP
@@ -1283,7 +1213,6 @@ if(settings->leveldnti ==1) {
 
 				int top = (vblk-blkrad)*offset;
 				float * datarow = pBuf +blkrad*offset;
-
 				for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) {
 					int row = top + i;
 					int rr = row;
@@ -1307,15 +1236,27 @@ if(settings->leveldnti ==1) {
 					//now fill this row of the blocks with Lab high pass data
 					for (int hblk=0; hblk<numblox_W; hblk++) {
 						int left = (hblk-blkrad)*offset;
-						int indx = (hblk)*TS;//index of block in malloc
-
-						for (int j=0; j<TS; j++) {
-							Lblox[(indx + i)*TS+j] = tilemask_in[i][j]*datarow[left+j];// luma data
-							if (top+i>=0 && top+i<height && left+j>=0 && left+j<width) {
+						int indx = (hblk)*TS;//index of block in malloc
+						if(top+i>=0 && top+i<height) {
+							int j;
+							for (j=0; j<min((-left),TS); j++) {
+								Lblox[(indx + i)*TS+j] = tilemask_in[i][j]*datarow[left+j];// luma data
+							}
+							for (; j<min(TS,width-left); j++) {
+								Lblox[(indx + i)*TS+j] = tilemask_in[i][j]*datarow[left+j];// luma data
 								totwt[top+i][left+j] += tilemask_in[i][j]*tilemask_out[i][j];
+							}
+							for (; j<TS; j++) {
+								Lblox[(indx + i)*TS+j] = tilemask_in[i][j]*datarow[left+j];// luma data
+							}
+						} else {
+							for (int j=0; j<TS; j++) {
+								Lblox[(indx + i)*TS+j] = tilemask_in[i][j]*datarow[left+j];// luma data
 							}
-						}
-					}
+						}
+
+					}
+
 				}//end of filling block row
 
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -1389,29 +1330,28 @@ if(settings->leveldnti ==1) {
 			}
 			//convert back to RGB and write to destination array
 			if (isRAW) {
-			if(!perf) {//Lab mode
-					realred /= 100.f;
-					realblue /= 100.f;
+			if(!denoiseMethodRgb) {//Lab mode
+				realred /= 100.f;
+				realblue /= 100.f;
 #ifdef _OPENMP
 #pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
 #endif
 				for (int i=tiletop; i<tilebottom; i++){
 					int i1 = i-tiletop;
-					float X,Y,Z,L,a,b;
-
 					for (int j=tileleft; j<tileright; j++) {
 						int j1=j-tileleft;
 						//modification Jacques feb 2013
 						//true conversion Lab==>xyz
-						L = labdn->L[i1][j1];
-						a = labdn->a[i1][j1];
-						b = labdn->b[i1][j1];
+						float L = labdn->L[i1][j1];
+						float a = labdn->a[i1][j1];
+						float b = labdn->b[i1][j1];
 						float c_h=SQR(a)+SQR(b);
 						if(c_h>9000000.f){
-							a *= 1.f + Qhigh*realred;
-							b *= 1.f + Qhigh*realblue;
+							a *= 1.f + qhighFactor*realred;
+							b *= 1.f + qhighFactor*realblue;
 						}
 						//convert XYZ
+						float X,Y,Z;
 						Color::Lab2XYZ(L, a, b, X, Y, Z);
 						//apply inverse gamma noise
 						float r_,g_,b_;
@@ -1420,22 +1360,12 @@ if(settings->leveldnti ==1) {
 						r_ = r_<32768.f ? igamcurve[r_] : (Color::gammanf(r_/32768.f, igam) * 65535.f);
 						g_ = g_<32768.f ? igamcurve[g_] : (Color::gammanf(g_/32768.f, igam) * 65535.f);
 						b_ = b_<32768.f ? igamcurve[b_] : (Color::gammanf(b_/32768.f, igam) * 65535.f);
-						//readapt arbitrary gamma (inverse from beginning)
-						if (gamlab == 0) {
-							r_ = Color::gammatab_26_11[r_];
-							g_ = Color::gammatab_26_11[g_];
-							b_ = Color::gammatab_26_11[b_];
-						}
-						else if (gamlab == 1) {
-							r_ = Color::gammatab_4[r_];
-							g_ = Color::gammatab_4[g_];
-							b_ = Color::gammatab_4[b_];
-						}
-						else if (gamlab == 2) {
-							r_ = Color::gammatab_55[r_];
-							g_ = Color::gammatab_55[g_];
-							b_ = Color::gammatab_55[b_];
-						}
+
+						//readapt arbitrary gamma (inverse from beginning)
+						r_ = (*denoisegamtab)[r_];
+						g_ = (*denoisegamtab)[g_];
+						b_ = (*denoisegamtab)[b_];
+
 						float factor = Vmask[i1]*Hmask[j1];
 
 						dsttmp->r(i,j) += factor*r_;
@@ -1448,17 +1378,16 @@ if(settings->leveldnti ==1) {
 				else {//RGB mode
 				for (int i=tiletop; i<tilebottom; i++){
 					int i1 = i-tiletop;
-					float X,Y,Z;
 					for (int j=tileleft; j<tileright; j++) {
 						int j1=j-tileleft;
 						float c_h=sqrt(SQR(labdn->a[i1][j1])+SQR(labdn->b[i1][j1]));
 						if(c_h>3000.f){
-							labdn->a[i1][j1]*=1.f + Qhigh*realred/100.f;
-							labdn->b[i1][j1]*=1.f + Qhigh*realblue/100.f;
+							labdn->a[i1][j1]*=1.f + qhighFactor*realred/100.f;
+							labdn->b[i1][j1]*=1.f + qhighFactor*realblue/100.f;
 						}
-						Y = labdn->L[i1][j1];
-						X = (labdn->a[i1][j1]) + Y;
-						Z = Y - (labdn->b[i1][j1]);
+						float Y = labdn->L[i1][j1];
+						float X = (labdn->a[i1][j1]) + Y;
+						float Z = Y - (labdn->b[i1][j1]);
 						
 
 						X = X<32768.0f ? igamcurve[X] : (Color::gamma((float)X/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
@@ -1478,19 +1407,19 @@ if(settings->leveldnti ==1) {
 			} else {
 				for (int i=tiletop; i<tilebottom; i++){
 					int i1 = i-tiletop;
-					float X,Y,Z,L,a,b;
 					for (int j=tileleft; j<tileright; j++) {
 						int j1=j-tileleft;
 						//modification Jacques feb 2013
-						L = labdn->L[i1][j1];
-						a = labdn->a[i1][j1];
-						b = labdn->b[i1][j1];
+						float L = labdn->L[i1][j1];
+						float a = labdn->a[i1][j1];
+						float b = labdn->b[i1][j1];
 						float c_h=sqrt(SQR(a)+SQR(b));
 						if(c_h>3000.f){
-							a*=1.f + Qhigh*realred/100.f;
-							b*=1.f + Qhigh*realblue/100.f;
+							a*=1.f + qhighFactor*realred/100.f;
+							b*=1.f + qhighFactor*realblue/100.f;
 						}
 						
+						float X,Y,Z;
 						Color::Lab2XYZ(L, a, b, X, Y, Z);
 
 						float factor = Vmask[i1]*Hmask[j1];
@@ -1512,20 +1441,14 @@ if(settings->leveldnti ==1) {
 			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 			delete labdn;
-			delete [] mad_LL;
-			delete [] mad_aa;
-			delete [] mad_bb;
 
 		}//end of tile row
 	}//end of tile loop
-	for (int i=0; i<(tileheight+1)/2; i++)
-		delete [] noisevarlum[i];
 	delete [] noisevarlum;
-	for (int i=0; i<(tileheight+1)/2; i++)
-		delete [] noisevarchrom[i];
 	delete [] noisevarchrom;
 
 	}
+
 	for(int i=0;i<denoiseNestedLevels*numthreads;i++) {
 		fftwf_free(LbloxArray[i]);
 		fftwf_free(fLbloxArray[i]);
@@ -1551,7 +1474,7 @@ fftwf_destroy_plan( plan_forward_blox[0] );
 fftwf_destroy_plan( plan_backward_blox[0] );
 fftwf_destroy_plan( plan_forward_blox[1] );
 fftwf_destroy_plan( plan_backward_blox[1] );
-fftwf_cleanup();
+fftwf_cleanup();
 }
 
 
@@ -1605,15 +1528,16 @@ for(int iteration=1;iteration<=dnparams.passes;iteration++){
 	} else {
 #pragma omp for	
 		for (int i=2; i<hei-2; i++) {
-			float pp[25],temp;
-			if(methmed == 3)			
+			float pp[25];
+			if(methmed == 3) {
+				float temp;
 				for (int j=2; j<wid-2; j++) {
 					med5(source->r(i,j),source->r(i-1,j),source->r(i+1,j),source->r(i,j+1),source->r(i,j-1),source->r(i-1,j-1),source->r(i-1,j+1),source->r(i+1,j-1),source->r(i+1,j+1),
 					source->r(i-2,j),source->r(i+2,j),source->r(i,j+2),source->r(i,j-2),source->r(i-2,j-2),source->r(i-2,j+2),source->r(i+2,j-2),source->r(i+2,j+2),	
 					source->r(i-2,j+1),source->r(i+2,j+1),source->r(i-1,j+2),source->r(i-1,j-2),source->r(i-2,j-1),source->r(i+2,j-1),source->r(i+1,j+2),source->r(i+1,j-2),	
 					tm[i][j]);//5x5
-				}
-			else
+				}
+			} else
 				for (int j=2; j<wid-2; j++) {
 					pp[0]=source->r(i,j);pp[1]=source->r(i-1,j); pp[2]=source->r(i+1,j);pp[3]=source->r(i,j+1);pp[4]=source->r(i,j-1);pp[5]=source->r(i-1,j-1);pp[6]=source->r(i-1,j+1);
 					pp[7]=source->r(i+1,j-1);pp[8]=source->r(i+1,j+1);pp[9]=source->r(i+2,j);pp[10]=source->r(i-2,j);pp[11]=source->r(i,j+2);pp[12]=source->r(i,j-2);
@@ -1647,15 +1571,16 @@ for(int iteration=1;iteration<=dnparams.passes;iteration++){
 	} else {
 #pragma omp for	
 		for (int i=2; i<hei-2; i++) {
-			float pp[25],temp;
-			if(methmed == 3)							
+			float pp[25];
+			if(methmed == 3) {
+				float temp;
 				for (int j=2; j<wid-2; j++) {
 					med5(source->b(i,j),source->b(i-1,j),source->b(i+1,j),source->b(i,j+1),source->b(i,j-1),source->b(i-1,j-1),source->b(i-1,j+1),source->b(i+1,j-1),source->b(i+1,j+1),
 					source->b(i-2,j),source->b(i+2,j),source->b(i,j+2),source->b(i,j-2),source->b(i-2,j-2),source->b(i-2,j+2),source->b(i+2,j-2),source->b(i+2,j+2),	
 					source->b(i-2,j+1),source->b(i+2,j+1),source->b(i-1,j+2),source->b(i-1,j-2),source->b(i-2,j-1),source->b(i+2,j-1),source->b(i+1,j+2),source->b(i+1,j-2),	
 					tm[i][j]);//5x5
-				}
-			else
+				}
+			} else
 				for (int j=2; j<wid-2; j++) {
 					pp[0]=source->b(i,j);pp[1]=source->b(i-1,j); pp[2]=source->b(i+1,j);pp[3]=source->b(i,j+1);pp[4]=source->b(i,j-1);pp[5]=source->b(i-1,j-1);pp[6]=source->b(i-1,j+1);
 					pp[7]=source->b(i+1,j-1);pp[8]=source->b(i+1,j+1);pp[9]=source->b(i+2,j);pp[10]=source->b(i-2,j);pp[11]=source->b(i,j+2);pp[12]=source->b(i,j-2);
@@ -1691,15 +1616,16 @@ for(int iteration=1;iteration<=dnparams.passes;iteration++){
 	} else {
 #pragma omp for	
 		for (int i=2; i<hei-2; i++) {
-			float pp[25],temp;
-			if(methmed == 3)											
+			float pp[25];
+			if(methmed == 3) {
+				float temp;
 				for (int j=2; j<wid-2; j++) {
 					med5(source->g(i,j),source->g(i-1,j),source->g(i+1,j),source->g(i,j+1),source->g(i,j-1),source->g(i-1,j-1),source->g(i-1,j+1),source->g(i+1,j-1),source->g(i+1,j+1),
 					source->g(i-2,j),source->g(i+2,j),source->g(i,j+2),source->g(i,j-2),source->g(i-2,j-2),source->g(i-2,j+2),source->g(i+2,j-2),source->g(i+2,j+2),	
 					source->g(i-2,j+1),source->g(i+2,j+1),source->g(i-1,j+2),source->g(i-1,j-2),source->g(i-2,j-1),source->g(i+2,j-1),source->g(i+1,j+2),source->g(i+1,j-2),	
 					tm[i][j]);//5x5
-				}
-			else
+				}
+			} else
 				for (int j=2; j<wid-2; j++) {
 					pp[0]=source->g(i,j);pp[1]=source->g(i-1,j); pp[2]=source->g(i+1,j);pp[3]=source->g(i,j+1);pp[4]=source->g(i,j-1);pp[5]=source->g(i-1,j-1);pp[6]=source->g(i-1,j+1);
 					pp[7]=source->g(i+1,j-1);pp[8]=source->g(i+1,j+1);pp[9]=source->g(i+2,j);pp[10]=source->g(i-2,j);pp[11]=source->g(i,j+2);pp[12]=source->g(i,j-2);
@@ -1725,16 +1651,13 @@ for(int iteration=1;iteration<=dnparams.passes;iteration++){
 		
 }		
 	//end median
-	if(noiseLCurve || noiseCCurve) {
-			for (int i=0; i<(hei+1)/2; i++)
+	if(noiseLCurve || useNoiseCCurve) {
+			for (int i=0; i<(hei); i++)
 				delete [] lumcalc[i];
 			delete [] lumcalc;
-			for (int i=0; i<(hei+1)/2; i++)
-				delete [] acalc[i];
-			delete [] acalc;
-			for (int i=0; i<(hei+1)/2; i++)
-				delete [] bcalc[i];
-			delete [] bcalc;
+			for (int i=0; i<(hei); i++)
+				delete [] ccalc[i];
+			delete [] ccalc;
 			
 			}
 
@@ -1764,7 +1687,7 @@ SSEFUNCTION	void ImProcFunctions::RGBtile_denoise (float * fLblox, int hblproc,
 		__m128	noisevar_Ldetailv = _mm_set1_ps( noisevar_Ldetail );
 		__m128	onev = _mm_set1_ps( 1.0f );
 		for (int n=0; n<TS*TS; n+=4) {		//for DCT
-			tempv  = onev - xexpf( -SQRV( LVFU(nbrwt[n]))/noisevar_Ldetailv);
+			tempv  = onev - xexpf( -SQRV( LVF(nbrwt[n]))/noisevar_Ldetailv);
 			_mm_storeu_ps( &fLblox[blkstart+n], LVFU(fLblox[blkstart+n]) * tempv );
 		}//output neighbor averaged result
 #else
@@ -1909,10 +1832,10 @@ SSEFUNCTION	void ImProcFunctions::RGBtile_denoise (float * fLblox, int hblproc,
 
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-void ImProcFunctions::Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, wavelet_decomposition &WaveletCoeffs_b,  int width, int height, float &chresid, float &chmaxredresid,float &chmaxblueresid, float &chresidred, float & chresidblue, float resid, float residblue, float residred, float maxredresid, float maxblueresid, float nbresid)
+void ImProcFunctions::Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, wavelet_decomposition &WaveletCoeffs_b,  int width, int height, float &chresid, float &chmaxredresid,float &chmaxblueresid, float &chresidred, float & chresidblue, float resid, float residblue, float residred, float maxredresid, float maxblueresid, float nbresid, bool denoiseMethodRgb)
 {
 		int maxlvl = WaveletCoeffs_L.maxlevel();
-		const float eps = 0.01f;
+
 
 		float madaC,madbC;
 		for (int lvl=0; lvl<maxlvl; lvl++) {
@@ -1923,10 +1846,14 @@ void ImProcFunctions::Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wav
 
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
-			for (int dir=1; dir<4; dir++) {
-				madaC = SQR(Mad(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab));
-				madbC = SQR(Mad(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab));
-
+			for (int dir=1; dir<4; dir++) {
+				if(denoiseMethodRgb) {
+					madaC = SQR(MadRgb(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab));
+					madbC = SQR(MadRgb(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab));
+				} else {
+					madaC = SQR(Mad(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab));
+					madbC = SQR(Mad(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab));
+				}
 				resid +=(madaC+madbC);
 				residred+=madaC;
 				residblue+=madbC;
@@ -1944,7 +1871,7 @@ void ImProcFunctions::Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wav
 }
 
 SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a,
-													 wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch, bool perf)
+													 wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float *noisevarlum, float *noisevarchrom, int width, int height, float noisevar_abr, float noisevar_abb, LabImage * noi, const bool useNoiseLCurve, const bool useNoiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, bool autoch, bool denoiseMethodRgb)
 	{
 		int maxlvl = WaveletCoeffs_L.maxlevel();
 		const float eps = 0.01f;
@@ -1990,7 +1917,7 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
 
 			for (int dir=1; dir<4; dir++) {
-				if(!perf) {
+				if(!denoiseMethodRgb) {
 					madL[lvl][dir-1] = SQR(Mad(WavCoeffs_L[dir], Wlvl_L*Hlvl_L));
 					mada[lvl][dir-1] = SQR(Mad(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab));
 					madb[lvl][dir-1] = SQR(Mad(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab));
@@ -2018,51 +1945,37 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
-			int callby=1;
 			if (lvl==maxlvl-1) {
 				ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, buffer, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
-						  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, width, height, mad_LL, mad_aa, mad_bb,noisevar_abr, noisevar_abb, noi, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, callby, autoch, perf, mada[lvl], madb[lvl], madL[lvl], true);
+						  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, width, height, noisevar_abr, noisevar_abb, noi, useNoiseLCurve, useNoiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, autoch, denoiseMethodRgb, mada[lvl], madb[lvl], madL[lvl], true);
 
 			} else {
 				//simple wavelet shrinkage
 				float * sfave = buffer[0]+32;
 				float * sfaved = buffer[3]+128;
 				float * blurBuffer = buffer[1]+64;
+
 				for (int dir=1; dir<4; dir++) {
 					float mad_Lr = madL[lvl][dir-1];
-					float mad_ar = noisevar_abr*mada[lvl][dir-1];
-					float mad_br = noisevar_abb*madb[lvl][dir-1];
-
-					if (!noiseCCurve || noiseCCurve.getSum() < 5.f ){
-						for (int i=0; i<Hlvl_ab; i++) {
-							for (int j=0; j<Wlvl_ab; j++) {
-								mad_aa[i*Wlvl_ab+j]=mad_ar*(noisevar_abr);
-								mad_bb[i*Wlvl_ab+j]=mad_br*(noisevar_abb);
-							}//noisevarchrom	
-						}
-					}	
+					float mad_ar = useNoiseCCurve ? noisevar_abr*mada[lvl][dir-1] : SQR(noisevar_abr)*mada[lvl][dir-1];
+					float mad_br = useNoiseCCurve ? noisevar_abb*madb[lvl][dir-1] : SQR(noisevar_abb)*madb[lvl][dir-1] ;
 
-					if (noiseCCurve  && noiseCCurve.getSum() > 5.f ){
-						for (int i=0; i<Hlvl_ab; i++) {
-							for (int j=0; j<Wlvl_ab; j++) {
-									mad_aa[i*Wlvl_ab+j]=mad_ar*(noisevarchrom[i][j]);
-									mad_bb[i*Wlvl_ab+j]=mad_br*(noisevarchrom[i][j]);
-							}//noisevarchrom
-						}
-					}
 					if (noisevar_abr>0.001f  || noisevar_abb>0.001f) {
 					
 #ifdef __SSE2__
-						__m128 onev = _mm_set1_ps(1.f);
+						__m128 onev = _mm_set1_ps(1.f);
+						__m128 mad_arv = _mm_set1_ps(mad_ar);
+						__m128 mad_brv = _mm_set1_ps(mad_br);
 						__m128 rmad_Lm9v = onev / _mm_set1_ps(mad_Lr * 9.f);
 						__m128 mad_av;
 						__m128 mad_bv;
 						__m128 mag_Lv, mag_av, mag_bv;
 						__m128 tempav, tempbv;
 						int coeffloc_ab;
-						for (coeffloc_ab=0; coeffloc_ab<Hlvl_ab*Wlvl_ab-3; coeffloc_ab+=4) {
-							mad_av = LVFU(mad_aa[coeffloc_ab]);
-							mad_bv = LVFU(mad_bb[coeffloc_ab]);
+						for (coeffloc_ab=0; coeffloc_ab<Hlvl_ab*Wlvl_ab-3; coeffloc_ab+=4) {
+							__m128 nvcv = LVFU(noisevarchrom[coeffloc_ab]);
+							mad_av = nvcv*mad_arv;
+							mad_bv = nvcv*mad_brv;
 						
 							tempav = LVFU(WavCoeffs_a[dir][coeffloc_ab]);
 							tempbv = LVFU(WavCoeffs_b[dir][coeffloc_ab]);
@@ -2078,8 +1991,8 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 							float mag_L = SQR(WavCoeffs_L[dir][coeffloc_ab ]);
 							float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab]);
 							float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab]);
-							WavCoeffs_a[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*mad_Lr)))/*satfactor_a*/);
-							WavCoeffs_b[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*mad_Lr)))/*satfactor_b*/);
+							WavCoeffs_a[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_a/(noisevarchrom[coeffloc_ab]*mad_ar))-(mag_L/(9.f*mad_Lr)))/*satfactor_a*/);
+							WavCoeffs_b[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_b/(noisevarchrom[coeffloc_ab]*mad_br))-(mag_L/(9.f*mad_Lr)))/*satfactor_b*/);
 						}//now chrominance coefficients are denoised
 #else
 						for (int i=0; i<Hlvl_ab; i++) {
@@ -2090,8 +2003,8 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 								float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
 								float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
 
-								WavCoeffs_a[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*mad_Lr)))/*satfactor_a*/);
-								WavCoeffs_b[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*mad_Lr)))/*satfactor_b*/);
+								WavCoeffs_a[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_a/(noisevarchrom[coeffloc_ab]*mad_ar))-(mag_L/(9.f*mad_Lr)))/*satfactor_a*/);
+								WavCoeffs_b[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_b/(noisevarchrom[coeffloc_ab]*mad_br))-(mag_L/(9.f*mad_Lr)))/*satfactor_b*/);
 
 							}
 						}//now chrominance coefficients are denoised
@@ -2099,37 +2012,23 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 					}
 
 
-					float levelFactor = mad_Lr*5.f/(lvl+1);
 					if (noisevar_L>0.00001f) {
-						if (!noiseLCurve || noiseLCurve.getSum() < 7.f ) {
-							for (int i=0; i<Hlvl_L; i++)
-								for (int j=0; j<Wlvl_L; j++) {
-									int coeffloc_L = i*Wlvl_L+j;
-									mad_LL[coeffloc_L] = noisevar_L*levelFactor;//noisevarlum
-								}
-							}
-						else { 	if (noiseLCurve && noiseLCurve.getSum() >= 7.f) {	
-							for (int i=0; i<Hlvl_L; i++)
-								for (int j=0; j<Wlvl_L; j++) {
-									int coeffloc_L = i*Wlvl_L+j;
-									mad_LL[coeffloc_L] = noisevarlum[i][j]*levelFactor;//noisevarlum
-								}
-							}
-						}
+						float levelFactor = mad_Lr*5.f/(lvl+1);
 #ifdef __SSE2__
 						__m128 mad_Lv;
 						__m128 ninev = _mm_set1_ps( 9.0f );
 						__m128 epsv = _mm_set1_ps(eps);
 						__m128 mag_Lv;
+						__m128 levelFactorv = _mm_set1_ps(levelFactor);
 						int coeffloc_L;
 						for (coeffloc_L=0; coeffloc_L<Hlvl_L*Wlvl_L-3; coeffloc_L+=4) {
-							mad_Lv = LVFU(mad_LL[coeffloc_L]);
+							mad_Lv = LVFU(noisevarlum[coeffloc_L]) * levelFactorv;
 							mag_Lv = SQRV(LVFU(WavCoeffs_L[dir][coeffloc_L]));
 							_mm_storeu_ps(&sfave[coeffloc_L], mag_Lv / ( mag_Lv + mad_Lv * xexpf(-mag_Lv/(mad_Lv*ninev) )+ epsv));
 						}	
 						for (; coeffloc_L<Hlvl_L*Wlvl_L; coeffloc_L++) {
 							float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L]);
-							sfave[coeffloc_L] = mag_L/(mag_L+mad_LL[coeffloc_L]*xexpf(-mag_L/(9.f*mad_LL[coeffloc_L]))+eps);
+							sfave[coeffloc_L] = mag_L/(mag_L+levelFactor*noisevarlum[coeffloc_L]*xexpf(-mag_L/(9.f*levelFactor*noisevarlum[coeffloc_L]))+eps);
 						}
 #else
 						for (int i=0; i<Hlvl_L; i++)
@@ -2137,7 +2036,7 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 
 								int coeffloc_L = i*Wlvl_L+j;
 								float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L]);
-								sfave[coeffloc_L] = mag_L/(mag_L+mad_LL[coeffloc_L]*xexpf(-mag_L/(9.f*mad_LL[coeffloc_L]))+eps);
+								sfave[coeffloc_L] = mag_L/(mag_L+levelFactor*noisevarlum[coeffloc_L]*xexpf(-mag_L/(9.f*levelFactor*noisevarlum[coeffloc_L]))+eps);
 							}
 #endif
 						boxblur(sfave, sfaved, blurBuffer, lvl+2, lvl+2, Wlvl_L, Hlvl_L);//increase smoothness by locally averaging shrinkage
@@ -2177,7 +2076,7 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 
 
 	void ImProcFunctions::WaveletDenoiseAll(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a,
-											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut,float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch, bool perf)//mod JD
+											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float *noisevarlum, float *noisevarchrom, int width, int height, float noisevar_abr, float noisevar_abb, LabImage * noi, const bool useNoiseLCurve, const bool useNoiseCCurve, float &chaut,float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, bool autoch, bool denoiseMethodRgb)//mod JD
 
 	{
 
@@ -2189,7 +2088,6 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 			if(WaveletCoeffs_L.level_H(lvl) > maxHL)
 				maxHL = WaveletCoeffs_L.level_H(lvl);
 		}
-
 
 #ifdef _OPENMP
 #pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
@@ -2223,12 +2121,8 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
 
-			int callby=0;
-			if(schoice==0) callby=0;
-			if(schoice==2) callby=1;
-			
 			ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, buffer, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
-					  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, width, height, mad_LL, mad_aa, mad_bb, noisevar_abr, noisevar_abb, noi, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, callby, autoch, perf);
+					  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, width, height, noisevar_abr, noisevar_abb, noi, useNoiseLCurve, useNoiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, autoch, denoiseMethodRgb);
 
 		}
 		for(int i=4;i>=0;i--)
@@ -2239,8 +2133,8 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, float **buffer, int level, 
-									int W_L, int H_L, int W_ab, int H_ab,int skip_L, int skip_ab, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float * mad_LL, float * mad_aa, float * mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut,
-									float &maxredaut, float &maxblueaut, int callby, bool autoch, bool perf, float * madaa, float * madab, float * madaL, bool madCalculated )
+									int W_L, int H_L, int W_ab, int H_ab,int skip_L, int skip_ab, float noisevar_L, float *noisevarlum, float *noisevarchrom, int width, int height, float noisevar_abr, float noisevar_abb, LabImage * noi, const bool useNoiseLCurve, const bool useNoiseCCurve, float &chaut, float &redaut, float &blueaut,
+									float &maxredaut, float &maxblueaut, bool autoch, bool denoiseMethodRgb, float * madaa, float * madab, float * madaL, bool madCalculated )
 
 									{
 		//simple wavelet shrinkage
@@ -2265,7 +2159,7 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo
 				madb = madab[dir-1];
 				madL = madaL[dir-1] ;
 			} else {
-				if(!perf) {
+				if(!denoiseMethodRgb) {
 					madL = SQR(Mad(WavCoeffs_L[dir], W_L*H_L));
 					mada = SQR(Mad(WavCoeffs_a[dir], W_ab*H_ab));
 					madb = SQR(Mad(WavCoeffs_b[dir], W_ab*H_ab));
@@ -2275,35 +2169,24 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo
 					madb = SQR(MadRgb(WavCoeffs_b[dir], W_ab*H_ab));
 				}
 			}
-		
-			if (!noiseCCurve || noiseCCurve.getSum() < 5.f ){
-				for (int i=0; i<H_ab; i++) {
-					for (int j=0; j<W_ab; j++) {
-						mad_aa[i*W_ab+j]=mada*(noisevar_abr);
-						mad_bb[i*W_ab+j]=madb*(noisevar_abb);
-					}//noisevarchrom	
-				}
-			}	
-			if (noiseCCurve  && noiseCCurve.getSum() > 5.f ){
-				for (int i=0; i<H_ab; i++) {
-					for (int j=0; j<W_ab; j++) {
-						mad_aa[i*W_ab+j]=mada*(noisevarchrom[i][j]);
-						mad_bb[i*W_ab+j]=madb*(noisevarchrom[i][j]);
-					}//noisevarchrom
-				}
-			}
-			
+
 			if (noisevar_abr>0.001f  ||  noisevar_abb>0.001f ) {
+				mada = useNoiseCCurve ? mada : mada * noisevar_abr;
+				madb = useNoiseCCurve ? madb : madb * noisevar_abb;
 #ifdef __SSE2__
-				__m128 onev = _mm_set1_ps(1.f);
-				__m128 rmadLm9v = onev / _mm_set1_ps(madL * 9.f);
+				__m128 onev = _mm_set1_ps(1.f);
+				__m128 mad_arv = _mm_set1_ps(mada);
+				__m128 mad_brv = _mm_set1_ps(madb);
+
+				__m128 rmadLm9v = onev / _mm_set1_ps(madL * 9.f);
 				__m128 mad_av ;
 				__m128 mad_bv;
 				__m128 mag_Lv, mag_av, mag_bv;
 				int coeffloc_ab;
 				for (coeffloc_ab=0; coeffloc_ab<H_ab*W_ab-3; coeffloc_ab+=4) {
-					mad_av = LVFU(mad_aa[coeffloc_ab]);
-					mad_bv = LVFU(mad_bb[coeffloc_ab]);
+					__m128 nvcv = LVFU(noisevarchrom[coeffloc_ab]);
+					mad_av = nvcv*mad_arv;
+					mad_bv = nvcv*mad_brv;
 
 					mag_Lv = LVFU(WavCoeffs_L[dir][coeffloc_ab]);
 					mag_av = SQRV(LVFU(WavCoeffs_a[dir][coeffloc_ab]));
@@ -2317,10 +2200,9 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo
 					float mag_L = SQR(WavCoeffs_L[dir][coeffloc_ab]);
 					float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab]);
 					float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab]);
-					sfavea[coeffloc_ab] = (1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*madL))));
-					sfaveb[coeffloc_ab] = (1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*madL))));
+					sfavea[coeffloc_ab] = (1.f-xexpf(-(mag_a/(noisevarchrom[coeffloc_ab]*mada))-(mag_L/(9.f*madL))));
+					sfaveb[coeffloc_ab] = (1.f-xexpf(-(mag_b/(noisevarchrom[coeffloc_ab]*madb))-(mag_L/(9.f*madL))));
 				}//now chrominance coefficients are denoised
-
 #else
 				for (int i=0; i<H_ab; i++) {
 					for (int j=0; j<W_ab; j++) {
@@ -2329,15 +2211,14 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo
 						float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L ]);
 						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab]);
 						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab]);
-						sfavea[coeffloc_ab] = (1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*madL))));
-						sfaveb[coeffloc_ab] = (1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*madL))));
+						sfavea[coeffloc_ab] = (1.f-xexpf(-(mag_a/(noisevarchrom[coeffloc_ab]*mada))-(mag_L/(9.f*madL))));
+						sfaveb[coeffloc_ab] = (1.f-xexpf(-(mag_b/(noisevarchrom[coeffloc_ab]*madb))-(mag_L/(9.f*madL))));
 					}
 				}//now chrominance coefficients are denoised
 #endif
 
 				boxblur(sfavea, sfavead, blurBuffer, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
 				boxblur(sfaveb, sfavebd, blurBuffer, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
-
 #ifdef __SSE2__
 				__m128 epsv = _mm_set1_ps(eps);
 				__m128 sfav, sfbv;
@@ -2378,42 +2259,29 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo
 			}
 			
 			if (noisevar_L>0.00001f) {
-				if (!noiseLCurve || noiseLCurve.getSum() < 7.f ){//under 7 quasi no action
-					for (int i=0; i<H_L; i++) {
-						for (int j=0; j<W_L; j++) {
-							mad_LL[i*W_L+j]=madL*(noisevar_L)*5/(level+1);
-						}//noisevarlum
-					}
-				}
-				else if (noiseLCurve && noiseLCurve.getSum() >= 7.f) {
-					float precalc = madL * 5.f / (float)(level+1);
-					for (int i=0; i<H_L; i++) {
-						for (int j=0; j<W_L; j++) {
-							mad_LL[i*W_L+j]=precalc*(noisevarlum[i][j]);
-						}//noisevarlum
-					}
-				}
+				float levelFactor = madL * 5.f / (float)(level+1);
 #ifdef __SSE2__
 				__m128	magv;
+				__m128 levelFactorv = _mm_set1_ps(levelFactor);
 				__m128  mad_Lv;
 				__m128	ninev = _mm_set1_ps( 9.0f );
 				__m128 	epsv = _mm_set1_ps( eps );
 				int i;
 				for (i=0; i<W_L*H_L-3; i+=4) {
-					mad_Lv = LVFU(mad_LL[i]);
+					mad_Lv = LVFU(noisevarlum[i]) * levelFactorv;
 					magv = SQRV(LVFU(WavCoeffs_L[dir][i]));
 					_mm_storeu_ps( &sfave[i], magv / (magv + mad_Lv*xexpf(-magv/(ninev * mad_Lv)) + epsv));
 				}
 				// few remaining pixels
 				for (; i<W_L*H_L; i++) {
 					float mag = SQR(WavCoeffs_L[dir][i]);
-					sfave[i] = mag/(mag+mad_LL[i]*xexpf(-mag/(9*mad_LL[i]))+eps);
+					sfave[i] = mag/(mag+levelFactor*noisevarlum[i]*xexpf(-mag/(9*levelFactor*noisevarlum[i]))+eps);
 				}
 #else
 				for (int i=0; i<W_L*H_L; i++) {
 
 					float mag = SQR(WavCoeffs_L[dir][i]);
-					float shrinkfactor = mag/(mag+mad_LL[i]*xexpf(-mag/(9*mad_LL[i]))+eps);
+					float shrinkfactor = mag/(mag+levelFactor*noisevarlum[i]*xexpf(-mag/(9*levelFactor*noisevarlum[i]))+eps);
 					sfave[i] = shrinkfactor;
 				}
 #endif
@@ -2449,7 +2317,7 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo
 
 
 SSEFUNCTION	void ImProcFunctions::ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b, int level,
-									int W_ab, int H_ab, int skip_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float * mad_aa, float * mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut,
+									int W_ab, int H_ab, int skip_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut,
 									float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut,  bool autoch, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc,
 									float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool perf, bool multiThread) {
 
@@ -2465,7 +2333,7 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll_info(float ** WavCoeffs_a, float **
 		float red_yel = 0.f;
 		float skin_c = 0.f;
 		int nsk = 0;
-		float nsk_nc = 0.f;
+
 		for (int i=0; i<H_ab; i++) {
 			for (int j=0; j<W_ab; j++) {
 				chro += noisevarchrom[i][j];
@@ -2541,7 +2409,7 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll_info(float ** WavCoeffs_a, float **
 
 
 void ImProcFunctions::WaveletDenoiseAll_info(int levwav, wavelet_decomposition &WaveletCoeffs_a,
-											wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut,int schoice, bool autoch, 
+											wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut,int schoice, bool autoch, 
 											float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool perf, bool multiThread ){
 
 	int maxlvl = levwav;
@@ -2556,7 +2424,7 @@ void ImProcFunctions::WaveletDenoiseAll_info(int levwav, wavelet_decomposition &
 		float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
 
 		ShrinkAll_info(WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_ab, Hlvl_ab,
-		  skip_ab, noisevarlum,  noisevarchrom, noisevarhue, width, height, mad_aa, mad_bb, noisevar_abr, noisevar_abb, noi, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, 
+		  skip_ab, noisevarlum,  noisevarchrom, noisevarhue, width, height, noisevar_abr, noisevar_abb, noi, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, 
 		 autoch, schoice, lvl, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc, maxchred, maxchblue, minchred, minchblue, nb, chau, chred, chblue, perf, multiThread );
 
 	}
@@ -2592,9 +2460,9 @@ void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int l
 		reducdelta = (float)settings->nrhigh;
 
 	chaut = (chaut*Nb-maxmax)/(Nb-1);//suppress maximum for chaut calcul
-	if (redyel > 5000.f || skinc > 1000.f && nsknc < 0.4f  && chromina > 3000.f)
+	if ((redyel > 5000.f || skinc > 1000.f) && nsknc < 0.4f  && chromina > 3000.f)
 		chaut *= 0.45f;//reduct action in red zone, except skin for high / med chroma
-	else if (redyel>12000.f || skinc > 1200.f && nsknc < 0.3f && chromina > 3000.f)
+	else if ((redyel>12000.f || skinc > 1200.f) && nsknc < 0.3f && chromina > 3000.f)
 		chaut *= 0.3f;
 	
 	if (mode==0 || mode==2) {//Preview or Auto multizone
@@ -2664,7 +2532,7 @@ void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int l
 }
 	
 SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, const bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb,  float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, float &nresi, float &highresi, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread)
-{
+{
 	if ((settings->leveldnautsimpl==1 && dnparams.Cmethod == "MAN") || (settings->leveldnautsimpl==0 && dnparams.C2method == "MANU")) {
 		//nothing to do
 		return;
@@ -2678,11 +2546,12 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
 	wid=provicalc->width;
 	TMatrix wprofi = iccStore->workingSpaceMatrix (params->icm.working);
 
-	double wpi[3][3] = {
-		{wprofi[0][0],wprofi[0][1],wprofi[0][2]},
-		{wprofi[1][0],wprofi[1][1],wprofi[1][2]},
-		{wprofi[2][0],wprofi[2][1],wprofi[2][2]}
-		};
+	const float wpi[3][3] = {
+		{static_cast<float>(wprofi[0][0]),static_cast<float>(wprofi[0][1]),static_cast<float>(wprofi[0][2])},
+		{static_cast<float>(wprofi[1][0]),static_cast<float>(wprofi[1][1]),static_cast<float>(wprofi[1][2])},
+		{static_cast<float>(wprofi[2][0]),static_cast<float>(wprofi[2][1]),static_cast<float>(wprofi[2][2])}
+		};
+
 	lumcalc = new float*[hei];
 	for (int i=0; i<hei; i++)
 		lumcalc[i] = new float[wid];
@@ -2742,14 +2611,11 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
 
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-
-	//DCT block data storage
 	TMatrix wprof = iccStore->workingSpaceMatrix (params->icm.working);
-
-	double wp[3][3] = {
-		{wprof[0][0],wprof[0][1],wprof[0][2]},
-		{wprof[1][0],wprof[1][1],wprof[1][2]},
-		{wprof[2][0],wprof[2][1],wprof[2][2]}
+	const float wp[3][3] = {
+		{static_cast<float>(wprof[0][0]),static_cast<float>(wprof[0][1]),static_cast<float>(wprof[0][2])},
+		{static_cast<float>(wprof[1][0]),static_cast<float>(wprof[1][1]),static_cast<float>(wprof[1][2])},
+		{static_cast<float>(wprof[2][0]),static_cast<float>(wprof[2][1]),static_cast<float>(wprof[2][2])}
 	};
 
 	float chau=0.f;
@@ -2761,6 +2627,18 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
 	float minchblue=100000000.f;
 	int nb=0;
 	int comptlevel=0;
+
+	// To avoid branches in loops we access the gammatabs by pointers
+	LUTf *denoiseigamtab;
+	switch(settings->denoiselabgamma) {
+		case 0:	 denoiseigamtab = &(Color::igammatab_26_11);
+				 break;
+		case 1:	 denoiseigamtab = &(Color::igammatab_4);
+				 break;
+		default: denoiseigamtab = &(Color::igammatab_55);
+				 break;
+	}
+
 
 	for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
 		for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
@@ -2770,18 +2648,16 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
 			int width  = tileright-tileleft;
 			int height = tilebottom-tiletop;
 			LabImage * labdn = new LabImage(width,height);
-			float** noisevarlum = new float*[height];
-			for (int i=0; i<height; i++)
-				noisevarlum[i] = new float[width];
+			float** noisevarlum = new float*[(height+1)/2];
+			for (int i=0; i<(height+1)/2; i++)
+				noisevarlum[i] = new float[(width+1)/2];
 						
-			float* mad_aa = new float [height*width];
-			float* mad_bb = new float [height*width];
-			float** noisevarchrom = new float*[height];
-			for (int i=0; i<height; i++)
-				noisevarchrom[i] = new float[width];
-			float** noisevarhue = new float*[height];
-			for (int i=0; i<height; i++)
-				noisevarhue[i] = new float[width];
+			float** noisevarchrom = new float*[(height+1)/2];
+			for (int i=0; i<(height+1)/2; i++)
+				noisevarchrom[i] = new float[(width+1)/2];
+			float** noisevarhue = new float*[(height+1)/2];
+			for (int i=0; i<(height+1)/2; i++)
+				noisevarhue[i] = new float[(width+1)/2];
 			
 			// init luma noisevarL
 			float noisevarab_b, noisevarab_r;
@@ -2805,61 +2681,59 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
 				realblue = 0.001f;
 			//TODO: implement using AlignedBufferMP
 			//fill tile from image; convert RGB to "luma/chroma"
-			float noiseluma=(float) dnparams.luma;
-			noiseluma += 1.f;
-			float noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));
+
 			if (isRAW) {//image is raw; use channel differences for chroma channels
 #ifdef _OPENMP
 #pragma omp parallel for if(multiThread)
 #endif
-					for (int i=tiletop; i<tilebottom; i++) {
+					for (int i=tiletop; i<tilebottom; i+=2) {
 						int i1 = i - tiletop;
 #ifdef __SSE2__
 						__m128 aNv,bNv;
 						__m128 c100v = _mm_set1_ps(100.f);
 						int j;
-						for (j=tileleft; j<tileright-3; j+=4) {
+						for (j=tileleft; j<tileright-7; j+=8) {
 							int j1 = j - tileleft;
-							aNv = LVFU(acalc[i][j]);
-							bNv = LVFU(bcalc[i][j]);
-							_mm_storeu_ps(&noisevarhue[i1][j1], xatan2f(bNv,aNv));
-							_mm_storeu_ps(&noisevarchrom[i1][j1], _mm_max_ps(c100v,_mm_sqrt_ps(SQRV(aNv)+SQRV(bNv))));
+							aNv = LVFU(acalc[i>>1][j>>1]);
+							bNv = LVFU(bcalc[i>>1][j>>1]);
+							_mm_storeu_ps(&noisevarhue[i1>>1][j1>>1], xatan2f(bNv,aNv));
+							_mm_storeu_ps(&noisevarchrom[i1>>1][j1>>1], _mm_max_ps(c100v,_mm_sqrt_ps(SQRV(aNv)+SQRV(bNv))));
 						}
-						for (; j<tileright; j++) {
+						for (; j<tileright; j+=2) {
 							int j1 = j - tileleft;
-							float aN = acalc[i][j];
-							float bN = bcalc[i][j];
+							float aN = acalc[i>>1][j>>1];
+							float bN = bcalc[i>>1][j>>1];
 							float cN = sqrtf(SQR(aN)+SQR(bN));
-							noisevarhue[i1][j1] = xatan2f(bN,aN);
+							noisevarhue[i1>>1][j1>>1] = xatan2f(bN,aN);
 							if(cN < 100.f)
 								cN=100.f;//avoid divided by zero
-							noisevarchrom[i1][j1] = cN;
+							noisevarchrom[i1>>1][j1>>1] = cN;
 						}
 #else
-						for (int j=tileleft; j<tileright; j++) {
+						for (int j=tileleft; j<tileright; j+=2) {
 							int j1 = j - tileleft;
-							float aN = acalc[i][j];
-							float bN = bcalc[i][j];
+							float aN = acalc[i>>1][j>>1];
+							float bN = bcalc[i>>1][j>>1];
 							float cN = sqrtf(SQR(aN)+SQR(bN));
 							float hN = xatan2f(bN,aN);
 							if(cN < 100.f)
 								cN = 100.f;//avoid divided by zero
-							noisevarchrom[i1][j1] = cN;
-							noisevarhue[i1][j1] = hN;
+							noisevarchrom[i1>>1][j1>>1] = cN;
+							noisevarhue[i1>>1][j1>>1] = hN;
 						}
 #endif
 					}
 #ifdef _OPENMP
 #pragma omp parallel for if(multiThread)
 #endif
-					for (int i=tiletop; i<tilebottom; i++) {
+					for (int i=tiletop; i<tilebottom; i+=2) {
 						int i1 = i - tiletop;
-						for (int j=tileleft; j<tileright; j++) {
+						for (int j=tileleft; j<tileright; j+=2) {
 							int j1 = j - tileleft;
-							float Llum = lumcalc[i][j];
+							float Llum = lumcalc[i>>1][j>>1];
 							Llum = Llum < 2.f ? 2.f : Llum; //avoid divided by zero ?
 							Llum = Llum > 32768.f ? 32768.f : Llum; // not strictly necessary							
-							noisevarlum[i1][j1]= Llum;
+							noisevarlum[i1>>1][j1>>1]= Llum;
 						}
 					}
 				if (!perf){//lab mode, modification Jacques feb 2013 and july 2014
@@ -2875,24 +2749,10 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
 						float G_ = gain*src->g(i,j);
 						float B_ = gain*src->b(i,j);
 
-						//finally I opted fot gamma55 and with options we can change
-						if (gamlab == 0) {// options 12/2013
-							R_ = Color::igammatab_26_11[R_];
-							G_ = Color::igammatab_26_11[G_];
-							B_ = Color::igammatab_26_11[B_];
-						}
-						else if (gamlab == 1) {
-							//other new gamma 4 5 
-							R_ = Color::igammatab_4[R_];
-							G_ = Color::igammatab_4[G_];
-							B_ = Color::igammatab_4[B_];
-						}
-						else if (gamlab == 2) {							
-							//new gamma 5.5 10 better for detail luminance..it is a compromise...which depends on the image (distribution BL, ML, HL ...)
-							R_ = Color::igammatab_55[R_];
-							G_ = Color::igammatab_55[G_];
-							B_ = Color::igammatab_55[B_];
-						}
+						R_ = (*denoiseigamtab)[R_];
+						G_ = (*denoiseigamtab)[G_];
+						B_ = (*denoiseigamtab)[B_];
+
 						//apply gamma noise	standard (slider)
 						R_ = R_<65535.0f ? gamcurve[R_] : (Color::gamman((double)R_/65535.0, gam)*32768.0f);
 						G_ = G_<65535.0f ? gamcurve[G_] : (Color::gamman((double)G_/65535.0, gam)*32768.0f);
@@ -2958,25 +2818,24 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
 
 						//convert Lab
 						Color::XYZ2Lab(X, Y, Z, L, a, b);
-						float Llum,alum,blum;
-						float XL,YL,ZL;
-						Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp);
-						Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum);
-						
-						float kN=Llum;
-						float epsi=0.01f;
 
-						if(kN<2.f) kN=2.f;
-						if(kN>32768.f) kN=32768.f;								
-						noisevarlum[i1][j1]=kN;
-						float aN=alum;
-						float bN=blum;
-						float hN=xatan2f(bN,aN);
-						float cN=sqrt(SQR(aN)+SQR(bN));
-						if(cN < 100.f) cN=100.f;//avoid divided by zero
-						noisevarchrom[i1][j1]=cN;
-						noisevarhue[i1][j1]=hN;
-
+						if(((i1|j1)&1) == 0) {
+							float Llum,alum,blum;
+							float XL,YL,ZL;
+							Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp);
+							Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum);
+							float kN=Llum;
+							if(kN<2.f) kN=2.f;
+							if(kN>32768.f) kN=32768.f;
+							noisevarlum[i1>>1][j1>>1]=kN;
+							float aN=alum;
+							float bN=blum;
+							float hN=xatan2f(bN,aN);
+							float cN=sqrt(SQR(aN)+SQR(bN));
+							if(cN < 100.f) cN=100.f;//avoid divided by zero
+							noisevarchrom[i1>>1][j1>>1]=cN;
+							noisevarhue[i1>>1][j1>>1]=hN;
+						}
 						labdn->a[i1][j1] = a;
 						labdn->b[i1][j1] = b;
 					}
@@ -3018,27 +2877,24 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
 }
 }
 			bool autoch = dnparams.autochroma;
-			if(comptlevel==0) WaveletDenoiseAll_info(levwav, *adecomp, *bdecomp, noisevarlum, noisevarchrom, noisevarhue, width, height, mad_aa, mad_bb, noisevarab_r, noisevarab_b, labdn, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, schoice, autoch, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc,maxchred, maxchblue, minchred, minchblue, nb,chau ,chred, chblue, perf, multiThread);//enhance mode
+			if(comptlevel==0) WaveletDenoiseAll_info(levwav, *adecomp, *bdecomp, noisevarlum, noisevarchrom, noisevarhue, width, height, noisevarab_r, noisevarab_b, labdn, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, schoice, autoch, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc,maxchred, maxchblue, minchred, minchblue, nb,chau ,chred, chblue, perf, multiThread);//enhance mode
 			comptlevel+=1;
-			float chresid,chmaxredresid,chmaxblueresid,chresidred, chresidblue;	
+			float chresid,chmaxredresid,chmaxblueresid;	
 			nresi=chresid;
 			highresi=chresid + 0.66f*(max(chmaxredresid,chmaxblueresid) - chresid);//evaluate sigma
 			delete adecomp;
 			delete bdecomp;
 			delete labdn;
-			for (int i=0; i<height; i++)
+			for (int i=0; i<(height+1)/2; i++)
 				delete [] noisevarlum[i];
 			delete [] noisevarlum;
-			for (int i=0; i<height; i++)
+			for (int i=0; i<(height+1)/2; i++)
 				delete [] noisevarchrom[i];
 			delete [] noisevarchrom;
-			for (int i=0; i<height; i++)
+			for (int i=0; i<(height+1)/2; i++)
 				delete [] noisevarhue[i];
 			delete [] noisevarhue;
 			
-			delete [] mad_aa;
-			delete [] mad_bb;
-
 		}//end of tile row
 	}//end of tile loop
 
diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h
index d1dd9ad4c..c1f50d7a1 100644
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -745,27 +745,27 @@ template<class T, class A> SSEFUNCTION void boxabsblur (T* src, A* dst, int radx
 		__m128	tempv,lenv,lenp1v,lenm1v,rlenv;
 		for (int col = 0; col < W-3; col+=4) {
 			lenv = leninitv;
-			tempv = LVFU(temp[0*W+col]);
+			tempv = LVF(temp[0*W+col]);
 			for (int i=1; i<=rady; i++) {
-				tempv = tempv + LVFU(temp[i*W+col]);
+				tempv = tempv + LVF(temp[i*W+col]);
 			}
 			tempv = tempv / lenv;
-			_mm_storeu_ps( &dst[0*W+col], tempv );
+			STVF(dst[0*W+col], tempv);
 			for (int row=1; row<=rady; row++) {
 				lenp1v = lenv + onev;
-				tempv = (tempv*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v;
-				_mm_storeu_ps( &dst[row*W+col],tempv);
+				tempv = (tempv*lenv + LVF(temp[(row+rady)*W+col]))/lenp1v;
+				STVF(dst[row*W+col],tempv);
 				lenv = lenp1v;
 			}
 			rlenv = onev / lenv;
 			for (int row = rady+1; row < H-rady; row++) {
-				tempv = tempv + (LVFU(temp[(row+rady)*W+col])- LVFU(temp[(row-rady-1)*W+col]))*rlenv;
-				_mm_storeu_ps( &dst[row*W+col], tempv);
+				tempv = tempv + (LVF(temp[(row+rady)*W+col])- LVF(temp[(row-rady-1)*W+col]))*rlenv;
+				STVF(dst[row*W+col], tempv);
 			}
 			for (int row=H-rady; row<H; row++) {
 				lenm1v = lenv - onev;
-				tempv = (tempv*lenv - LVFU(temp[(row-rady-1)*W+col]))/lenm1v;
-				_mm_storeu_ps( &dst[row*W+col], tempv);
+				tempv = (tempv*lenv - LVF(temp[(row-rady-1)*W+col]))/lenm1v;
+				STVF(dst[row*W+col], tempv);
 				lenv = lenm1v;
 			}
 	}
diff --git a/rtengine/cplx_wavelet_dec.h b/rtengine/cplx_wavelet_dec.h
index 4547fb3bd..a39f09fce 100644
--- a/rtengine/cplx_wavelet_dec.h
+++ b/rtengine/cplx_wavelet_dec.h
@@ -37,7 +37,7 @@ namespace rtengine {
 
 	private:
 
-		static const int maxlevels = 9;//should be greater than any conceivable order of decimation
+		static const int maxlevels = 10;//should be greater than any conceivable order of decimation
 
 		int lvltot, subsamp;
 		size_t m_w, m_h;//dimensions
@@ -53,7 +53,7 @@ namespace rtengine {
 	public:
 
 		template<typename E>
-		wavelet_decomposition(E * src, int width, int height, int maxlvl, int subsampling);
+		wavelet_decomposition(E * src, int width, int height, int maxlvl, int subsampling, int skipcrop = 1);
 
 		~wavelet_decomposition();
 
@@ -72,11 +72,6 @@ namespace rtengine {
 			return wavelet_decomp[level]->height();
 		}
 
-		int level_pad(int level) const
-		{
-			return wavelet_decomp[level]->padding();
-		}
-
 		int level_stride(int level) const
 		{
 			return wavelet_decomp[level]->stride();
@@ -96,7 +91,7 @@ namespace rtengine {
 	};
 
 	template<typename E>
-	wavelet_decomposition::wavelet_decomposition(E * src, int width, int height, int maxlvl, int subsampling)
+	wavelet_decomposition::wavelet_decomposition(E * src, int width, int height, int maxlvl, int subsampling, int skipcrop)
 	: lvltot(0), subsamp(subsampling), m_w(width), m_h(height), coeff0(NULL)
 	{
 		//initialize wavelet filters
@@ -116,21 +111,20 @@ namespace rtengine {
 		// after coefficient rotation, data structure is:
 		// wavelet_decomp[scale][channel={lo,hi1,hi2,hi3}][pixel_array]
 
-		int padding = 0;//pow(2, maxlvl);//must be a multiple of two
 		lvltot=0;
 		E *buffer[2];
 		buffer[0] = new E[(m_w/2+1)*(m_h/2+1)];
 		buffer[1] = new E[(m_w/2+1)*(m_h/2+1)];
 		int bufferindex = 0;
 
-		wavelet_decomp[lvltot] = new wavelet_level<internal_type>(src, buffer[bufferindex^1], lvltot/*level*/, subsamp, padding/*padding*/, m_w, m_h, \
-																  wavfilt_anal, wavfilt_anal, wavfilt_len, wavfilt_offset);
+		wavelet_decomp[lvltot] = new wavelet_level<internal_type>(src, buffer[bufferindex^1], lvltot/*level*/, subsamp, m_w, m_h, \
+																  wavfilt_anal, wavfilt_anal, wavfilt_len, wavfilt_offset, skipcrop);
 		while(lvltot < maxlvl-1) {
 			lvltot++;
 			bufferindex ^= 1;
-			wavelet_decomp[lvltot] = new wavelet_level<internal_type>(buffer[bufferindex], buffer[bufferindex^1]/*lopass*/, lvltot/*level*/, subsamp, 0/*no padding*/, \
+			wavelet_decomp[lvltot] = new wavelet_level<internal_type>(buffer[bufferindex], buffer[bufferindex^1]/*lopass*/, lvltot/*level*/, subsamp, \
 																	  wavelet_decomp[lvltot-1]->width(), wavelet_decomp[lvltot-1]->height(), \
-																	  wavfilt_anal, wavfilt_anal, wavfilt_len, wavfilt_offset);
+																	  wavfilt_anal, wavfilt_anal, wavfilt_len, wavfilt_offset, skipcrop);
 		}
 		coeff0 = buffer[bufferindex^1];
 		delete[] buffer[bufferindex];
diff --git a/rtengine/cplx_wavelet_level.h b/rtengine/cplx_wavelet_level.h
index 874476c9a..dabd8c8ff 100644
--- a/rtengine/cplx_wavelet_level.h
+++ b/rtengine/cplx_wavelet_level.h
@@ -31,9 +31,6 @@ namespace rtengine {
 	class wavelet_level
 	{
 
-		// size of padded border
-		size_t m_pad;
-
 		// level of decomposition
 		int lvl;
 
@@ -49,19 +46,27 @@ namespace rtengine {
 
 		// load a row/column of input data, possibly with padding
 
-		void AnalysisFilterHaarVertical (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen, int row);
-		void AnalysisFilterHaarHorizontal (T * srcbuffer, T * dstLo, T * dstHi, int srclen, int row);
-		void SynthesisFilterHaarHorizontal (T * srcLo, T * srcHi, T * dst, int dstlen);
-		void SynthesisFilterHaarVertical (T * srcLo, T * srcHi, T * dst, int pitch, int dstlen);
+		void AnalysisFilterHaarVertical (const T * const srcbuffer, T * dstLo, T * dstHi, const int width, const int height, const int row);
+		void AnalysisFilterHaarHorizontal (const T * const srcbuffer, T * dstLo, T * dstHi, const int width, const int row);
+		void SynthesisFilterHaarHorizontal (const T * const srcLo, const T * const srcHi, T * dst, const int width, const int height);
+		void SynthesisFilterHaarVertical (const T * const srcLo, const T * const srcHi, T * dst, const int width, const int height);
 
 		void AnalysisFilterSubsampHorizontal (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
-							 int taps, int offset, int pitch, int srclen, int m_w2, int row);
+							 const int taps, const int offset, const int srcwidth, const int dstwidth, const int row);
+#ifdef __SSE2__
+		void AnalysisFilterSubsampVertical (T * srcbuffer, T * dstLo, T * dstHi, float (*filterLo)[4], float (*filterHi)[4],
+							 const int taps, const int offset, const int width, const int height, const int row);
+#else
 		void AnalysisFilterSubsampVertical (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
-							 int taps, int offset, int pitch, int srclen, int row);
+							 int const taps, const int offset, const int width, const int height, const int row);
+#endif
 		void SynthesisFilterSubsampHorizontal (T * srcLo, T * srcHi, T * dst,
-							  float *filterLo, float *filterHi, int taps, int offset, int dstlen);
-		void SynthesisFilterSubsampVertical (T * srcLo, T * srcHi, T * dst, float *filterLo, float *filterHi, int taps, int offset, int pitch, int dstlen);
-
+							  float *filterLo, float *filterHi, const int taps, const int offset, const int scrwidth, const int dstwidth, const int height);
+#ifdef __SSE2__
+		void SynthesisFilterSubsampVertical (T * srcLo, T * srcHi, T * dst, float (*filterLo)[4], float (*filterHi)[4], const int taps, const int offset, const int width, const int srcheight, const int dstheight);
+#else
+		void SynthesisFilterSubsampVertical (T * srcLo, T * srcHi, T * dst, float *filterLo, float *filterHi, const int taps, const int offset, const int width, const int srcheight, const int dstheight);
+#endif
 	public:
 
 		T ** wavcoeffs;
@@ -72,18 +77,20 @@ namespace rtengine {
 		size_t m_w2, m_h2;
 
 		template<typename E>
-		wavelet_level(E * src, E * dst, int level, int subsamp, int padding, size_t w, size_t h, float *filterV, float *filterH, int len, int offset)
-		: m_w(w), m_h(h), m_w2(w), m_h2(h), m_pad(padding), wavcoeffs(NULL), lvl(level), skip(1<<level), subsamp_out((subsamp>>level)&1)
+		wavelet_level(E * src, E * dst, int level, int subsamp, size_t w, size_t h, float *filterV, float *filterH, int len, int offset, int skipcrop)
+		: lvl(level), subsamp_out((subsamp>>level)&1), skip(1<<level), wavcoeffs(NULL), m_w(w), m_h(h), m_w2(w), m_h2(h)
 		{
 			if (subsamp) {
 				skip = 1;
 				for (int n=0; n<level; n++) {
 					skip *= 2-((subsamp>>n)&1);
-				}
+				}
+				skip /= skipcrop;
+				if(skip < 1) skip=1;
+
 			}
-			m_w2 = (subsamp_out ? ((w+1+2*skip*padding)/2) : (w+2*skip*padding));
-			m_h2 = (subsamp_out ? ((h+1+2*skip*padding)/2) : (h+2*skip*padding));
-			m_pad= skip*padding;
+			m_w2 = (subsamp_out ? (w+1)/2 : w);
+			m_h2 = (subsamp_out ? (h+1)/2 : h);
 			
 			wavcoeffs = create((m_w2)*(m_h2));
 			decompose_level(src, dst, filterV, filterH, len, offset);
@@ -115,11 +122,6 @@ namespace rtengine {
 			return m_h2;
 		}
 
-		size_t padding() const
-		{
-			return m_pad/skip;
-		}
-
 		size_t stride() const
 		{
 			return skip;
@@ -151,33 +153,33 @@ namespace rtengine {
 	}
 
 	template<typename T>
-	void wavelet_level<T>::AnalysisFilterHaarHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int srclen, int row) {
+	void wavelet_level<T>::AnalysisFilterHaarHorizontal (const T * const RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, const int width, const int row) {
 		/* Basic convolution code
 		 * Applies a Haar filter 
 		*/							
-			for(int i = 0; i < (srclen - skip); i++) {
-				dstLo[row*srclen+i] = (srcbuffer[i] + srcbuffer[i+skip]);
-				dstHi[row*srclen+i] = (srcbuffer[i] - srcbuffer[i+skip]);
+			for(int i = 0; i < (width - skip); i++) {
+				dstLo[row*width+i] = (srcbuffer[i] + srcbuffer[i+skip]);
+				dstHi[row*width+i] = (srcbuffer[i] - srcbuffer[i+skip]);
 			}
-			for(size_t i = max(srclen-skip,skip); i < (srclen); i++) {
-				dstLo[row*srclen+i] = (srcbuffer[i] + srcbuffer[i-skip]);
-				dstHi[row*srclen+i] = (srcbuffer[i] - srcbuffer[i-skip]);
+			for(size_t i = max(width-skip,skip); i < (width); i++) {
+				dstLo[row*width+i] = (srcbuffer[i] + srcbuffer[i-skip]);
+				dstHi[row*width+i] = (srcbuffer[i] - srcbuffer[i-skip]);
 			}
 	}
 
-	template<typename T> void wavelet_level<T>::AnalysisFilterHaarVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int pitch, int srclen, int row) {
+	template<typename T> void wavelet_level<T>::AnalysisFilterHaarVertical (const T * const RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, const int width, const int height, const int row) {
 	/* Basic convolution code
 	 * Applies a Haar filter 
 	*/
-		if(row < (srclen - skip)) {
-			for(int j=0;j<pitch;j++) {
-				dstLo[j] = 0.25f*(srcbuffer[row*pitch+j] + srcbuffer[(row+skip)*pitch+j]);
-				dstHi[j] = 0.25f*(srcbuffer[row*pitch+j] - srcbuffer[(row+skip)*pitch+j]);
+		if(row < (height - skip)) {
+			for(int j=0;j<width;j++) {
+				dstLo[j] = 0.25f*(srcbuffer[row*width+j] + srcbuffer[(row+skip)*width+j]);
+				dstHi[j] = 0.25f*(srcbuffer[row*width+j] - srcbuffer[(row+skip)*width+j]);
 			}
-		} else if(row>=max(srclen-skip,skip)) {
-			for(int j=0;j<pitch;j++) {
-				dstLo[j] = 0.25f*(srcbuffer[row*pitch+j] + srcbuffer[(row-skip)*pitch+j]);
-				dstHi[j] = 0.25f*(srcbuffer[row*pitch+j] - srcbuffer[(row-skip)*pitch+j]);
+		} else if(row>=max(height-skip,skip)) {
+			for(int j=0;j<width;j++) {
+				dstLo[j] = 0.25f*(srcbuffer[row*width+j] + srcbuffer[(row-skip)*width+j]);
+				dstHi[j] = 0.25f*(srcbuffer[row*width+j] - srcbuffer[(row-skip)*width+j]);
 			}
 		}
 	}
@@ -185,43 +187,43 @@ namespace rtengine {
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
 	
-	template<typename T> void wavelet_level<T>::SynthesisFilterHaarHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, int dstlen) {
+	template<typename T> void wavelet_level<T>::SynthesisFilterHaarHorizontal (const T * const RESTRICT srcLo, const T * const RESTRICT srcHi, T * RESTRICT dst, const int width, const int height) {
 
 		/* Basic convolution code
 		 * Applies a Haar filter 
 		 *
 		 */
 
-		for (int k=0; k<m_h2; k++) {
-			for(size_t i = (m_pad); i < (m_pad+skip); i++) {
-				dst[k*m_w+(i-m_pad)] = (srcLo[k*m_w2+i] + srcHi[k*m_w2+i]);			
+		for (int k=0; k<height; k++) {
+			for(size_t i = 0; i < skip; i++) {
+				dst[k*width+i] = (srcLo[k*width+i] + srcHi[k*width+i]);			
 			}
-			for(size_t i = m_pad+skip; i < (dstlen+m_pad); i++) {
-				dst[k*m_w+(i-m_pad)] = 0.5f*(srcLo[k*m_w2+i] + srcHi[k*m_w2+i] + srcLo[k*m_w2+i-skip] - srcHi[k*m_w2+i-skip]);			
+			for(size_t i = skip; i < width; i++) {
+				dst[k*width+i] = 0.5f*(srcLo[k*width+i] + srcHi[k*width+i] + srcLo[k*width+i-skip] - srcHi[k*width+i-skip]);			
 			}
 		}
 	}
 
-	template<typename T> void wavelet_level<T>::SynthesisFilterHaarVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, int pitch, int dstlen) {
+	template<typename T> void wavelet_level<T>::SynthesisFilterHaarVertical (const T * const RESTRICT srcLo, const T * const RESTRICT srcHi, T * RESTRICT dst, const int width, const int height) {
 
 		/* Basic convolution code
 		 * Applies a Haar filter 
 		 *
 		 */
 
-		for(size_t i = (m_pad); i < (m_pad+skip); i++) {
-			for(int j=0;j<pitch;j++)
-				dst[pitch*(i-m_pad)+j] = (srcLo[i*pitch+j] + srcHi[i*pitch+j]);			
+		for(size_t i = 0; i < skip; i++) {
+			for(int j=0;j<width;j++)
+				dst[width*i+j] = (srcLo[i*width+j] + srcHi[i*width+j]);			
 		}
-		for(size_t i = m_pad+skip; i < (dstlen+m_pad); i++) {
-			for(int j=0;j<pitch;j++)
-				dst[pitch*(i-m_pad)+j] = 0.5f*(srcLo[i*pitch+j] + srcHi[i*pitch+j] + srcLo[(i-skip)*pitch+j] - srcHi[(i-skip)*pitch+j]);			
+		for(size_t i = skip; i < height; i++) {
+			for(int j=0;j<width;j++)
+				dst[width*i+j] = 0.5f*(srcLo[i*width+j] + srcHi[i*width+j] + srcLo[(i-skip)*width+j] - srcHi[(i-skip)*width+j]);			
 		}
 	}
 
 	template<typename T>
-	void wavelet_level<T>::AnalysisFilterSubsampHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float *filterHi, 
-												int taps, int offset, int pitch, int srclen, int m_w2, int row) {
+	void wavelet_level<T>::AnalysisFilterSubsampHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float *RESTRICT filterHi, 
+												const int taps, const int offset, const int srcwidth, const int dstwidth, const int row) {
 		/* Basic convolution code
 		 * Applies an FIR filter 'filter' with filter length 'taps', 
 		 * aligning the 'offset' element of the filter with
@@ -229,62 +231,125 @@ namespace rtengine {
 		 * Output is subsampled by two
 		 */
 		// calculate coefficients
-			for(int i = 0; i < srclen; i+=2) {
-				float lo = 0.f, hi = 0.f;
-				if (LIKELY(i>skip*taps && i<srclen-skip*taps)) {//bulk
-					for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
-						float src = srcbuffer[i-l];
-						lo += filterLo[j] * src;//lopass channel
-						hi += filterHi[j] * src;//hipass channel
-					}
-				} else {
-					for (int j=0; j<taps; j++) {
-						int arg = max(0,min(i+skip*(offset-j),srclen-1));//clamped BC's
-						lo += filterLo[j] * srcbuffer[arg];//lopass channel
-						hi += filterHi[j] * srcbuffer[arg];//hipass channel
-					}
-				}
-				dstLo[row*m_w2+((i/2))] = lo;
-				dstHi[row*m_w2+((i/2))] = hi;
-			}
-	}
-
-	template<typename T> void wavelet_level<T>::AnalysisFilterSubsampVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float * RESTRICT filterHi, 
-													int taps, int offset, int pitch, int srclen, int row) {
-
-			/* Basic convolution code
-			 * Applies an FIR filter 'filter' with filter length 'taps', 
-			 * aligning the 'offset' element of the filter with
-			 * the input pixel, and skipping 'skip' pixels between taps 
-			 * Output is subsampled by two
-			 */
-
-			// calculate coefficients
-			if (LIKELY(row>skip*taps && row<srclen-skip*taps)) {//bulk
-				for (int k=0; k<pitch; k++) {
-					float lo = 0.f, hi = 0.f;
-					for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
-						lo += filterLo[j] * srcbuffer[(row-l)*pitch+k];//lopass channel
-						hi += filterHi[j] * srcbuffer[(row-l)*pitch+k];//hipass channel
-					}
-					dstLo[k] = lo;
-					dstHi[k] = hi;
-				}
-			} else {//boundary
-				for (int k=0; k<pitch; k++) {
-					float lo = 0.f, hi = 0.f;
-					for (int j=0; j<taps; j++) {
-						int arg = max(0,min(row+skip*(offset-j),srclen-1))*pitch+k;//clamped BC's
-						lo += filterLo[j] * srcbuffer[arg];//lopass channel
-						hi += filterHi[j] * srcbuffer[arg];//hipass channel
-					}
-					dstLo[k] = lo;
-					dstHi[k] = hi;
+		for(int i = 0; i < srcwidth; i+=2) {
+			float lo = 0.f, hi = 0.f;
+			if (LIKELY(i>skip*taps && i<srcwidth-skip*taps)) {//bulk
+				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+					float src = srcbuffer[i-l];
+					lo += filterLo[j] * src;//lopass channel
+					hi += filterHi[j] * src;//hipass channel
+				}
+			} else {
+				for (int j=0; j<taps; j++) {
+					int arg = max(0,min(i+skip*(offset-j),srcwidth-1));//clamped BC's
+					lo += filterLo[j] * srcbuffer[arg];//lopass channel
+					hi += filterHi[j] * srcbuffer[arg];//hipass channel
 				}
 			}
+			dstLo[row*dstwidth+((i/2))] = lo;
+			dstHi[row*dstwidth+((i/2))] = hi;
+		}
 	}
 
-	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, int taps, int offset, int dstlen) {
+#ifdef __SSE2__
+	template<typename T> void wavelet_level<T>::AnalysisFilterSubsampVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float (* RESTRICT filterLo)[4], float (* RESTRICT filterHi)[4],
+													const int taps, const int offset, const int width, const int height, const int row) {
+
+		/* Basic convolution code
+		 * Applies an FIR filter 'filter' with filter length 'taps', 
+		 * aligning the 'offset' element of the filter with
+		 * the input pixel, and skipping 'skip' pixels between taps 
+		 * Output is subsampled by two
+		 */
+
+		// calculate coefficients
+		if (LIKELY(row>skip*taps && row<height-skip*taps)) {//bulk
+			int k;
+			for (k=0; k<width-3; k+=4) {
+				__m128 lov = _mm_setzero_ps();
+				__m128 hiv = _mm_setzero_ps();
+				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+					__m128 srcv = LVFU(srcbuffer[(row-l)*width+k]);
+					lov += LVF(filterLo[j][0]) * srcv;//lopass channel
+					hiv += LVF(filterHi[j][0]) * srcv;//hipass channel
+				}
+				STVF(dstLo[k], lov);
+				STVF(dstHi[k], hiv);
+			}
+			for (; k<width; k++) {
+				float lo = 0.f, hi = 0.f;
+				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+					lo += filterLo[j][0] * srcbuffer[(row-l)*width+k];//lopass channel
+					hi += filterHi[j][0] * srcbuffer[(row-l)*width+k];//hipass channel
+				}
+				dstLo[k] = lo;
+				dstHi[k] = hi;
+			}
+		} else {//boundary
+			int k;
+			for (k=0; k<width-3; k+=4) {
+				__m128 lov = _mm_setzero_ps();
+				__m128 hiv = _mm_setzero_ps();
+				for (int j=0; j<taps; j++) {
+					int arg = max(0,min(row+skip*(offset-j),height-1))*width+k;//clamped BC's
+					__m128 srcv = LVFU(srcbuffer[arg]);
+					lov += LVF(filterLo[j][0]) * srcv;//lopass channel
+					hiv += LVF(filterHi[j][0]) * srcv;//hipass channel
+				}
+				STVF(dstLo[k], lov);
+				STVF(dstHi[k], hiv);
+			}
+			for (; k<width; k++) {
+				float lo = 0.f, hi = 0.f;
+				for (int j=0; j<taps; j++) {
+					int arg = max(0,min(row+skip*(offset-j),height-1))*width+k;//clamped BC's
+					lo += filterLo[j][0] * srcbuffer[arg];//lopass channel
+					hi += filterHi[j][0] * srcbuffer[arg];//hipass channel
+				}
+				dstLo[k] = lo;
+				dstHi[k] = hi;
+			}
+		}
+	}
+#else
+	template<typename T> void wavelet_level<T>::AnalysisFilterSubsampVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float * RESTRICT filterHi, 
+													const int taps, const int offset, const int width, const int height, const int row) {
+
+		/* Basic convolution code
+		 * Applies an FIR filter 'filter' with filter length 'taps', 
+		 * aligning the 'offset' element of the filter with
+		 * the input pixel, and skipping 'skip' pixels between taps 
+		 * Output is subsampled by two
+		 */
+
+		// calculate coefficients
+		if (LIKELY(row>skip*taps && row<height-skip*taps)) {//bulk
+			for (int k=0; k<width; k++) {
+				float lo = 0.f, hi = 0.f;
+				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+					lo += filterLo[j] * srcbuffer[(row-l)*width+k];//lopass channel
+					hi += filterHi[j] * srcbuffer[(row-l)*width+k];//hipass channel
+				}
+				dstLo[k] = lo;
+				dstHi[k] = hi;
+			}
+		} else {//boundary
+			for (int k=0; k<width; k++) {
+				float lo = 0.f, hi = 0.f;
+				for (int j=0; j<taps; j++) {
+					int arg = max(0,min(row+skip*(offset-j),height-1))*width+k;//clamped BC's
+					lo += filterLo[j] * srcbuffer[arg];//lopass channel
+					hi += filterHi[j] * srcbuffer[arg];//hipass channel
+				}
+				dstLo[k] = lo;
+				dstHi[k] = hi;
+			}
+		}
+	}
+#endif
+
+
+	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, const int taps, const int offset, const int srcwidth, const int dstwidth, const int height) {
 
 		/* Basic convolution code
 		 * Applies an FIR filter 'filter' with filter length 'taps', 
@@ -294,78 +359,157 @@ namespace rtengine {
 		 */
 
 		// calculate coefficients
-		int srclen = (dstlen==m_w ? m_w2 : m_h2);//length of row/col in src (coarser level)
 		int shift = skip*(taps-offset-1);//align filter with data
 
-		for (int k=0; k<m_h2; k++) {
-			for(size_t i = m_pad; i < (dstlen+m_pad); i++) {
+		for (int k=0; k<height; k++) {
+			int i;	
+			for(i=0; i<=min(skip*taps,dstwidth); i++) {
 				float tot=0.f;
 				//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
 				int i_src = (i+shift)/2;
 				int begin = (i+shift)%2;
-				if (LIKELY(i>skip*taps && i<(srclen-skip*taps))) {//bulk
-					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-						tot += ((filterLo[j] * srcLo[k*m_w2+i_src-l] + filterHi[j] * srcHi[k*m_w2+i_src-l]));
-					}
-				} else {//boundary
-					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-						int arg = max(0,min((i_src-l),srclen-1));//clamped BC's
-						tot += ((filterLo[j] * srcLo[k*m_w2+arg] + filterHi[j] * srcHi[k*m_w2+arg]));
-					}
+				for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+					int arg = max(0,min((i_src-l),srcwidth-1));//clamped BC's
+					tot += ((filterLo[j] * srcLo[k*srcwidth+arg] + filterHi[j] * srcHi[k*srcwidth+arg]));
 				}
-				dst[k*m_w+(i-m_pad)] = tot;
+				dst[k*dstwidth+i] = tot;
+			}
+			for(; i<min(dstwidth-skip*taps,dstwidth); i++) {
+				float tot=0.f;
+				//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
+				int i_src = (i+shift)/2;
+				int begin = (i+shift)%2;
+				for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+					tot += ((filterLo[j] * srcLo[k*srcwidth+i_src-l] + filterHi[j] * srcHi[k*srcwidth+i_src-l]));
+				}
+				dst[k*dstwidth+i] = tot;
+			}
+			for(; i < dstwidth; i++) {
+				float tot=0.f;
+				//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
+				int i_src = (i+shift)/2;
+				int begin = (i+shift)%2;
+				for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+					int arg = max(0,min((i_src-l),srcwidth-1));//clamped BC's
+					tot += ((filterLo[j] * srcLo[k*srcwidth+arg] + filterHi[j] * srcHi[k*srcwidth+arg]));
+				}
+				dst[k*dstwidth+i] = tot;
 			}
 		}
 	}
+
+#ifdef __SSE2__
+	template<typename T> SSEFUNCTION void wavelet_level<T>::SynthesisFilterSubsampVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float (* RESTRICT filterLo)[4], float (* RESTRICT filterHi)[4], const int taps, const int offset, const int width, const int srcheight, const int dstheight)
+	 {
 
-	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, int taps, int offset, int pitch, int dstlen) {
-
-			/* Basic convolution code
-			 * Applies an FIR filter 'filter' with filter length 'taps', 
-			 * aligning the 'offset' element of the filter with
-			 * the input pixel, and skipping 'skip' pixels between taps 
-			 * Output is subsampled by two
-			 */
+		/* Basic convolution code
+		 * Applies an FIR filter 'filter' with filter length 'taps', 
+		 * aligning the 'offset' element of the filter with
+		 * the input pixel, and skipping 'skip' pixels between taps 
+		 * Output is subsampled by two
+		 */
 
 		// calculate coefficients
-		int srclen = (dstlen==m_w ? m_w2 : m_h2);//length of row/col in src (coarser level)
 		int shift=skip*(taps-offset-1);//align filter with data
-
-		for(size_t i = m_pad; i < (dstlen+m_pad); i++) {
+		__m128 fourv = _mm_set1_ps(4.f);
+		for(size_t i = 0; i < dstheight; i++) {
 			int i_src = (i+shift)/2;
 			int begin = (i+shift)%2;
 			//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
-			if (LIKELY(i>skip*taps && i<(srclen-skip*taps))) {//bulk
-				for (int k=0; k<pitch; k++) {
-					float tot = 0.f;
+			if (LIKELY(i>skip*taps && i<(dstheight-skip*taps))) {//bulk
+				int k;
+				for (k=0; k<width-3; k+=4) {
+					__m128 totv = _mm_setzero_ps();
 					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-						tot += ((filterLo[j] * srcLo[(i_src-l)*pitch+k] + filterHi[j] * srcHi[(i_src-l)*pitch+k]));
+						totv += ((LVF(filterLo[j][0]) * LVFU(srcLo[(i_src-l)*width+k]) + LVF(filterHi[j][0]) * LVFU(srcHi[(i_src-l)*width+k])));
 					}
-					dst[pitch*(i-m_pad)+k] = 4.f * tot;
+					_mm_storeu_ps(&dst[width*i+k], fourv * totv);
 				}
-			} else {//boundary
-				for (int k=0; k<pitch; k++) {
+				for (; k<width; k++) {
 					float tot = 0.f;
 					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-						int arg = max(0,min((i_src-l),srclen-1))*pitch+k;//clamped BC's
-						tot += ((filterLo[j] * srcLo[arg] + filterHi[j] * srcHi[arg]));
+						tot += ((filterLo[j][0] * srcLo[(i_src-l)*width+k] + filterHi[j][0] * srcHi[(i_src-l)*width+k]));
+					}
+					dst[width*i+k] = 4.f * tot;
+				}
+			} else {//boundary
+				int k;
+				for (k=0; k<width-3; k+=4) {
+					__m128 totv = _mm_setzero_ps();
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						int arg = max(0,min((i_src-l),srcheight-1))*width+k;//clamped BC's
+						totv += ((LVF(filterLo[j][0]) * LVFU(srcLo[arg]) + LVF(filterHi[j][0]) * LVFU(srcHi[arg])));
 					}
-					dst[pitch*(i-m_pad)+k] = 4.f * tot;
+					_mm_storeu_ps(&dst[width*i+k], fourv * totv);
+				}
+				for (; k<width; k++) {
+					float tot = 0.f;
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						int arg = max(0,min((i_src-l),srcheight-1))*width+k;//clamped BC's
+						tot += ((filterLo[j][0] * srcLo[arg] + filterHi[j][0] * srcHi[arg]));
+					}
+					dst[width*i+k] = 4.f * tot;
 				}
 			}
 		}
 	}
+#else
+	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, const int taps, const int offset, const int width, const int srcheight, const int dstheight)
+	 {
 
-	template<typename T> template<typename E> void wavelet_level<T>::decompose_level(E *src, E *dst, float *filterV, float *filterH, int taps, int offset) { 
+		/* Basic convolution code
+		 * Applies an FIR filter 'filter' with filter length 'taps', 
+		 * aligning the 'offset' element of the filter with
+		 * the input pixel, and skipping 'skip' pixels between taps 
+		 * Output is subsampled by two
+		 */
+
+		// calculate coefficients
+		int shift=skip*(taps-offset-1);//align filter with data
+
+		for(size_t i = 0; i < dstheight; i++) {
+			int i_src = (i+shift)/2;
+			int begin = (i+shift)%2;
+			//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
+			if (LIKELY(i>skip*taps && i<(dstheight-skip*taps))) {//bulk
+				for (int k=0; k<width; k++) {
+					float tot = 0.f;
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						tot += ((filterLo[j] * srcLo[(i_src-l)*width+k] + filterHi[j] * srcHi[(i_src-l)*width+k]));
+					}
+					dst[width*i+k] = 4.f * tot;
+				}
+			} else {//boundary
+				for (int k=0; k<width; k++) {
+					float tot = 0.f;
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						int arg = max(0,min((i_src-l),srcheight-1))*width+k;//clamped BC's
+						tot += ((filterLo[j] * srcLo[arg] + filterHi[j] * srcHi[arg]));
+					}
+					dst[width*i+k] = 4.f * tot;
+				}
+			}
+		}
+	}
+#endif
+
+#ifdef __SSE2__
+	template<typename T> template<typename E> SSEFUNCTION void wavelet_level<T>::decompose_level(E *src, E *dst, float *filterV, float *filterH, int taps, int offset) { 
 
 		T tmpLo[m_w] ALIGNED64;
 		T tmpHi[m_w] ALIGNED64;
 		/* filter along rows and columns */
 		if(subsamp_out) {
+			float filterVarray[2*taps][4] ALIGNED64;
+			for(int i=0;i<2*taps;i++) {
+				for(int j=0;j<4;j++) {
+					filterVarray[i][j] = filterV[i];
+				}
+			}
 			for(int row=0;row<m_h;row+=2) {
-				AnalysisFilterSubsampVertical (src, tmpLo, tmpHi, filterV, filterV+taps, taps, offset, m_w/*output_pitch*/, m_h/*srclen*/, row);
-				AnalysisFilterSubsampHorizontal (tmpLo, dst, wavcoeffs[1], filterH, filterH+taps, taps, offset, m_h2/*output_pitch*/, m_w/*srclen*/, m_w2, row/2);
-				AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_h2/*output_pitch*/, m_w/*srclen*/, m_w2, row/2);
+				AnalysisFilterSubsampVertical (src, tmpLo, tmpHi, filterVarray, filterVarray+taps, taps, offset, m_w, m_h, row);
+				AnalysisFilterSubsampHorizontal (tmpLo, dst, wavcoeffs[1], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
+				AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
 			}
 		} else {
 			for(int row=0;row<m_h;row++) {
@@ -375,21 +519,64 @@ namespace rtengine {
 			}
 		}
 	}
+#else
+	template<typename T> template<typename E> void wavelet_level<T>::decompose_level(E *src, E *dst, float *filterV, float *filterH, int taps, int offset) { 
 
+		T tmpLo[m_w] ALIGNED64;
+		T tmpHi[m_w] ALIGNED64;
+		/* filter along rows and columns */
+		if(subsamp_out) {
+			for(int row=0;row<m_h;row+=2) {
+				AnalysisFilterSubsampVertical (src, tmpLo, tmpHi, filterV, filterV+taps, taps, offset, m_w, m_h, row);
+				AnalysisFilterSubsampHorizontal (tmpLo, dst, wavcoeffs[1], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
+				AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
+			}
+		} else {
+			for(int row=0;row<m_h;row++) {
+				AnalysisFilterHaarVertical (src, tmpLo, tmpHi, m_w, m_h, row);
+				AnalysisFilterHaarHorizontal (tmpLo, dst, wavcoeffs[1], m_w, row);
+				AnalysisFilterHaarHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], m_w, row);
+			}
+		}
+	}
+#endif
+
+#ifdef __SSE2__
+
+	template<typename T> template<typename E> SSEFUNCTION void wavelet_level<T>::reconstruct_level(E* tmpLo, E* tmpHi, E * src, E *dst, float *filterV, float *filterH, int taps, int offset) { 
+
+		/* filter along rows and columns */
+		if (subsamp_out) {
+			float filterVarray[2*taps][4] ALIGNED64;
+			for(int i=0;i<2*taps;i++) {
+				for(int j=0;j<4;j++) {
+					filterVarray[i][j] = filterV[i];
+				}
+			}
+			SynthesisFilterSubsampHorizontal (src, wavcoeffs[1], tmpLo, filterH, filterH+taps, taps, offset, m_w2, m_w, m_h2);
+			SynthesisFilterSubsampHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, filterH, filterH+taps, taps, offset, m_w2, m_w, m_h2);
+			SynthesisFilterSubsampVertical (tmpLo, tmpHi, dst, filterVarray, filterVarray+taps, taps, offset, m_w, m_h2, m_h);
+		} else {
+			SynthesisFilterHaarHorizontal (src, wavcoeffs[1], tmpLo, m_w, m_h2);
+			SynthesisFilterHaarHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, m_w, m_h2);
+			SynthesisFilterHaarVertical (tmpLo, tmpHi, dst, m_w, m_h);
+		}
+	}
+#else
 	template<typename T> template<typename E> void wavelet_level<T>::reconstruct_level(E* tmpLo, E* tmpHi, E * src, E *dst, float *filterV, float *filterH, int taps, int offset) { 
 
 		/* filter along rows and columns */
 		if (subsamp_out) {
-			SynthesisFilterSubsampHorizontal (src, wavcoeffs[1], tmpLo, filterH, filterH+taps, taps, offset, m_w/*dstlen*/);
-			SynthesisFilterSubsampHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, filterH, filterH+taps, taps, offset, m_w/*dstlen*/);
-			SynthesisFilterSubsampVertical (tmpLo, tmpHi, dst, filterV, filterV+taps, taps, offset, m_w/*pitch*/, m_h/*dstlen*/);
+			SynthesisFilterSubsampHorizontal (src, wavcoeffs[1], tmpLo, filterH, filterH+taps, taps, offset, m_w2, m_w, m_h2);
+			SynthesisFilterSubsampHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, filterH, filterH+taps, taps, offset, m_w2, m_w, m_h2);
+			SynthesisFilterSubsampVertical (tmpLo, tmpHi, dst, filterV, filterV+taps, taps, offset, m_w, m_h2, m_h);
 		} else {
-			SynthesisFilterHaarHorizontal (src, wavcoeffs[1], tmpLo, m_w);
-			SynthesisFilterHaarHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, m_w);
+			SynthesisFilterHaarHorizontal (src, wavcoeffs[1], tmpLo, m_w, m_h2);
+			SynthesisFilterHaarHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, m_w, m_h2);
 			SynthesisFilterHaarVertical (tmpLo, tmpHi, dst, m_w, m_h);
 		}
 	}
-	
+#endif
 };
 
 #endif
diff --git a/rtengine/dcp.cc b/rtengine/dcp.cc
index b067b2120..685ce6f38 100644
--- a/rtengine/dcp.cc
+++ b/rtengine/dcp.cc
@@ -287,7 +287,7 @@ void DCPProfile::MakeXYZCAM(ColorTemp &wb, double pre_mul[3], double camWbMatrix
     }
 
     // mix if we have two matrices
-    double mix;
+    double mix = 1.0;
     if ((hasCol1 && hasCol2) || (hasFwd1 && hasFwd2)) {
         double wbtemp;
         /* DNG ref way to convert XY to temperature, which affect matrix mixing. A different model here
diff --git a/rtengine/dcraw.h b/rtengine/dcraw.h
index 3bfda4c1f..804cfc49c 100644
--- a/rtengine/dcraw.h
+++ b/rtengine/dcraw.h
@@ -48,19 +48,19 @@ public:
     ,ifname(NULL)
     ,meta_data(NULL)
     ,shot_select(0),multi_out(0)
+	,float_raw_image(NULL)
     ,image(NULL)
     ,bright(1.),threshold(0.)
     ,half_size(0),four_color_rgb(0),document_mode(0),highlight(0)
     ,verbose(0)
+    ,use_auto_wb(0),use_camera_wb(0),use_camera_matrix(1)
+    ,output_color(1),output_bps(8),output_tiff(0),med_passes(0),no_auto_bright(0)
     ,RT_whitelevel_from_constant(0)
     ,RT_blacklevel_from_constant(0)
     ,RT_matrix_from_constant(0)
-    ,use_auto_wb(0),use_camera_wb(0),use_camera_matrix(1)
-    ,output_color(1),output_bps(8),output_tiff(0),med_passes(0),no_auto_bright(0)
 	,getbithuff(this,ifp,zero_after_ff)
 	,ph1_bithuff(this,ifp,order)	
 	,pana_bits(ifp,load_flags)
-	,float_raw_image(NULL)
     {
         memset(&hbd, 0, sizeof(hbd));
         aber[0]=aber[1]=aber[2]=aber[3]=1;
@@ -106,9 +106,9 @@ protected:
     int output_color, output_bps, output_tiff, med_passes;
     int no_auto_bright;
     unsigned greybox[4] ;
-    int RT_matrix_from_constant;
-    int RT_blacklevel_from_constant;
     int RT_whitelevel_from_constant;
+    int RT_blacklevel_from_constant;
+    int RT_matrix_from_constant;
 
     float cam_mul[4], pre_mul[4], cmatrix[3][4], rgb_cam[3][4];
     
diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 5391a67f3..121b347c9 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -23,7 +23,6 @@
 #include "refreshmap.h"
 #include "rt_math.h"
 #include "colortemp.h"
-
 // "ceil" rounding
 #define SKIPS(a,b) ((a) / (b) + ((a) % (b) > 0))
 
@@ -158,14 +157,11 @@ void Crop::update (int todo) {
 		//	printf("x=%d y=%d crow=%d croh=%d skip=%d\n",rqcropx, rqcropy, rqcropw, rqcroph, skip);
 		//	printf("trafx=%d trafyy=%d trafwsk=%d trafHs=%d \n",trafx, trafy, trafw*skip, trafh*skip);
 			
-		int skipP=1;//force Skip for noise evaluation
-
 		Imagefloat *calclum = NULL;//for Luminance denoise curve
 		NoiseCurve noiseLCurve;
 		NoiseCurve noiseCCurve;
 		float autoNR = (float) settings->nrauto;//
 		float autoNRmax = (float) settings->nrautomax;//
-		float autohigh = (float) settings->nrhigh;//
 		
 		params.dirpyrDenoise.getCurves(noiseLCurve, noiseCCurve);
 
@@ -186,7 +182,6 @@ void Crop::update (int todo) {
 		parent->ipf.Tile_calc (tilesize, overlap, kall, widIm, heiIm, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip);
 		kall=0;
 		
-		int nbtl=numtiles_W*numtiles_H;
 		float *ch_M = new float [9];//allocate memory
 		float *max_r = new float [9];
 		float *max_b = new float [9];
@@ -212,71 +207,7 @@ void Crop::update (int todo) {
 				parent->imgsrc->getImage (parent->currWB, tr, origCrop, pp, params.toneCurve, params.icm, params.raw );
 			}
 		}
-		/*
-		if(params.dirpyrDenoise.Cmethod=="PON") {
-			MyTime t1dce,t2dce;
-			t1dce.set();
 
-		int crW=100;//crop noise width
-		int crH=100;//crop noise height
-		Imagefloat *origCropPart;//init auto noise
-			origCropPart = new Imagefloat (crW, crH);//allocate memory
-		
-		int skipP=1;
-		if (skip==1 && params.dirpyrDenoise.enabled) {//evaluate Noise
-		
-			for(int wcr=0;wcr<numtiles_W;wcr++) {
-				for(int hcr=0;hcr<numtiles_H;hcr++) {
-			
-					int beg_tileW=wcr*tileWskip + tileWskip/2.f - crW/2.f;
-					int beg_tileH=hcr*tileHskip + tileHskip/2.f - crH/2.f;
-					PreviewProps ppP (beg_tileW , beg_tileH, crW, crH, skipP);
-					parent->imgsrc->getImage (parent->currWB, tr, origCropPart, ppP, params.toneCurve, params.icm, params.raw );
-
-					Imagefloat *provi;
-					Imagefloat *provicalc;
-					provi = new Imagefloat (crW, crH);
-						if(origCropPart !=  provi)
-							origCropPart->copyData(provi);
-					provicalc = new Imagefloat (crW, crH);
-						if(origCropPart !=  provicalc)
-							origCropPart->copyData(provicalc);
-					parent->imgsrc->convertColorSpace(provicalc, params.icm, parent->currWB, params.raw);//for denoise luminance curve
-					float maxr=0.f;
-					float maxb=0.f;
-
-					float chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L;
-					chaut=0.f;redaut=0.f; blueaut=0.f; maxredaut=0.f; maxblueaut=0.f;chromina=0.f; sigma=0.f;
-					parent->ipf.RGB_denoise_info(provi, provi, provicalc, parent->imgsrc->isRAW(), params.dirpyrDenoise, params.defringe, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve,lldenoiseutili, noiseCCurve,ccdenoiseutili, chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L);
-					//printf("DCROP skip=%d cha=%f red=%f bl=%f redM=%f bluM=%f chrom=%f sigm=%f lum=%f\n",skip, chaut,redaut,blueaut, maxredaut, maxblueaut, chromina, sigma, lumema);
-					if(maxredaut > maxblueaut) {
-						maxr=(maxredaut-chaut)/(autoNRmax/2.f);
-						if(minblueaut <= minredaut  && minblueaut < chaut) maxb=(-chaut+minblueaut)/autoNRmax;
-					} 
-					else {
-						maxb=(maxblueaut-chaut)/(autoNRmax/2.f);
-						if(minredaut <= minblueaut  && minredaut < chaut) maxr=(-chaut+minredaut)/autoNRmax;
-					}//maxb mxr - empirical evaluation red / blue
-					
-					ch_M[hcr*numtiles_W + wcr]=chaut/autoNR;
-					max_r[hcr*numtiles_W + wcr]=maxr;
-					max_b[hcr*numtiles_W + wcr]=maxb;
-					
-					delete provi;
-				
-				}
-			}
-		
-		}
-		delete origCropPart; 
-		if (settings->verbose) {
-			t2dce.set();
-			printf("Info denoise ponderated performed in %d usec:\n", t2dce.etime(t1dce));
-			}
-		
-		}
-		*/
-		
 		if((settings->leveldnautsimpl==1 && params.dirpyrDenoise.Cmethod=="PRE") || (settings->leveldnautsimpl==0 && params.dirpyrDenoise.C2method=="PREV")) {
 			PreviewProps pp (trafx, trafy, trafw*skip, trafh*skip, skip);
 			parent->imgsrc->getImage (parent->currWB, tr, origCrop, pp, params.toneCurve, params.icm, params.raw );
@@ -298,12 +229,12 @@ void Crop::update (int todo) {
 					if(abs(centerTile_Y[cc]-CenterPreview_Y) < minimuY) {minimuY=abs(centerTile_Y[cc]-CenterPreview_Y);poscenterY=cc;}
 				}
 			//	printf("TileCX=%d  TileCY=%d  prevX=%d  prevY=%d \n",centerTile_X[poscenterX],centerTile_Y[poscenterY],CenterPreview_X,CenterPreview_Y);
-				int crW,crH;	
-				if(settings->leveldnv ==0) {crW=100;crH=100;}
-				if(settings->leveldnv ==1) {crW=250;crH=250;}
+				int crW;	
+				if(settings->leveldnv ==0) {crW=100;}
+				if(settings->leveldnv ==1) {crW=250;}
 			//	if(settings->leveldnv ==2) {crW=int(tileWskip/2);crH=int((tileWskip/2));}//adapted to scale of preview
-				if(settings->leveldnv ==2) {crW=int(tileWskip/2);crH=int(tileHskip/2);}
-				if(settings->leveldnv ==3) {crW=tileWskip-10;crH=tileHskip-10;}
+				if(settings->leveldnv ==2) {crW=int(tileWskip/2);}
+				if(settings->leveldnv ==3) {crW=tileWskip-10;}
 
 				float adjustr=1.f;	
 				if      (params.icm.working=="ProPhoto")   {adjustr =1.f;}
@@ -323,9 +254,17 @@ void Crop::update (int todo) {
 			 //   cropImageListener->setPosition (centerTile_X[poscenterX],centerTile_Y[poscenterY] , true);
 				//setCropSizes (centerTile_X[poscenterX], centerTile_Y[poscenterY], trafw*skip,trafh*skip , skip, true);
 
-				Imagefloat *provicalc = new Imagefloat (cropw, croph);//for Luminance denoise curve
-				origCrop->copyData(provicalc);
+				// we only need image reduced to 1/4 here
+				Imagefloat *provicalc = new Imagefloat ((cropw+1)/2, (croph+1)/2);//for denoise curves
+				for(int ii=0;ii<croph;ii+=2){
+					for(int jj=0;jj<cropw;jj+=2){
+						provicalc->r(ii>>1,jj>>1) = origCrop->r(ii,jj);
+						provicalc->g(ii>>1,jj>>1) = origCrop->g(ii,jj);
+						provicalc->b(ii>>1,jj>>1) = origCrop->b(ii,jj);
+					}
+				}
 				parent->imgsrc->convertColorSpace(provicalc, params.icm, parent->currWB, params.raw);//for denoise luminance curve
+
 				float maxr=0.f;
 				float maxb=0.f;
 				float chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc;
@@ -380,7 +319,6 @@ void Crop::update (int todo) {
 		//	if(settings->leveldnv ==2) {crW=int(tileWskip/2);crH=int((tileWskip/2));}//adapted to scale of preview
 			if(settings->leveldnv ==2) {crW=int(tileWskip/2);crH=int(tileHskip/2);}
 			if(settings->leveldnv ==3) {crW=tileWskip-10;crH=tileHskip-10;}
-			
 			float lowdenoise=1.f;
 			int levaut=settings->leveldnaut;
 			if(levaut==1)  //Standard
@@ -394,10 +332,8 @@ void Crop::update (int todo) {
 #pragma omp parallel			
 #endif
 {
-			Imagefloat *origCropPart;//init auto noise		
-			origCropPart = new Imagefloat (crW, crH);//allocate memory
-			Imagefloat *provicalc;
-			provicalc = new Imagefloat (crW, crH);			
+			Imagefloat *origCropPart = new Imagefloat (crW, crH);//allocate memory
+			Imagefloat *provicalc = new Imagefloat ((crW+1)/2, (crH+1)/2);//for denoise curves
 
 			int  coordW[3];//coordonate of part of image to mesure noise
 			int  coordH[3];
@@ -410,18 +346,22 @@ void Crop::update (int todo) {
 #endif
 			for(int wcr=0;wcr<=2;wcr++) {
 				for(int hcr=0;hcr<=2;hcr++) {
-					PreviewProps ppP (coordW[wcr] , coordH[hcr], crW, crH, skipP);
+					PreviewProps ppP (coordW[wcr] , coordH[hcr], crW, crH, 1);
 					parent->imgsrc->getImage (parent->currWB, tr, origCropPart, ppP, params.toneCurve, params.icm, params.raw );
 
-					if(origCropPart !=  provicalc)
-						origCropPart->copyData(provicalc);
+					// we only need image reduced to 1/4 here
+					for(int ii=0;ii<crH;ii+=2){
+						for(int jj=0;jj<crW;jj+=2){
+							provicalc->r(ii>>1,jj>>1) = origCropPart->r(ii,jj);
+							provicalc->g(ii>>1,jj>>1) = origCropPart->g(ii,jj);
+							provicalc->b(ii>>1,jj>>1) = origCropPart->b(ii,jj);
+						}
+					}
 					parent->imgsrc->convertColorSpace(provicalc, params.icm, parent->currWB, params.raw);//for denoise luminance curve
-					//float maxr=0.f;
-					//float maxb=0.f;
+
 					float pondcorrec=1.0f;
 					float chaut=0.f, redaut=0.f, blueaut=0.f, maxredaut=0.f, maxblueaut=0.f, minredaut=0.f, minblueaut=0.f, nresi=0.f, highresi=0.f, chromina=0.f, sigma=0.f, lumema=0.f, sigma_L=0.f, redyel=0.f, skinc=0.f, nsknc=0.f;
 					int nb=0;
-//						chaut=0.f;redaut=0.f; blueaut=0.f; maxredaut=0.f; maxblueaut=0.f; chromina=0.f; sigma=0.f;
 					parent->ipf.RGB_denoise_info(origCropPart, provicalc, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope, params.dirpyrDenoise, parent->imgsrc->getDirPyrDenoiseExpComp(), chaut, nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc);
 
 				//printf("DCROP skip=%d cha=%f red=%f bl=%f redM=%f bluM=%f chrom=%f sigm=%f lum=%f\n",skip, chaut,redaut,blueaut, maxredaut, maxblueaut, chromina, sigma, lumema);
@@ -469,27 +409,22 @@ void Crop::update (int todo) {
 			else if (params.icm.working=="BestRGB")    {adjustr =1.f/1.2f;}
 			else if (params.icm.working=="BruceRGB")   {adjustr =1.f/1.2f;}
 
-			float maxmax;
-			float minmin;				
 			float delta[9];
 			int mode=1;
-			float redyel, skinc, nsknc;
 			int lissage=settings->leveldnliss;
 			for (int k=0;k<9;k++) {
-				maxmax=max(max_r[k],max_b[k]);				
+				float maxmax=max(max_r[k],max_b[k]);				
 				parent->ipf.calcautodn_info (ch_M[k], delta[k], Nb[k], levaut, maxmax,lumL[k],chromC[k], mode, lissage, ry[k], sk[k], pcsk[k]);
 			//	printf("ch_M=%f delta=%f\n",ch_M[k], delta[k]);
 			}
 			for (int k=0;k<9;k++) {
 				if(max_r[k] > max_b[k]) {
-					minmin=min(min_r[k],min_b[k]);
 					Max_R[k]=(delta[k])/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
 					Min_B[k]= -(ch_M[k]- min_b[k])/(autoNRmax*multip*adjustr*lowdenoise);
 					Max_B[k]=0.f;
 					Min_R[k]=0.f;
 				} 
 				else {
-					minmin=min(min_r[k],min_b[k]);	
 					Max_B[k]=(delta[k])/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
 					Min_R[k]=- (ch_M[k]-min_r[k])	/ (autoNRmax*multip*adjustr*lowdenoise);
 					Min_B[k]=0.f;
@@ -555,14 +490,20 @@ void Crop::update (int todo) {
 		} else if(denoiseParams.Lmethod == "SLI")
 			noiseLCurve.Reset();
 
-
-
 		if((noiseLCurve || noiseCCurve ) && skip==1 && denoiseParams.enabled)	{//only allocate memory if enabled and skip
-			calclum = new Imagefloat (cropw, croph);//for Luminance denoise curve
-			origCrop->copyData(calclum);
+			// we only need image reduced to 1/4 here
+			calclum = new Imagefloat ((cropw+1)/2, (croph+1)/2);//for denoise curves
+			for(int ii=0;ii<croph;ii+=2){
+				for(int jj=0;jj<cropw;jj+=2){
+					calclum->r(ii>>1,jj>>1) = origCrop->r(ii,jj);
+					calclum->g(ii>>1,jj>>1) = origCrop->g(ii,jj);
+					calclum->b(ii>>1,jj>>1) = origCrop->b(ii,jj);
+				}
+			}
 		
 			parent->imgsrc->convertColorSpace(calclum, params.icm, parent->currWB, params.raw);//for denoise luminance curve
 		}
+
 		if(skip!=1) if(parent->adnListener) parent->adnListener->noiseChanged(0.f, 0.f);	
 
         if (todo & M_LINDENOISE) {
@@ -643,7 +584,6 @@ void Crop::update (int todo) {
                    baseCrop->b[(int)(xref/skip)][(int)(yref/skip)]/256,
                    parent->imgsrc->getGamma());
         }*/
-    double rrm, ggm, bbm;
     float satLimit = float(params.colorToning.satProtectionThreshold)/100.f*0.7f+0.3f;
     float satLimitOpacity = 1.f-(float(params.colorToning.saturatedOpacity)/100.f);
 
@@ -659,12 +599,13 @@ void Crop::update (int todo) {
         satLimitOpacity= 100.f*(moyS-0.85f*eqty);//-0.85 sigma==>20% pixels with low saturation
     }
 
-    if (todo & M_RGBCURVE)
+    if (todo & M_RGBCURVE) {
+	    double rrm, ggm, bbm;
         parent->ipf.rgbProc (baseCrop, laboCrop, this, parent->hltonecurve, parent->shtonecurve, parent->tonecurve, cshmap,
                              params.toneCurve.saturation, parent->rCurve, parent->gCurve, parent->bCurve, satLimit ,satLimitOpacity, parent->ctColorCurve, parent->ctOpacityCurve, parent->opautili, parent->clToningcurve,parent->cl2Toningcurve,
                              parent->customToneCurve1, parent->customToneCurve2, parent->beforeToneCurveBW, parent->afterToneCurveBW,rrm, ggm, bbm,
                              parent->bwAutoR, parent->bwAutoG, parent->bwAutoB);
-
+    }
     /*xref=000;yref=000;
     if (colortest && cropw>115 && croph>115)
     for(int j=1;j<5;j++){
@@ -744,7 +685,6 @@ void Crop::update (int todo) {
 
             if(settings->ciecamfloat) {
                 float d; // not used after this block
-				skip2=skip;
                 parent->ipf.ciecam_02float (cieCrop, float(adap), begh, endh, 1, 2, labnCrop, &params, parent->customColCurve1, parent->customColCurve2, parent->customColCurve3,
                                             dummy, dummy, parent->CAMBrightCurveJ, parent->CAMBrightCurveQ, parent->CAMMean, 5, 1, execsharp, d, skip, 1);			
             }
diff --git a/rtengine/dcrop.h b/rtengine/dcrop.h
index 23c9ebf8f..6f858779f 100644
--- a/rtengine/dcrop.h
+++ b/rtengine/dcrop.h
@@ -74,7 +74,6 @@ class Crop : public DetailedCrop, public EditBuffer {
     public:
         Crop             (ImProcCoordinator* parent, EditDataProvider *editDataProvider, bool isDetailWindow);
         virtual ~Crop    ();
-        int skip2;
 
         void mLock       () { cropMutex.lock(); }
         void mUnlock     () { cropMutex.lock(); }
diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h
index 0f0a3b876..cbfcc6691 100644
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -29,13 +29,16 @@ typedef __m128i vint2;
 	#if __GNUC__ == 4 && __GNUC_MINOR__ >= 8
 		#define LVF(x) _mm_load_ps(&x)
 		#define LVFU(x) _mm_loadu_ps(&x)
+		#define STVF(x,y) _mm_store_ps(&x,y)
 	#else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3
 		#define LVF(x) _mm_loadu_ps(&x)
 		#define LVFU(x) _mm_loadu_ps(&x)
+		#define STVF(x,y) _mm_storeu_ps(&x,y)
 	#endif
 #else 
-#define LVF(x) _mm_load_ps(&x)
-#define LVFU(x) _mm_loadu_ps(&x)
+	#define LVF(x) _mm_load_ps(&x)
+	#define LVFU(x) _mm_loadu_ps(&x)
+	#define STVF(x,y) _mm_store_ps(&x,y)
 #endif
 #define LC2VFU(a) _mm_shuffle_ps( LVFU(a),  _mm_loadu_ps(  (&a) + 4 ), _MM_SHUFFLE( 2,0,2,0 ) )
 #define ZEROV _mm_setzero_ps()
diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc
index 82a2dcef0..c805d240e 100644
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -33,13 +33,13 @@ ImProcCoordinator::ImProcCoordinator ()
     : orig_prev(NULL), oprevi(NULL), oprevl(NULL), nprevl(NULL), previmg(NULL), workimg(NULL),
       ncie(NULL), imgsrc(NULL), shmap(NULL), lastAwbEqual(0.), ipf(&params, true), scale(10),
       highDetailPreprocessComputed(false), highDetailRawComputed(false), allocated(false),
-      bwAutoR(-9000.f), bwAutoG(-9000.f), bwAutoB(-9000.f), CAMMean(0.), chaut(0.f), redaut(0.f), blueaut(0.f), maxredaut(0.f), maxblueaut(0.f),nresi(0.f),
-		chromina(0.f), sigma(0.f), lumema(0.f),minredaut(0.f), minblueaut(0.f),
+      bwAutoR(-9000.f), bwAutoG(-9000.f), bwAutoB(-9000.f), CAMMean(0.),
 
-      hltonecurve(65536,LUT_CLIP_BELOW),  // used LUT_CLIP_BELOW, because we want to have a baseline of 2^expcomp in this curve. If we don't clip the lut we get wrong values, see Issue 2621 #14 for details
-      shtonecurve(65536,2),//clip above
+      hltonecurve(65536),
+      shtonecurve(65536),
       tonecurve(65536,0),//,1);
-
+	  chaut(0.f), redaut(0.f), blueaut(0.f), maxredaut(0.f), maxblueaut(0.f),minredaut(0.f), minblueaut(0.f),nresi(0.f),
+	  chromina(0.f), sigma(0.f), lumema(0.f),
       lumacurve(65536,0),
       chroma_acurve(65536,0),
       chroma_bcurve(65536,0),
@@ -82,11 +82,11 @@ ImProcCoordinator::ImProcCoordinator ()
       rcurvehist(256), rcurvehistCropped(256), rbeforehist(256),
       gcurvehist(256), gcurvehistCropped(256), gbeforehist(256),
       bcurvehist(256), bcurvehistCropped(256), bbeforehist(256),
-
+	  fullw(1),fullh(1),
       pW(-1), pH(-1),
-      plistener(NULL), imageListener(NULL), aeListener(NULL), hListener(NULL),acListener(NULL), abwListener(NULL),actListener(NULL),adnListener(NULL),
-      resultValid(false), changeSinceLast(0), updaterRunning(false), destroying(false),utili(false),autili(false),opautili(false),
-	  butili(false),ccutili(false),cclutili(false),clcutili(false),fullw(1),fullh(1)
+      plistener(NULL), imageListener(NULL), aeListener(NULL), acListener(NULL),abwListener(NULL),actListener(NULL),adnListener(NULL), hListener(NULL), 
+      resultValid(false), changeSinceLast(0), updaterRunning(false), destroying(false),utili(false),autili(false),
+	  butili(false),ccutili(false),cclutili(false),clcutili(false),opautili(false)
 
     {}
 
@@ -291,10 +291,15 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall) {
 		
 
 		if((noiseLCurve || noiseCCurve) &&  denoiseParams.enabled && (scale==1)){//only allocate memory if enabled and scale=1	
-			calclum = new Imagefloat (pW, pH);//for Luminance denoise curve
-				if(orig_prev !=  calclum)
-					orig_prev->copyData(calclum);
-		
+			// we only need image reduced to 1/4 here
+			calclum = new Imagefloat ((pW+1)/2, (pH+1)/2);//for luminance denoise curve
+			for(int ii=0;ii<pH;ii+=2){
+				for(int jj=0;jj<pW;jj+=2){
+					calclum->r(ii>>1,jj>>1) = orig_prev->r(ii,jj);
+					calclum->g(ii>>1,jj>>1) = orig_prev->g(ii,jj);
+					calclum->b(ii>>1,jj>>1) = orig_prev->b(ii,jj);
+				}
+			}
 			imgsrc->convertColorSpace(calclum, params.icm, currWB, params.raw);//claculate values after colorspace conversion
 		}
 		//always enabled to calculated auto Chroma
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index 958df0277..9a67323dc 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -114,13 +114,6 @@ class ImProcCoordinator : public StagedImageProcessor {
         LUTf rCurve;
         LUTf gCurve;
         LUTf bCurve;
-        bool utili;
-        bool autili;
-        bool butili;
-        bool ccutili;
-        bool cclutili;
-        bool clcutili;
-		bool opautili;
         ToneCurve customToneCurve1;
         ToneCurve customToneCurve2;
         ColorGradientCurve ctColorCurve;
@@ -176,6 +169,13 @@ class ImProcCoordinator : public StagedImageProcessor {
         bool updaterRunning;
         ProcParams nextParams;
         bool destroying;
+        bool utili;
+        bool autili;
+        bool butili;
+        bool ccutili;
+        bool cclutili;
+        bool clcutili;
+		bool opautili;
 
         void startProcessing ();
         void process ();
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index 5fda98920..c51f860d6 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -35,7 +35,6 @@
 #include "iccmatrices.h"
 #include "color.h"
 #include "calc_distort.h"
-#include "boxblur.h"
 #include "rt_math.h"
 #include "EdgePreservingDecomposition.h"
 #include "improccoordinator.h"
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 454838215..745c41dae 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -174,7 +174,6 @@ class ImProcFunctions {
 		bool iGamma; // true if inverse gamma has to be applied in rgbProc
 		double g;
 		static LUTf cachef;
-		float Qhigh;
 		double lumimul[3];
 //		float chau;
 //		float chred;
@@ -284,13 +283,13 @@ class ImProcFunctions {
 	//	void WaveletDenoiseAll(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
 	//						   wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab, wavelet_decomposition &wch, NoiImage * noi );
 		void WaveletDenoiseAll(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
-							   wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & dnNoisCurve, const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch, bool perf);
+							   wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float *noisevarlum, float *noisevarchrom, int width, int height, float noisevar_abr, float noisevar_abb, LabImage * noi, const bool useNoiseLCurve, const bool useNoiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, bool autoch, bool perf);
 		void WaveletDenoiseAll_info(int levwav, wavelet_decomposition &WaveletCoeffs_a, 
-							   wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut,float &minredaut, float & minblueaut, int schoice, bool autoch, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc,
+							   wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut,float &minredaut, float & minblueaut, int schoice, bool autoch, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc,
 							   float &maxchred, float &maxchblue,float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool perf, bool multiThread);
 							   
 		void WaveletDenoiseAll_BiShrink(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
-										wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb,  float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & dnNoisCurve, const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch, bool perf);
+										wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float *noisevarlum, float *noisevarchrom, int width, int height, float noisevar_abr, float noisevar_abb, LabImage * noi, const bool useNoiseLCurve, const bool useNoiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, bool autoch, bool perf);
 		//void BiShrink(float * ReCoeffs, float * ImCoeffs, float * ReParents, float * ImParents, 
 		//			  int W, int H, int level, int padding, float noisevar);
 		//void Shrink(float ** WavCoeffs, int W, int H, int level, float noisevar);
@@ -298,11 +297,11 @@ class ImProcFunctions {
 	//				   int W_L, int H_L, int W_ab, int H_ab, int W_h, int H_h, int skip_L, int skip_ab, int skip_h, float noisevar_L, float noisevar_ab, float **WavCoeffs_h, LabImage * noi);
 
 		void ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, float **buffer, int level, 
-					   int W_L, int H_L, int W_ab, int H_ab, int skip_L, int skip_ab, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL,  float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb,LabImage * noi, const NoiseCurve & dnNoisCurve,  const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int callby, bool autoch, bool perf, float * madaa = NULL, float * madab = NULL, float * madaL = NULL, bool madCalculated= false);
+					   int W_L, int H_L, int W_ab, int H_ab, int skip_L, int skip_ab, float noisevar_L, float *noisevarlum, float *noisevarchrom, int width, int height, float noisevar_abr, float noisevar_abb,LabImage * noi, const bool useNoiseLCurve,  const bool useNoiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, bool autoch, bool perf, float * madaa = NULL, float * madab = NULL, float * madaL = NULL, bool madCalculated= false);
 		void ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b, int level, 
-					   int W_ab, int H_ab, int skip_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb,LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, bool autoch, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc, 
+					   int W_ab, int H_ab, int skip_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float noisevar_abr, float noisevar_abb,LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, bool autoch, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc, 
 					   float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool perf, bool multiThread);
-		void Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, wavelet_decomposition &WaveletCoeffs_b, int width, int height, float &chresid, float &chmaxredresid,float &chmaxblueresid, float &chresidred, float & chresidblue, float resid, float residblue, float residred, float maxredresid, float maxblueresid, float nbresid);
+		void Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, wavelet_decomposition &WaveletCoeffs_b, int width, int height, float &chresid, float &chmaxredresid,float &chmaxblueresid, float &chresidred, float & chresidblue, float resid, float residblue, float residred, float maxredresid, float maxblueresid, float nbresid, bool denoiseMethodRgb);
 		void calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc);
 		float MadMax(float * HH_Coeffs, int &max, int datalen); 
 		float Mad(float * DataList, const int datalen);
diff --git a/rtengine/simpleprocess.cc b/rtengine/simpleprocess.cc
index 53276bd3e..75f4ea147 100644
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@@ -31,7 +31,6 @@
 #include "../rtgui/ppversion.h"
 #include "../rtgui/multilangmgr.h"
 #include "mytime.h"
-#include "StopWatch.h"
 #undef THREAD_PRIORITY_NORMAL
 #ifdef _OPENMP
 #include <omp.h>
@@ -132,7 +131,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
     params.dirpyrDenoise.getCurves(noiseLCurve, noiseCCurve);
 	float autoNR = (float) settings->nrauto;//
 	float autoNRmax = (float) settings->nrautomax;//
-	float autohigh = (float) settings->nrhigh;//
 	int tilesize;
 	int overlap;
 	if(settings->leveldnti ==0) {
@@ -162,9 +160,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 		float *pcsk = new float [nbtl];
 		float *Max_R_ =new float [nbtl];
 		float *Max_B_ = new float [nbtl];
-		float *Min_R_ =new float [nbtl];
-		float *Min_B_ = new float [nbtl];
-		float *delta_ = new float [nbtl];
 	//	printf("expert=%d\n",settings->leveldnautsimpl);
 		if(settings->leveldnautsimpl==1 && params.dirpyrDenoise.Cmethod=="PON") {
 			MyTime t1pone,t2pone;
@@ -185,7 +180,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 	//	int crH=tileHskip-10;//crop noise height
 //		Imagefloat *origCropPart;//init auto noise
 //			origCropPart = new Imagefloat (crW, crH);//allocate memory
-		StopWatch Stop1("denoise info tiled");
 		if (params.dirpyrDenoise.enabled) {//evaluate Noise
 		LUTf gamcurve(65536,0);
 		float gam, gamthresh, gamslope;
@@ -194,18 +188,24 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 		{
 			Imagefloat *origCropPart;//init auto noise
 			origCropPart = new Imagefloat (crW, crH);//allocate memory
-			Imagefloat *provicalc;
-			provicalc = new Imagefloat (crW, crH);	
+			Imagefloat *provicalc = new Imagefloat ((crW+1)/2, (crH+1)/2);//for denoise curves
 			int skipP=1;
-			#pragma omp for schedule(dynamic) collapse(2) nowait		
+			#pragma omp for schedule(dynamic) collapse(2) nowait
 				for(int wcr=0;wcr<numtiles_W;wcr++) {
 					for(int hcr=0;hcr<numtiles_H;hcr++) {
 						int beg_tileW=wcr*tileWskip + tileWskip/2.f - crW/2.f;
 						int beg_tileH=hcr*tileHskip + tileHskip/2.f - crH/2.f;
 						PreviewProps ppP (beg_tileW , beg_tileH, crW, crH, skipP);
 						imgsrc->getImage (currWB, tr, origCropPart, ppP, params.toneCurve, params.icm, params.raw );
-						if(origCropPart !=  provicalc)
-						origCropPart->copyData(provicalc);
+
+						// we only need image reduced to 1/4 here
+						for(int ii=0;ii<crH;ii+=2){
+							for(int jj=0;jj<crW;jj+=2){
+								provicalc->r(ii>>1,jj>>1) = origCropPart->r(ii,jj);
+								provicalc->g(ii>>1,jj>>1) = origCropPart->g(ii,jj);
+								provicalc->b(ii>>1,jj>>1) = origCropPart->b(ii,jj);
+							}
+						}
 						imgsrc->convertColorSpace(provicalc, params.icm, currWB, params.raw);//for denoise luminance curve
 						float maxr=0.f;
 						float maxb=0.f;
@@ -279,8 +279,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 				if(liss==3){//same as auto but with much cells
 					float MaxR=0.f;
 					float MaxB=0.f;
-					float MinR=100000000000.f;
-					float MinB=100000000000.f;
 					float MaxRMoy=0.f;
 					float MaxBMoy=0.f;
 					
@@ -316,7 +314,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 			}
 			
 		}
-		Stop1.stop();
 	}
 
 		
@@ -350,15 +347,21 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 			{
 				Imagefloat *origCropPart;//init auto noise		
 				origCropPart = new Imagefloat (crW, crH);//allocate memory
-				Imagefloat *provicalc;
-				provicalc = new Imagefloat (crW, crH);			
+				Imagefloat *provicalc = new Imagefloat ((crW+1)/2, (crH+1)/2);//for denoise curves
 
 				#pragma omp for schedule(dynamic) collapse(2) nowait				
 				for(int wcr=0;wcr<=2;wcr++) {
 					for(int hcr=0;hcr<=2;hcr++) {
 						PreviewProps ppP (coordW[wcr] , coordH[hcr], crW, crH, 1);
 						imgsrc->getImage (currWB, tr, origCropPart, ppP, params.toneCurve, params.icm, params.raw);
-						origCropPart->copyData(provicalc);
+						// we only need image reduced to 1/4 here
+						for(int ii=0;ii<crH;ii+=2){
+							for(int jj=0;jj<crW;jj+=2){
+								provicalc->r(ii>>1,jj>>1) = origCropPart->r(ii,jj);
+								provicalc->g(ii>>1,jj>>1) = origCropPart->g(ii,jj);
+								provicalc->b(ii>>1,jj>>1) = origCropPart->b(ii,jj);
+							}
+						}
 						imgsrc->convertColorSpace(provicalc, params.icm, currWB, params.raw);//for denoise luminance curve
 						int nb = 0;
 						float chaut=0.f, redaut=0.f, blueaut=0.f, maxredaut=0.f, maxblueaut=0.f, minredaut=0.f, minblueaut=0.f, nresi=0.f, highresi=0.f, chromina=0.f, sigma=0.f, lumema=0.f, sigma_L=0.f, redyel=0.f, skinc=0.f, nsknc=0.f;
@@ -407,12 +410,11 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 			
 				if(!imgsrc->isRAW()) multip=2.f;//take into account gamma for TIF / JPG approximate value...not good fot gamma=1
 
-				float maxmax;
 				float delta[9];
 				int mode=1;
 				int lissage=settings->leveldnliss;
 				for(int k=0;k<9;k++) {
-					maxmax = max(max_r[k],max_b[k]);
+					float maxmax = max(max_r[k],max_b[k]);
 					ipf.calcautodn_info (ch_M[k], delta[k], Nb[k], levaut, maxmax, lumL[k], chromC[k], mode, lissage, ry[k], sk[k], pcsk[k] );
 				//	printf("ch_M=%f delta=%f\n",ch_M[k], delta[k]);
 				}
@@ -527,19 +529,24 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 				denoiseParams.luma = 0.0f;
 		} else if(denoiseParams.Lmethod=="SLI")
 			noiseLCurve.Reset();
-
 	if (denoiseParams.enabled  && (noiseLCurve || noiseCCurve )) {
-
-		calclum = new Imagefloat (fw, fh);//for luminance denoise curve
-			if(baseImg !=  calclum)
-				baseImg->copyData(calclum);
-			imgsrc->convertColorSpace(calclum, params.icm, currWB, params.raw);
+		// we only need image reduced to 1/4 here
+		calclum = new Imagefloat ((fw+1)/2, (fh+1)/2);//for luminance denoise curve
+#pragma omp parallel for
+		for(int ii=0;ii<fh;ii+=2){
+			for(int jj=0;jj<fw;jj+=2){
+				calclum->r(ii>>1,jj>>1) = baseImg->r(ii,jj);
+				calclum->g(ii>>1,jj>>1) = baseImg->g(ii,jj);
+				calclum->b(ii>>1,jj>>1) = baseImg->b(ii,jj);
+			}
+		}
+		imgsrc->convertColorSpace(calclum, params.icm, currWB, params.raw);
 	}
     if (denoiseParams.enabled) {
        // CurveFactory::denoiseLL(lldenoiseutili, denoiseParams.lcurve, Noisecurve,1);
 		//denoiseParams.getCurves(noiseLCurve);	
 //		ipf.RGB_denoise(baseImg, baseImg, calclum, imgsrc->isRAW(), denoiseParams, params.defringe, imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, lldenoiseutili);
-		float chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc;
+		float chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi;
 		int kall=2;
 		ipf.RGB_denoise(kall, baseImg, baseImg, calclum, ch_M, max_r, max_b, imgsrc->isRAW(), denoiseParams, imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi);
  
@@ -557,9 +564,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 	delete [] pcsk;
     delete [] Max_R_;
     delete [] Max_B_;
-    delete [] Min_R_;
-    delete [] Min_B_;
-    delete [] delta_;
 	
     imgsrc->convertColorSpace(baseImg, params.icm, currWB, params.raw);
 
@@ -589,8 +593,8 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
     }
     // RGB processing
 
-    LUTf curve1 (65536,LUT_CLIP_BELOW); // used LUT_CLIP_BELOW, because we want to have a baseline of 2^expcomp in this curve. If we don't clip the lut we get wrong values, see Issue 2621 #14 for details
-    LUTf curve2 (65536,0);
+    LUTf curve1 (65536);
+    LUTf curve2 (65536);
 	LUTf curve (65536,0);
 	LUTf satcurve (65536,0);
 	LUTf lhskcurve (65536,0);
@@ -792,8 +796,8 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 					customColCurve2,
 					customColCurve3,
 					1);
-	double adap;
 	if(params.colorappearance.enabled){
+		double adap;
 		float fnum = imgsrc->getMetaData()->getFNumber  ();// F number
 		float fiso = imgsrc->getMetaData()->getISOSpeed () ;// ISO
 		float fspeed = imgsrc->getMetaData()->getShutterSpeed () ;//speed
@@ -819,7 +823,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 			else ipf.ciecam_02 (cieView, adap, begh, endh,1,2, labView, &params,customColCurve1,customColCurve2,customColCurve3, dummy, dummy, CAMBrightCurveJ, CAMBrightCurveQ, CAMMean, 5, 1, true, dd, 1, 1);
 		}
 		else {
-			int f_h=2,f_w=2;
 			float d;
 
 			double dd;
@@ -852,7 +855,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
     Image16* readyImg = NULL;
     cmsHPROFILE jprof = NULL;
     bool customGamma = false;
-    bool useLCMS;
+    bool useLCMS = false;
 
     if(params.icm.gamma != "default" || params.icm.freegamma) { // if select gamma output between BT709, sRGB, linear, low, high, 2.2 , 1.8
         cmsMLU *DescriptionMLU, *CopyrightMLU, *DmndMLU, *DmddMLU;// for modification TAG
@@ -860,15 +863,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
         cmsToneCurve* GammaTRC[3] = { NULL, NULL, NULL };
         cmsFloat64Number Parameters[7];
         double ga0,ga1,ga2,ga3,ga4,ga5,ga6;
-       // wchar_t string[80] ;
-        int ns;//numero of stri[]
-        if      (params.icm.working=="ProPhoto")   ns=0;
-        else if (params.icm.working=="Adobe RGB")  ns=1;
-        else if (params.icm.working=="sRGB")  	   ns=2;
-        else if (params.icm.working=="WideGamut")  ns=3;
-        else if (params.icm.working=="Beta RGB")   ns=4;
-        else if (params.icm.working=="BestRGB")    ns=5;
-        else if (params.icm.working=="BruceRGB")   ns=6;
 	//	if(params.blackwhite.enabled) params.toneCurve.hrenabled=false;
         readyImg = ipf.lab2rgb16b (labView, cx, cy, cw, ch, params.icm.output, params.icm.working, params.icm.gamma, params.icm.freegamma, params.icm.gampos, params.icm.slpos, ga0,ga1,ga2,ga3,ga4,ga5,ga6, params.blackwhite.enabled );
         customGamma = true;
diff --git a/rtengine/sleef.c b/rtengine/sleef.c
index fa3eb22ea..c7b3fb486 100755
--- a/rtengine/sleef.c
+++ b/rtengine/sleef.c
@@ -1212,26 +1212,40 @@ __inline float xexpf(float d) {
 
 }
 
-
 __inline float xmul2f(float d) {
-	if (*(int*)&d & 0x7FFFFFFF) { // if f==0 do nothing
-		*(int*)&d += 1 << 23; // add 1 to the exponent
-		}
-	return d;
+	union {
+		float floatval;
+		int intval;
+	} uflint;
+	uflint.floatval = d;
+	if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
+		uflint.intval += 1 << 23; // add 1 to the exponent
+	}
+	return uflint.floatval;
 }
 
 __inline float xdiv2f(float d) {
-	if (*(int*)&d & 0x7FFFFFFF) { // if f==0 do nothing
-		*(int*)&d -= 1 << 23; // sub 1 from the exponent
+	union {
+		float floatval;
+		int intval;
+	} uflint;
+	uflint.floatval = d;
+	if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
+		uflint.intval -= 1 << 23; // sub 1 from the exponent
 		}
-	return d;
+	return uflint.floatval;
 }
 
 __inline float xdivf( float d, int n){
-	if (*(int*)&d & 0x7FFFFFFF) { // if f==0 do nothing
-		*(int*)&d -= n << 23; // add n to the exponent
+	union {
+		float floatval;
+		int intval;
+	} uflint;
+	uflint.floatval = d;
+	if (uflint.intval & 0x7FFFFFFF) { // if f==0 do nothing
+		uflint.intval -= n << 23; // add n to the exponent
 		}
-	return d;
+	return uflint.floatval;
 }	
 
 
diff --git a/rtgui/options.h b/rtgui/options.h
index 52422eec2..c32f67730 100644
--- a/rtgui/options.h
+++ b/rtgui/options.h
@@ -49,7 +49,7 @@ class SaveFormat {
         int tiffBits;
         bool tiffUncompressed;
         bool saveParams;
-        SaveFormat () : format("jpg"), jpegQuality(90), jpegSubSamp(2), pngCompression(6), pngBits(8), tiffBits(8), tiffUncompressed(true), saveParams(true) {};
+        SaveFormat () : format("jpg"), pngBits(8), pngCompression(6), jpegQuality(90), jpegSubSamp(2), tiffBits(8), tiffUncompressed(true), saveParams(true) {};
 };
 
 enum ThFileType {FT_Invalid=-1, FT_None=0, FT_Raw=1, FT_Jpeg=2, FT_Tiff=3, FT_Png=4, FT_Custom=5, FT_Tiff16=6, FT_Png16=7, FT_Custom16=8};