From 8cb59b06c4f74abc9adfe8ee8606fcf6a5818ab2 Mon Sep 17 00:00:00 2001
From: Ingo <heckflosse@i-weyrich.de>
Date: Sun, 14 Dec 2014 19:07:36 +0100
Subject: [PATCH] Speedup and some bugfixes for Noise Reduction, Issue 2557 #60

---
 rtengine/CMakeLists.txt               |    2 +-
 rtengine/FTblockDN.cc                 | 4455 ++++++++++++-------------
 rtengine/LUT.h                        |    7 +-
 rtengine/boxblur.h                    |   88 +-
 rtengine/color.cc                     |   18 +-
 rtengine/color.h                      |   15 +-
 rtengine/cplx_wavelet_dec.cc          |   13 -
 rtengine/cplx_wavelet_dec.h           |  373 +--
 rtengine/cplx_wavelet_filter_coeffs.h |   99 -
 rtengine/cplx_wavelet_level.h         |  676 ++--
 rtengine/dcrop.cc                     |  433 ++-
 rtengine/improcfun.cc                 |    1 -
 rtengine/improcfun.h                  |   30 +-
 rtengine/opthelper.h                  |    9 +-
 rtengine/procparams.cc                |   22 +-
 rtengine/procparams.h                 |    2 +-
 rtengine/simpleprocess.cc             |   13 +-
 rtengine/sleefsseavx.c                |   11 +-
 18 files changed, 2751 insertions(+), 3516 deletions(-)

diff --git a/rtengine/CMakeLists.txt b/rtengine/CMakeLists.txt
index 796d5033d..b2699a9ef 100644
--- a/rtengine/CMakeLists.txt
+++ b/rtengine/CMakeLists.txt
@@ -15,7 +15,7 @@ set (RTENGINESOURCEFILES safegtk.cc colortemp.cc curves.cc flatcurves.cc diagona
     processingjob.cc rtthumbnail.cc utils.cc labimage.cc slicer.cc cieimage.cc
     iplab2rgb.cc ipsharpen.cc iptransform.cc ipresize.cc ipvibrance.cc
     imagedimensions.cc jpeg_memsrc.cc jdatasrc.cc iimage.cc
-    EdgePreserveLab.cc EdgePreservingDecomposition.cc cplx_wavelet_dec.cc FTblockDN.cc
+    EdgePreservingDecomposition.cc cplx_wavelet_dec.cc FTblockDN.cc
     PF_correct_RT.cc 
     dirpyr_equalizer.cc
     calc_distort.cc lcp.cc dcp.cc
diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 778265db0..f9d809dfb 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -22,14 +22,10 @@
 //
 ////////////////////////////////////////////////////////////////
 
-
-
-
 #include <math.h>
 #include <fftw3.h>
 #include "../rtgui/threadutils.h"
 
-#include "gauss.h"
 #include "rtengine.h"
 #include "improcfun.h"
 #include "LUT.h"
@@ -40,7 +36,6 @@
 #include "mytime.h"
 #include "sleef.c"
 #include "opthelper.h"
-#include "StopWatch.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -137,7 +132,8 @@ namespace rtengine {
 
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
+
+
 extern const Settings* settings;
 
 // Median calculation using quicksort
@@ -244,15 +240,17 @@ void ImProcFunctions::Tile_calc (int tilesize, int overlap, int kall, int imwidt
 		
 	//	printf("Nw=%d NH=%d tileW=%d tileH=%d\n",numtiles_W,numtiles_H,tileWskip,tileHskip);
 }
+
+int denoiseNestedLevels = 1;
+
 
-	void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst,Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi)
-	{
+void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst,Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi)
+{
 //#ifdef _DEBUG
 	MyTime t1e,t2e;
 	t1e.set();
 //#endif
 
-	
 	if (dnparams.luma==0 && dnparams.chroma==0  && !dnparams.median && !noiseLCurve && !noiseCCurve) {
 		//nothing to do; copy src to dst or do nothing in case src == dst
 		if(src != dst)
@@ -264,94 +262,91 @@ void ImProcFunctions::Tile_calc (int tilesize, int overlap, int kall, int imwidt
 			
 		return;
 	}
-		static MyMutex FftwMutex;
-		MyMutex::MyLock lock(FftwMutex);
-		int hei,wid;
-	//	float LLum,AAum,BBum;
-		float** lumcalc;
-		float** acalc;
-		float** bcalc;
-		if(noiseLCurve || noiseCCurve)	{	
-			hei=calclum->height;
-			wid=calclum->width;
-			TMatrix wprofi = iccStore->workingSpaceMatrix (params->icm.working);
+	static MyMutex FftwMutex;
+	MyMutex::MyLock lock(FftwMutex);
+	int hei,wid;
+	float** lumcalc;
+	float** acalc;
+	float** bcalc;
+	if(noiseLCurve || noiseCCurve) {
+		hei=calclum->height;
+		wid=calclum->width;
+		TMatrix wprofi = iccStore->workingSpaceMatrix (params->icm.working);
 
-			double wpi[3][3] = {
-				{wprofi[0][0],wprofi[0][1],wprofi[0][2]},
-				{wprofi[1][0],wprofi[1][1],wprofi[1][2]},
-				{wprofi[2][0],wprofi[2][1],wprofi[2][2]}
-				};
-			lumcalc = new float*[hei];
-				for (int i=0; i<hei; i++)
-					lumcalc[i] = new float[wid];
-			acalc = new float*[hei];
-				for (int i=0; i<hei; i++)
-					acalc[i] = new float[wid];
-			bcalc = new float*[hei];
-				for (int i=0; i<hei; i++)
-					bcalc[i] = new float[wid];
-												
+		double wpi[3][3] = {
+			{wprofi[0][0],wprofi[0][1],wprofi[0][2]},
+			{wprofi[1][0],wprofi[1][1],wprofi[1][2]},
+			{wprofi[2][0],wprofi[2][1],wprofi[2][2]}
+			};
+		lumcalc = new float*[hei];
+			for (int i=0; i<hei; i++)
+				lumcalc[i] = new float[wid];
+		acalc = new float*[hei];
+			for (int i=0; i<hei; i++)
+				acalc[i] = new float[wid];
+		bcalc = new float*[hei];
+			for (int i=0; i<hei; i++)
+				bcalc[i] = new float[wid];
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif		
-			for(int ii=0;ii<hei;ii++){
-				for(int jj=0;jj<wid;jj++){
-					float LLum,AAum,BBum;
-				
-					float RL = calclum->r(ii,jj);
-					float GL = calclum->g(ii,jj);
-					float BL = calclum->b(ii,jj);
-								// determine luminance for noisecurve
-					float XL,YL,ZL;
-					Color::rgbxyz(RL,GL,BL,XL,YL,ZL,wpi);
-					Color::XYZ2Lab(XL, YL, ZL, LLum, AAum, BBum);							
-					lumcalc[ii][jj]=LLum;
-					acalc[ii][jj]=AAum;
-					bcalc[ii][jj]=BBum;
-				}
+		for(int ii=0;ii<hei;ii++){
+			for(int jj=0;jj<wid;jj++){
+				float LLum,AAum,BBum;
+			
+				float RL = calclum->r(ii,jj);
+				float GL = calclum->g(ii,jj);
+				float BL = calclum->b(ii,jj);
+							// determine luminance for noisecurve
+				float XL,YL,ZL;
+				Color::rgbxyz(RL,GL,BL,XL,YL,ZL,wpi);
+				Color::XYZ2Lab(XL, YL, ZL, LLum, AAum, BBum);							
+				lumcalc[ii][jj]=LLum;
+				acalc[ii][jj]=AAum;
+				bcalc[ii][jj]=BBum;
 			}
-		
-		delete calclum;
+		}
+	delete calclum;
+	calclum = NULL;
 }		
 
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-		/*if (plistener) {
-		 plistener->setProgressStr ("Denoise...");
-		 plistener->setProgress (0.0);
-		 }*/
+	/*if (plistener) {
+	 plistener->setProgressStr ("Denoise...");
+	 plistener->setProgress (0.0);
+	 }*/
 
 //		volatile double progress = 0.0;
 
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-		const short int imheight=src->height, imwidth=src->width;
-	//	printf("imW=%d imH=%d\n",imwidth,imheight);
-	//	printf("Chroma=%f\n", dnparams.chroma);
-	Qhigh=1.0f;	
-	if(dnparams.smethod=="shalbi") Qhigh=1.f/(float) settings->nrhigh;
+	const short int imheight=src->height, imwidth=src->width;
+	Qhigh=1.0f;
+	if(dnparams.smethod=="shalbi")
+		Qhigh=1.f/(float) settings->nrhigh;
 	if (dnparams.luma!=0 || dnparams.chroma!=0 || dnparams.methodmed=="Lab" || dnparams.methodmed=="Lonly" ) {
-		const bool perf = (dnparams.dmethod=="RGB");
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+		const bool perf = (dnparams.dmethod=="RGB");
 		// gamma transform for input data
 		float gam = dnparams.gamma;
 		float gamthresh = 0.001f;
 		if(!isRAW) {//reduce gamma under 1 for Lab mode ==> TIF and JPG
-		 if(gam <1.9f) gam=1.f - (1.9f-gam)/3.f;//minimum gamma 0.7
-		 else if (gam >= 1.9f && gam <= 3.f) gam=(1.4f/1.1f)*gam - 1.41818f;
-		 }
+			if(gam <1.9f)
+				gam=1.f - (1.9f-gam)/3.f;//minimum gamma 0.7
+			else if (gam >= 1.9f && gam <= 3.f)
+				gam=(1.4f/1.1f)*gam - 1.41818f;
+		}
 		float gamslope = exp(log((double)gamthresh)/gam)/gamthresh;
 
-		LUTf gamcurve(65536,0);
+		LUTf gamcurve(65536,LUT_CLIP_BELOW);
 		if(perf) {
-		for (int i=0; i<65536; i++) {
-			gamcurve[i] = (Color::gamma((double)i/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f;
-		}
-		}
-		else  {
-		for (int i=0; i<65536; i++) {
-			gamcurve[i] = (Color::gamman((double)i/65535.0,gam)) * 32768.0f;
-		}
+			for (int i=0; i<65536; i++) {
+				gamcurve[i] = (Color::gamma((double)i/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f;
+			}
+		} else {
+			for (int i=0; i<65536; i++) {
+				gamcurve[i] = (Color::gamman((double)i/65535.0,gam)) * 32768.0f;
+			}
 		}
 
 		// inverse gamma transform for output data
@@ -359,29 +354,26 @@ void ImProcFunctions::Tile_calc (int tilesize, int overlap, int kall, int imwidt
 		float igamthresh = gamthresh*gamslope;
 		float igamslope = 1.f/gamslope;
 
-		LUTf igamcurve(65536,0);
+		LUTf igamcurve(65536,LUT_CLIP_BELOW);
 		if(perf) {
-		for (int i=0; i<65536; i++) {
-			igamcurve[i] = (Color::gamma((float)i/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
+			for (int i=0; i<65536; i++) {
+				igamcurve[i] = (Color::gamma((float)i/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
+			}
+		} else {
+			for (int i=0; i<65536; i++) {
+				igamcurve[i] = (Color::gamman((float)i/32768.0f,igam) * 65535.0f);
+			}
 		}
-		}
-		else {
-		for (int i=0; i<65536; i++) {
-			igamcurve[i] = (Color::gamman((float)i/32768.0f,igam) * 65535.0f);
-		}
-
-		}
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		//srand((unsigned)time(0));//test with random data
 
 		const float gain = pow (2.0f, float(expcomp));
 		float incr=1.f;
 		float noisevar_Ldetail = SQR((float)(SQR(100.-dnparams.Ldetail) + 50.*(100.-dnparams.Ldetail)) * TS * 0.5f * incr);
 		bool enhance_denoise = dnparams.enhance;
-//		bool median_denoise = dnparams.median;
 		int gamlab = settings->denoiselabgamma;//gamma lab essentialy for Luminance detail
-		if(gamlab > 2) gamlab=2;
-		if(settings->verbose) printf("Denoise Lab=%i\n",gamlab);
+		if(gamlab > 2)
+			gamlab=2;
+		if(settings->verbose)
+			printf("Denoise Lab=%i\n",gamlab);
 
 		array2D<float> tilemask_in(TS,TS);
 		array2D<float> tilemask_out(TS,TS);
@@ -392,1306 +384,1366 @@ void ImProcFunctions::Tile_calc (int tilesize, int overlap, int kall, int imwidt
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int i=0; i<TS; i++) {
-			float i1 = abs((i>TS/2 ? i-TS+1 : i));
-			float vmask = (i1<border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
-			float vmask2 = (i1<2*border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
-			for (int j=0; j<TS; j++) {
-				float j1 = abs((j>TS/2 ? j-TS+1 : j));
-				tilemask_in[i][j] = (vmask * (j1<border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
+	for (int i=0; i<TS; i++) {
+		float i1 = abs((i>TS/2 ? i-TS+1 : i));
+		float vmask = (i1<border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
+		float vmask2 = (i1<2*border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
+		for (int j=0; j<TS; j++) {
+			float j1 = abs((j>TS/2 ? j-TS+1 : j));
+			tilemask_in[i][j] = (vmask * (j1<border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
+			tilemask_out[i][j] = (vmask2 * (j1<2*border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
 
-				tilemask_out[i][j] = (vmask2 * (j1<2*border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
-
-			}
 		}
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		// begin tile processing of image
-
-		//output buffer
-		Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
-		for (int n=0; n<3*imwidth*imheight; n++) dsttmp->data[n] = 0;
-
-	int tilesize;
-	int overlap;
-	if(settings->leveldnti ==0) {
-		tilesize = 1024;
-		overlap = 128;
-	}
-	if(settings->leveldnti ==1) {
-		tilesize = 768;
-		overlap = 96;
 	}
+	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	// begin tile processing of image
 
-		int numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip;
-		
-		Tile_calc (tilesize, overlap, 2, imwidth, imheight, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip);
-		//now we have tile dimensions, overlaps
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	//output buffer
+	Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
+	for (int n=0; n<3*imwidth*imheight; n++) dsttmp->data[n] = 0;
 
-        // According to FFTW-Doc 'it is safe to execute the same plan in parallel by multiple threads', so we now create 4 plans
-        // outside the parallel region and use them inside the parallel region.
+int tilesize;
+int overlap;
+if(settings->leveldnti ==0) {
+	tilesize = 1024;
+	overlap = 128;
+}
+if(settings->leveldnti ==1) {
+	tilesize = 768;
+	overlap = 96;
+}
 
-        // calculate max size of numblox_W.
-        int max_numblox_W = ceil(((float)(MIN(imwidth,tilewidth)))/(offset))+2*blkrad;
-        // calculate min size of numblox_W.
-		int min_numblox_W = ceil(((float)((MIN(imwidth,((numtiles_W - 1) * tileWskip) + tilewidth) ) - ((numtiles_W - 1) * tileWskip)))/(offset))+2*blkrad;
+	int numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip;
+	
+	Tile_calc (tilesize, overlap, 2, imwidth, imheight, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip);
+	//now we have tile dimensions, overlaps
+	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-        // these are needed only for creation of the plans and will be freed before entering the parallel loop
-		float * Lbloxtmp;
-		float * fLbloxtmp;
-		Lbloxtmp  = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof (float));
-		fLbloxtmp = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof (float));
+	// According to FFTW-Doc 'it is safe to execute the same plan in parallel by multiple threads', so we now create 4 plans
+	// outside the parallel region and use them inside the parallel region.
 
-		int nfwd[2]={TS,TS};
+	// calculate max size of numblox_W.
+	int max_numblox_W = ceil(((float)(MIN(imwidth,tilewidth)))/(offset))+2*blkrad;
+	// calculate min size of numblox_W.
+	int min_numblox_W = ceil(((float)((MIN(imwidth,((numtiles_W - 1) * tileWskip) + tilewidth) ) - ((numtiles_W - 1) * tileWskip)))/(offset))+2*blkrad;
 
-        //for DCT:
-		const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
-		const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
+	// these are needed only for creation of the plans and will be freed before entering the parallel loop
+	float * Lbloxtmp;
+	float * fLbloxtmp;
+	Lbloxtmp  = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof (float));
+	fLbloxtmp = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof (float));
 
-        fftwf_plan plan_forward_blox[2];
-        fftwf_plan plan_backward_blox[2];
+	int nfwd[2]={TS,TS};
 
-        // Creating the plans with FFTW_MEASURE instead of FFTW_ESTIMATE speeds up the execute a bit
-		plan_forward_blox[0]  = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
-		plan_backward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
-		plan_forward_blox[1]  = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
-		plan_backward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
-		fftwf_free ( Lbloxtmp );
-		fftwf_free ( fLbloxtmp );
+	//for DCT:
+	const fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10};
+	const fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01};
+
+	fftwf_plan plan_forward_blox[2];
+	fftwf_plan plan_backward_blox[2];
+
+	// Creating the plans with FFTW_MEASURE instead of FFTW_ESTIMATE speeds up the execute a bit
+	plan_forward_blox[0]  = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
+	plan_backward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
+	plan_forward_blox[1]  = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, Lbloxtmp, NULL, 1, TS*TS, fLbloxtmp, NULL, 1, TS*TS, fwdkind, FFTW_MEASURE );
+	plan_backward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, fLbloxtmp, NULL, 1, TS*TS, Lbloxtmp, NULL, 1, TS*TS, bwdkind, FFTW_MEASURE );
+	fftwf_free ( Lbloxtmp );
+	fftwf_free ( fLbloxtmp );
+
+	int numthreads = 1;
+
 #ifdef _OPENMP
-        // Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
-        int numtiles = numtiles_W * numtiles_H;
-        int numthreads = MIN(numtiles,omp_get_max_threads());
-        if(options.rgbDenoiseThreadLimit > 0) numthreads = MIN(numthreads,options.rgbDenoiseThreadLimit);
-        // Issue 1887, overide setting of 1, if more than one thread is available. This way the inner omp-directives should become inactive
-        if(numthreads == 1 && omp_get_max_threads() > 1)
-			numthreads = 2;
-#pragma omp parallel num_threads(numthreads)
-#endif
-        {
-		float resid=0.f;
-		float nbresid=0;
-		float maxredresid=0.f;
-		float maxblueresid=0.f;
-		float residred=0.f;
-		float residblue=0.f;
-		
-		//DCT block data storage
-		float * Lblox;
-		float * fLblox;
-		int pos;
-    TMatrix wprof = iccStore->workingSpaceMatrix (params->icm.working);
-
-    double wp[3][3] = {
-        {wprof[0][0],wprof[0][1],wprof[0][2]},
-		{wprof[1][0],wprof[1][1],wprof[1][2]},
-		{wprof[2][0],wprof[2][1],wprof[2][2]}
-    };
-
+	// Calculate number of tiles. If less than omp_get_max_threads(), then limit num_threads to number of tiles
+	int numtiles = numtiles_W * numtiles_H;
+	numthreads = MIN(numtiles,omp_get_max_threads());
+	if(options.rgbDenoiseThreadLimit > 0)
+		numthreads = MIN(numthreads,options.rgbDenoiseThreadLimit);
+	denoiseNestedLevels = omp_get_max_threads() / numthreads;
+	bool oldNested = omp_get_nested();
+	if(denoiseNestedLevels < 2)
+		denoiseNestedLevels = 1;
+	else
+		omp_set_nested(true);
+	if(options.rgbDenoiseThreadLimit > 0)
+		while(denoiseNestedLevels*numthreads > options.rgbDenoiseThreadLimit)
+			denoiseNestedLevels--;
+	if(settings->verbose)
+		printf("RGB_denoise uses %d main thread(s) and up to %d nested thread(s) for each main thread\n",numthreads,denoiseNestedLevels);
+#endif
+
+	float *nbrwtArray[denoiseNestedLevels*numthreads];
+	float *LbloxArray[denoiseNestedLevels*numthreads];
+	float *fLbloxArray[denoiseNestedLevels*numthreads];
+
+	for(int i=0;i<denoiseNestedLevels*numthreads;i++) {
+		LbloxArray[i]  = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof(float));
+		fLbloxArray[i] = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof(float));
+		nbrwtArray[i] = new float[TS*TS];
+	}
+
 	TMatrix wiprof = iccStore->workingSpaceInverseMatrix (params->icm.working);
 	//inverse matrix user select
-	double wip[3][3] = {
+	const float wip[3][3] = {
 		{wiprof[0][0],wiprof[0][1],wiprof[0][2]},
 		{wiprof[1][0],wiprof[1][1],wiprof[1][2]},
 		{wiprof[2][0],wiprof[2][1],wiprof[2][2]}
 	};
-	//	int wcr,hcr;
-	
+
+	TMatrix wprof = iccStore->workingSpaceMatrix (params->icm.working);
+
+	const float wp[3][3] = {
+		{wprof[0][0],wprof[0][1],wprof[0][2]},
+		{wprof[1][0],wprof[1][1],wprof[1][2]},
+		{wprof[2][0],wprof[2][1],wprof[2][2]}
+	};
+
+
 #ifdef _OPENMP
-#pragma omp critical
+#pragma omp parallel num_threads(numthreads)
 #endif
-        {
-		Lblox  = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof(float));
-		fLblox = (float*) fftwf_malloc(max_numblox_W*TS*TS*sizeof(float));
-        }
-        
-        float * nbrwt = new float[TS*TS];
-        float * blurbuffer = new float[TS*TS];
+	{
+	float resid=0.f;
+	float nbresid=0;
+	float maxredresid=0.f;
+	float maxblueresid=0.f;
+	float residred=0.f;
+	float residblue=0.f;
+	
+	int pos;
+
+
 #ifdef _OPENMP
 #pragma omp for schedule(dynamic) collapse(2)
 #endif
 
-		for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
-			for (int tileleft=0; tileleft<imwidth ; tileleft+=tileWskip) {
-				//printf("titop=%d tileft=%d\n",tiletop/tileHskip, tileleft/tileWskip);
-				pos= (tiletop/tileHskip)*numtiles_W + tileleft/tileWskip ;
-		//	    printf("posw=%d\n", pos);
-		/*		if(kall==0) {//with Dcrop
-					//int trafx, int trafy, int trafw, int trafh, // trafx trafy = begin window trafw trafh = width and heigh window
-					//int widIm, int heiIm : width and height full image
-					int centerwCrop=trafx+trafw/2;//center crop window
-					int centerhCrop=trafy+trafh/2;
-					int poscalculate; 
-			//	for(int wcr=0;wcr< ;wcr++) {
-			//	for(int hcr=0;hcr<numtiles_H;hcr++) {
-			//	    int posca= hcr*numtiles_W + wcr;
+	for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
+		for (int tileleft=0; tileleft<imwidth ; tileleft+=tileWskip) {
+			//printf("titop=%d tileft=%d\n",tiletop/tileHskip, tileleft/tileWskip);
+			pos= (tiletop/tileHskip)*numtiles_W + tileleft/tileWskip ;
+	//	    printf("posw=%d\n", pos);
+	/*		if(kall==0) {//with Dcrop
+				//int trafx, int trafy, int trafw, int trafh, // trafx trafy = begin window trafw trafh = width and heigh window
+				//int widIm, int heiIm : width and height full image
+				int centerwCrop=trafx+trafw/2;//center crop window
+				int centerhCrop=trafy+trafh/2;
+				int poscalculate; 
+		//	for(int wcr=0;wcr< ;wcr++) {
+		//	for(int hcr=0;hcr<numtiles_H;hcr++) {
+		//	    int posca= hcr*numtiles_W + wcr;
+		
+			}
+			*/
+			int tileright = MIN(imwidth,tileleft+tilewidth);
+			int tilebottom = MIN(imheight,tiletop+tileheight);
+			int width  = tileright-tileleft;
+			int height = tilebottom-tiletop;
+			bool ponder=false;
+			float ponderCC=1.f;
+			if(settings->leveldnautsimpl==1 && params->dirpyrDenoise.Cmethod=="PON") {ponder=true;ponderCC=0.5f;}
+			if(settings->leveldnautsimpl==1 && params->dirpyrDenoise.Cmethod=="PRE") {ponderCC=0.5f;}
+			if(settings->leveldnautsimpl==0 && params->dirpyrDenoise.Cmethod=="PREV") {ponderCC=0.5f;}
 			
-				}
-				*/
-				int tileright = MIN(imwidth,tileleft+tilewidth);
-				int tilebottom = MIN(imheight,tiletop+tileheight);
-				int width  = tileright-tileleft;
-				int height = tilebottom-tiletop;
-				float noisevarab_b, noisevarab_r;
-				bool ponder=false;
-				float ponderCC=1.f;
-				if(settings->leveldnautsimpl==1 && params->dirpyrDenoise.Cmethod=="PON") {ponder=true;ponderCC=0.5f;}
-				if(settings->leveldnautsimpl==1 && params->dirpyrDenoise.Cmethod=="PRE") {ponderCC=0.5f;}
-				if(settings->leveldnautsimpl==0 && params->dirpyrDenoise.Cmethod=="PREV") {ponderCC=0.5f;}
-				
-				float realred2, realred, realblue, realblue2;
-				float interm_med =(float) dnparams.chroma/10.0;
-				float intermred, intermblue;
-				if(dnparams.redchro > 0.) intermred=(dnparams.redchro/10.); else intermred= (float) dnparams.redchro/7.0;//increase slower than linear for more sensit
-				if(dnparams.bluechro > 0.) intermblue=(dnparams.bluechro/10.); else intermblue= (float) dnparams.bluechro/7.0;//increase slower than linear for more sensit
-				if(ponder && kall==2){interm_med=ch_M[pos]/10.f; intermred=max_r[pos]/10.f;intermblue=max_b[pos]/10.f;}							
-				if(ponder && kall==0){interm_med=0.01f; intermred=0.f;intermblue=0.f;}							
-				realred = interm_med + intermred; if (realred < 0.f) realred=0.001f;
-				realblue = interm_med + intermblue; if (realblue < 0.f) realblue=0.001f;
-			//	printf("Ch=%f red=%f blu=%f\n",interm_med, intermred,intermblue );
-				
-				//input L channel
-				array2D<float> Lin(width,height);
-				//wavelet denoised image
-				LabImage * labdn = new LabImage(width,height);
-				float* mad_LL = new float [height*width];
-				float** noisevarlum;
-						noisevarlum = new float*[height];
-							for (int i=0; i<height; i++)
-							noisevarlum[i] = new float[width];
-							
-				float* mad_aa = new float [height*width];
-				float* mad_bb = new float [height*width];
-				float** noisevarchrom;
-						noisevarchrom = new float*[height];
-							for (int i=0; i<height; i++)
-							noisevarchrom[i] = new float[width];
-				
-				//residual between input and denoised L channel
-				array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
-				//pixel weight
-				array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
-				// init luma noisevarL
-				float noisevarL	 = (float) (SQR((dnparams.luma/125.0)*(1.+ dnparams.luma/25.0)));
-			//	printf("nova=%f\n",noisevarL);
-				//TODO: implement using AlignedBufferMP
-				//fill tile from image; convert RGB to "luma/chroma"
-				if (isRAW) {//image is raw; use channel differences for chroma channels
-					if(!perf){//lab mode
-							//modification Jacques feb 2013 and july 2014					
-					for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
-						int i1 = i - tiletop;
-						for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
-							int j1 = j - tileleft;
-							float R_ = gain*src->r(i,j);
-							float G_ = gain*src->g(i,j);
-							float B_ = gain*src->b(i,j);
-							float Llum,alum,blum;
-							
-							if(noiseLCurve) {
-								Llum=lumcalc[i][j];
-							}
-							if(noiseCCurve) {
-								alum=acalc[i][j];
-								blum=bcalc[i][j];
-							}
-							//modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10
-							//we can put other as gamma g=2.6 slope=11, etc.
-							// but noting to do with real gamma !!!: it's only for data Lab # data RGB
-							//finally I opted fot gamma55 and with options we can change
-							if (gamlab == 0) {// options 12/2013
+			float realred2, realred, realblue, realblue2;
+			float interm_med =(float) dnparams.chroma/10.0;
+			float intermred, intermblue;
+			if(dnparams.redchro > 0.) intermred=(dnparams.redchro/10.); else intermred= (float) dnparams.redchro/7.0;//increase slower than linear for more sensit
+			if(dnparams.bluechro > 0.) intermblue=(dnparams.bluechro/10.); else intermblue= (float) dnparams.bluechro/7.0;//increase slower than linear for more sensit
+			if(ponder && kall==2){interm_med=ch_M[pos]/10.f; intermred=max_r[pos]/10.f;intermblue=max_b[pos]/10.f;}							
+			if(ponder && kall==0){interm_med=0.01f; intermred=0.f;intermblue=0.f;}							
+			realred = interm_med + intermred; if (realred < 0.f) realred=0.001f;
+			realblue = interm_med + intermblue; if (realblue < 0.f) realblue=0.001f;
+			const float noisevarab_r = SQR(realred);
+			const float noisevarab_b = SQR(realblue);
+		//	printf("Ch=%f red=%f blu=%f\n",interm_med, intermred,intermblue );
+
+			//input L channel
+			array2D<float> Lin(width,height);
+			//wavelet denoised image
+			LabImage * labdn = new LabImage(width,height);
+			float* mad_LL = new float [height*width];
+			float** noisevarlum;
+					noisevarlum = new float*[height];
+						for (int i=0; i<height; i++)
+						noisevarlum[i] = new float[width];
+
+			float* mad_aa = new float [height*width];
+			float* mad_bb = new float [height*width];
+			float** noisevarchrom;
+					noisevarchrom = new float*[height];
+						for (int i=0; i<height; i++)
+						noisevarchrom[i] = new float[width];
+
+			//residual between input and denoised L channel
+			array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
+			//pixel weight
+			array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
+			// init luma noisevarL
+			float noiseluma=(float) dnparams.luma;
+			if(perf && noiseLCurve)
+				noiseluma += 1.f;
+			float noisevarL	 = (float) (SQR((noiseluma/125.0)*(1.+ noiseluma/25.0)));
+		//	printf("nova=%f\n",noisevarL);
+			//TODO: implement using AlignedBufferMP
+			//fill tile from image; convert RGB to "luma/chroma"
+			if (isRAW) {//image is raw; use channel differences for chroma channels
+
+				if(!perf){//lab mode
+						//modification Jacques feb 2013 and july 2014
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+				for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
+					int i1 = i - tiletop;
+					for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+						int j1 = j - tileleft;
+						float R_ = gain*src->r(i,j);
+						float G_ = gain*src->g(i,j);
+						float B_ = gain*src->b(i,j);
+
+						//modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10
+						//we can put other as gamma g=2.6 slope=11, etc.
+						// but noting to do with real gamma !!!: it's only for data Lab # data RGB
+						//finally I opted fot gamma55 and with options we can change
+						if (gamlab == 0) {// options 12/2013
 							R_ = Color::igammatab_26_11[R_];
 							G_ = Color::igammatab_26_11[G_];
 							B_ = Color::igammatab_26_11[B_];
-							}
-							else if (gamlab == 1) {
-							//other new gamma 4 5 
+						}
+						else if (gamlab == 1) {
+						//other new gamma 4 5 
 							R_ = Color::igammatab_4[R_];
 							G_ = Color::igammatab_4[G_];
 							B_ = Color::igammatab_4[B_];
-							}
-							else if (gamlab == 2) {							
-							//new gamma 5.5 10 better for detail luminance..it is a compromise...which depends on the image (distribution BL, ML, HL ...)
+						}
+						else if (gamlab == 2) {							
+						//new gamma 5.5 10 better for detail luminance..it is a compromise...which depends on the image (distribution BL, ML, HL ...)
 							R_ = Color::igammatab_55[R_];
 							G_ = Color::igammatab_55[G_];
 							B_ = Color::igammatab_55[B_];
-							}
-							//apply gamma noise	standard (slider)
-							R_ = R_<65535.0f ? gamcurve[R_] : (Color::gamman((double)R_/65535.0, gam)*32768.0f);
-							G_ = G_<65535.0f ? gamcurve[G_] : (Color::gamman((double)G_/65535.0, gam)*32768.0f);
-							B_ = B_<65535.0f ? gamcurve[B_] : (Color::gamman((double)B_/65535.0, gam)*32768.0f);
-							//true conversion xyz=>Lab
-							float L,a,b;
-							float X,Y,Z;
-							Color::rgbxyz(R_,G_,B_,X,Y,Z,wp);
+						}
+						//apply gamma noise	standard (slider)
+						R_ = R_<65535.0f ? gamcurve[R_] : (Color::gammanf(R_/65535.f, gam)*32768.0f);
+						G_ = G_<65535.0f ? gamcurve[G_] : (Color::gammanf(G_/65535.f, gam)*32768.0f);
+						B_ = B_<65535.0f ? gamcurve[B_] : (Color::gammanf(B_/65535.f, gam)*32768.0f);
+						//true conversion xyz=>Lab
+						float L,a,b;
+						float X,Y,Z;
+						Color::rgbxyz(R_,G_,B_,X,Y,Z,wp);
 
-							//convert to Lab
-							Color::XYZ2Lab(X, Y, Z, L, a, b);
-							float noiseluma=(float) dnparams.luma;	
-							
-							if(noiseLCurve) {
-								float kN=Llum;//with no gamma and take into account working profile
-								float epsi=0.01f;
-								if(kN<2.f) kN=2.f;//avoid divided by zero
-								if(kN>32768.f) kN=32768.f;	// not strictly necessary							
-								float kinterm=epsi+ noiseLCurve[(kN/32768.f)*500.f];
-								float ki=kinterm*100.f;
-								ki+=noiseluma;
-								noiseluma += 1.f;
-								noisevarlum[i1][j1]= SQR((ki/125.f)*(1.f+ki/25.f));
-							}
-							noisevarab_r = SQR(realred);
-							noisevarab_b = SQR(realblue);
-							if(noiseCCurve) {
-								float aN=alum;
-								float bN=blum;
-								float epsic=0.01f;
-								float kN=Llum;//with no gamma and take into account working profile
-								if(kN<2.f) kN=2.f;//avoid divided by zero
-								if(kN>32768.f) kN=32768.f;	// not strictly necessary							
-								
-								float cN=sqrt(SQR(aN)+SQR(bN));
-								if(cN < 100.f) cN=100.f;//avoid divided by zero
-								float Cinterm=1.f + ponderCC*4.f*noiseCCurve[(cN/30000.f)*500.f];//C=f(C)
-								noisevarchrom[i1][j1]= max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
-							//	printf("NC=%f ",noisevarchrom[i1][j1]);
-							}
-
-							//end chroma
-							
-							labdn->L[i1][j1] = L;
-							labdn->a[i1][j1] = a;
-							labdn->b[i1][j1] = b;
+						//convert to Lab
+						Color::XYZ2Lab(X, Y, Z, L, a, b);
 						
-							Lin[i1][j1] = L;
-//							totwt[i1][j1] = 0;
+						if(noiseLCurve) {
+							float kN = lumcalc[i][j]; //with no gamma and take into account working profile
+							float epsi = 0.01f;
+							if(kN<2.f)
+								kN = 2.f;//avoid divided by zero
+							if(kN>32768.f)
+								kN = 32768.f;	// not strictly necessary							
+							float kinterm = epsi + noiseLCurve[xdivf(kN,15)*500.f];
+							float ki = kinterm*100.f;
+							ki += noiseluma;
+							noisevarlum[i1][j1] = SQR((ki/125.f)*(1.f+ki/25.f));
 						}
-					}
-					}
-					else {//RGB mode
+						if(noiseCCurve) {
+							float aN = acalc[i][j];
+							float bN = bcalc[i][j];
+							float cN = sqrtf(SQR(aN)+SQR(bN));
+							if(cN < 100.f)
+								cN = 100.f;//avoid divided by zero
+							float Cinterm = 1.f + ponderCC*4.f*noiseCCurve[cN/60.f];//C=f(C)
+							noisevarchrom[i1][j1] = max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
+						}
+						//end chroma
+
+						labdn->L[i1][j1] = L;
+						labdn->a[i1][j1] = a;
+						labdn->b[i1][j1] = b;
 					
-					for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
-						int i1 = i - tiletop;
-						for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
-							int j1 = j - tileleft;
-
-							float X = gain*src->r(i,j);
-							float Y = gain*src->g(i,j);
-							float Z = gain*src->b(i,j);
-							//conversion colorspace to determine luminance with no gamma
-							float Llum,alum,blum;
-							if(noiseLCurve) {
-								Llum=lumcalc[i][j];
-							}
-							if(noiseCCurve) {
-								alum=acalc[i][j];
-								blum=bcalc[i][j];
-							}
-							
-							X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-							Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-							Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-							float noiseluma=(float) dnparams.luma;	
-							if(noiseLCurve) {
-							//	float noiseluma=(float) dnparams.luma;	
-								float kN=Llum;
-								float epsi=0.01f;
-								if(kN<2.f) kN=2.f;
-								if(kN>32768.f) kN=32768.f;
-								float kinterm=epsi + noiseLCurve[(kN/32768.f)*500.f];
-								float ki=kinterm*100.f;
-								ki+=noiseluma;
-								noiseluma += 1.f;
-								noisevarlum[i1][j1]= SQR((ki/125.f)*(1.f+ki/25.f));
-							}
-							noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));
-							noisevarab_r = SQR(realred);
-							noisevarab_b = SQR(realblue);
-							if(noiseCCurve) {
-								float aN=alum;
-								float bN=blum;
-								float epsic=0.01f;
-								
-								float cN=sqrt(SQR(aN)+SQR(bN));
-								if(cN < 100.f) cN=100.f;//avoid divided by zero
-								float Cinterm=1.f + ponderCC*4.f*noiseCCurve[(cN/30000.f)*500.f];
-								noisevarchrom[i1][j1]=max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
-							}
-							//end chroma					
-							
-							labdn->L[i1][j1] = Y;
-							labdn->a[i1][j1] = (X-Y);
-							labdn->b[i1][j1] = (Y-Z);
-
-//							Ldetail[i1][j1] = 0;
-							Lin[i1][j1] = Y;
-//							totwt[i1][j1] = 0;
-						}
-					}
-
-					}
-					
-				} else {//image is not raw; use Lab parametrization
-					for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
-						int i1 = i - tiletop;
-						for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
-							int j1 = j - tileleft;
-							float L,a,b;
-							float rLum=src->r(i,j) ;//for luminance denoise curve
-							float gLum=src->g(i,j) ;
-							float bLum=src->b(i,j) ;
-							
-							//use gamma sRGB, not good if TIF (JPG) Output profil not with gamma sRGB  (eg : gamma =1.0, or 1.8...)
-							//very difficult to solve !
-							// solution ==> save TIF with gamma sRGB and re open
-							float rtmp = Color::igammatab_srgb[ src->r(i,j) ];
-							float gtmp = Color::igammatab_srgb[ src->g(i,j) ];
-							float btmp = Color::igammatab_srgb[ src->b(i,j) ];
-							//modification Jacques feb 2013
-							// gamma slider different from raw
-							rtmp = rtmp<65535.0f ? gamcurve[rtmp] : (Color::gamman((double)rtmp/65535.0, gam)*32768.0f);
-							gtmp = gtmp<65535.0f ? gamcurve[gtmp] : (Color::gamman((double)gtmp/65535.0, gam)*32768.0f);
-							btmp = btmp<65535.0f ? gamcurve[btmp] : (Color::gamman((double)btmp/65535.0, gam)*32768.0f);
-							
-							noisevarab_r = SQR(realred);
-							noisevarab_b = SQR(realblue);
-
-
-							float X,Y,Z;
-							Color::rgbxyz(rtmp,gtmp,btmp,X,Y,Z,wp);
-
-							//convert Lab
-							Color::XYZ2Lab(X, Y, Z, L, a, b);
-							float Llum,alum,blum;
-							if(noiseLCurve || noiseCCurve) {
-								float XL,YL,ZL;
-								Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp);
-								Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum);
-							}
-							float noiseluma=(float) dnparams.luma;	
-							
-							if(noiseLCurve) {
-							//	float noiseluma=(float) dnparams.luma;	
-								float kN=Llum;
-								float epsi=0.01f;
-
-								if(kN<2.f) kN=2.f;
-								if(kN>32768.f) kN=32768.f;								
-								float kinterm=epsi + noiseLCurve[(kN/32768.f)*500.f];
-								float ki=kinterm*100.f;
-								ki+=noiseluma;
-								noiseluma += 1.f;
-							//	noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));								
-								noisevarlum[i1][j1]=SQR((ki/125.f)*(1.f+ki/25.f));
-							}	
-							noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));								
-							if(noiseCCurve) {
-								float aN=alum;
-								float bN=blum;
-								float epsic=0.01f;
-								float cN=sqrt(SQR(aN)+SQR(bN));
-								if(cN < 100.f) cN=100.f;//avoid divided by zero
-								float Cinterm=1.f + ponderCC*4.f*noiseCCurve[(cN/30000.f)*500.f];
-								noisevarchrom[i1][j1]=max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
-							}
-							
-							labdn->L[i1][j1] = L;
-							labdn->a[i1][j1] = a;
-							labdn->b[i1][j1] = b;
-
-//							Ldetail[i1][j1] = 0;
-							Lin[i1][j1] = L;
-
-//							totwt[i1][j1] = 0;
-						}
+						Lin[i1][j1] = L;
 					}
 				}
+				}
+				else {//RGB mode
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+				for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
+					int i1 = i - tiletop;
+					for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+						int j1 = j - tileleft;
+
+						float X = gain*src->r(i,j);
+						float Y = gain*src->g(i,j);
+						float Z = gain*src->b(i,j);
+						//conversion colorspace to determine luminance with no gamma
+						X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
+						Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
+						Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
+						if(noiseLCurve) {
+						//	float noiseluma=(float) dnparams.luma;	
+							float kN = lumcalc[i][j];
+							float epsi = 0.01f;
+							if (kN<2.f) 
+								kN = 2.f;
+							if (kN>32768.f)
+								kN = 32768.f;
+							float kinterm = epsi + noiseLCurve[xdivf(kN,15)*500.f];
+							float ki = kinterm*100.f;
+							ki += noiseluma;
+							noisevarlum[i1][j1] = SQR((ki/125.f)*(1.f+ki/25.f));
+						}
+						if(noiseCCurve) {
+							float aN = acalc[i][j];
+							float bN = bcalc[i][j];
+							
+							float cN = sqrtf(SQR(aN)+SQR(bN));
+							if(cN < 100.f)
+								cN = 100.f;//avoid divided by zero
+							float Cinterm = 1.f + ponderCC*4.f*noiseCCurve[cN/60.f];
+							noisevarchrom[i1][j1] = max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
+						}
+						//end chroma					
+						labdn->L[i1][j1] = Y;
+						labdn->a[i1][j1] = (X-Y);
+						labdn->b[i1][j1] = (Y-Z);
+
+						Lin[i1][j1] = Y;
+					}
+				}
+
+				}
+				
+			} else {//image is not raw; use Lab parametrization
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+				for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
+					int i1 = i - tiletop;
+					for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+						int j1 = j - tileleft;
+						float L,a,b;
+						float rLum=src->r(i,j) ;//for luminance denoise curve
+						float gLum=src->g(i,j) ;
+						float bLum=src->b(i,j) ;
+						
+						//use gamma sRGB, not good if TIF (JPG) Output profil not with gamma sRGB  (eg : gamma =1.0, or 1.8...)
+						//very difficult to solve !
+						// solution ==> save TIF with gamma sRGB and re open
+						float rtmp = Color::igammatab_srgb[ src->r(i,j) ];
+						float gtmp = Color::igammatab_srgb[ src->g(i,j) ];
+						float btmp = Color::igammatab_srgb[ src->b(i,j) ];
+						//modification Jacques feb 2013
+						// gamma slider different from raw
+						rtmp = rtmp<65535.0f ? gamcurve[rtmp] : (Color::gamman((double)rtmp/65535.0, gam)*32768.0f);
+						gtmp = gtmp<65535.0f ? gamcurve[gtmp] : (Color::gamman((double)gtmp/65535.0, gam)*32768.0f);
+						btmp = btmp<65535.0f ? gamcurve[btmp] : (Color::gamman((double)btmp/65535.0, gam)*32768.0f);
+
+						float X,Y,Z;
+						Color::rgbxyz(rtmp,gtmp,btmp,X,Y,Z,wp);
+
+						//convert Lab
+						Color::XYZ2Lab(X, Y, Z, L, a, b);
+						float Llum,alum,blum;
+						if(noiseLCurve || noiseCCurve) {
+							float XL,YL,ZL;
+							Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp);
+							Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum);
+						}
+				
+						if(noiseLCurve) {
+							float kN = Llum;
+							float epsi=0.01f;
+
+							if(kN<2.f) kN=2.f;
+							if(kN>32768.f) kN=32768.f;								
+							float kinterm=epsi + noiseLCurve[xdivf(kN,15)*500.f];
+							float ki=kinterm*100.f;
+							ki+=noiseluma;
+							noiseluma += 1.f;
+							noisevarlum[i1][j1]=SQR((ki/125.f)*(1.f+ki/25.f));
+						}	
+						if(noiseCCurve) {
+							float aN=alum;
+							float bN=blum;
+							float cN=sqrtf(SQR(aN)+SQR(bN));
+							if(cN < 100.f)
+								cN=100.f;//avoid divided by zero ???
+							float Cinterm=1.f + ponderCC*4.f*noiseCCurve[cN/60.f];
+							noisevarchrom[i1][j1]=max(noisevarab_b,noisevarab_r)*SQR(Cinterm);
+						}
+						
+						labdn->L[i1][j1] = L;
+						labdn->a[i1][j1] = a;
+						labdn->b[i1][j1] = b;
+
+						Lin[i1][j1] = L;
+					}
+				}
+			}
 
-				//initial impulse denoise, removed in Issue 2557
+			//initial impulse denoise, removed in Issue 2557
 //				if (dnparams.luma>0.01) {
 //					impulse_nr (labdn, float(MIN(50.0,dnparams.luma))/20.0f);
 //				}
 
-				int datalen = labdn->W * labdn->H;
+			int datalen = labdn->W * labdn->H;
 
-				//now perform basic wavelet denoise
-				//last two arguments of wavelet decomposition are max number of wavelet decomposition levels;
-				//and whether to subsample the image after wavelet filtering.  Subsampling is coded as
-				//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
-				//the first level only, 7 means subsample the first three levels, etc.
-			//	float noisevarL	 = (float) (SQR((dnparams.luma/125.0)*(1.+ dnparams.luma/25.0)));
-				float interm_medT= (float) dnparams.chroma/10.0;
-				bool execwavelet=true;
-				bool autoch=false;
-				if(noisevarL < 0.000007f && interm_medT < 0.05f && dnparams.median && (dnparams.methodmed=="Lab" || dnparams.methodmed=="Lonly")) execwavelet=false;//do not exec wavelet if sliders luminance and chroma are very small and median need
-				//we considered user don't want wavelet
-				if(settings->leveldnautsimpl==1 && dnparams.Cmethod!="MAN") execwavelet=true;
-				if(settings->leveldnautsimpl==0 && dnparams.C2method!="MANU") execwavelet=true;
-				if(settings->leveldnautsimpl==1 && (dnparams.Cmethod=="AUT" || dnparams.Cmethod=="PRE")) autoch=true;
-				if(settings->leveldnautsimpl==0 && dnparams.C2method=="AUTO" || dnparams.C2method=="PREV") autoch=true;
-				
-				if(execwavelet) {//gain time if user choose only median  sliders L <=1  slider chrom master < 1
-                { // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
-                wavelet_decomposition* Ldecomp;
-                wavelet_decomposition* adecomp;
-                wavelet_decomposition* bdecomp;
-				int schoice=0;//shrink method
-				if(dnparams.smethod=="shal") schoice=0;
-				else if(dnparams.smethod=="shalbi") schoice=2;
+			//now perform basic wavelet denoise
+			//last two arguments of wavelet decomposition are max number of wavelet decomposition levels;
+			//and whether to subsample the image after wavelet filtering.  Subsampling is coded as
+			//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
+			//the first level only, 7 means subsample the first three levels, etc.
+			float interm_medT = (float) dnparams.chroma/10.0;
+			bool execwavelet = true;
+			bool autoch = false;
+			if(noisevarL < 0.000007f && interm_medT < 0.05f && dnparams.median && (dnparams.methodmed=="Lab" || dnparams.methodmed=="Lonly"))
+				execwavelet=false;//do not exec wavelet if sliders luminance and chroma are very small and median need
+			//we considered user don't want wavelet
+			if(settings->leveldnautsimpl==1 && dnparams.Cmethod!="MAN")
+				execwavelet=true;
+			if(settings->leveldnautsimpl==0 && dnparams.C2method!="MANU")
+				execwavelet=true;
+			if(settings->leveldnautsimpl==1 && (dnparams.Cmethod=="AUT" || dnparams.Cmethod=="PRE"))
+				autoch=true;
+			if(settings->leveldnautsimpl==0 && dnparams.C2method=="AUTO" || dnparams.C2method=="PREV")
+				autoch=true;
+			
+			if(execwavelet) {//gain time if user choose only median  sliders L <=1  slider chrom master < 1
+			{ // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
+			wavelet_decomposition* Ldecomp;
+			wavelet_decomposition* adecomp;
+			wavelet_decomposition* bdecomp;
+			int schoice=0;//shrink method
+			if(dnparams.smethod=="shal") schoice=0;
+			else if(dnparams.smethod=="shalbi") schoice=2;
 
-				int levwav=5;
-				float maxreal = max(realred, realblue);
-				//increase the level of wavelet if user increase much or very much sliders
-				if( maxreal < 8.f) levwav=5;
-				else if( maxreal < 10.f)levwav=6;
-				else if( maxreal < 15.f)levwav=7;
-				else levwav=8;//maximum ==> I have increase Maxlevel in cplx_wavelet_dec.h from 8 to 9
-				if(schoice==2) levwav+=settings->nrwavlevel;//increase level for enhanced mode
-				if(levwav>8) levwav=8;
-				
-				
-				
-				//	if (settings->verbose) printf("levwavelet=%i  noisevarA=%f noisevarB=%f \n",levwav, noisevarab_r, noisevarab_b );
-                Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, levwav/*maxlevels*/, 0/*subsampling*/ );
-                adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H,levwav, 1 );
-                bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, levwav, 1 );
-				if(schoice==0) WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL, mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, schoice, autoch);//enhance mode
-				if(schoice==2) {
-					WaveletDenoiseAll_BiShrink(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL,  mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, schoice, autoch);//enhance mode
-					WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL,  mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut ,redaut, blueaut, maxredaut, maxblueaut, schoice, autoch);
-					}
-				float chresid,chmaxredresid,chmaxblueresid,chresidred, chresidblue;	
-				//kall=0 call by Dcrop
-				resid=0.f;residred=0.f;residblue=0.f;
-				
-				if(kall==0) Noise_residual(*Ldecomp, *adecomp, *bdecomp,  width, height, chresid, chmaxredresid,chmaxblueresid, chresidred, chresidblue, resid, residblue, residred, maxredresid, maxblueresid, nbresid);
-				//printf("NoiRESID=%3.1f maxR=%3.1f maxB=%3.1f red=%3.1f  blue=%3.1f\n",chresid, chmaxredresid,chmaxblueresid, chresidred, chresidblue);	
-				nresi=chresid;
-				highresi=chresid + 0.66f*(max(chmaxredresid,chmaxblueresid) - chresid);//evaluate sigma
-				Ldecomp->reconstruct(labdn->data);
-				delete Ldecomp;
-				adecomp->reconstruct(labdn->data+datalen);
-				delete adecomp;
-				bdecomp->reconstruct(labdn->data+2*datalen);
-				delete bdecomp;
-                }
+			int levwav=5;
+			float maxreal = max(realred, realblue);
+			//increase the level of wavelet if user increase much or very much sliders
+			if( maxreal < 8.f) levwav=5;
+			else if( maxreal < 10.f)levwav=6;
+			else if( maxreal < 15.f)levwav=7;
+			else levwav=8;//maximum ==> I have increase Maxlevel in cplx_wavelet_dec.h from 8 to 9
+			if(schoice==2) levwav+=settings->nrwavlevel;//increase level for enhanced mode
+			if(levwav>8) levwav=8;
+			
+			//	if (settings->verbose) printf("levwavelet=%i  noisevarA=%f noisevarB=%f \n",levwav, noisevarab_r, noisevarab_b );
+#ifdef _OPENMP
+#pragma omp parallel sections num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+{
+#ifdef _OPENMP
+#pragma omp section
+#endif
+{
+			Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, levwav/*maxlevels*/, 0/*subsampling*/ );
+}
+#ifdef _OPENMP
+#pragma omp section
+#endif
+{			// only one section for both, because they are much faster then Ldecomp
+			adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H,levwav, 1 );
+			bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, levwav, 1 );
+}
+}
+			if(schoice==0) WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL, mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, schoice, autoch, perf );//enhance mode
+			if(schoice==2) {
+				WaveletDenoiseAll_BiShrink(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL,  mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, schoice, autoch, perf );//enhance mode
+				WaveletDenoiseAll(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom, width, height, mad_LL,  mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut ,redaut, blueaut, maxredaut, maxblueaut, schoice, autoch, perf );
 				}
-				//TODO: at this point wavelet coefficients storage can be freed
-				//Issue 1680: Done now
+			float chresid,chmaxredresid,chmaxblueresid,chresidred, chresidblue;	
+			//kall=0 call by Dcrop
+			resid=0.f;residred=0.f;residblue=0.f;
+			if(kall==0) Noise_residual(*Ldecomp, *adecomp, *bdecomp,  width, height, chresid, chmaxredresid,chmaxblueresid, chresidred, chresidblue, resid, residblue, residred, maxredresid, maxblueresid, nbresid);
+			//printf("NoiRESID=%3.1f maxR=%3.1f maxB=%3.1f red=%3.1f  blue=%3.1f\n",chresid, chmaxredresid,chmaxblueresid, chresidred, chresidblue);	
+			nresi=chresid;
+			highresi=chresid + 0.66f*(max(chmaxredresid,chmaxblueresid) - chresid);//evaluate sigma
+#ifdef _OPENMP
+#pragma omp parallel sections num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+{
+#ifdef _OPENMP
+#pragma omp section
+#endif
+{
+			Ldecomp->reconstruct(labdn->data);
+			delete Ldecomp;
+}
+#ifdef _OPENMP
+#pragma omp section
+#endif
+{
+			adecomp->reconstruct(labdn->data+datalen);
+			delete adecomp;
+}
+#ifdef _OPENMP
+#pragma omp section
+#endif
+{
+			bdecomp->reconstruct(labdn->data+2*datalen);
+			delete bdecomp;
+}
+}
+			}
+			}
+			//TODO: at this point wavelet coefficients storage can be freed
+			//Issue 1680: Done now
 
-				//second impulse denoise, removed in Issue 2557
+			//second impulse denoise, removed in Issue 2557
 //				if (dnparams.luma>0.01) {
 //					impulse_nr (labdn, MIN(50.0f,(float)dnparams.luma)/20.0f);
 //				}
-				//PF_correct_RT(dst, dst, defringe.radius, defringe.threshold);
+			//PF_correct_RT(dst, dst, defringe.radius, defringe.threshold);
 
-				
-				int metchoice=0;
-				if(dnparams.methodmed=="Lonly") metchoice=1;
-				else if(dnparams.methodmed=="Lab") metchoice=2;
-				else if(dnparams.methodmed=="ab") metchoice=3;
-				else if(dnparams.methodmed=="Lpab") metchoice=4;
-				
-				//median on Luminance Lab only
-			if( (metchoice==1 || metchoice==2 || metchoice==3 || metchoice==4)   && dnparams.median) {
-			//printf("Lab et Lonly \n");
-			for(int iteration=1;iteration<=dnparams.passes;iteration++){
-			  //printf("pas=%i\n",iteration);
-					int wid=labdn->W;
-					int hei=labdn->H;
-					float** tmL;
-					tmL = new float*[hei];
-						for (int i=0; i<hei; i++)
-							tmL[i] = new float[wid];
 			
-					int methmedL=0;
-					int methmedAB=0;
-					int borderL = 1;
-					if(dnparams.medmethod=="soft")
-						{if(metchoice!=4) methmedL=methmedAB=0;
-						else {methmedL=0;methmedAB=0;}
-						}
-					else if(dnparams.medmethod=="33")
-						{if(metchoice!=4) methmedL=methmedAB=1;
-						else {methmedL=0;methmedAB=1;borderL = 1;}
-						}
-					
-					//	methmedL=methmedAB=1;
-					else if(dnparams.medmethod=="55") {
-						{if(metchoice!=4) methmedL=methmedAB=3;
-						else {methmedL=1;methmedAB=3;}
-						}
-					
-					
-					//	methmedL =methmedAB= 3;
-						borderL = 2;
-					}
-					else if(dnparams.medmethod=="55soft") {
-						{if(metchoice!=4) methmedL=methmedAB=2;
-						else {methmedL=0;methmedAB=2;}
-						}
-					
-					//	methmedL = methmedAB= 2;
-						borderL = 2;
-					}	
-					else if(dnparams.medmethod=="77") {
-						{if(metchoice!=4) methmedL=methmedAB=4;
-						else {methmedL=1;methmedAB=4;}
-						}
-					
-					//	methmedL = methmedAB=4;
-						borderL = 3;
-					}
-					else if(dnparams.medmethod=="99") {
-						{if(metchoice!=4) methmedL=methmedAB=5;
-						else {methmedL=2;methmedAB=5;}
-						}
-					
-					//	methmedL = methmedAB = 5;
-						borderL = 4;
-					}	
-					
-				if (metchoice==1 || metchoice==2 || metchoice==4)
-				{ /*printf("LONLY methmedL=%d\n", methmedL);*/
-					
-					if(methmedL < 2) {
-						for (int i=1; i<hei-1; i++) {
-							float pp[9],results[5],temp;
-							if(methmedL == 0) 
-								for (int j=1; j<wid-1; j++) {
-									med2(labdn->L[i][j] ,labdn->L[i-1][j], labdn->L[i+1][j] ,labdn->L[i][j+1],labdn->L[i][j-1], tmL[i][j]);//3x3 soft
-								}
-							else
-								for (int j=1; j<wid-1; j++) {//hybrid median filter ==> impluse denoise !I let here to get trace
-									/*	pp[0] = labdn->L[i][j-1] ;
-										pp[1] = labdn->L[i-1][j] ;
-										pp[2] = labdn->L[i][j] ;
-										pp[3] = labdn->L[i+1][j] ;
-										pp[4] = labdn->L[i][j+1] ;
-									//   Get median
-										results[0] = media(pp, 5);
-									//   Pick up x-window elements
-										pp[0] = labdn->L[i-1][j-1] ;
-										pp[1] = labdn->L[i+1][j-1] ;
-										pp[2] = labdn->L[i][j] ;
-										pp[3] = labdn->L[i-1][j+1] ;
-										pp[4] = labdn->L[i+1][j+1] ;
-										  //   Get median
-										results[1] = media(pp, 5);
-										//   Pick up leading element
-										results[2] = labdn->L[i][j] ;;
-										//   Get result
-										tmL[i][j] = media(results, 3);
-								*/
-								med3(labdn->L[i][j] ,labdn->L[i-1][j], labdn->L[i+1][j] ,labdn->L[i][j+1],labdn->L[i][j-1], labdn->L[i-1][j-1],labdn->L[i-1][j+1],labdn->L[i+1][j-1],labdn->L[i+1][j+1],tmL[i][j]);//3x3 soft
-								}
-						}
-					}
-					else {
-						for (int i=borderL; i<hei-borderL; i++) {
-							float pp[81],temp;
-						if(methmedL == 4)						
-							for (int j=borderL; j<wid-borderL; j++) {
-									int kk=0;
-									for (int ii=-3;ii<=3;ii++) {
-										for (int jj=-3;jj<=3;jj++){kk++;
-										 pp[kk]=labdn->L[i+ii][j+jj];
-										}
-									}	
-								fq_sort2(pp,49);
-								tmL[i][j]=pp[24];//7x7
-							}
-						else if(methmedL == 5)						
-							for (int j=borderL; j<wid-borderL; j++) {
-									int kk=0;
-									for (int ii=-4;ii<=4;ii++) {
-										for (int jj=-4;jj<=4;jj++){kk++;
-										 pp[kk]=labdn->L[i+ii][j+jj];
-										}
-									}	
-								fq_sort2(pp,81);
-								tmL[i][j]=pp[40];//9x9
-							}
-							
-						else if(methmedL == 3)	
-							for (int j=2; j<wid-2; j++) {
-								med5(labdn->L[i][j],labdn->L[i-1][j],labdn->L[i+1][j],labdn->L[i][j+1],labdn->L[i][j-1],labdn->L[i-1][j-1],labdn->L[i-1][j+1], labdn->L[i+1][j-1],labdn->L[i+1][j+1],
-								labdn->L[i-2][j],labdn->L[i+2][j],labdn->L[i][j+2],labdn->L[i][j-2],labdn->L[i-2][j-2],labdn->L[i-2][j+2],labdn->L[i+2][j-2],labdn->L[i+2][j+2],	
-								labdn->L[i-2][j+1],labdn->L[i+2][j+1],labdn->L[i-1][j+2],labdn->L[i-1][j-2],labdn->L[i-2][j-1],labdn->L[i+2][j-1],labdn->L[i+1][j+2],labdn->L[i+1][j-2],	
-								tmL[i][j]);//5x5
-							}
-						else
-							for (int j=2; j<wid-2; j++) {
-								pp[0]=labdn->L[i][j];pp[1]=labdn->L[i-1][j]; pp[2]=labdn->L[i+1][j];pp[3]=labdn->L[i][j+1];pp[4]=labdn->L[i][j-1];pp[5]=labdn->L[i-1][j-1];pp[6]=labdn->L[i-1][j+1];
-								pp[7]=labdn->L[i+1][j-1];pp[8]=labdn->L[i+1][j+1];pp[9]=labdn->L[i+2][j];pp[10]=labdn->L[i-2][j];pp[11]=labdn->L[i][j+2];pp[12]=labdn->L[i][j-2];
-								fq_sort2(pp,13);
-								tmL[i][j]=pp[6];//5x5 soft
-							}
-						}
-					}
+			int metchoice=0;
+			if(dnparams.methodmed=="Lonly") metchoice=1;
+			else if(dnparams.methodmed=="Lab") metchoice=2;
+			else if(dnparams.methodmed=="ab") metchoice=3;
+			else if(dnparams.methodmed=="Lpab") metchoice=4;
+			
+			//median on Luminance Lab only
+		if( (metchoice==1 || metchoice==2 || metchoice==3 || metchoice==4)   && dnparams.median) {
+		//printf("Lab et Lonly \n");
+			float** tmL;
+			int wid=labdn->W;
+			int hei=labdn->H;
+			tmL = new float*[hei];
+			for (int i=0; i<hei; i++)
+				tmL[i] = new float[wid];
+			int methmedL=0;
+			int methmedAB=0;
+			int borderL = 1;
+			if(dnparams.medmethod=="soft")
+				{if(metchoice!=4) methmedL=methmedAB=0;
+				else {methmedL=0;methmedAB=0;}
+				}
+			else if(dnparams.medmethod=="33")
+				{if(metchoice!=4) methmedL=methmedAB=1;
+				else {methmedL=0;methmedAB=1;borderL = 1;}
+				}
 
-					
-		
-					for(int i = borderL; i < hei-borderL; i++ ) {
-						for(int j = borderL; j < wid-borderL; j++) {
-							labdn->L[i][j] = tmL[i][j];
-						}
-		 			}
-             }    
-				if(metchoice==2   || metchoice==3 || metchoice==4) {/*printf(" AB methmedL=%d\n", methmedL);*/
-					if(methmedAB < 2) {
-						for (int i=1; i<hei-1; i++) {
-							float pp[9],temp;
-							if(methmedAB == 0) 
-								for (int j=1; j<wid-1; j++) {
-									med2(labdn->a[i][j] ,labdn->a[i-1][j], labdn->a[i+1][j] ,labdn->a[i][j+1],labdn->a[i][j-1], tmL[i][j]);//3x3 soft
-								}
-							else
-								for (int j=1; j<wid-1; j++) {
-								med3(labdn->a[i][j] ,labdn->a[i-1][j], labdn->a[i+1][j] ,labdn->a[i][j+1],labdn->a[i][j-1], labdn->a[i-1][j-1],labdn->a[i-1][j+1],labdn->a[i+1][j-1],labdn->a[i+1][j+1],tmL[i][j]);//3x3 soft
-								}
-						}
-					}
-					else {
-						for (int i=borderL; i<hei-borderL; i++) {
-							float pp[81],temp;
-						if(methmedAB == 4)						
-							for (int j=borderL; j<wid-borderL; j++) {
-									int kk=0;
-									for (int ii=-3;ii<=3;ii++) {
-										for (int jj=-3;jj<=3;jj++){kk++;
-										 pp[kk]=labdn->a[i+ii][j+jj];
-										}
-									}	
-								fq_sort2(pp,49);
-								tmL[i][j]=pp[24];//7x7
+			//	methmedL=methmedAB=1;
+			else if(dnparams.medmethod=="55") {
+				{if(metchoice!=4) methmedL=methmedAB=3;
+				else {methmedL=1;methmedAB=3;}
+				}
+
+			//	methmedL =methmedAB= 3;
+				borderL = 2;
+			}
+			else if(dnparams.medmethod=="55soft") {
+				{if(metchoice!=4) methmedL=methmedAB=2;
+				else {methmedL=0;methmedAB=2;}
+				}
+
+			//	methmedL = methmedAB= 2;
+				borderL = 2;
+			}
+			else if(dnparams.medmethod=="77") {
+				{if(metchoice!=4) methmedL=methmedAB=4;
+				else {methmedL=1;methmedAB=4;}
+				}
+
+			//	methmedL = methmedAB=4;
+				borderL = 3;
+			}
+			else if(dnparams.medmethod=="99") {
+				{if(metchoice!=4) methmedL=methmedAB=5;
+				else {methmedL=2;methmedAB=5;}
+				}
+
+			//	methmedL = methmedAB = 5;
+				borderL = 4;
+			}
+		for(int iteration=1;iteration<=dnparams.passes;iteration++){
+		  //printf("pas=%i\n",iteration);
+
+
+
+			if (metchoice==1 || metchoice==2 || metchoice==4)
+			{ /*printf("LONLY methmedL=%d\n", methmedL);*/
+
+				if(methmedL < 2) {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+					for (int i=1; i<hei-1; i++) {
+						float pp[9],results[5],temp;
+						if(methmedL == 0) 
+							for (int j=1; j<wid-1; j++) {
+								med2(labdn->L[i][j] ,labdn->L[i-1][j], labdn->L[i+1][j] ,labdn->L[i][j+1],labdn->L[i][j-1], tmL[i][j]);//3x3 soft
 							}
-						else if(methmedAB == 5)						
-							for (int j=borderL; j<wid-borderL; j++) {
-									int kk=0;
-									for (int ii=-4;ii<=4;ii++) {
-										for (int jj=-4;jj<=4;jj++){kk++;
-										 pp[kk]=labdn->a[i+ii][j+jj];
-										}
-									}	
-								fq_sort2(pp,81);
-								tmL[i][j]=pp[40];//9
-							}
-							
-						else if(methmedAB == 3)
-							for (int j=2; j<wid-2; j++) {						
-								med5(labdn->a[i][j],labdn->a[i-1][j],labdn->a[i+1][j],labdn->a[i][j+1],labdn->a[i][j-1],labdn->a[i-1][j-1],labdn->a[i-1][j+1], labdn->a[i+1][j-1],labdn->a[i+1][j+1],
-								labdn->a[i-2][j],labdn->a[i+2][j],labdn->a[i][j+2],labdn->a[i][j-2],labdn->a[i-2][j-2],labdn->a[i-2][j+2],labdn->a[i+2][j-2],labdn->a[i+2][j+2],	
-								labdn->a[i-2][j+1],labdn->a[i+2][j+1],labdn->a[i-1][j+2],labdn->a[i-1][j-2],labdn->a[i-2][j-1],labdn->a[i+2][j-1],labdn->a[i+1][j+2],labdn->a[i+1][j-2],	
-								tmL[i][j]);//5x5
-								}
 						else
-							for (int j=2; j<wid-2; j++) {
-								pp[0]=labdn->a[i][j];pp[1]=labdn->a[i-1][j]; pp[2]=labdn->a[i+1][j];pp[3]=labdn->a[i][j+1];pp[4]=labdn->a[i][j-1];pp[5]=labdn->a[i-1][j-1];pp[6]=labdn->a[i-1][j+1];
-								pp[7]=labdn->a[i+1][j-1];pp[8]=labdn->a[i+1][j+1];pp[9]=labdn->a[i+2][j];pp[10]=labdn->a[i-2][j];pp[11]=labdn->a[i][j+2];pp[12]=labdn->a[i][j-2];
-								fq_sort2(pp,13);
-								tmL[i][j]=pp[6];//5x5 soft
-							}
-						/*	for (int j=3; j<wid-3; j++) {
-									int kk=0;
-									for (int ii=-2;ii<=2;ii++) {
-										for (int jj=-2;jj<=2;jj++){kk++;
-										 pp[kk]=labdn->a[i+ii][j+jj];
-										}
-										pp[kk+1]=labdn->a[i-3][j-3];pp[kk+2]=labdn->a[i-3][j+3];pp[kk+3]=labdn->a[i+3][j-3];pp[kk+4]=labdn->a[i+3][j+3];
-										pp[kk+5]=labdn->a[i-3][j];pp[kk+6]=labdn->a[i+3][j];pp[kk+7]=labdn->a[i][j-3];pp[kk+8]=labdn->a[i][j+3];
-										
-									}	
-								fq_sort2(pp,33);
-								tmL[i][j]=pp[16];//7x7
-							}
+							for (int j=1; j<wid-1; j++) {//hybrid median filter ==> impluse denoise !I let here to get trace
+								/*	pp[0] = labdn->L[i][j-1] ;
+									pp[1] = labdn->L[i-1][j] ;
+									pp[2] = labdn->L[i][j] ;
+									pp[3] = labdn->L[i+1][j] ;
+									pp[4] = labdn->L[i][j+1] ;
+								//   Get median
+									results[0] = media(pp, 5);
+								//   Pick up x-window elements
+									pp[0] = labdn->L[i-1][j-1] ;
+									pp[1] = labdn->L[i+1][j-1] ;
+									pp[2] = labdn->L[i][j] ;
+									pp[3] = labdn->L[i-1][j+1] ;
+									pp[4] = labdn->L[i+1][j+1] ;
+									  //   Get median
+									results[1] = media(pp, 5);
+									//   Pick up leading element
+									results[2] = labdn->L[i][j] ;;
+									//   Get result
+									tmL[i][j] = media(results, 3);
 							*/
+							med3(labdn->L[i][j] ,labdn->L[i-1][j], labdn->L[i+1][j] ,labdn->L[i][j+1],labdn->L[i][j-1], labdn->L[i-1][j-1],labdn->L[i-1][j+1],labdn->L[i+1][j-1],labdn->L[i+1][j+1],tmL[i][j]);//3x3 soft
+							}
+					}
+				}
+				else {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+					for (int i=borderL; i<hei-borderL; i++) {
+						float pp[81],temp;
+					if(methmedL == 4)						
+						for (int j=borderL; j<wid-borderL; j++) {
+								int kk=0;
+								for (int ii=-3;ii<=3;ii++) {
+									for (int jj=-3;jj<=3;jj++) {
+										kk++;
+										pp[kk]=labdn->L[i+ii][j+jj];
+									}
+								}
+							fq_sort2(pp,49);
+							tmL[i][j]=pp[24];//7x7
+						}
+					else if(methmedL == 5)						
+						for (int j=borderL; j<wid-borderL; j++) {
+								int kk=0;
+								for (int ii=-4;ii<=4;ii++) {
+									for (int jj=-4;jj<=4;jj++) {
+										kk++;
+										pp[kk]=labdn->L[i+ii][j+jj];
+									}
+								}
+							fq_sort2(pp,81);
+							tmL[i][j]=pp[40];//9x9
+						}
+						
+					else if(methmedL == 3)	
+						for (int j=2; j<wid-2; j++) {
+							med5(labdn->L[i][j],labdn->L[i-1][j],labdn->L[i+1][j],labdn->L[i][j+1],labdn->L[i][j-1],labdn->L[i-1][j-1],labdn->L[i-1][j+1], labdn->L[i+1][j-1],labdn->L[i+1][j+1],
+							labdn->L[i-2][j],labdn->L[i+2][j],labdn->L[i][j+2],labdn->L[i][j-2],labdn->L[i-2][j-2],labdn->L[i-2][j+2],labdn->L[i+2][j-2],labdn->L[i+2][j+2],	
+							labdn->L[i-2][j+1],labdn->L[i+2][j+1],labdn->L[i-1][j+2],labdn->L[i-1][j-2],labdn->L[i-2][j-1],labdn->L[i+2][j-1],labdn->L[i+1][j+2],labdn->L[i+1][j-2],	
+							tmL[i][j]);//5x5
+						}
+					else
+						for (int j=2; j<wid-2; j++) {
+							pp[0]=labdn->L[i][j];pp[1]=labdn->L[i-1][j]; pp[2]=labdn->L[i+1][j];pp[3]=labdn->L[i][j+1];pp[4]=labdn->L[i][j-1];pp[5]=labdn->L[i-1][j-1];pp[6]=labdn->L[i-1][j+1];
+							pp[7]=labdn->L[i+1][j-1];pp[8]=labdn->L[i+1][j+1];pp[9]=labdn->L[i+2][j];pp[10]=labdn->L[i-2][j];pp[11]=labdn->L[i][j+2];pp[12]=labdn->L[i][j-2];
+							fq_sort2(pp,13);
+							tmL[i][j]=pp[6];//5x5 soft
 						}
 					}
+				}
 
-		
-					for(int i = borderL; i < hei-borderL; i++ ) {
-						for(int j = borderL; j < wid-borderL; j++) {
-							labdn->a[i][j] = tmL[i][j];
-						}
+				
+	
+				for(int i = borderL; i < hei-borderL; i++ ) {
+					for(int j = borderL; j < wid-borderL; j++) {
+						labdn->L[i][j] = tmL[i][j];
 					}
-					
-					
+				}
+		 }    
+			if(metchoice==2   || metchoice==3 || metchoice==4) {/*printf(" AB methmedL=%d\n", methmedL);*/
+				if(methmedAB < 2) {
+					for (int i=1; i<hei-1; i++) {
+						float pp[9],temp;
+						if(methmedAB == 0) 
+							for (int j=1; j<wid-1; j++) {
+								med2(labdn->a[i][j] ,labdn->a[i-1][j], labdn->a[i+1][j] ,labdn->a[i][j+1],labdn->a[i][j-1], tmL[i][j]);//3x3 soft
+							}
+						else
+							for (int j=1; j<wid-1; j++) {
+							med3(labdn->a[i][j] ,labdn->a[i-1][j], labdn->a[i+1][j] ,labdn->a[i][j+1],labdn->a[i][j-1], labdn->a[i-1][j-1],labdn->a[i-1][j+1],labdn->a[i+1][j-1],labdn->a[i+1][j+1],tmL[i][j]);//3x3 soft
+							}
+					}
+				}
+				else {
+					for (int i=borderL; i<hei-borderL; i++) {
+						float pp[81],temp;
+					if(methmedAB == 4)						
+						for (int j=borderL; j<wid-borderL; j++) {
+								int kk=0;
+								for (int ii=-3;ii<=3;ii++) {
+									for (int jj=-3;jj<=3;jj++) {
+										kk++;
+										pp[kk]=labdn->a[i+ii][j+jj];
+									}
+								}
+							fq_sort2(pp,49);
+							tmL[i][j]=pp[24];//7x7
+						}
+					else if(methmedAB == 5)						
+						for (int j=borderL; j<wid-borderL; j++) {
+								int kk=0;
+								for (int ii=-4;ii<=4;ii++) {
+									for (int jj=-4;jj<=4;jj++) {
+										kk++;
+										pp[kk]=labdn->a[i+ii][j+jj];
+									}
+								}
+							fq_sort2(pp,81);
+							tmL[i][j]=pp[40];//9
+						}
+						
+					else if(methmedAB == 3)
+						for (int j=2; j<wid-2; j++) {						
+							med5(labdn->a[i][j],labdn->a[i-1][j],labdn->a[i+1][j],labdn->a[i][j+1],labdn->a[i][j-1],labdn->a[i-1][j-1],labdn->a[i-1][j+1], labdn->a[i+1][j-1],labdn->a[i+1][j+1],
+							labdn->a[i-2][j],labdn->a[i+2][j],labdn->a[i][j+2],labdn->a[i][j-2],labdn->a[i-2][j-2],labdn->a[i-2][j+2],labdn->a[i+2][j-2],labdn->a[i+2][j+2],	
+							labdn->a[i-2][j+1],labdn->a[i+2][j+1],labdn->a[i-1][j+2],labdn->a[i-1][j-2],labdn->a[i-2][j-1],labdn->a[i+2][j-1],labdn->a[i+1][j+2],labdn->a[i+1][j-2],	
+							tmL[i][j]);//5x5
+							}
+					else
+						for (int j=2; j<wid-2; j++) {
+							pp[0]=labdn->a[i][j];pp[1]=labdn->a[i-1][j]; pp[2]=labdn->a[i+1][j];pp[3]=labdn->a[i][j+1];pp[4]=labdn->a[i][j-1];pp[5]=labdn->a[i-1][j-1];pp[6]=labdn->a[i-1][j+1];
+							pp[7]=labdn->a[i+1][j-1];pp[8]=labdn->a[i+1][j+1];pp[9]=labdn->a[i+2][j];pp[10]=labdn->a[i-2][j];pp[11]=labdn->a[i][j+2];pp[12]=labdn->a[i][j-2];
+							fq_sort2(pp,13);
+							tmL[i][j]=pp[6];//5x5 soft
+						}
+					/*	for (int j=3; j<wid-3; j++) {
+								int kk=0;
+								for (int ii=-2;ii<=2;ii++) {
+									for (int jj=-2;jj<=2;jj++){kk++;
+									 pp[kk]=labdn->a[i+ii][j+jj];
+									}
+									pp[kk+1]=labdn->a[i-3][j-3];pp[kk+2]=labdn->a[i-3][j+3];pp[kk+3]=labdn->a[i+3][j-3];pp[kk+4]=labdn->a[i+3][j+3];
+									pp[kk+5]=labdn->a[i-3][j];pp[kk+6]=labdn->a[i+3][j];pp[kk+7]=labdn->a[i][j-3];pp[kk+8]=labdn->a[i][j+3];
+									
+								}	
+							fq_sort2(pp,33);
+							tmL[i][j]=pp[16];//7x7
+						}
+						*/
+					}
+				}
+
+	
+				for(int i = borderL; i < hei-borderL; i++ ) {
+					for(int j = borderL; j < wid-borderL; j++) {
+						labdn->a[i][j] = tmL[i][j];
+					}
+				}
+				
+				
 //b
-					if(methmedAB < 2) {
-						for (int i=1; i<hei-1; i++) {
-							float pp[9],temp;
-							if(methmedAB == 0) 
-								for (int j=1; j<wid-1; j++) {
-									med2(labdn->b[i][j] ,labdn->b[i-1][j], labdn->b[i+1][j] ,labdn->b[i][j+1],labdn->b[i][j-1], tmL[i][j]);//3x3 soft
-								}
-							else
-								for (int j=1; j<wid-1; j++) {
-								med3(labdn->b[i][j] ,labdn->b[i-1][j], labdn->b[i+1][j] ,labdn->b[i][j+1],labdn->b[i][j-1], labdn->b[i-1][j-1],labdn->b[i-1][j+1],labdn->b[i+1][j-1],labdn->b[i+1][j+1],tmL[i][j]);//3x3 soft
-								}
-						}
-					}
-					else {
-						for (int i=borderL; i<hei-borderL; i++) {
-							float pp[81],temp;
-						if(methmedL == 4)			
-							for (int j=borderL; j<wid-borderL; j++) {
-									int kk=0;
-									for (int ii=-3;ii<=3;ii++) {
-										for (int jj=-3;jj<=3;jj++){kk++;
-										 pp[kk]=labdn->b[i+ii][j+jj];
-										}
-									}	
-								fq_sort2(pp,49);
-								tmL[i][j]=pp[24];//7x7
-							
-								
-							}
-						else if(methmedAB == 5)			
-							for (int j=borderL; j<wid-borderL; j++) {
-									int kk=0;
-									for (int ii=-4;ii<=4;ii++) {
-										for (int jj=-4;jj<=4;jj++){kk++;
-										 pp[kk]=labdn->b[i+ii][j+jj];
-										}
-									}	
-								fq_sort2(pp,81);
-								tmL[i][j]=pp[40];//9
-							}
-							
-						else if(methmedAB == 3)			
-							for (int j=2; j<wid-2; j++) {
-								med5(labdn->b[i][j],labdn->b[i-1][j],labdn->b[i+1][j],labdn->b[i][j+1],labdn->b[i][j-1],labdn->b[i-1][j-1],labdn->b[i-1][j+1], labdn->b[i+1][j-1],labdn->b[i+1][j+1],
-								labdn->b[i-2][j],labdn->b[i+2][j],labdn->b[i][j+2],labdn->b[i][j-2],labdn->b[i-2][j-2],labdn->b[i-2][j+2],labdn->b[i+2][j-2],labdn->b[i+2][j+2],	
-								labdn->b[i-2][j+1],labdn->b[i+2][j+1],labdn->b[i-1][j+2],labdn->b[i-1][j-2],labdn->b[i-2][j-1],labdn->b[i+2][j-1],labdn->b[i+1][j+2],labdn->b[i+1][j-2],	
-								tmL[i][j]);//5x5
-							
+				if(methmedAB < 2) {
+					for (int i=1; i<hei-1; i++) {
+						float pp[9],temp;
+						if(methmedAB == 0) 
+							for (int j=1; j<wid-1; j++) {
+								med2(labdn->b[i][j] ,labdn->b[i-1][j], labdn->b[i+1][j] ,labdn->b[i][j+1],labdn->b[i][j-1], tmL[i][j]);//3x3 soft
 							}
 						else
-							for (int j=2; j<wid-2; j++) {
-							/*for (int j=3; j<wid-3; j++) {
-									int kk=0;
-									for (int ii=-2;ii<=2;ii++) {
-										for (int jj=-2;jj<=2;jj++){kk++;
-										 pp[kk]=labdn->b[i+ii][j+jj];
-										}
-										pp[kk+1]=labdn->b[i-3][j-3];pp[kk+2]=labdn->b[i-3][j+3];pp[kk+3]=labdn->b[i+3][j-3];pp[kk+4]=labdn->b[i+3][j+3];
-										pp[kk+5]=labdn->b[i-3][j];pp[kk+6]=labdn->b[i+3][j];pp[kk+7]=labdn->b[i][j-3];pp[kk+8]=labdn->b[i][j+3];
-										
-									}	
-								fq_sort2(pp,33);
-								tmL[i][j]=pp[16];//7x7
-							*/
-							
-							
-								pp[0]=labdn->b[i][j];pp[1]=labdn->b[i-1][j]; pp[2]=labdn->b[i+1][j];pp[3]=labdn->b[i][j+1];pp[4]=labdn->b[i][j-1];pp[5]=labdn->b[i-1][j-1];pp[6]=labdn->b[i-1][j+1];
-								pp[7]=labdn->b[i+1][j-1];pp[8]=labdn->b[i+1][j+1];pp[9]=labdn->b[i+2][j];pp[10]=labdn->b[i-2][j];pp[11]=labdn->b[i][j+2];pp[12]=labdn->b[i][j-2];
-								fq_sort2(pp,13);
-								tmL[i][j]=pp[6];//5x5 soft
-							
+							for (int j=1; j<wid-1; j++) {
+							med3(labdn->b[i][j] ,labdn->b[i-1][j], labdn->b[i+1][j] ,labdn->b[i][j+1],labdn->b[i][j-1], labdn->b[i-1][j-1],labdn->b[i-1][j+1],labdn->b[i+1][j-1],labdn->b[i+1][j+1],tmL[i][j]);//3x3 soft
 							}
-						}
 					}
-
-		
-					for(int i = borderL; i < hei-borderL; i++ ) {
-						for(int j = borderL; j < wid-borderL; j++) {
-							labdn->b[i][j] = tmL[i][j];
-						}
-					}
-				}			
-			
-					for (int i=0; i<hei; i++)
-						delete [] tmL[i];
-					delete [] tmL;
-			}
-			}
-			
-				//wavelet denoised L channel
-				array2D<float> Lwavdn(width,height);
-				float * Lwavdnptr = Lwavdn;
-				memcpy (Lwavdnptr, labdn->data, width*height*sizeof(float));
-
-				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				// now do detail recovery using block DCT to detect
-				// patterns missed by wavelet denoise
-				// blocks are not the same thing as tiles!
-
-
-
-				// calculation for detail recovery blocks
-				const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad;
-				const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad;
-
-				//const int nrtiles = numblox_W*numblox_H;
-				// end of tiling calc
-                {
-
-				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				// Main detail recovery algorithm: Block loop
-                AlignedBuffer<float> pBuf(width + TS + 2*blkrad*offset);
-				for (int vblk=0; vblk<numblox_H; vblk++) {
-					//printf("vblock=%d",vblk);
-
-					int top = (vblk-blkrad)*offset;
-					float * datarow = (float*)pBuf.data +blkrad*offset;
-
-					//TODO: implement using AlignedBufferMP
-					for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) {
-						int row = top + i;
-						int rr = row;
-						if (row<0) {
-							rr = MIN(-row,height-1);
-						} else if (row>=height) {
-							rr = MAX(0,2*height-2-row);
-						}
-
-						for (int j=0; j<labdn->W; j++) {
-							datarow[j] = (Lin[rr][j]-Lwavdn[rr][j]);
-						}
-
-						for (int j=-blkrad*offset; j<0; j++) {
-							datarow[j] = datarow[MIN(-j,width-1)];
-						}
-						for (int j=width; j<width+TS+blkrad*offset; j++) {
-							datarow[j] = datarow[MAX(0,2*width-2-j)];
-						}//now we have a padded data row
-
-						//now fill this row of the blocks with Lab high pass data
-						//OMP here does not add speed, better handled on the outside loop
-						for (int hblk=0; hblk<numblox_W; hblk++) {
-							int left = (hblk-blkrad)*offset;
-							int indx = (hblk)*TS;//index of block in malloc
-
-							for (int j=0; j<TS; j++) {
-								Lblox[(indx + i)*TS+j]  = tilemask_in[i][j]*datarow[left+j];// luma data
-								if (top+i>=0 && top+i<height && left+j>=0 && left+j<width) {
-									totwt[top+i][left+j] += tilemask_in[i][j]*tilemask_out[i][j];
+				}
+				else {
+					for (int i=borderL; i<hei-borderL; i++) {
+						float pp[81],temp;
+					if(methmedL == 4)			
+						for (int j=borderL; j<wid-borderL; j++) {
+								int kk=0;
+								for (int ii=-3;ii<=3;ii++) {
+									for (int jj=-3;jj<=3;jj++) {
+										kk++;
+										pp[kk]=labdn->b[i+ii][j+jj];
+									}
 								}
-							}
+							fq_sort2(pp,49);
+							tmL[i][j]=pp[24];//7x7
+						
+							
+						}
+					else if(methmedAB == 5)			
+						for (int j=borderL; j<wid-borderL; j++) {
+								int kk=0;
+								for (int ii=-4;ii<=4;ii++) {
+									for (int jj=-4;jj<=4;jj++) {
+										kk++;
+										pp[kk]=labdn->b[i+ii][j+jj];
+									}
+								}
+							fq_sort2(pp,81);
+							tmL[i][j]=pp[40];//9
+						}
+						
+					else if(methmedAB == 3)			
+						for (int j=2; j<wid-2; j++) {
+							med5(labdn->b[i][j],labdn->b[i-1][j],labdn->b[i+1][j],labdn->b[i][j+1],labdn->b[i][j-1],labdn->b[i-1][j-1],labdn->b[i-1][j+1], labdn->b[i+1][j-1],labdn->b[i+1][j+1],
+							labdn->b[i-2][j],labdn->b[i+2][j],labdn->b[i][j+2],labdn->b[i][j-2],labdn->b[i-2][j-2],labdn->b[i-2][j+2],labdn->b[i+2][j-2],labdn->b[i+2][j+2],	
+							labdn->b[i-2][j+1],labdn->b[i+2][j+1],labdn->b[i-1][j+2],labdn->b[i-1][j-2],labdn->b[i-2][j-1],labdn->b[i+2][j-1],labdn->b[i+1][j+2],labdn->b[i+1][j-2],	
+							tmL[i][j]);//5x5
+						
+						}
+					else
+						for (int j=2; j<wid-2; j++) {
+						/*for (int j=3; j<wid-3; j++) {
+								int kk=0;
+								for (int ii=-2;ii<=2;ii++) {
+									for (int jj=-2;jj<=2;jj++){kk++;
+									 pp[kk]=labdn->b[i+ii][j+jj];
+									}
+									pp[kk+1]=labdn->b[i-3][j-3];pp[kk+2]=labdn->b[i-3][j+3];pp[kk+3]=labdn->b[i+3][j-3];pp[kk+4]=labdn->b[i+3][j+3];
+									pp[kk+5]=labdn->b[i-3][j];pp[kk+6]=labdn->b[i+3][j];pp[kk+7]=labdn->b[i][j-3];pp[kk+8]=labdn->b[i][j+3];
+									
+								}	
+							fq_sort2(pp,33);
+							tmL[i][j]=pp[16];//7x7
+						*/
+						
+						
+							pp[0]=labdn->b[i][j];pp[1]=labdn->b[i-1][j]; pp[2]=labdn->b[i+1][j];pp[3]=labdn->b[i][j+1];pp[4]=labdn->b[i][j-1];pp[5]=labdn->b[i-1][j-1];pp[6]=labdn->b[i-1][j+1];
+							pp[7]=labdn->b[i+1][j-1];pp[8]=labdn->b[i+1][j+1];pp[9]=labdn->b[i+2][j];pp[10]=labdn->b[i-2][j];pp[11]=labdn->b[i][j+2];pp[12]=labdn->b[i][j-2];
+							fq_sort2(pp,13);
+							tmL[i][j]=pp[6];//5x5 soft
+						
 						}
-					}//end of filling block row
-
-					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-					//fftwf_print_plan (plan_forward_blox);
-					if(numblox_W == max_numblox_W)
-                        fftwf_execute_r2r(plan_forward_blox[0],Lblox,fLblox);		// DCT an entire row of tiles
-                    else
-                        fftwf_execute_r2r(plan_forward_blox[1],Lblox,fLblox);		// DCT an entire row of tiles
-					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-					// now process the vblk row of blocks for noise reduction
-					for (int hblk=0; hblk<numblox_W; hblk++) {
-
-						RGBtile_denoise (fLblox, hblk, noisevar_Ldetail, nbrwt, blurbuffer );
-
-					}//end of horizontal block loop
-
-					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-					//now perform inverse FT of an entire row of blocks
-					if(numblox_W == max_numblox_W)
-                        fftwf_execute_r2r(plan_backward_blox[0],fLblox,Lblox);	//for DCT
-                    else
-                        fftwf_execute_r2r(plan_backward_blox[1],fLblox,Lblox);	//for DCT
-
-					int topproc = (vblk-blkrad)*offset;
-
-					//add row of blocks to output image tile
-					RGBoutput_tile_row (Lblox, Ldetail, tilemask_out, height, width, topproc );
-
-					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-				}//end of vertical block loop
-				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-                }
-				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				for (int i=0; i<height; i++) {
-					for (int j=0; j<width; j++) {
-						//may want to include masking threshold for large hipass data to preserve edges/detail
-						float hpdn = Ldetail[i][j]/totwt[i][j];//note that labdn initially stores the denoised hipass data
-
-						labdn->L[i][j] = Lwavdn[i][j] + hpdn;
-
 					}
 				}
 
+				for(int i = borderL; i < hei-borderL; i++ ) {
+					for(int j = borderL; j < wid-borderL; j++) {
+						labdn->b[i][j] = tmL[i][j];
+					}
+				}
+			}			
+
+		}
+			for (int i=0; i<hei; i++)
+				delete [] tmL[i];
+			delete [] tmL;
+		}
+
+			//wavelet denoised L channel
+			array2D<float> Lwavdn(width,height);
+			float * Lwavdnptr = Lwavdn;
+			memcpy (Lwavdnptr, labdn->data, width*height*sizeof(float));
+
+			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+			// now do detail recovery using block DCT to detect
+			// patterns missed by wavelet denoise
+			// blocks are not the same thing as tiles!
+
+
+
+			// calculation for detail recovery blocks
+			const int numblox_W = ceil(((float)(width))/(offset))+2*blkrad;
+			const int numblox_H = ceil(((float)(height))/(offset))+2*blkrad;
+
+			//const int nrtiles = numblox_W*numblox_H;
+			// end of tiling calc
+			{
+
+			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+			// Main detail recovery algorithm: Block loop
+				//DCT block data storage
+
+
+#ifdef _OPENMP
+			int masterThread = omp_get_thread_num();
+#endif
+#ifdef _OPENMP
+#pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+{
+#ifdef _OPENMP
+			int subThread = masterThread * denoiseNestedLevels + omp_get_thread_num();
+#else
+			int subThread = 0;
+#endif
+			AlignedBuffer<float> pBuf(width + TS + 2*blkrad*offset);
+			float * nbrwt = nbrwtArray[subThread];
+			float * Lblox = LbloxArray[subThread];
+			float * fLblox = fLbloxArray[subThread];;
+			float * blurbuffer = new float[TS*TS];
+#ifdef _OPENMP
+#pragma omp for
+#endif
+			for (int vblk=0; vblk<numblox_H; vblk++) {
+				//printf("vblock=%d",vblk);
+
+				int top = (vblk-blkrad)*offset;
+				float * datarow = (float*)pBuf.data +blkrad*offset;
+
+				//TODO: implement using AlignedBufferMP
+				for (int i=0/*, row=top*/; i<TS; i++/*, row++*/) {
+					int row = top + i;
+					int rr = row;
+					if (row<0) {
+						rr = MIN(-row,height-1);
+					} else if (row>=height) {
+						rr = MAX(0,2*height-2-row);
+					}
+
+					for (int j=0; j<labdn->W; j++) {
+						datarow[j] = (Lin[rr][j]-Lwavdn[rr][j]);
+					}
+
+					for (int j=-blkrad*offset; j<0; j++) {
+						datarow[j] = datarow[MIN(-j,width-1)];
+					}
+					for (int j=width; j<width+TS+blkrad*offset; j++) {
+						datarow[j] = datarow[MAX(0,2*width-2-j)];
+					}//now we have a padded data row
+
+					//now fill this row of the blocks with Lab high pass data
+					//OMP here does not add speed, better handled on the outside loop
+					for (int hblk=0; hblk<numblox_W; hblk++) {
+						int left = (hblk-blkrad)*offset;
+						int indx = (hblk)*TS;//index of block in malloc
+
+						for (int j=0; j<TS; j++) {
+							Lblox[(indx + i)*TS+j]  = tilemask_in[i][j]*datarow[left+j];// luma data
+							if (top+i>=0 && top+i<height && left+j>=0 && left+j<width) {
+								totwt[top+i][left+j] += tilemask_in[i][j]*tilemask_out[i][j];
+							}
+						}
+					}
+				}//end of filling block row
+
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-				// transform denoised "Lab" to output RGB
+				//fftwf_print_plan (plan_forward_blox);
+				if(numblox_W == max_numblox_W)
+					fftwf_execute_r2r(plan_forward_blox[0],Lblox,fLblox);		// DCT an entire row of tiles
+				else
+					fftwf_execute_r2r(plan_forward_blox[1],Lblox,fLblox);		// DCT an entire row of tiles
+				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+				// now process the vblk row of blocks for noise reduction
 
-				//calculate mask for feathering output tile overlaps
-				float * Vmask = new float [height+1];
-				float * Hmask = new float [width+1];
+
+				for (int hblk=0; hblk<numblox_W; hblk++) {
+					RGBtile_denoise (fLblox, hblk, noisevar_Ldetail, nbrwt, blurbuffer );
+				}//end of horizontal block loop
+
+				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-				for (int i=0; i<height; i++) {
-					Vmask[i] = 1;
-				}
+				//now perform inverse FT of an entire row of blocks
+				if(numblox_W == max_numblox_W)
+					fftwf_execute_r2r(plan_backward_blox[0],fLblox,Lblox);	//for DCT
+				else
+					fftwf_execute_r2r(plan_backward_blox[1],fLblox,Lblox);	//for DCT
+
+				int topproc = (vblk-blkrad)*offset;
+
+				//add row of blocks to output image tile
+				RGBoutput_tile_row (Lblox, Ldetail, tilemask_out, height, width, topproc );
+
+				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+			}//end of vertical block loop
+			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+			delete [] blurbuffer;
+
+}
+			}
+			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+			for (int i=0; i<height; i++) {
 				for (int j=0; j<width; j++) {
-					Hmask[j] = 1;
-				}
-				for (int i=0; i<overlap; i++) {
-					float mask = SQR(sin((M_PI*i)/(2*overlap)));
-					if (tiletop>0) Vmask[i] = mask;
-					if (tilebottom<imheight) Vmask[height-i] = mask;
-					if (tileleft>0) Hmask[i] = mask;
-					if (tileright<imwidth) Hmask[width-i] = mask;
-				}
-				//convert back to RGB and write to destination array
-				if (isRAW) {
-				if(!perf) {//Lab mode
-					for (int i=tiletop; i<tilebottom; i++){
-						int i1 = i-tiletop;
-						float X,Y,Z,L,a,b;
+					//may want to include masking threshold for large hipass data to preserve edges/detail
+					float hpdn = Ldetail[i][j]/totwt[i][j];//note that labdn initially stores the denoised hipass data
 
-						for (int j=tileleft; j<tileright; j++) {
-							int j1=j-tileleft;
-							//modification Jacques feb 2013
-							//true conversion Lab==>xyz
-							L = labdn->L[i1][j1];
-							a = labdn->a[i1][j1];
-							b = labdn->b[i1][j1];
-							float c_h=sqrt(SQR(a)+SQR(b));
-							if(c_h>3000.f){
-								a*=1.f + Qhigh*realred/100.f;
-								b*=1.f + Qhigh*realblue/100.f;
-							}						
-							//convert XYZ
-							Color::Lab2XYZ(L, a, b, X, Y, Z);
-							//apply inverse gamma noise
-							float r_,g_,b_;
-							Color::xyz2rgb(X,Y,Z,r_,g_,b_,wip);
-							//inverse gamma standard (slider)
-							r_ = r_<32768.0f ? igamcurve[r_] : (Color::gamman((float)r_/32768.0f, igam) * 65535.0f);
-							g_ = g_<32768.0f ? igamcurve[g_] : (Color::gamman((float)g_/32768.0f, igam) * 65535.0f);
-							b_ = b_<32768.0f ? igamcurve[b_] : (Color::gamman((float)b_/32768.0f, igam) * 65535.0f);
-							//readapt arbitrary gamma (inverse from beginning)
-							if (gamlab == 0) {
+					labdn->L[i][j] = Lwavdn[i][j] + hpdn;
+
+				}
+			}
+
+			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+			// transform denoised "Lab" to output RGB
+
+			//calculate mask for feathering output tile overlaps
+			float * Vmask = new float [height+1];
+			float * Hmask = new float [width+1];
+
+			for (int i=0; i<height; i++) {
+				Vmask[i] = 1;
+			}
+			float newGain = 1.f;
+			if(isRAW)
+				newGain = gain;
+			for (int j=0; j<width; j++) {
+				Hmask[j] = 1.f/newGain;
+			}
+				
+			for (int i=0; i<overlap; i++) {
+				float mask = SQR(sin((M_PI*i)/(2*overlap)));
+				if (tiletop>0) Vmask[i] = mask;
+				if (tilebottom<imheight) Vmask[height-i] = mask;
+				if (tileleft>0) Hmask[i] = mask/newGain;
+				if (tileright<imwidth) Hmask[width-i] = mask/newGain;
+			}
+			//convert back to RGB and write to destination array
+			if (isRAW) {
+			if(!perf) {//Lab mode
+					realred /= 100.f;
+					realblue /= 100.f;
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+				for (int i=tiletop; i<tilebottom; i++){
+					int i1 = i-tiletop;
+					float X,Y,Z,L,a,b;
+
+					for (int j=tileleft; j<tileright; j++) {
+						int j1=j-tileleft;
+						//modification Jacques feb 2013
+						//true conversion Lab==>xyz
+						L = labdn->L[i1][j1];
+						a = labdn->a[i1][j1];
+						b = labdn->b[i1][j1];
+						float c_h=SQR(a)+SQR(b);
+						if(c_h>9000000.f){
+							a *= 1.f + Qhigh*realred;
+							b *= 1.f + Qhigh*realblue;
+						}
+						//convert XYZ
+						Color::Lab2XYZ(L, a, b, X, Y, Z);
+						//apply inverse gamma noise
+						float r_,g_,b_;
+						Color::xyz2rgb(X,Y,Z,r_,g_,b_,wip);
+						//inverse gamma standard (slider)
+						r_ = r_<32768.f ? igamcurve[r_] : (Color::gammanf(r_/32768.f, igam) * 65535.f);
+						g_ = g_<32768.f ? igamcurve[g_] : (Color::gammanf(g_/32768.f, igam) * 65535.f);
+						b_ = b_<32768.f ? igamcurve[b_] : (Color::gammanf(b_/32768.f, igam) * 65535.f);
+						//readapt arbitrary gamma (inverse from beginning)
+						if (gamlab == 0) {
 							r_ = Color::gammatab_26_11[r_];
 							g_ = Color::gammatab_26_11[g_];
 							b_ = Color::gammatab_26_11[b_];
-							}
-							else if (gamlab == 1) {
+						}
+						else if (gamlab == 1) {
 							r_ = Color::gammatab_4[r_];
 							g_ = Color::gammatab_4[g_];
 							b_ = Color::gammatab_4[b_];
-							}
-							else if (gamlab == 2) {
+						}
+						else if (gamlab == 2) {
 							r_ = Color::gammatab_55[r_];
 							g_ = Color::gammatab_55[g_];
 							b_ = Color::gammatab_55[b_];
-							}
-							float factor = Vmask[i1]*Hmask[j1]/gain;
-
-							dsttmp->r(i,j) += factor*r_;
-							dsttmp->g(i,j) += factor*g_;
-							dsttmp->b(i,j) += factor*b_;
-
 						}
+						float factor = Vmask[i1]*Hmask[j1];
+
+						dsttmp->r(i,j) += factor*r_;
+						dsttmp->g(i,j) += factor*g_;
+						dsttmp->b(i,j) += factor*b_;
+
 					}
-					}
-					else {//RGB mode
-					for (int i=tiletop; i<tilebottom; i++){
-						int i1 = i-tiletop;
-						float X,Y,Z;
-						for (int j=tileleft; j<tileright; j++) {
-							int j1=j-tileleft;
-							float c_h=sqrt(SQR(labdn->a[i1][j1])+SQR(labdn->b[i1][j1]));
-							if(c_h>3000.f){
-								labdn->a[i1][j1]*=1.f + Qhigh*realred/100.f;
-								labdn->b[i1][j1]*=1.f + Qhigh*realblue/100.f;
-							}
-							Y = labdn->L[i1][j1];
-							X = (labdn->a[i1][j1]) + Y;
-							Z = Y - (labdn->b[i1][j1]);
-							
-
-							X = X<32768.0f ? igamcurve[X] : (Color::gamma((float)X/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
-							Y = Y<32768.0f ? igamcurve[Y] : (Color::gamma((float)Y/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
-							Z = Z<32768.0f ? igamcurve[Z] : (Color::gamma((float)Z/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
-
-							float factor = Vmask[i1]*Hmask[j1]/gain;
-
-							dsttmp->r(i,j) += factor*X;
-							dsttmp->g(i,j) += factor*Y;
-							dsttmp->b(i,j) += factor*Z;
-
+				}
+				}
+				else {//RGB mode
+				for (int i=tiletop; i<tilebottom; i++){
+					int i1 = i-tiletop;
+					float X,Y,Z;
+					for (int j=tileleft; j<tileright; j++) {
+						int j1=j-tileleft;
+						float c_h=sqrt(SQR(labdn->a[i1][j1])+SQR(labdn->b[i1][j1]));
+						if(c_h>3000.f){
+							labdn->a[i1][j1]*=1.f + Qhigh*realred/100.f;
+							labdn->b[i1][j1]*=1.f + Qhigh*realblue/100.f;
 						}
-					}
+						Y = labdn->L[i1][j1];
+						X = (labdn->a[i1][j1]) + Y;
+						Z = Y - (labdn->b[i1][j1]);
+						
 
-					}
-				} else {
-					for (int i=tiletop; i<tilebottom; i++){
-						int i1 = i-tiletop;
-						float X,Y,Z,L,a,b;
-						for (int j=tileleft; j<tileright; j++) {
-							int j1=j-tileleft;
-							//modification Jacques feb 2013
-							L = labdn->L[i1][j1];
-							a = labdn->a[i1][j1];
-							b = labdn->b[i1][j1];
-							float c_h=sqrt(SQR(a)+SQR(b));
-							if(c_h>3000.f){
-								a*=1.f + Qhigh*realred/100.f;
-								b*=1.f + Qhigh*realblue/100.f;
-							}
-							
-							Color::Lab2XYZ(L, a, b, X, Y, Z);
+						X = X<32768.0f ? igamcurve[X] : (Color::gamma((float)X/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
+						Y = Y<32768.0f ? igamcurve[Y] : (Color::gamma((float)Y/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
+						Z = Z<32768.0f ? igamcurve[Z] : (Color::gamma((float)Z/32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
 
-							float factor = Vmask[i1]*Hmask[j1];
-							float r_,g_,b_;
-							Color::xyz2rgb(X,Y,Z,r_,g_,b_,wip);
-							//gamma slider is different from Raw
-							r_ = r_<32768.0f ? igamcurve[r_] : (Color::gamman((float)r_/32768.0f, igam) * 65535.0f);
-							g_ = g_<32768.0f ? igamcurve[g_] : (Color::gamman((float)g_/32768.0f, igam) * 65535.0f);
-							b_ = b_<32768.0f ? igamcurve[b_] : (Color::gamman((float)b_/32768.0f, igam) * 65535.0f);
+						float factor = Vmask[i1]*Hmask[j1];
 
-							dsttmp->r(i,j) += factor*r_;
-							dsttmp->g(i,j) += factor*g_;
-							dsttmp->b(i,j) += factor*b_;
+						dsttmp->r(i,j) += factor*X;
+						dsttmp->g(i,j) += factor*Y;
+						dsttmp->b(i,j) += factor*Z;
 
-						}
 					}
 				}
 
-				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+				}
+			} else {
+				for (int i=tiletop; i<tilebottom; i++){
+					int i1 = i-tiletop;
+					float X,Y,Z,L,a,b;
+					for (int j=tileleft; j<tileright; j++) {
+						int j1=j-tileleft;
+						//modification Jacques feb 2013
+						L = labdn->L[i1][j1];
+						a = labdn->a[i1][j1];
+						b = labdn->b[i1][j1];
+						float c_h=sqrt(SQR(a)+SQR(b));
+						if(c_h>3000.f){
+							a*=1.f + Qhigh*realred/100.f;
+							b*=1.f + Qhigh*realblue/100.f;
+						}
+						
+						Color::Lab2XYZ(L, a, b, X, Y, Z);
 
-				delete labdn;
-			//	delete noiseh;
-				for (int i=0; i<height; i++)
-					delete [] noisevarlum[i];
-				delete [] noisevarlum;
-				delete [] mad_LL;
-				for (int i=0; i<height; i++)
-					delete [] noisevarchrom[i];
-				delete [] noisevarchrom;
-				delete [] mad_aa;
-				delete [] mad_bb;
-				
+						float factor = Vmask[i1]*Hmask[j1];
+						float r_,g_,b_;
+						Color::xyz2rgb(X,Y,Z,r_,g_,b_,wip);
+						//gamma slider is different from Raw
+						r_ = r_<32768.0f ? igamcurve[r_] : (Color::gamman((float)r_/32768.0f, igam) * 65535.0f);
+						g_ = g_<32768.0f ? igamcurve[g_] : (Color::gamman((float)g_/32768.0f, igam) * 65535.0f);
+						b_ = b_<32768.0f ? igamcurve[b_] : (Color::gamman((float)b_/32768.0f, igam) * 65535.0f);
 
-				delete[] Vmask;
-				delete[] Hmask;
+						dsttmp->r(i,j) += factor*r_;
+						dsttmp->g(i,j) += factor*g_;
+						dsttmp->b(i,j) += factor*b_;
+
+					}
+				}
+			}
+
+			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+			delete labdn;
+			for (int i=0; i<height; i++)
+				delete [] noisevarlum[i];
+			delete [] noisevarlum;
+			delete [] mad_LL;
+			for (int i=0; i<height; i++)
+				delete [] noisevarchrom[i];
+			delete [] noisevarchrom;
+			delete [] mad_aa;
+			delete [] mad_bb;
+
+			delete[] Vmask;
+			delete[] Hmask;
 
 
 
-			}//end of tile row
-		}//end of tile loop
+		}//end of tile row
+	}//end of tile loop
+	}
+	for(int i=0;i<denoiseNestedLevels*numthreads;i++) {
+		fftwf_free(LbloxArray[i]);
+		fftwf_free(fLbloxArray[i]);
+		delete [] nbrwtArray[i];
+	}
+
 #ifdef _OPENMP
-#pragma omp critical
-#endif
-{
-		fftwf_free ( Lblox);
-		fftwf_free ( fLblox);
-}
-	delete [] nbrwt;
-	delete [] blurbuffer;
-        }
-		//copy denoised image to output
-		memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float));
-
-		if (!isRAW) {//restore original image gamma
+omp_set_nested(oldNested);
+#endif
+	//copy denoised image to output
+	memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float));
+	if (!isRAW) {//restore original image gamma
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-			for (int i=0; i<3*dst->width*dst->height; i++) {
-				dst->data[i] = Color::gammatab_srgb[ dst->data[i] ];
-			}
+		for (int i=0; i<3*dst->width*dst->height; i++) {
+			dst->data[i] = Color::gammatab_srgb[ dst->data[i] ];
 		}
-
-		delete dsttmp;
-    // destroy the plans
-	fftwf_destroy_plan( plan_forward_blox[0] );
-	fftwf_destroy_plan( plan_backward_blox[0] );
-	fftwf_destroy_plan( plan_forward_blox[1] );
-	fftwf_destroy_plan( plan_backward_blox[1] );
-	fftwf_cleanup();
 	}
 
+	delete dsttmp;
+// destroy the plans
+fftwf_destroy_plan( plan_forward_blox[0] );
+fftwf_destroy_plan( plan_backward_blox[0] );
+fftwf_destroy_plan( plan_forward_blox[1] );
+fftwf_destroy_plan( plan_backward_blox[1] );
+fftwf_cleanup();
+}
 
-	//median 3x3 in complement on RGB
-	if(dnparams.methodmed=="RGB" && dnparams.median) {
-	//printf("RGB den\n");
-		int wid=dst->width, hei=dst->height;
-		float** tm;
-		tm = new float*[hei];
-		for (int i=0; i<hei; i++)
-			tm[i] = new float[wid];
-			
-		Imagefloat *source;
-		if (dnparams.luma==0 && dnparams.chroma==0)
-			source = dst;
-		else
-			source = src;
 
-		int methmed=0;
-		int border = 1;
-		if(dnparams.rgbmethod=="soft")
-			methmed=0;
-		else if(dnparams.rgbmethod=="33")
-			methmed=1;
-		else if(dnparams.rgbmethod=="55") {
-			methmed = 3;
-			border = 2;
-		}
-		else if(dnparams.rgbmethod=="55soft") {
-			methmed = 2;
-			border = 2;
-		}
+//median 3x3 in complement on RGB
+if(dnparams.methodmed=="RGB" && dnparams.median) {
+//printf("RGB den\n");
+	int wid=dst->width, hei=dst->height;
+	float** tm;
+	tm = new float*[hei];
+	for (int i=0; i<hei; i++)
+		tm[i] = new float[wid];
+
+	Imagefloat *source;
+	if (dnparams.luma==0 && dnparams.chroma==0)
+		source = dst;
+	else
+		source = src;
+
+	int methmed=0;
+	int border = 1;
+	if(dnparams.rgbmethod=="soft")
+		methmed=0;
+	else if(dnparams.rgbmethod=="33")
+		methmed=1;
+	else if(dnparams.rgbmethod=="55") {
+		methmed = 3;
+		border = 2;
+	}
+	else if(dnparams.rgbmethod=="55soft") {
+		methmed = 2;
+		border = 2;
+	}
 
 for(int iteration=1;iteration<=dnparams.passes;iteration++){
 
 #pragma omp parallel
 {
-		if(methmed < 2) {
+	if(methmed < 2) {
 #pragma omp for	
-			for (int i=1; i<hei-1; i++) {
-				float pp[9],temp;
-				if(methmed == 0)
-					for (int j=1; j<wid-1; j++) {
-						med2(source->r(i,j),source->r(i-1,j),source->r(i+1,j),source->r(i,j+1),source->r(i,j-1),tm[i][j]);//3x3 soft
-						}
-				else
-					for (int j=1; j<wid-1; j++) {
-						med3(source->r(i,j),source->r(i-1,j),source->r(i+1,j),source->r(i,j+1),source->r(i,j-1),source->r(i-1,j-1),source->r(i-1,j+1),source->r(i+1,j-1),source->r(i+1,j+1),tm[i][j]);//3x3
-					}
-			}
-		} else {
-#pragma omp for	
-			for (int i=2; i<hei-2; i++) {
-				float pp[25],temp;
-				if(methmed == 3)			
-					for (int j=2; j<wid-2; j++) {
-						med5(source->r(i,j),source->r(i-1,j),source->r(i+1,j),source->r(i,j+1),source->r(i,j-1),source->r(i-1,j-1),source->r(i-1,j+1),source->r(i+1,j-1),source->r(i+1,j+1),
-						source->r(i-2,j),source->r(i+2,j),source->r(i,j+2),source->r(i,j-2),source->r(i-2,j-2),source->r(i-2,j+2),source->r(i+2,j-2),source->r(i+2,j+2),	
-						source->r(i-2,j+1),source->r(i+2,j+1),source->r(i-1,j+2),source->r(i-1,j-2),source->r(i-2,j-1),source->r(i+2,j-1),source->r(i+1,j+2),source->r(i+1,j-2),	
-						tm[i][j]);//5x5
-					}
-				else
-					for (int j=2; j<wid-2; j++) {
-						pp[0]=source->r(i,j);pp[1]=source->r(i-1,j); pp[2]=source->r(i+1,j);pp[3]=source->r(i,j+1);pp[4]=source->r(i,j-1);pp[5]=source->r(i-1,j-1);pp[6]=source->r(i-1,j+1);
-						pp[7]=source->r(i+1,j-1);pp[8]=source->r(i+1,j+1);pp[9]=source->r(i+2,j);pp[10]=source->r(i-2,j);pp[11]=source->r(i,j+2);pp[12]=source->r(i,j-2);
-						fq_sort2(pp,13);
-						tm[i][j]=pp[6];//5x5 soft
+		for (int i=1; i<hei-1; i++) {
+			float pp[9],temp;
+			if(methmed == 0)
+				for (int j=1; j<wid-1; j++) {
+					med2(source->r(i,j),source->r(i-1,j),source->r(i+1,j),source->r(i,j+1),source->r(i,j-1),tm[i][j]);//3x3 soft
 					}
+			else
+				for (int j=1; j<wid-1; j++) {
+					med3(source->r(i,j),source->r(i-1,j),source->r(i+1,j),source->r(i,j+1),source->r(i,j-1),source->r(i-1,j-1),source->r(i-1,j+1),source->r(i+1,j-1),source->r(i+1,j+1),tm[i][j]);//3x3
 				}
 		}
+	} else {
+#pragma omp for	
+		for (int i=2; i<hei-2; i++) {
+			float pp[25],temp;
+			if(methmed == 3)			
+				for (int j=2; j<wid-2; j++) {
+					med5(source->r(i,j),source->r(i-1,j),source->r(i+1,j),source->r(i,j+1),source->r(i,j-1),source->r(i-1,j-1),source->r(i-1,j+1),source->r(i+1,j-1),source->r(i+1,j+1),
+					source->r(i-2,j),source->r(i+2,j),source->r(i,j+2),source->r(i,j-2),source->r(i-2,j-2),source->r(i-2,j+2),source->r(i+2,j-2),source->r(i+2,j+2),	
+					source->r(i-2,j+1),source->r(i+2,j+1),source->r(i-1,j+2),source->r(i-1,j-2),source->r(i-2,j-1),source->r(i+2,j-1),source->r(i+1,j+2),source->r(i+1,j-2),	
+					tm[i][j]);//5x5
+				}
+			else
+				for (int j=2; j<wid-2; j++) {
+					pp[0]=source->r(i,j);pp[1]=source->r(i-1,j); pp[2]=source->r(i+1,j);pp[3]=source->r(i,j+1);pp[4]=source->r(i,j-1);pp[5]=source->r(i-1,j-1);pp[6]=source->r(i-1,j+1);
+					pp[7]=source->r(i+1,j-1);pp[8]=source->r(i+1,j+1);pp[9]=source->r(i+2,j);pp[10]=source->r(i-2,j);pp[11]=source->r(i,j+2);pp[12]=source->r(i,j-2);
+					fq_sort2(pp,13);
+					tm[i][j]=pp[6];//5x5 soft
+				}
+			}
+	}
 #ifdef _OPENMP
 #pragma omp for nowait
 #endif
-		for(int i = border; i < hei-border; i++ ) {
-			for(int j = border; j < wid-border; j++) {
-				dst->r(i,j) = tm[i][j];
-			}
+	for(int i = border; i < hei-border; i++ ) {
+		for(int j = border; j < wid-border; j++) {
+			dst->r(i,j) = tm[i][j];
 		}
+	}
 
-		if(methmed < 2) {
+	if(methmed < 2) {
 #pragma omp for		
-			for (int i=1; i<hei-1; i++) {
-				float pp[9],temp;
-				if(methmed == 0)
-					for (int j=1; j<wid-1; j++) {
-						med2(source->b(i,j),source->b(i-1,j),source->b(i+1,j),source->b(i,j+1),source->b(i,j-1),tm[i][j]);
-					}
-				else 
-					for (int j=1; j<wid-1; j++) {
-						med3(source->b(i,j),source->b(i-1,j),source->b(i+1,j),source->b(i,j+1),source->b(i,j-1),source->b(i-1,j-1),source->b(i-1,j+1),source->b(i+1,j-1),source->b(i+1,j+1),tm[i][j]);
-					}
-			}
-		} else {
-#pragma omp for	
-			for (int i=2; i<hei-2; i++) {
-				float pp[25],temp;
-				if(methmed == 3)							
-					for (int j=2; j<wid-2; j++) {
-						med5(source->b(i,j),source->b(i-1,j),source->b(i+1,j),source->b(i,j+1),source->b(i,j-1),source->b(i-1,j-1),source->b(i-1,j+1),source->b(i+1,j-1),source->b(i+1,j+1),
-						source->b(i-2,j),source->b(i+2,j),source->b(i,j+2),source->b(i,j-2),source->b(i-2,j-2),source->b(i-2,j+2),source->b(i+2,j-2),source->b(i+2,j+2),	
-						source->b(i-2,j+1),source->b(i+2,j+1),source->b(i-1,j+2),source->b(i-1,j-2),source->b(i-2,j-1),source->b(i+2,j-1),source->b(i+1,j+2),source->b(i+1,j-2),	
-						tm[i][j]);//5x5
-					}
-				else
-					for (int j=2; j<wid-2; j++) {
-						pp[0]=source->b(i,j);pp[1]=source->b(i-1,j); pp[2]=source->b(i+1,j);pp[3]=source->b(i,j+1);pp[4]=source->b(i,j-1);pp[5]=source->b(i-1,j-1);pp[6]=source->b(i-1,j+1);
-						pp[7]=source->b(i+1,j-1);pp[8]=source->b(i+1,j+1);pp[9]=source->b(i+2,j);pp[10]=source->b(i-2,j);pp[11]=source->b(i,j+2);pp[12]=source->b(i,j-2);
-						fq_sort2(pp,13);
-						tm[i][j]=pp[6];//5x5 soft
-					}
-			}
+		for (int i=1; i<hei-1; i++) {
+			float pp[9],temp;
+			if(methmed == 0)
+				for (int j=1; j<wid-1; j++) {
+					med2(source->b(i,j),source->b(i-1,j),source->b(i+1,j),source->b(i,j+1),source->b(i,j-1),tm[i][j]);
+				}
+			else 
+				for (int j=1; j<wid-1; j++) {
+					med3(source->b(i,j),source->b(i-1,j),source->b(i+1,j),source->b(i,j+1),source->b(i,j-1),source->b(i-1,j-1),source->b(i-1,j+1),source->b(i+1,j-1),source->b(i+1,j+1),tm[i][j]);
+				}
 		}
+	} else {
+#pragma omp for	
+		for (int i=2; i<hei-2; i++) {
+			float pp[25],temp;
+			if(methmed == 3)							
+				for (int j=2; j<wid-2; j++) {
+					med5(source->b(i,j),source->b(i-1,j),source->b(i+1,j),source->b(i,j+1),source->b(i,j-1),source->b(i-1,j-1),source->b(i-1,j+1),source->b(i+1,j-1),source->b(i+1,j+1),
+					source->b(i-2,j),source->b(i+2,j),source->b(i,j+2),source->b(i,j-2),source->b(i-2,j-2),source->b(i-2,j+2),source->b(i+2,j-2),source->b(i+2,j+2),	
+					source->b(i-2,j+1),source->b(i+2,j+1),source->b(i-1,j+2),source->b(i-1,j-2),source->b(i-2,j-1),source->b(i+2,j-1),source->b(i+1,j+2),source->b(i+1,j-2),	
+					tm[i][j]);//5x5
+				}
+			else
+				for (int j=2; j<wid-2; j++) {
+					pp[0]=source->b(i,j);pp[1]=source->b(i-1,j); pp[2]=source->b(i+1,j);pp[3]=source->b(i,j+1);pp[4]=source->b(i,j-1);pp[5]=source->b(i-1,j-1);pp[6]=source->b(i-1,j+1);
+					pp[7]=source->b(i+1,j-1);pp[8]=source->b(i+1,j+1);pp[9]=source->b(i+2,j);pp[10]=source->b(i-2,j);pp[11]=source->b(i,j+2);pp[12]=source->b(i,j-2);
+					fq_sort2(pp,13);
+					tm[i][j]=pp[6];//5x5 soft
+				}
+		}
+	}
 
 #ifdef _OPENMP
 #pragma omp for nowait
 #endif
-		for(int i = border; i < hei-border; i++ ) {
-			for(int j = border; j < wid-border; j++) {
-				dst->b(i,j) = tm[i][j];	
-			}
+	for(int i = border; i < hei-border; i++ ) {
+		for(int j = border; j < wid-border; j++) {
+			dst->b(i,j) = tm[i][j];	
 		}
+	}
 
 
-		if(methmed < 2) {
+	if(methmed < 2) {
 #pragma omp for		
-			for (int i=1; i<hei-1; i++) {
-				float pp[9],temp;
-				if(methmed == 0)
-					for (int j=1; j<wid-1; j++) {
-						med2(source->g(i,j),source->g(i-1,j),source->g(i+1,j),source->g(i,j+1),source->g(i,j-1),tm[i][j]);
-					}
-				else
-					for (int j=1; j<wid-1; j++) {
-						med3(source->g(i,j),source->g(i-1,j),source->g(i+1,j),source->g(i,j+1),source->g(i,j-1),source->g(i-1,j-1),source->g(i-1,j+1),source->g(i+1,j-1),source->g(i+1,j+1),tm[i][j]);
-					}
-			}
-		} else {
-#pragma omp for	
-			for (int i=2; i<hei-2; i++) {
-				float pp[25],temp;
-				if(methmed == 3)											
-					for (int j=2; j<wid-2; j++) {
-						med5(source->g(i,j),source->g(i-1,j),source->g(i+1,j),source->g(i,j+1),source->g(i,j-1),source->g(i-1,j-1),source->g(i-1,j+1),source->g(i+1,j-1),source->g(i+1,j+1),
-						source->g(i-2,j),source->g(i+2,j),source->g(i,j+2),source->g(i,j-2),source->g(i-2,j-2),source->g(i-2,j+2),source->g(i+2,j-2),source->g(i+2,j+2),	
-						source->g(i-2,j+1),source->g(i+2,j+1),source->g(i-1,j+2),source->g(i-1,j-2),source->g(i-2,j-1),source->g(i+2,j-1),source->g(i+1,j+2),source->g(i+1,j-2),	
-						tm[i][j]);//5x5
-					}
-				else
-					for (int j=2; j<wid-2; j++) {
-						pp[0]=source->g(i,j);pp[1]=source->g(i-1,j); pp[2]=source->g(i+1,j);pp[3]=source->g(i,j+1);pp[4]=source->g(i,j-1);pp[5]=source->g(i-1,j-1);pp[6]=source->g(i-1,j+1);
-						pp[7]=source->g(i+1,j-1);pp[8]=source->g(i+1,j+1);pp[9]=source->g(i+2,j);pp[10]=source->g(i-2,j);pp[11]=source->g(i,j+2);pp[12]=source->g(i,j-2);
-						fq_sort2(pp,13);
-						tm[i][j]=pp[6];//5x5 soft
-					}
-			}
+		for (int i=1; i<hei-1; i++) {
+			float pp[9],temp;
+			if(methmed == 0)
+				for (int j=1; j<wid-1; j++) {
+					med2(source->g(i,j),source->g(i-1,j),source->g(i+1,j),source->g(i,j+1),source->g(i,j-1),tm[i][j]);
+				}
+			else
+				for (int j=1; j<wid-1; j++) {
+					med3(source->g(i,j),source->g(i-1,j),source->g(i+1,j),source->g(i,j+1),source->g(i,j-1),source->g(i-1,j-1),source->g(i-1,j+1),source->g(i+1,j-1),source->g(i+1,j+1),tm[i][j]);
+				}
 		}
+	} else {
+#pragma omp for	
+		for (int i=2; i<hei-2; i++) {
+			float pp[25],temp;
+			if(methmed == 3)											
+				for (int j=2; j<wid-2; j++) {
+					med5(source->g(i,j),source->g(i-1,j),source->g(i+1,j),source->g(i,j+1),source->g(i,j-1),source->g(i-1,j-1),source->g(i-1,j+1),source->g(i+1,j-1),source->g(i+1,j+1),
+					source->g(i-2,j),source->g(i+2,j),source->g(i,j+2),source->g(i,j-2),source->g(i-2,j-2),source->g(i-2,j+2),source->g(i+2,j-2),source->g(i+2,j+2),	
+					source->g(i-2,j+1),source->g(i+2,j+1),source->g(i-1,j+2),source->g(i-1,j-2),source->g(i-2,j-1),source->g(i+2,j-1),source->g(i+1,j+2),source->g(i+1,j-2),	
+					tm[i][j]);//5x5
+				}
+			else
+				for (int j=2; j<wid-2; j++) {
+					pp[0]=source->g(i,j);pp[1]=source->g(i-1,j); pp[2]=source->g(i+1,j);pp[3]=source->g(i,j+1);pp[4]=source->g(i,j-1);pp[5]=source->g(i-1,j-1);pp[6]=source->g(i-1,j+1);
+					pp[7]=source->g(i+1,j-1);pp[8]=source->g(i+1,j+1);pp[9]=source->g(i+2,j);pp[10]=source->g(i-2,j);pp[11]=source->g(i,j+2);pp[12]=source->g(i,j-2);
+					fq_sort2(pp,13);
+					tm[i][j]=pp[6];//5x5 soft
+				}
+		}
+	}
 
 #ifdef _OPENMP
 #pragma omp for
 #endif
-		for(int i = border; i < hei-border; i++ ) {
-			for(int j = border; j < wid-border; j++) {
-				dst->g(i,j) = tm[i][j];
-			}
+	for(int i = border; i < hei-border; i++ ) {
+		for(int j = border; j < wid-border; j++) {
+			dst->g(i,j) = tm[i][j];
 		}
+	}
 }
 }		
-                for (int i=0; i<hei; i++)
-                    delete [] tm[i];
-                delete [] tm;
-			
+			for (int i=0; i<hei; i++)
+				delete [] tm[i];
+			delete [] tm;
+		
 }		
-		//end median
-		if(noiseLCurve || noiseCCurve) {
-				for (int i=0; i<hei; i++)
-					delete [] lumcalc[i];
-				delete [] lumcalc;
-				for (int i=0; i<hei; i++)
-					delete [] acalc[i];
-				delete [] acalc;
-				for (int i=0; i<hei; i++)
-					delete [] bcalc[i];
-				delete [] bcalc;
-				
-				}
+	//end median
+	if(noiseLCurve || noiseCCurve) {
+			for (int i=0; i<hei; i++)
+				delete [] lumcalc[i];
+			delete [] lumcalc;
+			for (int i=0; i<hei; i++)
+				delete [] acalc[i];
+			delete [] acalc;
+			for (int i=0; i<hei; i++)
+				delete [] bcalc[i];
+			delete [] bcalc;
+			
+			}
 
 //#ifdef _DEBUG
 	if (settings->verbose) {
@@ -1700,7 +1752,7 @@ for(int iteration=1;iteration<=dnparams.passes;iteration++){
 	}
 //#endif
 
-	}//end of main RGB_denoise
+}//end of main RGB_denoise
 
 
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -1713,7 +1765,7 @@ SSEFUNCTION	void ImProcFunctions::RGBtile_denoise (float * fLblox, int hblproc,
 		int blkstart = hblproc*TS*TS;
 
 		boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS, blurbuffer);//blur neighbor weights for more robust estimation	//for DCT
-		
+
 #ifdef __SSE2__
 		__m128	tempv;
 		__m128	noisevar_Ldetailv = _mm_set1_ps( noisevar_Ldetail );
@@ -1722,9 +1774,7 @@ SSEFUNCTION	void ImProcFunctions::RGBtile_denoise (float * fLblox, int hblproc,
 			tempv  = onev - xexpf( -SQRV( LVFU(nbrwt[n]))/noisevar_Ldetailv);
 			_mm_storeu_ps( &fLblox[blkstart+n], LVFU(fLblox[blkstart+n]) * tempv );
 		}//output neighbor averaged result
-
 #else
-#pragma omp parallel for
 		for (int n=0; n<TS*TS; n++) {		//for DCT
 			fLblox[blkstart+n] *= (1-xexpf(-SQR(nbrwt[n])/noisevar_Ldetail));
 		}//output neighbor averaged result
@@ -1748,7 +1798,7 @@ SSEFUNCTION	void ImProcFunctions::RGBtile_denoise (float * fLblox, int hblproc,
 		int bottom = MIN( top+TS,height);
 		int imax = bottom - top;
 
-		//add row of tiles to output image
+		//add row of tiles to output image
 		for (int hblk=0; hblk < numblox_W; hblk++) {
 			int left = (hblk-blkrad)*offset;
 			int right  = MIN(left+TS, width);
@@ -1809,66 +1859,81 @@ SSEFUNCTION	void ImProcFunctions::RGBtile_denoise (float * fLblox, int hblproc,
 
 	}
 
-	float ImProcFunctions::Mad( float *  RESTRICT DataList, int datalen, int * RESTRICT histo) {
+	float ImProcFunctions::Mad( float * DataList, const int datalen) {
 
 		//computes Median Absolute Deviation
-		//DataList values should mostly have abs val < 65535
-
-		for (int i=0; i<65536; i++)
-			histo[i]=0;
-
-		//calculate histogram of absolute values of HH wavelet coeffs
+		//DataList values should mostly have abs val < 256 because we are in Lab mode
+		int histo[256] ALIGNED64 = {0};
+		
+		//calculate histogram of absolute values of wavelet coeffs
 		for (int i=0; i<datalen; i++) {
-			histo[min(65535,abs((int)DataList[i]))]++;			
+			histo[min(255,abs((int)DataList[i]))]++;			
 		}
 
 		//find median of histogram
-		int median=0, count=0;
+		int median = 0, count = 0;
 		while (count<datalen/2) {
 			count += histo[median];
 			median++;
 		}
 
 		int count_ = count - histo[median-1];
-
-		// interpolate
+
+		// interpolate
 		return (( (median-1) + (datalen/2-count_)/((float)(count-count_)) )/0.6745);
 	}
+
+	float ImProcFunctions::MadRgb( float * DataList, const int datalen) {
+
+		//computes Median Absolute Deviation
+		//DataList values should mostly have abs val < 65536 because we are in RGB mode
+		int * histo = new int[65536];
+		for(int i=0;i<65536;i++)
+			histo[i] = 0;
+		
+		//calculate histogram of absolute values of wavelet coeffs
+		int i;
+
+		for (i=0; i<datalen; i++) {
+			histo[min(65536,abs((int)DataList[i]))]++;			
+		}
+
+		//find median of histogram
+		int median = 0, count = 0;
+		while (count<datalen/2) {
+			count += histo[median];
+			median++;
+		}
+
+		int count_ = count - histo[median-1];
+
+		// interpolate
+		delete [] histo;
+		return (( (median-1) + (datalen/2-count_)/((float)(count-count_)) )/0.6745);
+	}
+
 
 
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 void ImProcFunctions::Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, wavelet_decomposition &WaveletCoeffs_b,  int width, int height, float &chresid, float &chmaxredresid,float &chmaxblueresid, float &chresidred, float & chresidblue, float resid, float residblue, float residred, float maxredresid, float maxblueresid, float nbresid)
-
 {
 		int maxlvl = WaveletCoeffs_L.maxlevel();
 		const float eps = 0.01f;
-//		int max;
-//		float parfrac = 0.05;
-	//	float madL[8][3], mada[8][3], madb[8][3];
 
 		float madaC,madbC;
-		int * madHisto = new int[65536];
 		for (int lvl=0; lvl<maxlvl; lvl++) {
 			// compute median absolute deviation (MAD) of detail coefficients as robust noise estimator
 
-			int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
-			int Hlvl_L = WaveletCoeffs_L.level_H(lvl);
-			
-			if(Wlvl_L!=width) Wlvl_L=width;
-			if(Hlvl_L!=height) Hlvl_L=height;
-			
 			int Wlvl_ab = WaveletCoeffs_a.level_W(lvl);
 			int Hlvl_ab = WaveletCoeffs_a.level_H(lvl);
 
-			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
 			for (int dir=1; dir<4; dir++) {
-				madaC = SQR(Mad(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab, madHisto));
-				madbC = SQR(Mad(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab, madHisto));
-				
-			
+				madaC = SQR(Mad(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab));
+				madbC = SQR(Mad(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab));
+
 				resid +=(madaC+madbC);
 				residred+=madaC;
 				residblue+=madbC;
@@ -1876,36 +1941,53 @@ void ImProcFunctions::Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wav
 				if(madaC >maxredresid ) maxredresid=madaC;
 				if(madbC > maxblueresid) maxblueresid=madbC;
 				nbresid++;	
-		
-				chresid=sqrt(resid/(2*nbresid));
-				chmaxredresid=sqrt(maxredresid);
-				chmaxblueresid=sqrt(maxblueresid);
-				chresidred=sqrt(residred/nbresid);
-				chresidblue=sqrt(residblue/nbresid);
-				
-				
-			}	
-			
-			
-			
+			}
+			chresid=sqrt(resid/(2*nbresid));
+			chmaxredresid=sqrt(maxredresid);
+			chmaxblueresid=sqrt(maxblueresid);
+			chresidred=sqrt(residred/nbresid);
+			chresidblue=sqrt(residblue/nbresid);
 		}
-		delete [] madHisto;
-
-
-
 }
 
 SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a,
-													 wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch)
+													 wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch, bool perf)
 	{
 		int maxlvl = WaveletCoeffs_L.maxlevel();
-		const float eps = 0.01f;
-//		int max;
-//		float parfrac = 0.05;
-		float madL[8][3], mada[8][3], madb[8][3];
+		const float eps = 0.01f;
+		if(autoch && noisevar_abr <=0.001f) noisevar_abr=0.02f;
+		if(autoch && noisevar_abb <=0.001f) noisevar_abb=0.02f;
 
-		float madaC,madbC;
-		int * madHisto = new int[65536];
+		float madL[8][3], mada[8][3], madb[8][3];
+		
+		int maxWL = 0, maxHL = 0, maxWab = 0, maxHab = 0;
+		for (int lvl=0; lvl<maxlvl; lvl++) {
+			if(WaveletCoeffs_L.level_W(lvl) > maxWL)
+				maxWL = WaveletCoeffs_L.level_W(lvl);
+			if(WaveletCoeffs_L.level_H(lvl) > maxHL)
+				maxHL = WaveletCoeffs_L.level_H(lvl);
+			if(WaveletCoeffs_a.level_W(lvl) > maxWab)
+				maxWab = WaveletCoeffs_a.level_W(lvl);
+			if(WaveletCoeffs_a.level_H(lvl) > maxHab)
+				maxHab = WaveletCoeffs_a.level_H(lvl);
+		}
+
+#ifdef _OPENMP
+#pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+{
+		float *buffer[6];
+		buffer[0] = new float[maxWL*maxHL+32];
+		buffer[1] = new float[maxWL*maxHL+64];
+		buffer[2] = new float[maxWL*maxHL+96];
+		buffer[3] = new float[maxWab*maxHab+128];
+		buffer[4] = new float[maxWL*maxHL+160];
+		buffer[5] = new float[maxWL*maxHL+192];
+		
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
 		for (int lvl=0; lvl<maxlvl; lvl++) {
 			// compute median absolute deviation (MAD) of detail coefficients as robust noise estimator
 
@@ -1922,29 +2004,25 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
 
-			for (int dir=1; dir<4; dir++) {
-				madL[lvl][dir-1] = SQR(Mad(WavCoeffs_L[dir], Wlvl_L*Hlvl_L, madHisto));
-				mada[lvl][dir-1] = SQR(Mad(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab, madHisto));
-				madb[lvl][dir-1] = SQR(Mad(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab, madHisto));
-				madaC = SQR(Mad(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab, madHisto));
-				madbC = SQR(Mad(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab, madHisto));
-				
+			for (int dir=1; dir<4; dir++) {
+				if(!perf) {
+					madL[lvl][dir-1] = SQR(Mad(WavCoeffs_L[dir], Wlvl_L*Hlvl_L));
+					mada[lvl][dir-1] = SQR(Mad(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab));
+					madb[lvl][dir-1] = SQR(Mad(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab));
+				} else {
+					madL[lvl][dir-1] = SQR(MadRgb(WavCoeffs_L[dir], Wlvl_L*Hlvl_L));
+					mada[lvl][dir-1] = SQR(MadRgb(WavCoeffs_a[dir], Wlvl_ab*Hlvl_ab));
+					madb[lvl][dir-1] = SQR(MadRgb(WavCoeffs_b[dir], Wlvl_ab*Hlvl_ab));
+				}
 			}	
-			
-			
-			
 		}
-		delete [] madHisto;
-
-		
 		
+#ifdef _OPENMP
+#pragma omp for
+#endif
 		for (int lvl=maxlvl-1; lvl>=0; lvl--) {//for levels less than max, use level diff to make edge mask
-			//for (int lvl=0; lvl<maxlvl; lvl++) {
-
 			int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
 			int Hlvl_L = WaveletCoeffs_L.level_H(lvl);
-	//		printf("w=%d WL=%d\n",width, Wlvl_L);
-	//		printf("h=%d HL=%d\n",height, Hlvl_L);
 			
 			if(Wlvl_L!=width) Wlvl_L=width;
 			if(Hlvl_L!=height) Hlvl_L=height;
@@ -1954,32 +2032,21 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 
 			int skip_L = WaveletCoeffs_L.level_stride(lvl);
 			int skip_ab = WaveletCoeffs_a.level_stride(lvl);
-//			float skip_h;
+
 			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
 			int callby=1;
 			if (lvl==maxlvl-1) {
-				ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
-						  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, width, height, mad_LL, mad_aa, mad_bb,noisevar_abr, noisevar_abb, noi, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, callby, autoch, mada[lvl], madb[lvl], madL[lvl], true);
+				ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, buffer, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
+						  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, width, height, mad_LL, mad_aa, mad_bb,noisevar_abr, noisevar_abb, noi, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, callby, autoch, perf, mada[lvl], madb[lvl], madL[lvl], true);
 
 			} else {
-
-//				float ** WavPars_L = WaveletCoeffs_L.level_coeffs(lvl+1);
-				//float ** WavPars_a = WaveletCoeffs_a.level_coeffs(lvl+1);
-				//float ** WavPars_b = WaveletCoeffs_b.level_coeffs(lvl+1);
-
 				//simple wavelet shrinkage
-				float * sfave = new float[Wlvl_L*Hlvl_L];
-				float * WavCoeffsLtemp = new float[Hlvl_ab*Wlvl_ab];
-			//	float * mad_L = new float[Wlvl_L*Hlvl_L];
-
-//				array2D<float> edge(Wlvl_L,Hlvl_L);
-
-				//printf("\n level=%d  \n",lvl);
-					if(autoch && noisevar_abr <=0.001f) noisevar_abr=0.02f;
-					if(autoch && noisevar_abb <=0.001f) noisevar_abb=0.02f;
-
+				float * sfave = buffer[0]+32;
+				float * sfaved = buffer[4]+160;
+				float * WavCoeffsLtemp = buffer[3]+128;
+				float * blurBuffer = buffer[1]+64;
 				for (int dir=1; dir<4; dir++) {
 					float mad_Lr = madL[lvl][dir-1];
 					float mad_ar = noisevar_abr*mada[lvl][dir-1];
@@ -1993,34 +2060,26 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 							}//noisevarchrom	
 						}
 					}	
+
 					if (noiseCCurve  && noiseCCurve.getSum() > 5.f ){
-				//	printf("OUI\n");
 						for (int i=0; i<Hlvl_ab; i++) {
 							for (int j=0; j<Wlvl_ab; j++) {
 									mad_aa[i*Wlvl_ab+j]=mad_ar*(noisevarchrom[i][j]);
 									mad_bb[i*Wlvl_ab+j]=mad_br*(noisevarchrom[i][j]);
 							}//noisevarchrom
 						}
-					}
-
-					//float mad_Lpar = madL[lvl+1][dir-1];
-					//float mad_apar = mada[lvl+1][dir-1];
-					//float mad_bpar = mada[lvl+1][dir-1];
-
-					//float skip_ab_ratio = WaveletCoeffs_a.level_stride(lvl+1)/skip_ab;
-//					float skip_L_ratio =  WaveletCoeffs_L.level_stride(lvl+1)/skip_L;
-					
+					}
 					if (noisevar_abr>0.001f  || noisevar_abb>0.001f) {
 						for(int i=0;i<Hlvl_ab;i++)
 							for(int j=0;j<Wlvl_ab;j++)
-								WavCoeffsLtemp[i*Wlvl_ab+j] = WavCoeffs_L[dir][((i*skip_L)/skip_ab)*Wlvl_L + ((j*skip_L)/skip_ab)];
+								WavCoeffsLtemp[i*Wlvl_ab+j] = WavCoeffs_L[dir][((i*skip_L)/skip_ab)*Wlvl_L + ((j*skip_L)/skip_ab)];
+					
 #ifdef __SSE2__
 						int j;
 						__m128 onev = _mm_set1_ps(1.f);
-						__m128 mad_Lm9v = _mm_set1_ps(mad_Lr * 9.f);
-						__m128 mad_av ;//= _mm_set1_ps(mad_a);
-						__m128 mad_bv ;//= _mm_set1_ps(mad_b);
-						__m128 epsv = _mm_set1_ps(eps);
+						__m128 rmad_Lm9v = onev / _mm_set1_ps(mad_Lr * 9.f);
+						__m128 mad_av;
+						__m128 mad_bv;
 						__m128 mag_Lv, mag_av, mag_bv;
 						__m128 tempav, tempbv;
 						for (int i=0; i<Hlvl_ab; i++) {
@@ -2032,47 +2091,30 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 								tempav = LVFU(WavCoeffs_a[dir][coeffloc_ab]);
 								tempbv = LVFU(WavCoeffs_b[dir][coeffloc_ab]);
 								mag_Lv = LVFU(WavCoeffsLtemp[coeffloc_ab]);
-								mag_av = SQRV(tempav)+epsv;
-								mag_bv = SQRV(tempbv)+epsv;
-								mag_Lv = SQRV(mag_Lv) + epsv;
-								_mm_storeu_ps(&WavCoeffs_a[dir][coeffloc_ab], tempav * SQRV((onev-xexpf(-(mag_av/mad_av)-(mag_Lv/mad_Lm9v)))));
-								_mm_storeu_ps(&WavCoeffs_b[dir][coeffloc_ab], tempbv * SQRV((onev-xexpf(-(mag_bv/mad_bv)-(mag_Lv/mad_Lm9v)))));
+								mag_av = SQRV(tempav);
+								mag_bv = SQRV(tempbv);
+								mag_Lv = SQRV(mag_Lv) * rmad_Lm9v;
+								_mm_storeu_ps(&WavCoeffs_a[dir][coeffloc_ab], tempav * SQRV((onev-xexpf(-(mag_av/mad_av)-(mag_Lv)))));
+								_mm_storeu_ps(&WavCoeffs_b[dir][coeffloc_ab], tempbv * SQRV((onev-xexpf(-(mag_bv/mad_bv)-(mag_Lv)))));
 							}
 							for (; j<Wlvl_ab; j++,coeffloc_ab++) {
-								float mag_L = SQR(WavCoeffsLtemp[coeffloc_ab ])+eps;
-								float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
-								float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
+								float mag_L = SQR(WavCoeffsLtemp[coeffloc_ab ]);
+								float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab]);
+								float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab]);
 								WavCoeffs_a[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*mad_Lr)))/*satfactor_a*/);
 								WavCoeffs_b[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*mad_Lr)))/*satfactor_b*/);
 							}
 						}//now chrominance coefficients are denoised
 #else
-						//printf("  dir=%d  mad_L=%f		mad_a=%f		mad_b=%f	\n",dir,sqrt(mad_L),sqrt(mad_a),sqrt(mad_b));
-
 						for (int i=0; i<Hlvl_ab; i++) {
 							for (int j=0; j<Wlvl_ab; j++) {
 
 								int coeffloc_ab = i*Wlvl_ab+j;
-								//int coeffloc_abpar = (MAX(0,i-skip_ab)*Wlvl_ab+MAX(0,j-skip_ab))/skip_ab_ratio;
-
-//								int coeffloc_L	= ((i*skip_L)/skip_ab)*Wlvl_L + ((j*skip_L)/skip_ab);
 
 								float mag_L = SQR(WavCoeffsLtemp[coeffloc_ab ])+eps;
 								float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
 								float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
 
-								//float edgefactor = 1-exp(-mag_L/(9*mad_L));// * exp(-mag_a/(4*mad_a)) * exp(-mag_b/(4*mad_b));
-
-								//float coeff_a = sqrt(SQR(WavCoeffs_a[dir][coeffloc_ab])/mad_a+SQR(parfrac*WavPars_a[dir][coeffloc_abpar])/mad_apar);
-								//float coeff_b = sqrt(SQR(WavCoeffs_b[dir][coeffloc_ab])/mad_b+SQR(parfrac*WavPars_b[dir][coeffloc_abpar])/mad_bpar);
-
-								// 'firm' threshold of chroma coefficients
-								//WavCoeffs_a[dir][coeffloc_ab] *= edgefactor*(coeff_a>2 ? 1 : (coeff_a<1 ? 0 : (coeff_a - 1)));
-								//WavCoeffs_b[dir][coeffloc_ab] *= edgefactor*(coeff_b>2 ? 1 : (coeff_b<1 ? 0 : (coeff_b - 1)));
-
-								//float satfactor_a = mad_a/(mad_a+0.5*SQR(WavCoeffs_a[0][coeffloc_ab]));
-								//float satfactor_b = mad_b/(mad_b+0.5*SQR(WavCoeffs_b[0][coeffloc_ab]));
-
 								WavCoeffs_a[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*mad_Lr)))/*satfactor_a*/);
 								WavCoeffs_b[dir][coeffloc_ab] *= SQR(1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*mad_Lr)))/*satfactor_b*/);
 
@@ -2080,44 +2122,37 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 						}//now chrominance coefficients are denoised
 #endif
 					}
-
+
+
+					float levelFactor = mad_Lr*5.f/(lvl+1);
 					if (noisevar_L>0.00001f) {
 						if (!noiseLCurve || noiseLCurve.getSum() < 7.f ) {
 							for (int i=0; i<Hlvl_L; i++)
 								for (int j=0; j<Wlvl_L; j++) {
 									int coeffloc_L = i*Wlvl_L+j;
-									mad_LL[coeffloc_L]=mad_Lr*(noisevar_L)*5/(lvl+1);//noisevarlum
+									mad_LL[coeffloc_L] = noisevar_L*levelFactor;//noisevarlum
 								}
 							}
 						else { 	if (noiseLCurve && noiseLCurve.getSum() >= 7.f) {	
 							for (int i=0; i<Hlvl_L; i++)
 								for (int j=0; j<Wlvl_L; j++) {
 									int coeffloc_L = i*Wlvl_L+j;
-									mad_LL[coeffloc_L]=mad_Lr*(noisevarlum[i][j])*5/(lvl+1);//noisevarlum
+									mad_LL[coeffloc_L] = noisevarlum[i][j]*levelFactor;//noisevarlum
 								}
 							}
 						}
-	
-
-	
-								
-					//mad_L *= noisevar_L*5.f/(lvl+1);
-			
 #ifdef __SSE2__
 						int j;
 						__m128 mad_Lv;
-						__m128	ninev = _mm_set1_ps( 9.0f );
-						
-						__m128 mad_Lm9v;// _mm_set1_ps(mad_L * 9.f);
-						
+						__m128 ninev = _mm_set1_ps( 9.0f );
 						__m128 epsv = _mm_set1_ps(eps);
 						__m128 mag_Lv;
 						for (int i=0; i<Hlvl_L; i++) {
 							int coeffloc_L = i*Wlvl_L;
 							for (j=0; j<Wlvl_L-3; j+=4,coeffloc_L+=4) {
-								    mad_Lv =LVFU(mad_LL[coeffloc_L]);
-									mag_Lv =SQRV(LVFU(WavCoeffs_L[dir][coeffloc_L]));
-									_mm_storeu_ps(&sfave[coeffloc_L], mag_Lv / ( mag_Lv + mad_Lv * xexpf(-mag_Lv/(mad_Lv*ninev) )+ epsv));
+								mad_Lv = LVFU(mad_LL[coeffloc_L]);
+								mag_Lv = SQRV(LVFU(WavCoeffs_L[dir][coeffloc_L]));
+								_mm_storeu_ps(&sfave[coeffloc_L], mag_Lv / ( mag_Lv + mad_Lv * xexpf(-mag_Lv/(mad_Lv*ninev) )+ epsv));
 								
 							}	
 							for (; j<Wlvl_L; j++, coeffloc_L++) {
@@ -2126,94 +2161,89 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 							}
 						}
 #else
-
 						for (int i=0; i<Hlvl_L; i++)
 							for (int j=0; j<Wlvl_L; j++) {
 
 								int coeffloc_L = i*Wlvl_L+j;
-//								int coeffloc_Lpar = (MAX(0,i-skip_L)*Wlvl_L+MAX(0,j-skip_L))/skip_L_ratio;
-
 								float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L]);
-								//float mag_Lpar = SQR(parfrac*WavPars_L[dir][coeffloc_Lpar]);
-								//float sf_L = SQR(1-expf(-(mag_L/mad_L)-(mag_Lpar/mad_L)));
 								sfave[coeffloc_L] = mag_L/(mag_L+mad_LL[coeffloc_L]*xexpf(-mag_L/(9.f*mad_LL[coeffloc_L]))+eps);
-
-								//edge[i][j] = (WavCoeffs_L[dir][coeffloc_L] - WavPars_L[dir][coeffloc_Lpar]);
 							}
 #endif
-						//blur edge measure
-						//gaussHorizontal<float> (edge, edge, buffer, Wlvl_L, Hlvl_L, 1<<(lvl+1), false);
-						//gaussVertical<float>   (edge, edge, buffer, Wlvl_L, Hlvl_L, 1<<(lvl+1), false);
-	
-					boxblur(sfave, sfave, lvl+2, lvl+2, Wlvl_L, Hlvl_L);//increase smoothness by locally averaging shrinkage
-
+						boxblur(sfave, sfaved, blurBuffer, lvl+2, lvl+2, Wlvl_L, Hlvl_L);//increase smoothness by locally averaging shrinkage
 #ifdef __SSE2__
-						__m128 tempLv;
-						__m128 tempL2v;
+						__m128 sfavev;
 						__m128 sf_Lv;
-
 						for (int i=0; i<Hlvl_L; i++) {
 							int coeffloc_L = i*Wlvl_L;
-							for (j=0; j<Wlvl_L-3; j+=4,coeffloc_L+=4) {
-									tempLv = LVFU(WavCoeffs_L[dir][coeffloc_L]);
-									mag_Lv = SQRV(tempLv);
-									tempL2v = LVFU(sfave[coeffloc_L]);
-									sf_Lv = mag_Lv/(mag_Lv + mad_Lv*xexpf(-mag_Lv/(mad_Lv*ninev))+epsv);
-									_mm_storeu_ps(&WavCoeffs_L[dir][coeffloc_L], tempLv * (SQRV(tempL2v)+SQRV(sf_Lv))/(tempL2v+sf_Lv+epsv));
+							for (j=0; j<Wlvl_L-3; j+=4,coeffloc_L+=4) {
+								sfavev = LVFU(sfaved[coeffloc_L]);
+								sf_Lv = LVFU(sfave[coeffloc_L]);
+								_mm_storeu_ps(&WavCoeffs_L[dir][coeffloc_L], LVFU(WavCoeffs_L[dir][coeffloc_L]) * (SQRV(sfavev)+SQRV(sf_Lv))/(sfavev+sf_Lv+epsv));
 								//use smoothed shrinkage unless local shrinkage is much less
-
 							}//now luminance coeffs are denoised
 							for (; j<Wlvl_L; j++,coeffloc_L++) {
-									float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L]);
-									float sf_L = mag_L/(mag_L + mad_LL[coeffloc_L]*xexpf(-mag_L/(9.f*mad_LL[coeffloc_L]))+eps);
-
+								float sf_L = sfave[coeffloc_L];
 								//use smoothed shrinkage unless local shrinkage is much less
-									WavCoeffs_L[dir][coeffloc_L] *= (SQR(sfave[coeffloc_L])+SQR(sf_L))/(sfave[coeffloc_L]+sf_L+eps);
-
+								WavCoeffs_L[dir][coeffloc_L] *= (SQR(sfaved[coeffloc_L])+SQR(sf_L))/(sfaved[coeffloc_L]+sf_L+eps);
 							}//now luminance coeffs are denoised
 						}
-
 #else
-
 						for (int i=0; i<Hlvl_L; i++)
 							for (int j=0; j<Wlvl_L; j++) {
-
 								int coeffloc_L = i*Wlvl_L+j;
-
-								float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L]);
-								//float sf_L = SQR(1-expf(-(mag_L/mad_L)-(mag_Lpar/mad_L)));
-
-								float edgefactor = 1;//expf(-SQR(edge[i][j])/mad_L);
-
-								float sf_L = mag_L/(mag_L + edgefactor*mad_LL[coeffloc_L]*xexpf(-mag_L/(9.f*mad_LL[coeffloc_L]))+eps);
-
+								float sf_L = sfave[coeffloc_L];
 								//use smoothed shrinkage unless local shrinkage is much less
-								WavCoeffs_L[dir][coeffloc_L] *= (SQR(edgefactor*sfave[coeffloc_L])+SQR(sf_L))/(edgefactor*sfave[coeffloc_L]+sf_L+eps);
+								WavCoeffs_L[dir][coeffloc_L] *= (SQR(sfaved[coeffloc_L])+SQR(sf_L))/(sfaved[coeffloc_L]+sf_L+eps);
 
 							}//now luminance coeffs are denoised
 #endif
-
 					}
-					
 				}
-				delete[] WavCoeffsLtemp;
-				delete[] sfave;
-
 			}
-		}
+		}
+		for(int i=5;i>=0;i--)
+			delete [] buffer[i];
+
+}
 	}
 
-	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 	void ImProcFunctions::WaveletDenoiseAll(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a,
-											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut,float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch)//mod JD
+											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut,float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch, bool perf)//mod JD
 
 	{
-		int maxlvl = WaveletCoeffs_L.maxlevel();
-//       printf("maxlevel = %d\n",maxlvl);
-//omp_set_nested(true);
+
+		int maxlvl = WaveletCoeffs_L.maxlevel();
+		int maxWL = 0, maxHL = 0, maxWab = 0, maxHab = 0;
+		for (int lvl=0; lvl<maxlvl; lvl++) {
+			if(WaveletCoeffs_L.level_W(lvl) > maxWL)
+				maxWL = WaveletCoeffs_L.level_W(lvl);
+			if(WaveletCoeffs_L.level_H(lvl) > maxHL)
+				maxHL = WaveletCoeffs_L.level_H(lvl);
+			if(WaveletCoeffs_a.level_W(lvl) > maxWab)
+				maxWab = WaveletCoeffs_a.level_W(lvl);
+			if(WaveletCoeffs_a.level_H(lvl) > maxHab)
+				maxHab = WaveletCoeffs_a.level_H(lvl);
+		}
+
 
+#ifdef _OPENMP
+#pragma omp parallel num_threads(denoiseNestedLevels) if(denoiseNestedLevels>1)
+#endif
+{
+		float *buffer[6];
+		buffer[0] = new float[maxWL*maxHL+32];
+		buffer[1] = new float[maxWL*maxHL+64];
+		buffer[2] = new float[maxWL*maxHL+96];
+		buffer[3] = new float[maxWab*maxHab+128];
+		buffer[4] = new float[maxWL*maxHL+160];
+		buffer[5] = new float[maxWL*maxHL+192];
+		
+		
+#ifdef _OPENMP
+#pragma omp for
+#endif
+
 		for (int lvl=0; lvl<maxlvl; lvl++) {
 
 			int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
@@ -2230,91 +2260,89 @@ SSEFUNCTION	void ImProcFunctions::WaveletDenoiseAll_BiShrink(wavelet_decompositi
 			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
 			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
 
-      //     printf("HL : %d\n", Hlvl_L);
-       //    printf("WL : %d\n", Wlvl_L);
 			int callby=0;
 			if(schoice==0) callby=0;
 			if(schoice==2) callby=1;
 			
-			ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
-					  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, width, height, mad_LL, mad_aa, mad_bb, noisevar_abr, noisevar_abb, noi, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, callby, autoch);
+			ShrinkAll(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, buffer, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
+					  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, width, height, mad_LL, mad_aa, mad_bb, noisevar_abr, noisevar_abb, noi, noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, callby, autoch, perf);
 
-		}
-//omp_set_nested(false);
+		}
+		for(int i=5;i>=0;i--)
+			delete [] buffer[i];
+}
 	}
 
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level,
+SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, float *buffer[4], int level, 
 									int W_L, int H_L, int W_ab, int H_ab,int skip_L, int skip_ab, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float * mad_LL, float * mad_aa, float * mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut,
-									float &maxredaut, float &maxblueaut, int callby, bool autoch, float * madaa, float * madab, float * madaL, bool madCalculated )
+									float &maxredaut, float &maxblueaut, int callby, bool autoch, bool perf, float * madaa, float * madab, float * madaL, bool madCalculated )
 
-									{
+									{
 		//simple wavelet shrinkage
 		const float eps = 0.01f;
-		if(W_L != width) W_L=width;
-		if(H_L != height) H_L=height;
+		W_L = width;
+		H_L = height;
+		if(autoch && noisevar_abr <=0.001f) noisevar_abr=0.02f;
+		if(autoch && noisevar_abb <=0.001f) noisevar_abb=0.02f;
 		
-		float * sfave = new float[W_L*H_L];
-		float * sfavea = new float[W_L*H_L];
-		float * sfaveb = new float[W_L*H_L];
-		float * WavCoeffsLtemp = new float[H_ab*W_ab];
-
+		float * sfavea = buffer[0]+32;
+		float * sfaveb = buffer[1]+64;
+		float * sfavead = buffer[4]+160;
+		float * sfavebd = buffer[5]+192;
+		float * sfave = sfavea; // we can safely reuse sfavea here, because they are not used together
+		float * sfaved = sfavead; // we can safely reuse sfavead here, because they are not used together
+		float * WavCoeffsLtemp = buffer[3]+128;
+		float * blurBuffer = buffer[2]+96;
+
 		for (int dir=1; dir<4; dir++) {
 			float mada, madb, madL;
 			if(madCalculated) {
 				mada = madaa[dir-1];
 				madb = madab[dir-1];
 				madL = madaL[dir-1] ;
-			} else {
-				int * madHisto = new int[65536];
-				mada = SQR(Mad(WavCoeffs_a[dir], W_ab*H_ab, madHisto));
-				madb = SQR(Mad(WavCoeffs_b[dir], W_ab*H_ab, madHisto));
-				madL = SQR(Mad(WavCoeffs_L[dir], W_L*H_L, madHisto));
-				delete [] madHisto;
+			} else {
+				if(!perf) {
+					madL = SQR(Mad(WavCoeffs_L[dir], W_L*H_L));
+					mada = SQR(Mad(WavCoeffs_a[dir], W_ab*H_ab));
+					madb = SQR(Mad(WavCoeffs_b[dir], W_ab*H_ab));
+				} else {
+					madL = SQR(MadRgb(WavCoeffs_L[dir], W_L*H_L));
+					mada = SQR(MadRgb(WavCoeffs_a[dir], W_ab*H_ab));
+					madb = SQR(MadRgb(WavCoeffs_b[dir], W_ab*H_ab));
+				}
 			}
 		
-		//
-		//	float mad_L = madL*noisevar_L*5/(level+1);
-		//	float mad_a = mada*noisevar_abr;  // noisevar_abr between 0..2.25=default 100=middle value  ==> 582=max
-		//	float mad_b = madb*noisevar_abb;
-		//	printf("noisevarabr=%f\n",noisevar_abr);
-			if(autoch && noisevar_abr <=0.001f) noisevar_abr=0.02f;
-			if(autoch && noisevar_abb <=0.001f) noisevar_abb=0.02f;
-		
-			if (!noiseCCurve || noiseCCurve.getSum() < 5.f ){	//	printf("Chroma NON\n");
-
-							for (int i=0; i<H_ab; i++) {
-								for (int j=0; j<W_ab; j++) {
-									mad_aa[i*W_ab+j]=mada*(noisevar_abr);
-									mad_bb[i*W_ab+j]=madb*(noisevar_abb);
-									}//noisevarchrom	
-									}
+			if (!noiseCCurve || noiseCCurve.getSum() < 5.f ){
+				for (int i=0; i<H_ab; i++) {
+					for (int j=0; j<W_ab; j++) {
+						mad_aa[i*W_ab+j]=mada*(noisevar_abr);
+						mad_bb[i*W_ab+j]=madb*(noisevar_abb);
+					}//noisevarchrom	
+				}
 			}	
 			if (noiseCCurve  && noiseCCurve.getSum() > 5.f ){
-//	printf("chroma OUI\n");
-							for (int i=0; i<H_ab; i++) {
-								for (int j=0; j<W_ab; j++) {
-									mad_aa[i*W_ab+j]=mada*(noisevarchrom[i][j]);
-									mad_bb[i*W_ab+j]=madb*(noisevarchrom[i][j]);
-				}//noisevarchrom
-			}
+				for (int i=0; i<H_ab; i++) {
+					for (int j=0; j<W_ab; j++) {
+						mad_aa[i*W_ab+j]=mada*(noisevarchrom[i][j]);
+						mad_bb[i*W_ab+j]=madb*(noisevarchrom[i][j]);
+					}//noisevarchrom
+				}
 			}
 			
-			
-			if (noisevar_abr>0.001f  ||  noisevar_abb>0.001f ) { 
+			if (noisevar_abr>0.001f  ||  noisevar_abb>0.001f ) {
+
 				for(int i=0;i<H_ab;i++)
 					for(int j=0;j<W_ab;j++)
-						WavCoeffsLtemp[i*W_ab+j] = WavCoeffs_L[dir][((i*skip_L)/skip_ab)*W_L + ((j*skip_L)/skip_ab)];
+						WavCoeffsLtemp[i*W_ab+j] = WavCoeffs_L[dir][((i*skip_L)/skip_ab)*W_L + ((j*skip_L)/skip_ab)];
 
 #ifdef __SSE2__
 				int j;
 				__m128 onev = _mm_set1_ps(1.f);
-				__m128 madLm9v = _mm_set1_ps(madL * 9.f);
-			//	__m128 mad_av = _mm_set1_ps(mad_a);
+				__m128 rmadLm9v = onev / _mm_set1_ps(madL * 9.f);
 				__m128 mad_av ;
-				__m128 mad_bv;// = _mm_set1_ps(mad_b);
-				__m128 epsv = _mm_set1_ps(eps);
+				__m128 mad_bv;
 				__m128 mag_Lv, mag_av, mag_bv;
 				for (int i=0; i<H_ab; i++) {
 					int coeffloc_ab = i*W_ab;
@@ -2323,178 +2351,144 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo
 						mad_bv = LVFU(mad_bb[coeffloc_ab]);
 
 						mag_Lv = LVFU(WavCoeffsLtemp[coeffloc_ab]);
-						mag_av = SQRV(LVFU(WavCoeffs_a[dir][coeffloc_ab]))+epsv;
-						mag_bv = SQRV(LVFU(WavCoeffs_b[dir][coeffloc_ab]))+epsv;
-						mag_Lv = SQRV(mag_Lv) + epsv;
-						_mm_storeu_ps(&sfavea[coeffloc_ab], (onev-xexpf(-(mag_av/mad_av)-(mag_Lv/madLm9v))));
-						_mm_storeu_ps(&sfaveb[coeffloc_ab], (onev-xexpf(-(mag_bv/mad_bv)-(mag_Lv/madLm9v))));
+						mag_av = SQRV(LVFU(WavCoeffs_a[dir][coeffloc_ab]));
+						mag_bv = SQRV(LVFU(WavCoeffs_b[dir][coeffloc_ab]));
+						mag_Lv = (SQRV(mag_Lv)) * rmadLm9v;
+						_mm_storeu_ps(&sfavea[coeffloc_ab], (onev-xexpf(-(mag_av/mad_av)-(mag_Lv))));
+						_mm_storeu_ps(&sfaveb[coeffloc_ab], (onev-xexpf(-(mag_bv/mad_bv)-(mag_Lv))));
+
 					}
 					for (; j<W_ab; j++,coeffloc_ab++) {
-						float mag_L = SQR(WavCoeffsLtemp[coeffloc_ab])+eps;
-						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
-						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
+						float mag_L = SQR(WavCoeffsLtemp[coeffloc_ab]);
+						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab]);
+						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab]);
 						sfavea[coeffloc_ab] = (1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*madL))));
 						sfaveb[coeffloc_ab] = (1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*madL))));
 					}
-				}//now chrominance coefficients are denoised
-#else
+				}//now chrominance coefficients are denoised
 
+#else
 				for (int i=0; i<H_ab; i++) {
 					for (int j=0; j<W_ab; j++) {
 						int coeffloc_ab = i*W_ab+j;
 						int coeffloc_L	= ((i*skip_L)/skip_ab)*W_L + ((j*skip_L)/skip_ab);
-						float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L ])+eps;
-						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
-						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
+						float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L ]);
+						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab]);
+						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab]);
 						sfavea[coeffloc_ab] = (1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*madL))));
 						sfaveb[coeffloc_ab] = (1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*madL))));
-						// 'firm' threshold of chroma coefficients
-						//WavCoeffs_a[dir][coeffloc_ab] *= (1-exp(-(mag_a/mad_a)-(mag_L/(9*madL))));//(coeff_a>2*thresh_a ? 1 : (coeff_a<thresh_a ? 0 : (coeff_a/thresh_a - 1)));
-						//WavCoeffs_b[dir][coeffloc_ab] *= (1-exp(-(mag_b/mad_b)-(mag_L/(9*madL))));//(coeff_b>2*thresh_b ? 1 : (coeff_b<thresh_b ? 0 : (coeff_b/thresh_b - 1)));
 					}
 				}//now chrominance coefficients are denoised
 #endif
-
-				boxblur(sfavea, sfavea, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
-				boxblur(sfaveb, sfaveb, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
-
-#ifdef __SSE2__
+
+				boxblur(sfavea, sfavead, blurBuffer, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
+				boxblur(sfaveb, sfavebd, blurBuffer, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
+
+#ifdef __SSE2__
+				__m128 epsv = _mm_set1_ps(eps);
 				__m128 sfav, sfbv;
 				__m128 sfaveav, sfavebv;
 				for (int i=0; i<H_ab; i++) {
 					int coeffloc_ab = i*W_ab;
 					for (j=0; j<W_ab-3; j+=4,coeffloc_ab+=4) {
-						mag_Lv = LVFU(WavCoeffsLtemp[coeffloc_ab]);
-						mag_av = SQRV(LVFU(WavCoeffs_a[dir][coeffloc_ab]))+epsv;
-						mag_bv = SQRV(LVFU(WavCoeffs_b[dir][coeffloc_ab]))+epsv;
-						mag_Lv = SQRV(mag_Lv) + epsv;
-						sfav = (onev - xexpf(-(mag_av/mad_av)-(mag_Lv/madLm9v)));
-						sfbv = (onev - xexpf(-(mag_bv/mad_bv)-(mag_Lv/madLm9v)));
-						sfaveav = LVFU(sfavea[coeffloc_ab]);
-						sfavebv = LVFU(sfaveb[coeffloc_ab]);
-						//use smoothed shrinkage unless local shrinkage is much less
+						sfav = LVFU(sfavea[coeffloc_ab]);
+						sfbv = LVFU(sfaveb[coeffloc_ab]);
+						sfaveav = LVFU(sfavead[coeffloc_ab]);
+						sfavebv = LVFU(sfavebd[coeffloc_ab]);
+
+						//use smoothed shrinkage unless local shrinkage is much less
 						_mm_storeu_ps( &WavCoeffs_a[dir][coeffloc_ab], LVFU(WavCoeffs_a[dir][coeffloc_ab]) * (SQRV(sfaveav)+SQRV(sfav))/(sfaveav+sfav+epsv));
 						_mm_storeu_ps( &WavCoeffs_b[dir][coeffloc_ab], LVFU(WavCoeffs_b[dir][coeffloc_ab]) * (SQRV(sfavebv)+SQRV(sfbv))/(sfavebv+sfbv+epsv));
 
 					}//now chrominance coefficients are denoised
 					for (; j<W_ab; j++,coeffloc_ab++) {
-							//modification Jacques feb 2013
-						float mag_L = SQR(WavCoeffsLtemp[coeffloc_ab])+eps;
-
-						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
-						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
-
-						float sfa = (1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*madL))));
-						float sfb = (1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*madL))));
+						//modification Jacques feb 2013
+						float sfa = sfavea[coeffloc_ab];
+						float sfb = sfaveb[coeffloc_ab];
 
 						//use smoothed shrinkage unless local shrinkage is much less
-						WavCoeffs_a[dir][coeffloc_ab] *= (SQR(sfavea[coeffloc_ab])+SQR(sfa))/(sfavea[coeffloc_ab]+sfa+eps);
-						WavCoeffs_b[dir][coeffloc_ab] *= (SQR(sfaveb[coeffloc_ab])+SQR(sfb))/(sfaveb[coeffloc_ab]+sfb+eps);
+						WavCoeffs_a[dir][coeffloc_ab] *= (SQR(sfavead[coeffloc_ab])+SQR(sfa))/(sfavead[coeffloc_ab]+sfa+eps);
+						WavCoeffs_b[dir][coeffloc_ab] *= (SQR(sfavebd[coeffloc_ab])+SQR(sfb))/(sfavebd[coeffloc_ab]+sfb+eps);
 
 					}//now chrominance coefficients are denoised
 				}
 #else
-
 				for (int i=0; i<H_ab; i++) {
 					for (int j=0; j<W_ab; j++) {
-
 						int coeffloc_ab = i*W_ab+j;
-//						int coeffloc_L	= ((i*skip_L)/skip_ab)*W_L + ((j*skip_L)/skip_ab);
-						float mag_L = SQR(WavCoeffsLtemp[coeffloc_ab])+eps;
-
-//						float mag_L = SQR(WavCoeffs_L[dir][coeffloc_L ])+eps;
-						float mag_a = SQR(WavCoeffs_a[dir][coeffloc_ab])+eps;
-						float mag_b = SQR(WavCoeffs_b[dir][coeffloc_ab])+eps;
-
-						float sfa = (1.f-xexpf(-(mag_a/mad_aa[coeffloc_ab])-(mag_L/(9.f*madL))));
-						float sfb = (1.f-xexpf(-(mag_b/mad_bb[coeffloc_ab])-(mag_L/(9.f*madL))));
+						float sfa = sfavea[coeffloc_ab];
+						float sfb = sfaveb[coeffloc_ab];
 
 						//use smoothed shrinkage unless local shrinkage is much less
-						WavCoeffs_a[dir][coeffloc_ab] *= (SQR(sfavea[coeffloc_ab])+SQR(sfa))/(sfavea[coeffloc_ab]+sfa+eps);
-						WavCoeffs_b[dir][coeffloc_ab] *= (SQR(sfaveb[coeffloc_ab])+SQR(sfb))/(sfaveb[coeffloc_ab]+sfb+eps);
+						WavCoeffs_a[dir][coeffloc_ab] *= (SQR(sfavead[coeffloc_ab])+SQR(sfa))/(sfavead[coeffloc_ab]+sfa+eps);
+						WavCoeffs_b[dir][coeffloc_ab] *= (SQR(sfavebd[coeffloc_ab])+SQR(sfb))/(sfavebd[coeffloc_ab]+sfb+eps);
 
 					}//now chrominance coefficients are denoised
 				}
 #endif
 			}
-	   // 	if (settings->verbose) printf("noisevar=%f  dnzero=%f \n",noisevar_L, noiseLCurve.sum);
 			
 			if (noisevar_L>0.00001f) {
-					if (!noiseLCurve || noiseLCurve.getSum() < 7.f ){//under 7 quasi no action
-				//	printf("Luma sans\n");
-							for (int i=0; i<H_L; i++) {
-								for (int j=0; j<W_L; j++) {
-									mad_LL[i*W_L+j]=madL*(noisevar_L)*5/(level+1);
-									}//noisevarlum
-							}
-						}
-					else {if (noiseLCurve && noiseLCurve.getSum() >= 7.f) { // printf("Luma avec\n");
-
-							for (int i=0; i<H_L; i++) {
-								for (int j=0; j<W_L; j++) {
-									mad_LL[i*W_L+j]=madL*(noisevarlum[i][j])*5/(level+1);
-									}//noisevarlum
-							}
-						}
-						}
-//I think modifications I done are good for SSE2 ??					
+				if (!noiseLCurve || noiseLCurve.getSum() < 7.f ){//under 7 quasi no action
+					for (int i=0; i<H_L; i++) {
+						for (int j=0; j<W_L; j++) {
+							mad_LL[i*W_L+j]=madL*(noisevar_L)*5/(level+1);
+						}//noisevarlum
+					}
+				}
+				else if (noiseLCurve && noiseLCurve.getSum() >= 7.f) {
+					float precalc = madL * 5.f / (float)(level+1);
+					for (int i=0; i<H_L; i++) {
+						for (int j=0; j<W_L; j++) {
+							mad_LL[i*W_L+j]=precalc*(noisevarlum[i][j]);
+						}//noisevarlum
+					}
+				}
 #ifdef __SSE2__
 				__m128	magv;
-			//	__m128  mad_Lv = _mm_set1_ps( mad_L );
 				__m128  mad_Lv;
 				__m128	ninev = _mm_set1_ps( 9.0f );
-				__m128 	epsv = _mm_set1_ps( eps );
-				for (int i=0; i<W_L*H_L-3; i+=4) {
-				//	mad_Lv =_mm_set1_ps( 100.f );
+				__m128 	epsv = _mm_set1_ps( eps );
+				int i;
+				for (i=0; i<W_L*H_L-3; i+=4) {
 					mad_Lv = LVFU(mad_LL[i]);
 					magv = SQRV(LVFU(WavCoeffs_L[dir][i]));
 					_mm_storeu_ps( &sfave[i], magv / (magv + mad_Lv*xexpf(-magv/(ninev * mad_Lv)) + epsv));
 				}
-				for (int i=(W_L*H_L)-((W_L*H_L)%4); i<W_L*H_L; i++) {
+				for (; i<W_L*H_L; i++) {
 					float mag = SQR(WavCoeffs_L[dir][i]);
 					sfave[i] = mag/(mag+mad_LL[i]*xexpf(-mag/(9*mad_LL[i]))+eps);
 				}
 #else
-
 				for (int i=0; i<W_L*H_L; i++) {
 
 					float mag = SQR(WavCoeffs_L[dir][i]);
 					float shrinkfactor = mag/(mag+mad_LL[i]*xexpf(-mag/(9*mad_LL[i]))+eps);
-
-					//WavCoeffs_L[dir][i] *= shrinkfactor;
 					sfave[i] = shrinkfactor;
 				}
 #endif
-				boxblur(sfave, sfave, level+2, level+2, W_L, H_L);//increase smoothness by locally averaging shrinkage
+				boxblur(sfave, sfaved, blurBuffer, level+2, level+2, W_L, H_L);//increase smoothness by locally averaging shrinkage
 
 #ifdef __SSE2__ 
 				__m128	sfv;
 				for (int i=0; i<W_L*H_L-3; i+=4) {
-					magv = SQRV( LVFU(WavCoeffs_L[dir][i]));
-					
-				 //   mad_Lv =_mm_set1_ps( mad_LL[i] );
-					mad_Lv = LVFU(mad_LL[i]);
-					
-					sfv = magv/(magv + mad_Lv * xexpf( -magv / (ninev * mad_Lv)) + epsv );
+					sfv = LVFU(sfave[i]);
 					//use smoothed shrinkage unless local shrinkage is much less
-					_mm_storeu_ps( &WavCoeffs_L[dir][i], _mm_loadu_ps( &WavCoeffs_L[dir][i]) * (SQRV( LVFU(sfave[i] )) + SQRV(sfv)) / (LVFU(sfave[i])+sfv+epsv));
+					_mm_storeu_ps( &WavCoeffs_L[dir][i], _mm_loadu_ps( &WavCoeffs_L[dir][i]) * (SQRV( LVFU(sfaved[i] )) + SQRV(sfv)) / (LVFU(sfaved[i])+sfv+epsv));
 				}
 				for (int i=(W_L*H_L)-((W_L*H_L)%4); i<W_L*H_L; i++) {
-					float mag = SQR(WavCoeffs_L[dir][i]);
-					float sf = mag/(mag+mad_LL[i]*xexpf(-mag/(9*mad_LL[i]))+eps);
+					float sf = sfave[i];
 
 					//use smoothed shrinkage unless local shrinkage is much less
-					WavCoeffs_L[dir][i] *= (SQR(sfave[i])+SQR(sf))/(sfave[i]+sf+eps);
+					WavCoeffs_L[dir][i] *= (SQR(sfaved[i])+SQR(sf))/(sfaved[i]+sf+eps);
 				}//now luminance coefficients are denoised
 
 #else 
-
 				for (int i=0; i<W_L*H_L; i++) {
-					float mag = SQR(WavCoeffs_L[dir][i]);
-					float sf = mag/(mag+mad_LL[i]*xexpf(-mag/(9.f*mad_LL[i]))+eps);
+					float sf = sfave[i];
 
 					//use smoothed shrinkage unless local shrinkage is much less
-					WavCoeffs_L[dir][i] *= (SQR(sfave[i])+SQR(sf))/(sfave[i]+sf+eps);
+					WavCoeffs_L[dir][i] *= (SQR(sfaved[i])+SQR(sf))/(sfaved[i]+sf+eps);
 
 				}//now luminance coefficients are denoised
 #endif
@@ -2502,82 +2496,46 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll(float ** WavCoeffs_L, float ** WavCo
 
 
 		}
-		delete[] sfave;
-		delete[] sfavea;
-		delete[] sfaveb;
-		delete[] WavCoeffsLtemp;
-	//	delete[] mad_L;
-		
+
 	}
 
-	
-SSEFUNCTION	void ImProcFunctions::ShrinkAll_info(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level,
-									int W_L, int H_L, int W_ab, int H_ab,int skip_L, int skip_ab, float noisevar_L, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float * mad_LL, float * mad_aa, float * mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, int &Nb, float &redaut, float &blueaut,
-									float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int callby, bool autoch, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc,
-									float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue,
-									float * madaa, float * madab, float * madaL, bool madCalculated )
 
-									{
-		//simple wavelet shrinkage
-		const float eps = 0.01f;
-		if(W_L != width) W_L=width;
-		if(H_L != height) H_L=height;
-		
-		float * sfave = new float[W_L*H_L];
-		float * sfavea = new float[W_L*H_L];
-		float * sfaveb = new float[W_L*H_L];
-		float * WavCoeffsLtemp = new float[H_ab*W_ab];
-
-//		int max;
-	//	bool calcul=false;
-		//printf("\n level=%d  \n",level);
-	//	if(autoch) {
-	 if(lvl==1){//only one time
-	     float chro=0.f;
-		 float maxchro=0.f;
-		 float dev=0.f;
-		 float devL=0.f;
-		 int nc=0;
-		 int nL=0;
-		 int nry=0;
-		 float lume=0.f;
-		 float red_yel=0.f;
-		 float skin_c=0.f;
-		 int nsk=0;
-		 float nsk_nc=0.f;
-			for (int i=0; i<H_ab; i++) {
-								for (int j=0; j<W_ab; j++) {
-								if(noisevarchrom[i][j] > maxchro) maxchro= noisevarchrom[i][j];
-									chro+=noisevarchrom[i][j];nc++;
-				}
+SSEFUNCTION	void ImProcFunctions::ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b, int level,
+									int W_ab, int H_ab, int skip_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float * mad_aa, float * mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut,
+									float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut,  bool autoch, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc,
+									float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool perf, bool multiThread) {
+
+	//simple wavelet shrinkage
+	if(lvl==1){//only one time
+		float chro = 0.f;
+		float dev = 0.f;
+		float devL = 0.f;
+		int nc = 0;
+		int nL = 0;
+		int nry = 0;
+		float lume = 0.f;
+		float red_yel = 0.f;
+		float skin_c = 0.f;
+		int nsk = 0;
+		float nsk_nc = 0.f;
+		for (int i=0; i<H_ab; i++) {
+			for (int j=0; j<W_ab; j++) {
+				chro += noisevarchrom[i][j];
+				nc++;
+				dev += SQR(noisevarchrom[i][j]-(chro/nc));
+				if(noisevarhue[i][j] > -0.8f && noisevarhue[i][j] < 2.0f && noisevarchrom[i][j] > 10000.f) {//saturated red yellow
+					red_yel += noisevarchrom[i][j];
+					nry++;
+				}	
+				if(noisevarhue[i][j] > 0.f && noisevarhue[i][j] < 1.6f && noisevarchrom[i][j] < 10000.f) {//skin
+					skin_c += noisevarchrom[i][j];
+					nsk++;
+				}	
+				lume += noisevarlum[i][j];
+				nL++;
+				devL += SQR(noisevarlum[i][j]-(lume/nL));
 			}
-			for (int i=0; i<H_ab; i++) {
-								for (int j=0; j<W_ab; j++) {
-								dev+=SQR(noisevarchrom[i][j]-(chro/nc));
-				}
-			}
-			for (int i=0; i<H_ab; i++) {
-								for (int j=0; j<W_ab; j++) {
-								if(noisevarhue[i][j] > -0.8f && noisevarhue[i][j] < 2.0f && noisevarchrom[i][j] > 10000.f)//saturated red yellow
-								{red_yel+=noisevarchrom[i][j];nry++;}	
-								if(noisevarhue[i][j] > 0.f && noisevarhue[i][j] < 1.6f && noisevarchrom[i][j] < 10000.f)//skin
-								{skin_c+=noisevarchrom[i][j];nsk++;}	
-								
-				}
-			}
-			for (int i=0; i<H_L; i++) {
-								for (int j=0; j<W_L; j++) {
-									lume+=noisevarlum[i][j];nL++;
-				}
-			}	
-			for (int i=0; i<H_L; i++) {
-								for (int j=0; j<W_L; j++) {
-									devL+=SQR(noisevarlum[i][j]-(lume/nL));
-				}
-				
-			}
-			
-//		printf("chro=%f maxc=%f sigma=%f max2sigma=%f lum=%f\n",chro/(nc),maxchro, sqrt(dev/nc), (chro/(nc))+ 2.f*sqrt(dev/nc), lume/nL);
+		}
 		if(nc>0) {
 			chromina=chro/nc;
 			sigma=sqrt(dev/nc);
@@ -2593,737 +2551,564 @@ SSEFUNCTION	void ImProcFunctions::ShrinkAll_info(float ** WavCoeffs_L, float **
 			redyel=red_yel/nry;
 		if(nsk>0)
 			skinc=skin_c/nsk;
-		
-		
-	//	printf("redy=%f ski=%f nsk=%d nc=%d pcsk=%f\n",red_yel/nry,skinc,nsk, nc, (float)nsk/(float)nc );
-	//	chroG=;
-	//	chroB=;
 	}
-	
-		for (int dir=1; dir<4; dir++) {
-			float mada, madb, madL;
-			if(madCalculated) {
-				mada = madaa[dir-1];
-				madb = madab[dir-1];
-				madL = madaL[dir-1] ;
-			} else {
-				int * madHisto = new int[65536];
-				mada = SQR(Mad(WavCoeffs_a[dir], W_ab*H_ab, madHisto));
-				madb = SQR(Mad(WavCoeffs_b[dir], W_ab*H_ab, madHisto));
-				madL = SQR(Mad(WavCoeffs_L[dir], W_L*H_L, madHisto));
-				delete [] madHisto;
-			//	madCalculated=true;
-			}
+
+	const float reduc = (schoice == 2) ? (float) settings->nrhigh : 1.f;
+	for (int dir=1; dir<4; dir++) {
+		float mada, madb;
+		if(!perf)
+			mada = SQR(Mad(WavCoeffs_a[dir], W_ab*H_ab));
+		else
+			mada = SQR(MadRgb(WavCoeffs_a[dir], W_ab*H_ab));
+		chred += mada;
+		if(mada > maxchred)
+			maxchred = mada;
+		if(mada < minchred)
+			minchred = mada;
+		maxredaut = sqrt(reduc*maxchred);
+		minredaut = sqrt(reduc*minchred);
+
+		if(!perf)
+			madb = SQR(Mad(WavCoeffs_b[dir], W_ab*H_ab));
+		else
+			madb = SQR(MadRgb(WavCoeffs_b[dir], W_ab*H_ab));
+		chblue += madb;
+		if(madb > maxchblue)
+			maxchblue = madb;
+		if(madb < minchblue)
+			minchblue = madb;
+		maxblueaut = sqrt(reduc*maxchblue);
+		minblueaut = sqrt(reduc*minchblue);
 				
-			if(callby==0){	
-				chau+=(mada+madb);
-				chred+=mada;
-				chblue+=madb;
-				if(mada > maxchred) maxchred=mada;
-				if(madb > maxchblue) maxchblue=madb;
-				if(mada < minchred) minchred=mada;
-				if(madb < minchblue) minchblue=madb;
-				nb++;	
-				//here evaluation of automatic 
-			//	printf("WAL  dir=%d  mada=%4.0f madb=%4.0f  skip_ab=%i minR=%4.0f minB=%4.0f nb=%d\n",dir, sqrt(mada),sqrt(madb), skip_ab, sqrt(minchred), sqrt(minchblue), nb);
-				mada=madb=0.f;
-				float reduc=1.f;
-			//	if(schoice==2) reduc=0.6f;
-				if(schoice==2) reduc=(float) settings->nrhigh;
-				chaut=sqrt(reduc*chau/(nb + nb));
-				redaut=sqrt(reduc*chred/nb);
-				blueaut=sqrt(reduc*chblue/nb);
-				maxredaut=sqrt(reduc*maxchred);
-				maxblueaut=sqrt(reduc*maxchblue);
-				minredaut=sqrt(reduc*minchred);
-				minblueaut=sqrt(reduc*minchblue);
-				Nb=nb;
-			}
-		}
-	//	}
-		
-		delete[] sfave;
-		delete[] sfavea;
-		delete[] sfaveb;
-		delete[] WavCoeffsLtemp;
-	//	delete[] mad_L;
-		
+		chau += (mada+madb);
+		nb++;
+		//here evaluation of automatic 
+		chaut = sqrt(reduc*chau/(nb + nb));
+		redaut = sqrt(reduc*chred/nb);
+		blueaut = sqrt(reduc*chblue/nb);
+		Nb = nb;
+	}
+
+}
+
+
+void ImProcFunctions::WaveletDenoiseAll_info(int levwav, wavelet_decomposition &WaveletCoeffs_a,
+											wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut,int schoice, bool autoch, 
+											float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool perf, bool multiThread ){
+
+	int maxlvl = levwav;
+	for (int lvl=0; lvl<maxlvl; lvl++) {
+
+		int Wlvl_ab = WaveletCoeffs_a.level_W(lvl);
+		int Hlvl_ab = WaveletCoeffs_a.level_H(lvl);
+
+		int skip_ab = WaveletCoeffs_a.level_stride(lvl);
+
+		float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
+		float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
+
+		ShrinkAll_info(WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_ab, Hlvl_ab,
+		  skip_ab, noisevarlum,  noisevarchrom, noisevarhue, width, height, mad_aa, mad_bb, noisevar_abr, noisevar_abb, noi, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, 
+		 autoch, schoice, lvl, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc, maxchred, maxchblue, minchred, minchblue, nb, chau, chred, chblue, perf, multiThread );
+
 	}
+}
 	
-	
-	
-	void ImProcFunctions::WaveletDenoiseAll_info(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a,
-											wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut,int schoice, bool autoch, 
-											float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue ){
-
-	
-
-		int maxlvl = WaveletCoeffs_L.maxlevel();
-//       printf("maxlevel = %d\n",maxlvl);
-//omp_set_nested(true);
-		for (int lvl=0; lvl<maxlvl; lvl++) {
-
-			int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
-			int Hlvl_L = WaveletCoeffs_L.level_H(lvl);
-
-			int Wlvl_ab = WaveletCoeffs_a.level_W(lvl);
-			int Hlvl_ab = WaveletCoeffs_a.level_H(lvl);
-
-
-			int skip_L = WaveletCoeffs_L.level_stride(lvl);
-			int skip_ab = WaveletCoeffs_a.level_stride(lvl);
-
-			float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
-			float ** WavCoeffs_a = WaveletCoeffs_a.level_coeffs(lvl);
-			float ** WavCoeffs_b = WaveletCoeffs_b.level_coeffs(lvl);
-
-      //     printf("HL : %d\n", Hlvl_L);
-       //    printf("WL : %d\n", Wl ||vl_L);
-			int callby=0;
-			if(schoice==0  || schoice==2) callby=0;
-		//	printf("lv=%d\n", lvl);
-			ShrinkAll_info(WavCoeffs_L, WavCoeffs_a, WavCoeffs_b, lvl, Wlvl_L, Hlvl_L, Wlvl_ab, Hlvl_ab,
-			  skip_L, skip_ab, noisevar_L, noisevarlum,  noisevarchrom, noisevarhue, width, height, mad_LL, mad_aa, mad_bb, noisevar_abr, noisevar_abb, noi, noiseLCurve, noiseCCurve, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, 
-			  callby, autoch, schoice, lvl, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc, maxchred, maxchblue, minchred, minchblue, nb, chau, chred, chblue );
-
-		}
-//omp_set_nested(false);
+void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope) {
+	gam = dnparams.gamma;
+	gamthresh = 0.001f;
+	if(!isRAW) {//reduce gamma under 1 for Lab mode ==> TIF and JPG
+		if(gam <1.9f)
+			gam=1.f - (1.9f-gam)/3.f;//minimum gamma 0.7
+		else if (gam >= 1.9f && gam <= 3.f)
+			gam=(1.4f/1.1f)*gam - 1.41818f;
 	}
-	
-	void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope) {
-		gam = dnparams.gamma;
-		gamthresh = 0.001f;
-		if(!isRAW) {//reduce gamma under 1 for Lab mode ==> TIF and JPG
-		 if(gam <1.9f) gam=1.f - (1.9f-gam)/3.f;//minimum gamma 0.7
-		 else if (gam >= 1.9f && gam <= 3.f) gam=(1.4f/1.1f)*gam - 1.41818f;
-		 }
-		gamslope = exp(log((double)gamthresh)/gam)/gamthresh;
-		bool perf = (dnparams.dmethod=="RGB");
-		if(perf) {
+	gamslope = exp(log((double)gamthresh)/gam)/gamthresh;
+	bool perf = (dnparams.dmethod=="RGB");
+	if(perf) {
 		for (int i=0; i<65536; i++) {
 			gamcurve[i] = (Color::gamma((double)i/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f;
 		}
-		}
-		else  {
+	}
+	else {
 		for (int i=0; i<65536; i++) {
 			gamcurve[i] = (Color::gamman((double)i/65535.0,gam)) * 32768.0f;
 		}
-		}
-
 	}
+}
 
-void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc)
-	{//lissage = 1 more action
-	    float reducdelta=1.f;
-		if(params->dirpyrDenoise.smethod=="shalbi") reducdelta=(float) settings->nrhigh;
+void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc) {
+
+	float reducdelta=1.f;
+	if (params->dirpyrDenoise.smethod == "shalbi")
+		reducdelta = (float)settings->nrhigh;
+
+	chaut = (chaut*Nb-maxmax)/(Nb-1);//suppress maximum for chaut calcul
+	if (redyel > 5000.f || skinc > 1000.f && nsknc < 0.4f  && chromina > 3000.f)
+		chaut *= 0.45f;//reduct action in red zone, except skin for high / med chroma
+	else if (redyel>12000.f || skinc > 1200.f && nsknc < 0.3f && chromina > 3000.f)
+		chaut *= 0.3f;
 	
-		chaut = (chaut*Nb-maxmax)/(Nb-1);//suppress maximum for chaut calcul
-	    if(redyel>5000.f || skinc > 1000.f && nsknc < 0.4f  && chromina > 3000.f ) chaut *=0.45f;//reduct action in red zone, except skin for high / med chroma
-	    else if(redyel>12000.f || skinc > 1200.f && nsknc < 0.3f && chromina > 3000.f ) chaut *=0.3f;
-		
-		if(mode==0 || mode==2) {//Preview or Auto multizone
-			if(chromina > 10000.f) chaut*=0.7f;//decrease action for high chroma  (visible noise)
-			else if(chromina > 6000.f) chaut*=0.9f;	
-			else if(chromina < 3000.f) chaut*=1.2f;//increase action in low chroma==> 1.2  /==>2.0 ==> curve CC
-			else if(chromina < 2000.f) chaut*=1.5f;//increase action in low chroma==> 1.5 / ==>2.7
-			
-			if(lumema < 2500.f) chaut*=1.3f;//increase action for low light
-			else if(lumema < 5000.f) chaut*=1.2f;
-			else if(lumema > 20000.f) chaut*=0.9f;//decrease for high light		
-		}
-		else if(mode==1){//auto ==> less coefficient because interaction
-			if(chromina > 10000.f) chaut*=0.8f;//decrease action for high chroma  (visible noise)
-			else if(chromina > 6000.f) chaut*=0.9f;	
-			else if(chromina < 3000.f) chaut*=1.5f;//increase action in low chroma
-			else if(chromina < 2000.f) chaut*=2.2f;//increase action in low chroma
-			if(lumema < 2500.f) chaut*=1.2f;//increase action for low light
-			else if(lumema < 5000.f) chaut*=1.1f;
-			else if(lumema > 20000.f) chaut*=0.9f;//decrease for high light		
-		
-		}
-		if(levaut==0)  {//Low denoise
-				if(chaut > 300.f) chaut = 0.714286f*chaut + 85.71428f;
-		}
-		delta = maxmax-chaut;
-		delta*=reducdelta;
-		
-		if(lissage==1 || lissage==2) {
-		if(chaut < 200.f && delta < 200.f ) delta*=0.95f;
-		else if(chaut < 200.f && delta < 400.f ) delta*=0.5f;
-		else if(chaut < 200.f && delta >= 400.f ) delta=200.f;
-		else if(chaut < 400.f && delta < 400.f ) delta*=0.4f;
-		else if(chaut < 400.f && delta >= 400.f ) delta=120.f;
-		else if(chaut < 550.f) delta*=0.15f;
-		else if(chaut < 650.f) delta*=0.1f;
-		else if(chaut >= 650.f) delta*=0.07f;
-		if(mode==0 || mode==2) {//Preview or Auto multizone
-			if(chromina < 6000.f) delta*=1.4f;//increase maxi
-			if(lumema < 5000.f) delta*=1.4f;
-		}
-		else if(mode==1) {//Auto
-			if(chromina < 6000.f) delta*=1.2f;//increase maxi
-			if(lumema < 5000.f) delta*=1.2f;
-		}
-		}
-		if(lissage==0) {
-		if(chaut < 200.f && delta < 200.f ) delta*=0.95f;
-		else if(chaut < 200.f && delta < 400.f ) delta*=0.7f;
-		else if(chaut < 200.f && delta >= 400.f ) delta=280.f;
-		else if(chaut < 400.f && delta < 400.f ) delta*=0.6f;
-		else if(chaut < 400.f && delta >= 400.f ) delta=200.f;
-		else if(chaut < 550.f) delta*=0.3f;
-		else if(chaut < 650.f) delta*=0.2f;
-		else if(chaut >= 650.f) delta*=0.15f;
-		if(mode==0 || mode==2) {//Preview or Auto multizone
-			if(chromina < 6000.f) delta*=1.4f;//increase maxi
-			if(lumema < 5000.f) delta*=1.4f;
-		}
-		else if(mode==1) {//Auto
-			if(chromina < 6000.f) delta*=1.2f;//increase maxi
-			if(lumema < 5000.f) delta*=1.2f;
-		}
-		}
+	if (mode==0 || mode==2) {//Preview or Auto multizone
+		if (chromina > 10000.f)	chaut *= 0.7f;//decrease action for high chroma  (visible noise)
+		else if (chromina > 6000.f)	chaut *= 0.9f;	
+		else if (chromina < 3000.f)	chaut *= 1.2f;//increase action in low chroma==> 1.2  /==>2.0 ==> curve CC
+		else if (chromina < 2000.f)	chaut *= 1.5f;//increase action in low chroma==> 1.5 / ==>2.7
 		
+		if (lumema < 2500.f) chaut *= 1.3f;//increase action for low light
+		else if (lumema < 5000.f) chaut *= 1.2f;
+		else if (lumema > 20000.f) chaut *= 0.9f;//decrease for high light		
+	} else if (mode == 1) {//auto ==> less coefficient because interaction
+		if (chromina > 10000.f)	chaut *= 0.8f;//decrease action for high chroma  (visible noise)
+		else if (chromina > 6000.f)	chaut *= 0.9f;	
+		else if (chromina < 3000.f)	chaut *= 1.5f;//increase action in low chroma
+		else if (chromina < 2000.f)	chaut *= 2.2f;//increase action in low chroma
+		if (lumema < 2500.f) chaut *= 1.2f;//increase action for low light
+		else if (lumema < 5000.f) chaut *= 1.1f;
+		else if (lumema > 20000.f) chaut *= 0.9f;//decrease for high light		
 	}
-	
-	void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * dst,Imagefloat * provicalc, bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, int &Nb,  float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, float &nresi, float &highresi, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc)
-	{
-//		StopWatch Stop1("RGB_denoise_info");
 
-
-	if (dnparams.luma==0 && dnparams.chroma==0 && ((settings->leveldnautsimpl==1 && dnparams.Cmethod=="MAN") || (settings->leveldnautsimpl==0 && dnparams.C2method!="MANU"))  && !dnparams.median && !noiseLCurve && !noiseCCurve) {
-		//nothing to do; copy src to dst or do nothing in case src == dst
+	if(levaut==0) {//Low denoise
+		if(chaut > 300.f)
+			chaut = 0.714286f*chaut + 85.71428f;
+	}
+	
+	delta = maxmax-chaut;
+	delta *= reducdelta;
+	
+	if (lissage==1 || lissage==2) {
+		if (chaut < 200.f && delta < 200.f ) delta *= 0.95f;
+		else if (chaut < 200.f && delta < 400.f ) delta *= 0.5f;
+		else if (chaut < 200.f && delta >= 400.f ) delta = 200.f;
+		else if (chaut < 400.f && delta < 400.f ) delta *= 0.4f;
+		else if (chaut < 400.f && delta >= 400.f ) delta = 120.f;
+		else if (chaut < 550.f) delta *= 0.15f;
+		else if (chaut < 650.f) delta *= 0.1f;
+		else if (chaut >= 650.f) delta *= 0.07f;
+		if (mode==0 || mode==2) {//Preview or Auto multizone
+			if (chromina < 6000.f) delta *= 1.4f;//increase maxi
+			if (lumema < 5000.f) delta *= 1.4f;
+		}
+		else if (mode==1) {//Auto
+			if (chromina < 6000.f) delta *= 1.2f;//increase maxi
+			if (lumema < 5000.f) delta *= 1.2f;
+		}
+	}
+	if (lissage==0) {
+		if (chaut < 200.f && delta < 200.f ) delta *= 0.95f;
+		else if (chaut < 200.f && delta < 400.f ) delta *= 0.7f;
+		else if (chaut < 200.f && delta >= 400.f ) delta = 280.f;
+		else if (chaut < 400.f && delta < 400.f ) delta *= 0.6f;
+		else if (chaut < 400.f && delta >= 400.f ) delta = 200.f;
+		else if (chaut < 550.f) delta *= 0.3f;
+		else if (chaut < 650.f) delta *= 0.2f;
+		else if (chaut >= 650.f) delta *= 0.15f;
+		if (mode==0 || mode==2) {//Preview or Auto multizone
+			if (chromina < 6000.f) delta *= 1.4f;//increase maxi
+			if (lumema < 5000.f) delta *= 1.4f;
+		}
+		else if (mode==1) {//Auto
+			if (chromina < 6000.f) delta *= 1.2f;//increase maxi
+			if (lumema < 5000.f) delta *= 1.2f;
+		}
+	}
+	
+}
+	
+SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, const bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb,  float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, float &nresi, float &highresi, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread)
+{
+	if ((settings->leveldnautsimpl==1 && dnparams.Cmethod == "MAN") || (settings->leveldnautsimpl==0 && dnparams.C2method == "MANU")) {
+		//nothing to do
 		return;
 	}
 
+	int hei,wid;
+	float** lumcalc;
+	float** acalc;
+	float** bcalc;
+	hei=provicalc->height;
+	wid=provicalc->width;
+	TMatrix wprofi = iccStore->workingSpaceMatrix (params->icm.working);
 
-		int hei,wid;
-//		float LLum,AAum,BBum;
-		float** lumcalc;
-		float** acalc;
-		float** bcalc;
-		if(noiseLCurve || noiseCCurve)	{	
-			hei=provicalc->height;
-			wid=provicalc->width;
-			TMatrix wprofi = iccStore->workingSpaceMatrix (params->icm.working);
-
-			double wpi[3][3] = {
-				{wprofi[0][0],wprofi[0][1],wprofi[0][2]},
-				{wprofi[1][0],wprofi[1][1],wprofi[1][2]},
-				{wprofi[2][0],wprofi[2][1],wprofi[2][2]}
-				};
-			lumcalc = new float*[hei];
-				for (int i=0; i<hei; i++)
-					lumcalc[i] = new float[wid];
-			acalc = new float*[hei];
-				for (int i=0; i<hei; i++)
-					acalc[i] = new float[wid];
-			bcalc = new float*[hei];
-				for (int i=0; i<hei; i++)
-					bcalc[i] = new float[wid];
-												
-			for(int ii=0;ii<hei;ii++){
-				for(int jj=0;jj<wid;jj++){
-					float LLum,AAum,BBum;
-					float RL = provicalc->r(ii,jj);
-					float GL = provicalc->g(ii,jj);
-					float BL = provicalc->b(ii,jj);
-								// determine luminance for noisecurve
-					float XL,YL,ZL;
-					Color::rgbxyz(RL,GL,BL,XL,YL,ZL,wpi);
-					Color::XYZ2Lab(XL, YL, ZL, LLum, AAum, BBum);							
-					lumcalc[ii][jj]=LLum;
-					acalc[ii][jj]=AAum;
-					bcalc[ii][jj]=BBum;
-				}
-			}
-		
-}		
-
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-		const short int imheight=src->height, imwidth=src->width;
-
-	if (dnparams.luma!=0 || dnparams.chroma!=0 || dnparams.methodmed=="Lab" || dnparams.methodmed=="Lonly" ) {
-		bool perf = (dnparams.dmethod=="RGB");
-		// gamma transform for input data
-//		LUTf gamcurve(65536,0);
-//		float gam, gamthresh, gamslope;
-//		RGB_denoise_infoGamCurve(dnparams, isRAW, gamcurve, gam, gamthresh, gamslope);
-		
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-		const float gain = pow (2.0f, float(expcomp));
-		float incr=1.f;
-		float noisevar_Ldetail = SQR((float)(SQR(100.-dnparams.Ldetail) + 50.*(100.-dnparams.Ldetail)) * TS * 0.5f * incr);
-		bool enhance_denoise = dnparams.enhance;
-		int gamlab = settings->denoiselabgamma;//gamma lab essentialy for Luminance detail
-		if(gamlab > 2) gamlab=2;
-		
-/*
-		array2D<float> tilemask_in(TS,TS);
-		array2D<float> tilemask_out(TS,TS);
-
-		const int border = MAX(2,TS/16);
-		
-
-		for (int i=0; i<TS; i++) {
-			float i1 = abs((i>TS/2 ? i-TS+1 : i));
-			float vmask = (i1<border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
-			float vmask2 = (i1<2*border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
-			for (int j=0; j<TS; j++) {
-				float j1 = abs((j>TS/2 ? j-TS+1 : j));
-				tilemask_in[i][j] = (vmask * (j1<border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
-
-				tilemask_out[i][j] = (vmask2 * (j1<2*border ? SQR(sin((M_PI*j1)/(2*border))) : 1.0f)) + epsilon;
-
-			}
+	double wpi[3][3] = {
+		{wprofi[0][0],wprofi[0][1],wprofi[0][2]},
+		{wprofi[1][0],wprofi[1][1],wprofi[1][2]},
+		{wprofi[2][0],wprofi[2][1],wprofi[2][2]}
+		};
+	lumcalc = new float*[hei];
+	for (int i=0; i<hei; i++)
+		lumcalc[i] = new float[wid];
+	acalc = new float*[hei];
+	for (int i=0; i<hei; i++)
+		acalc[i] = new float[wid];
+	bcalc = new float*[hei];
+	for (int i=0; i<hei; i++)
+		bcalc[i] = new float[wid];
+
+#ifdef _OPENMP
+#pragma omp parallel for if(multiThread)
+#endif
+	for(int ii=0;ii<hei;ii++){
+		for(int jj=0;jj<wid;jj++){
+			float LLum,AAum,BBum;
+			float RL = provicalc->r(ii,jj);
+			float GL = provicalc->g(ii,jj);
+			float BL = provicalc->b(ii,jj);
+			// determine luminance for noisecurve
+			float XL,YL,ZL;
+			Color::rgbxyz(RL,GL,BL,XL,YL,ZL,wpi);
+			Color::XYZ2Lab(XL, YL, ZL, LLum, AAum, BBum);							
+			lumcalc[ii][jj]=LLum;
+			acalc[ii][jj]=AAum;
+			bcalc[ii][jj]=BBum;
 		}
-*/
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-		// begin tile processing of image
-
-		//output buffer
-//		Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
-//		for (int n=0; n<3*imwidth*imheight; n++) dsttmp->data[n] = 0;
-
-		int tilesize;
-		int overlap;
-		if(settings->leveldnti ==0) {
-			tilesize = 1024;
-			overlap = 128;
-		}
-		if(settings->leveldnti ==1) {
-			tilesize = 768;
-			overlap = 96;
-		}
-
-		int numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip;
-
-		//always no Tiles
-		int kall=0;
-		Tile_calc (tilesize, overlap, kall, imwidth, imheight, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip);
+	}
 
 		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
+	const int imheight=src->height, imwidth=src->width;
 
-        {
+	bool perf = (dnparams.dmethod == "RGB");
 
-		//DCT block data storage
-    TMatrix wprof = iccStore->workingSpaceMatrix (params->icm.working);
+	const float gain = pow (2.0f, float(expcomp));
+	int gamlab = settings->denoiselabgamma;//gamma lab essentialy for Luminance detail
+	if(gamlab > 2)
+		gamlab = 2;
+	
+	int tilesize;
+	int overlap;
+	if(settings->leveldnti == 0) {
+		tilesize = 1024;
+		overlap = 128;
+	}
+	if(settings->leveldnti == 1) {
+		tilesize = 768;
+		overlap = 96;
+	}
 
-    double wp[3][3] = {
-        {wprof[0][0],wprof[0][1],wprof[0][2]},
+	int numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip;
+
+	//always no Tiles
+	int kall=0;
+	Tile_calc (tilesize, overlap, kall, imwidth, imheight, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip);
+
+	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+	//DCT block data storage
+	TMatrix wprof = iccStore->workingSpaceMatrix (params->icm.working);
+
+	double wp[3][3] = {
+		{wprof[0][0],wprof[0][1],wprof[0][2]},
 		{wprof[1][0],wprof[1][1],wprof[1][2]},
 		{wprof[2][0],wprof[2][1],wprof[2][2]}
-    };
-
-	TMatrix wiprof = iccStore->workingSpaceInverseMatrix (params->icm.working);
-	//inverse matrix user select
-	double wip[3][3] = {
-		{wiprof[0][0],wiprof[0][1],wiprof[0][2]},
-		{wiprof[1][0],wiprof[1][1],wiprof[1][2]},
-		{wiprof[2][0],wiprof[2][1],wiprof[2][2]}
 	};
 
-		float chau=0.f;
-		float chred=0.f;
-		float chblue=0.f;
-		float maxchred=0.f;
-		float maxchblue=0.f;
-		float minchred =100000000.f;
-		float minchblue=100000000.f;
-		int nb=0;
-//		resid=0.f;
-//		nbresid=0;
-//		maxredresid=0.f;
-//		maxblueresid=0.f;
-//		residred=0.f;
-//		residblue=0.f;
-		int comptlevel=0;
-//        	        		static MyMutex FftwMutex;
-//		MyMutex::MyLock lock(FftwMutex);
-        
+	float chau=0.f;
+	float chred=0.f;
+	float chblue=0.f;
+	float maxchred=0.f;
+	float maxchblue=0.f;
+	float minchred =100000000.f;
+	float minchblue=100000000.f;
+	int nb=0;
+	int comptlevel=0;
 
-		for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
-			for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
+	for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
+		for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
 
-				int tileright = MIN(imwidth,tileleft+tilewidth);
-				int tilebottom = MIN(imheight,tiletop+tileheight);
-				int width  = tileright-tileleft;
-				int height = tilebottom-tiletop;
-				//input L channel
-//				array2D<float> Lin(width,height);
-				//wavelet denoised image
-				LabImage * labdn = new LabImage(width,height);
-				float* mad_LL = new float [height*width];
-				float** noisevarlum;
-						noisevarlum = new float*[height];
-							for (int i=0; i<height; i++)
-							noisevarlum[i] = new float[width];
-							
-				float* mad_aa = new float [height*width];
-				float* mad_bb = new float [height*width];
-				float** noisevarchrom;
-						noisevarchrom = new float*[height];
-							for (int i=0; i<height; i++)
-							noisevarchrom[i] = new float[width];
-				float** noisevarhue;
-						noisevarhue = new float*[height];
-							for (int i=0; i<height; i++)
-							noisevarhue[i] = new float[width];
-				
-				//residual between input and denoised L channel
-				array2D<float> Ldetail(width,height,ARRAY2D_CLEAR_DATA);
-				//pixel weight
-//				array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
-				// init luma noisevarL
-				float noisevarL	 = (float) (SQR((dnparams.luma/125.0)*(1.+ dnparams.luma/25.0)));
-				float noisevarab_b, noisevarab_r;
-				bool ponder=false;
-				
-				float realred2, realred, realblue, realblue2;
-				float interm_med =(float) dnparams.chroma/10.0;
-				float intermred, intermblue;
-				if(dnparams.redchro > 0.) intermred=(dnparams.redchro/10.); else intermred= (float) dnparams.redchro/7.0;//increase slower than linear for more sensit
-				if(dnparams.bluechro > 0.) intermblue=(dnparams.bluechro/10.); else intermblue= (float) dnparams.bluechro/7.0;//increase slower than linear for more sensit
-				realred = interm_med + intermred; if (realred < 0.f) realred=0.001f;
-				realblue = interm_med + intermblue; if (realblue < 0.f) realblue=0.001f;
-				//TODO: implement using AlignedBufferMP
-				//fill tile from image; convert RGB to "luma/chroma"
-				if (isRAW) {//image is raw; use channel differences for chroma channels
-					if(!perf){//lab mode
-							//modification Jacques feb 2013 and july 2014					
-					for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
-						int i1 = i - tiletop;
-						for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+			int tileright = MIN(imwidth,tileleft+tilewidth);
+			int tilebottom = MIN(imheight,tiletop+tileheight);
+			int width  = tileright-tileleft;
+			int height = tilebottom-tiletop;
+			LabImage * labdn = new LabImage(width,height);
+			float** noisevarlum = new float*[height];
+			for (int i=0; i<height; i++)
+				noisevarlum[i] = new float[width];
+						
+			float* mad_aa = new float [height*width];
+			float* mad_bb = new float [height*width];
+			float** noisevarchrom = new float*[height];
+			for (int i=0; i<height; i++)
+				noisevarchrom[i] = new float[width];
+			float** noisevarhue = new float*[height];
+			for (int i=0; i<height; i++)
+				noisevarhue[i] = new float[width];
+			
+			// init luma noisevarL
+			float noisevarab_b, noisevarab_r;
+			
+			float realred, realblue;
+			float interm_med =(float) dnparams.chroma/10.0;
+			float intermred, intermblue;
+			if(dnparams.redchro > 0.)
+				intermred = (dnparams.redchro/10.);
+			else
+				intermred = (float) dnparams.redchro/7.0;//increase slower than linear for more sensit
+			if(dnparams.bluechro > 0.)
+				intermblue =(dnparams.bluechro/10.);
+			else
+				intermblue = (float) dnparams.bluechro/7.0;//increase slower than linear for more sensit
+			realred = interm_med + intermred;
+			if (realred < 0.f)
+				realred = 0.001f;
+			realblue = interm_med + intermblue;
+			if (realblue < 0.f)
+				realblue = 0.001f;
+			//TODO: implement using AlignedBufferMP
+			//fill tile from image; convert RGB to "luma/chroma"
+			float noiseluma=(float) dnparams.luma;
+			noiseluma += 1.f;
+			float noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));
+			if (isRAW) {//image is raw; use channel differences for chroma channels
+#ifdef _OPENMP
+#pragma omp parallel for if(multiThread)
+#endif
+					for (int i=tiletop; i<tilebottom; i++) {
+						int i1 = i - tiletop;
+#ifdef __SSE2__
+						__m128 aNv,bNv;
+						__m128 c100v = _mm_set1_ps(100.f);
+						int j;
+						for (j=tileleft; j<tileright-3; j+=4) {
+							int j1 = j - tileleft;
+							aNv = LVFU(acalc[i][j]);
+							bNv = LVFU(bcalc[i][j]);
+							_mm_storeu_ps(&noisevarhue[i1][j1], xatan2f(bNv,aNv));
+							_mm_storeu_ps(&noisevarchrom[i1][j1], _mm_max_ps(c100v,_mm_sqrt_ps(SQRV(aNv)+SQRV(bNv))));
+						}
+						for (; j<tileright; j++) {
 							int j1 = j - tileleft;
-							float R_ = gain*src->r(i,j);
-							float G_ = gain*src->g(i,j);
-							float B_ = gain*src->b(i,j);
-							float Llum,alum,blum;
-							
-							if(noiseLCurve) {
-								Llum=lumcalc[i][j];
-							}
-							if(noiseCCurve) {
-								alum=acalc[i][j];
-								blum=bcalc[i][j];
-							}
-							//finally I opted fot gamma55 and with options we can change
-							if (gamlab == 0) {// options 12/2013
+							float aN = acalc[i][j];
+							float bN = bcalc[i][j];
+							float cN = sqrtf(SQR(aN)+SQR(bN));
+							noisevarhue[i1][j1] = xatan2f(bN,aN);
+							if(cN < 100.f)
+								cN=100.f;//avoid divided by zero
+							noisevarchrom[i1][j1] = cN;
+						}
+#else
+						for (int j=tileleft; j<tileright; j++) {
+							int j1 = j - tileleft;
+							float aN = acalc[i][j];
+							float bN = bcalc[i][j];
+							float cN = sqrtf(SQR(aN)+SQR(bN));
+							float hN = xatan2f(bN,aN);
+							if(cN < 100.f)
+								cN = 100.f;//avoid divided by zero
+							noisevarchrom[i1][j1] = cN;
+							noisevarhue[i1][j1] = hN;
+						}
+#endif
+					}
+#ifdef _OPENMP
+#pragma omp parallel for if(multiThread)
+#endif
+					for (int i=tiletop; i<tilebottom; i++) {
+						int i1 = i - tiletop;
+						for (int j=tileleft; j<tileright; j++) {
+							int j1 = j - tileleft;
+							float Llum = lumcalc[i][j];
+							Llum = Llum < 2.f ? 2.f : Llum; //avoid divided by zero ?
+							Llum = Llum > 32768.f ? 32768.f : Llum; // not strictly necessary							
+							noisevarlum[i1][j1]= Llum;
+						}
+					}
+				if (!perf){//lab mode, modification Jacques feb 2013 and july 2014
+
+#ifdef _OPENMP
+#pragma omp parallel for if(multiThread)
+#endif
+				for (int i=tiletop; i<tilebottom; i++) {
+					int i1 = i - tiletop;
+					for (int j=tileleft; j<tileright; j++) {
+						int j1 = j - tileleft;
+						float R_ = gain*src->r(i,j);
+						float G_ = gain*src->g(i,j);
+						float B_ = gain*src->b(i,j);
+
+						//finally I opted fot gamma55 and with options we can change
+						if (gamlab == 0) {// options 12/2013
 							R_ = Color::igammatab_26_11[R_];
 							G_ = Color::igammatab_26_11[G_];
 							B_ = Color::igammatab_26_11[B_];
-							}
-							else if (gamlab == 1) {
+						}
+						else if (gamlab == 1) {
 							//other new gamma 4 5 
 							R_ = Color::igammatab_4[R_];
 							G_ = Color::igammatab_4[G_];
 							B_ = Color::igammatab_4[B_];
-							}
-							else if (gamlab == 2) {							
+						}
+						else if (gamlab == 2) {							
 							//new gamma 5.5 10 better for detail luminance..it is a compromise...which depends on the image (distribution BL, ML, HL ...)
 							R_ = Color::igammatab_55[R_];
 							G_ = Color::igammatab_55[G_];
 							B_ = Color::igammatab_55[B_];
-							}
-							//apply gamma noise	standard (slider)
-							R_ = R_<65535.0f ? gamcurve[R_] : (Color::gamman((double)R_/65535.0, gam)*32768.0f);
-							G_ = G_<65535.0f ? gamcurve[G_] : (Color::gamman((double)G_/65535.0, gam)*32768.0f);
-							B_ = B_<65535.0f ? gamcurve[B_] : (Color::gamman((double)B_/65535.0, gam)*32768.0f);
-							//true conversion xyz=>Lab
-							float L,a,b;
-							float X,Y,Z;
-							Color::rgbxyz(R_,G_,B_,X,Y,Z,wp);
-
-							//convert to Lab
-							Color::XYZ2Lab(X, Y, Z, L, a, b);
-							float noiseluma=(float) dnparams.luma;	
-							
-							if(noiseLCurve) {
-								float kN=Llum;//with no gamma and take into account working profile
-								float epsi=0.01f;
-								if(kN<2.f) kN=2.f;//avoid divided by zero
-								if(kN>32768.f) kN=32768.f;	// not strictly necessary							
-								float kinterm=epsi+ noiseLCurve[(kN/32768.f)*500.f];
-								float ki=kinterm*100.f;
-								ki+=noiseluma;
-								noiseluma += 1.f;
-								noisevarlum[i1][j1]= kN;// SQR((ki/125.f)*(1.f+ki/25.f));
-							}
-							noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));
-							// chroma
-							noisevarab_r = SQR(realred);
-							noisevarab_b = SQR(realblue);
-							
-							if(noiseCCurve) {
-								float aN=alum;
-								float bN=blum;
-								float epsic=0.01f;
-								float kN=Llum;//with no gamma and take into account working profile
-								if(kN<2.f) kN=2.f;//avoid divided by zero
-								if(kN>32768.f) kN=32768.f;	// not strictly necessary							
-								
-								float cN=sqrt(SQR(aN)+SQR(bN));
-								float hN=xatan2f(bN,aN);
-								if(cN < 100.f) cN=100.f;//avoid divided by zero
-								float Cinterm=1.f + 10.f*noiseCCurve[(cN/48000.f)*500.f];//C=f(C)
-								noisevarchrom[i1][j1]=cN;
-								noisevarhue[i1][j1]=hN;
-								
-							}
-							//end chroma
-							
-							labdn->L[i1][j1] = L;
-							labdn->a[i1][j1] = a;
-							labdn->b[i1][j1] = b;
-//							Lin[i1][j1] = L;
-//							totwt[i1][j1] = 0;
 						}
-					}
-					}
-					else {//RGB mode
-					
-					for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
-						int i1 = i - tiletop;
-						for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
-							int j1 = j - tileleft;
+						//apply gamma noise	standard (slider)
+						R_ = R_<65535.0f ? gamcurve[R_] : (Color::gamman((double)R_/65535.0, gam)*32768.0f);
+						G_ = G_<65535.0f ? gamcurve[G_] : (Color::gamman((double)G_/65535.0, gam)*32768.0f);
+						B_ = B_<65535.0f ? gamcurve[B_] : (Color::gamman((double)B_/65535.0, gam)*32768.0f);
+						//true conversion xyz=>Lab
+						float X,Y,Z;
+						Color::rgbxyz(R_,G_,B_,X,Y,Z,wp);
 
-							float X = gain*src->r(i,j);
-							float Y = gain*src->g(i,j);
-							float Z = gain*src->b(i,j);
-							//conversion colorspace to determine luminance with no gamma
-							float Llum,alum,blum;
-							if(noiseLCurve) {
-								Llum=lumcalc[i][j];
-							}
-							if(noiseCCurve) {
-								alum=acalc[i][j];
-								blum=bcalc[i][j];
-							}
-							
-							X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-							Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-							Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
-							float noiseluma=(float) dnparams.luma;	
-							if(noiseLCurve) {
-							//	float noiseluma=(float) dnparams.luma;	
-								float kN=Llum;
-								float epsi=0.01f;
-								if(kN<2.f) kN=2.f;
-								if(kN>32768.f) kN=32768.f;
-								float kinterm=epsi + noiseLCurve[(kN/32768.f)*500.f];
-								float ki=kinterm*100.f;
-								ki+=noiseluma;
-								noiseluma += 1.f;
-								noisevarlum[i1][j1]= kN;//SQR((ki/125.f)*(1.f+ki/25.f));
-							}
-							noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));
-							noisevarab_r = SQR(realred);
-							noisevarab_b = SQR(realblue);
-							if(noiseCCurve) {
-								float aN=alum;
-								float bN=blum;
-								float epsic=0.01f;
-								float hN=xatan2f(bN,aN);
-								
-								float cN=sqrt(SQR(aN)+SQR(bN));
-								if(cN < 100.f) cN=100.f;//avoid divided by zero
-								float Cinterm=1.f + 10.f*noiseCCurve[(cN/48000.f)*500.f];
-								//noisevarchrom[i1][j1]=0.5f*(noisevarab_b+noisevarab_r)*(Cinterm*Cinterm);
-								noisevarchrom[i1][j1]=cN;
-								noisevarhue[i1][j1]=hN;
-							}
-							//end chroma					
-							
-							labdn->L[i1][j1] = Y;
-							labdn->a[i1][j1] = (X-Y);
-							labdn->b[i1][j1] = (Y-Z);
-
-//							Ldetail[i1][j1] = 0;
-//							Lin[i1][j1] = Y;
-//							totwt[i1][j1] = 0;
-						}
-					}
-
-					}
-					
-				} else {//image is not raw; use Lab parametrization
-					for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
-						int i1 = i - tiletop;
-						for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
-							int j1 = j - tileleft;
-							float L,a,b;
-							float rLum=src->r(i,j) ;//for luminance denoise curve
-							float gLum=src->g(i,j) ;
-							float bLum=src->b(i,j) ;
-							
-							//use gamma sRGB, not good if TIF (JPG) Output profil not with gamma sRGB  (eg : gamma =1.0, or 1.8...)
-							//very difficult to solve !
-							// solution ==> save TIF with gamma sRGB and re open
-							float rtmp = Color::igammatab_srgb[ src->r(i,j) ];
-							float gtmp = Color::igammatab_srgb[ src->g(i,j) ];
-							float btmp = Color::igammatab_srgb[ src->b(i,j) ];
-							//modification Jacques feb 2013
-							// gamma slider different from raw
-							rtmp = rtmp<65535.0f ? gamcurve[rtmp] : (Color::gamman((double)rtmp/65535.0, gam)*32768.0f);
-							gtmp = gtmp<65535.0f ? gamcurve[gtmp] : (Color::gamman((double)gtmp/65535.0, gam)*32768.0f);
-							btmp = btmp<65535.0f ? gamcurve[btmp] : (Color::gamman((double)btmp/65535.0, gam)*32768.0f);
-							
-							noisevarab_r = SQR(realred);
-							noisevarab_b = SQR(realblue);
-
-
-							float X,Y,Z;
-							Color::rgbxyz(rtmp,gtmp,btmp,X,Y,Z,wp);
-
-							//convert Lab
-							Color::XYZ2Lab(X, Y, Z, L, a, b);
-							float Llum,alum,blum;
-							if(noiseLCurve || noiseCCurve) {
-								float XL,YL,ZL;
-								Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp);
-								Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum);
-							}
-							float noiseluma=(float) dnparams.luma;	
-							
-							if(noiseLCurve) {
-							//	float noiseluma=(float) dnparams.luma;	
-								float kN=Llum;
-								float epsi=0.01f;
-
-								if(kN<2.f) kN=2.f;
-								if(kN>32768.f) kN=32768.f;								
-								float kinterm=epsi + noiseLCurve[(kN/32768.f)*500.f];
-								float ki=kinterm*100.f;
-								ki+=noiseluma;
-								noiseluma += 1.f;
-							//	noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));								
-								noisevarlum[i1][j1]=kN;//SQR((ki/125.f)*(1.f+ki/25.f));
-							}	
-							noisevarL = SQR((noiseluma/125.f)*(1.f+noiseluma/25.f));								
-							if(noiseCCurve) {
-								float aN=alum;
-								float bN=blum;
-								float epsic=0.01f;
-								float hN=xatan2f(bN,aN);
-								
-								float cN=sqrt(SQR(aN)+SQR(bN));
-								if(cN < 100.f) cN=100.f;//avoid divided by zero
-								float Cinterm=1.f + 10.f*noiseCCurve[(cN/48000.f)*500.f];
-								//noisevarchrom[i1][j1]=0.5f*(noisevarab_b+noisevarab_r)*(Cinterm*Cinterm);
-								noisevarchrom[i1][j1]=cN;
-								noisevarhue[i1][j1]=hN;
-								
-
-							}
-							
-							labdn->L[i1][j1] = L;
-							labdn->a[i1][j1] = a;
-							labdn->b[i1][j1] = b;
-
-//							Ldetail[i1][j1] = 0;
-//							Lin[i1][j1] = L;
-
-//							totwt[i1][j1] = 0;
-						}
+						//convert to Lab
+						float L,a,b;
+						Color::XYZ2Lab(X, Y, Z, L, a, b);
+						
+						labdn->a[i1][j1] = a;
+						labdn->b[i1][j1] = b;
 					}
 				}
-	/*			if (dnparams.luma>0.01) {
-					impulse_nr (labdn, float(MIN(50.0,dnparams.luma))/20.0f);
 				}
-*/
-				int datalen = labdn->W * labdn->H;
+				else {//RGB mode
+				
+				for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
+					int i1 = i - tiletop;
+					for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+						int j1 = j - tileleft;
 
-				//now perform basic wavelet denoise
-				//last two arguments of wavelet decomposition are max number of wavelet decomposition levels;
-				//and whether to subsample the image after wavelet filtering.  Subsampling is coded as
-				//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
-				//the first level only, 7 means subsample the first three levels, etc.
-				noisevarab_r = SQR(realred) + 0.01f;
-				noisevarab_b = SQR(realblue) + 0.01f;
-				
-				bool execwavelet=true;
-				if(execwavelet) {//gain time if user choose only median  sliders L <=1  slider chrom master < 1
-                { // enclosing this code in a block frees about 120 MB before allocating 20 MB after this block (measured with D700 NEF)
-                wavelet_decomposition* Ldecomp;
-                wavelet_decomposition* adecomp;
-                wavelet_decomposition* bdecomp;
-				int schoice=0;//shrink method
-				
-				if(dnparams.smethod=="shal") schoice=0;
-				else if(dnparams.smethod=="shalbi") schoice=2;
-
-				int levwav=5;
-				float maxreal = max(realred, realblue);
-				//increase the level of wavelet if user increase much or very much sliders
-				
-				
-				//	if (settings->verbose) printf("levwavelet=%i  noisevarA=%f noisevarB=%f \n",levwav, noisevarab_r, noisevarab_b );
-                Ldecomp = new wavelet_decomposition (labdn->data, labdn->W, labdn->H, levwav/*maxlevels*/, 0/*subsampling*/ );
-                adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H,levwav, 1 );
-                bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, levwav, 1 );
-				bool autoch = dnparams.autochroma;
-				if(comptlevel==0) WaveletDenoiseAll_info(*Ldecomp, *adecomp, *bdecomp, noisevarL, noisevarlum, noisevarchrom,  noisevarhue, width, height, mad_LL, mad_aa, mad_bb, noisevarab_r, noisevarab_b,labdn, noiseLCurve, noiseCCurve, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, schoice, autoch, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc,maxchred, maxchblue, minchred, minchblue, nb,chau ,chred, chblue);//enhance mode
-				comptlevel+=1;
-				float chresid,chmaxredresid,chmaxblueresid,chresidred, chresidblue;	
-				nresi=chresid;
-				highresi=chresid + 0.66f*(max(chmaxredresid,chmaxblueresid) - chresid);//evaluate sigma
-				delete Ldecomp;
-				delete adecomp;
-				delete bdecomp;
-                }
+						float X = gain*src->r(i,j);
+						float Y = gain*src->g(i,j);
+						float Z = gain*src->b(i,j);
+						
+						X = X<65535.0f ? gamcurve[X] : (Color::gamma((double)X/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
+						Y = Y<65535.0f ? gamcurve[Y] : (Color::gamma((double)Y/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
+						Z = Z<65535.0f ? gamcurve[Z] : (Color::gamma((double)Z/65535.0, gam, gamthresh, gamslope, 1.0, 0.0)*32768.0f);
+
+						labdn->a[i1][j1] = (X-Y);
+						labdn->b[i1][j1] = (Y-Z);
+					}
+				}
 				}
-
 				
+			} else {//image is not raw; use Lab parametrization
+				for (int i=tiletop/*, i1=0*/; i<tilebottom; i++/*, i1++*/) {
+					int i1 = i - tiletop;
+					for (int j=tileleft/*, j1=0*/; j<tileright; j++/*, j1++*/) {
+						int j1 = j - tileleft;
+						float L,a,b;
+						float rLum=src->r(i,j) ;//for luminance denoise curve
+						float gLum=src->g(i,j) ;
+						float bLum=src->b(i,j) ;
+						
+						//use gamma sRGB, not good if TIF (JPG) Output profil not with gamma sRGB  (eg : gamma =1.0, or 1.8...)
+						//very difficult to solve !
+						// solution ==> save TIF with gamma sRGB and re open
+						float rtmp = Color::igammatab_srgb[ src->r(i,j) ];
+						float gtmp = Color::igammatab_srgb[ src->g(i,j) ];
+						float btmp = Color::igammatab_srgb[ src->b(i,j) ];
+						//modification Jacques feb 2013
+						// gamma slider different from raw
+						rtmp = rtmp<65535.0f ? gamcurve[rtmp] : (Color::gamman((double)rtmp/65535.0, gam)*32768.0f);
+						gtmp = gtmp<65535.0f ? gamcurve[gtmp] : (Color::gamman((double)gtmp/65535.0, gam)*32768.0f);
+						btmp = btmp<65535.0f ? gamcurve[btmp] : (Color::gamman((double)btmp/65535.0, gam)*32768.0f);
+						
+						float X,Y,Z;
+						Color::rgbxyz(rtmp,gtmp,btmp,X,Y,Z,wp);
 
-				delete labdn;
-			//	delete noiseh;
-				for (int i=0; i<height; i++)
-					delete [] noisevarlum[i];
-				delete [] noisevarlum;
-				delete [] mad_LL;
-				for (int i=0; i<height; i++)
-					delete [] noisevarchrom[i];
-				delete [] noisevarchrom;
-				for (int i=0; i<height; i++)
-					delete [] noisevarhue[i];
-				delete [] noisevarhue;
-				
-				delete [] mad_aa;
-				delete [] mad_bb;
+						//convert Lab
+						Color::XYZ2Lab(X, Y, Z, L, a, b);
+						float Llum,alum,blum;
+						float XL,YL,ZL;
+						Color::rgbxyz(rLum,gLum,bLum,XL,YL,ZL,wp);
+						Color::XYZ2Lab(XL, YL, ZL, Llum, alum, blum);
+						
+						float kN=Llum;
+						float epsi=0.01f;
 
-			}//end of tile row
-		}//end of tile loop
-        }
-	}
+						if(kN<2.f) kN=2.f;
+						if(kN>32768.f) kN=32768.f;								
+						noisevarlum[i1][j1]=kN;
+						float aN=alum;
+						float bN=blum;
+						float hN=xatan2f(bN,aN);
+						float cN=sqrt(SQR(aN)+SQR(bN));
+						if(cN < 100.f) cN=100.f;//avoid divided by zero
+						noisevarchrom[i1][j1]=cN;
+						noisevarhue[i1][j1]=hN;
+
+						labdn->a[i1][j1] = a;
+						labdn->b[i1][j1] = b;
+					}
+				}
+			}
+			int datalen = labdn->W * labdn->H;
 
-		//end median
-		if(noiseLCurve || noiseCCurve) {
-			for (int i=0; i<hei; i++)
-				delete [] lumcalc[i];
-			delete [] lumcalc;
-			for (int i=0; i<hei; i++)
-				delete [] acalc[i];
-			delete [] acalc;
-			for (int i=0; i<hei; i++)
-				delete [] bcalc[i];
-			delete [] bcalc;
-		}
+			//now perform basic wavelet denoise
+			//last two arguments of wavelet decomposition are max number of wavelet decomposition levels;
+			//and whether to subsample the image after wavelet filtering.  Subsampling is coded as
+			//binary 1 or 0 for each level, eg subsampling = 0 means no subsampling, 1 means subsample
+			//the first level only, 7 means subsample the first three levels, etc.
+			noisevarab_r = SQR(realred) + 0.01f;
+			noisevarab_b = SQR(realblue) + 0.01f;
+			
+			wavelet_decomposition* adecomp;
+			wavelet_decomposition* bdecomp;
+
+			int schoice = 0;//shrink method
+			if (dnparams.smethod == "shalbi")
+				schoice=2;
+
+			const int levwav=5;
+#ifdef _OPENMP
+#pragma omp parallel sections if(multiThread)
+#endif
+{
+#ifdef _OPENMP
+#pragma omp section
+#endif
+{
+			adecomp = new wavelet_decomposition (labdn->data+datalen, labdn->W, labdn->H, levwav, 1 );
+}
+#ifdef _OPENMP
+#pragma omp section
+#endif
+{
+			bdecomp = new wavelet_decomposition (labdn->data+2*datalen, labdn->W, labdn->H, levwav, 1 );
+}
+}
+			bool autoch = dnparams.autochroma;
+			if(comptlevel==0) WaveletDenoiseAll_info(levwav, *adecomp, *bdecomp, noisevarlum, noisevarchrom, noisevarhue, width, height, mad_aa, mad_bb, noisevarab_r, noisevarab_b, labdn, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, schoice, autoch, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc,maxchred, maxchblue, minchred, minchblue, nb,chau ,chred, chblue, perf, multiThread);//enhance mode
+			comptlevel+=1;
+			float chresid,chmaxredresid,chmaxblueresid,chresidred, chresidblue;	
+			nresi=chresid;
+			highresi=chresid + 0.66f*(max(chmaxredresid,chmaxblueresid) - chresid);//evaluate sigma
+			delete adecomp;
+			delete bdecomp;
+			delete labdn;
+			for (int i=0; i<height; i++)
+				delete [] noisevarlum[i];
+			delete [] noisevarlum;
+			for (int i=0; i<height; i++)
+				delete [] noisevarchrom[i];
+			delete [] noisevarchrom;
+			for (int i=0; i<height; i++)
+				delete [] noisevarhue[i];
+			delete [] noisevarhue;
+			
+			delete [] mad_aa;
+			delete [] mad_bb;
+
+		}//end of tile row
+	}//end of tile loop
+
+	for (int i=0; i<hei; i++)
+		delete [] lumcalc[i];
+	delete [] lumcalc;
+	for (int i=0; i<hei; i++)
+		delete [] acalc[i];
+	delete [] acalc;
+	for (int i=0; i<hei; i++)
+		delete [] bcalc[i];
+	delete [] bcalc;
 
 #undef TS
 #undef fTS
 #undef offset
 #undef epsilon
 
-	}//end of main RGB_denoise
-
-
-	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
+}//end of main RGB_denoise
 
 }
-
diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index 83a85fb65..a52deb047 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -82,6 +82,7 @@ class LUT {
 protected:
 	// list of variables ordered to improve cache speed
 	unsigned int maxs;
+	float maxsf;
 	T * data;
 	unsigned int clip, size;
 private:
@@ -109,6 +110,7 @@ public:
 		owner = 1;
 		size = s;
 		maxs=size-2;
+		maxsf = (float)maxs;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
 		maxsv =  _mm_set1_ps( maxs );
 		maxsiv = _mm_cvttps_epi32( maxsv );
@@ -130,6 +132,7 @@ public:
 		owner = 1;
 		size = s;
 		maxs=size-2;
+		maxsf = (float)maxs;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
 		maxsv =  _mm_set1_ps( maxs );
 		maxsiv = _mm_cvttps_epi32( maxsv );
@@ -153,6 +156,7 @@ public:
 		owner = 1;
 		size = s;
 		maxs=size-2;
+		maxsf = (float)maxs;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
 		maxsv =  _mm_set1_ps( size - 2);
 		maxsiv = _mm_cvttps_epi32( maxsv );
@@ -211,6 +215,7 @@ public:
 	      memcpy(this->data,rhs.data,rhs.size*sizeof(T));
 	      this->size=rhs.size;
 	      this->maxs=this->size-2;
+		  this->maxsf = (float)this->maxs;
 #if defined( __SSE2__ ) && defined( __x86_64__ )
 		  this->maxsv =  _mm_set1_ps( this->size - 2);
 		  this->maxsiv = _mm_cvttps_epi32( this->maxsv );
@@ -342,7 +347,7 @@ public:
 				return data[0];
 			idx=0;
 		}
-		else if (index > float(maxs))
+		else if (index > maxsf)
 		{
 			if (clip & LUT_CLIP_ABOVE)
 				return data[size - 1];
diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h
index 2331362fb..9871d43cc 100644
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -23,7 +23,6 @@
 #include <string.h> 
 #include <math.h>
 #include "alignedbuffer.h"
-
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -31,6 +30,7 @@
 #include "rt_math.h"
 #include "opthelper.h"
 
+
 //using namespace rtengine;
 
 namespace rtengine {
@@ -118,14 +118,13 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, int radx, int rady, int W, int H) {
+template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H) {
 
 //printf("boxblur\n");
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
 	
-	AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W*H);
-	float* temp = buffer->data;
+	float* temp = buffer;
 	if (radx==0) {
 		for (int row=0; row<H; row++) 
 			for (int col=0; col<W; col++) {
@@ -133,27 +132,32 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, int radx, i
 			}
 	} else {
 		//horizontal blur
-		for (int row = 0; row < H; row++) {
+		for (int row = H-1; row >= 0; row--) {
 			int len = radx + 1;
-			temp[row*W+0] = (float)src[row*W+0];
+			float tempval = (float)src[row*W];
 			for (int j=1; j<=radx; j++) {
-				temp[row*W+0] += (float)src[row*W+j];
+				tempval += (float)src[row*W+j];
 			}
-			temp[row*W+0] = temp[row*W+0]/len;
+			tempval = tempval/len;
+			temp[row*W] = tempval;
 			for (int col=1; col<=radx; col++) {
-				temp[row*W+col] = (temp[row*W+col-1]*len + src[row*W+col+radx])/(len+1);
+				tempval = (tempval*len + src[row*W+col+radx])/(len+1);
+				temp[row*W+col] = tempval;
 				len ++;
 			}
+			float reclen = 1.f/len;
 			for (int col = radx+1; col < W-radx; col++) {
-				temp[row*W+col] = temp[row*W+col-1] + ((float)(src[row*W+col+radx] - src[row*W+col-radx-1]))/len;
+				tempval = tempval + ((float)(src[row*W+col+radx] - src[row*W+col-radx-1]))*reclen;
+				temp[row*W+col] = tempval;
 			}
 			for (int col=W-radx; col<W; col++) {
-				temp[row*W+col] = (temp[row*W+col-1]*len - src[row*W+col-radx-1])/(len-1);
+				tempval = (tempval*len - src[row*W+col-radx-1])/(len-1);
+				temp[row*W+col] = tempval;
 				len --;
 			}
 		}
 	}
-	
+
 	if (rady==0) {
 		for (int row=0; row<H; row++) 
 			for (int col=0; col<W; col++) {
@@ -164,8 +168,45 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, int radx, i
 #ifdef __SSE2__
 		__m128	leninitv = _mm_set1_ps( (float)(rady+1));
 		__m128 	onev = _mm_set1_ps( 1.0f );
-		__m128	tempv,lenv,lenp1v,lenm1v;
-		for (int col = 0; col < W-3; col+=4) {
+		__m128	tempv,temp1v,lenv,lenp1v,lenm1v,rlenv;
+		int col;
+		for (col = 0; col < W-7; col+=8) {
+			lenv = leninitv;
+			tempv = LVFU(temp[0*W+col]);
+			temp1v = LVFU(temp[0*W+col+4]);
+			for (int i=1; i<=rady; i++) {
+				tempv = tempv + LVFU(temp[i*W+col]);
+				temp1v = temp1v + LVFU(temp[i*W+col+4]);
+			}
+			tempv = tempv/lenv;
+			temp1v = temp1v/lenv;
+			_mm_storeu_ps( &dst[0*W+col], tempv);
+			_mm_storeu_ps( &dst[0*W+col+4], temp1v);
+			for (int row=1; row<=rady; row++) {
+				lenp1v = lenv + onev;
+				tempv = (tempv*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v;
+				temp1v = (temp1v*lenv + LVFU(temp[(row+rady)*W+col+4]))/lenp1v;
+				_mm_storeu_ps( &dst[row*W+col], tempv);
+				_mm_storeu_ps( &dst[row*W+col+4], temp1v);
+				lenv = lenp1v;
+			}
+			rlenv = onev / lenv;
+			for (int row = rady+1; row < H-rady; row++) {
+				tempv = tempv +(LVFU(temp[(row+rady)*W+col])-LVFU(temp[(row-rady-1)*W+col]))*rlenv ;
+				temp1v = temp1v +(LVFU(temp[(row+rady)*W+col+4])-LVFU(temp[(row-rady-1)*W+col+4]))*rlenv ;
+				_mm_storeu_ps( &dst[row*W+col], tempv);
+				_mm_storeu_ps( &dst[row*W+col+4], temp1v);
+			}
+			for (int row=H-rady; row<H; row++) {
+				lenm1v = lenv - onev;
+				tempv = (tempv*lenv - LVFU(temp[(row-rady-1)*W+col]))/lenm1v;
+				temp1v = (temp1v*lenv - LVFU(temp[(row-rady-1)*W+col+4]))/lenm1v;
+				_mm_storeu_ps( &dst[row*W+col], tempv);
+				_mm_storeu_ps( &dst[row*W+col+4], temp1v);
+				lenv = lenm1v;
+			}
+		}
+		for (; col < W-3; col+=4) {
 			lenv = leninitv;
 			tempv = LVFU(temp[0*W+col]);
 			for (int i=1; i<=rady; i++) {
@@ -179,8 +220,9 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, int radx, i
 				_mm_storeu_ps( &dst[row*W+col], tempv);
 				lenv = lenp1v;
 			}
+			rlenv = onev / lenv;
 			for (int row = rady+1; row < H-rady; row++) {
-				tempv = tempv +(LVFU(temp[(row+rady)*W+col])-LVFU(temp[(row-rady-1)*W+col]))/lenv ;
+				tempv = tempv +(LVFU(temp[(row+rady)*W+col])-LVFU(temp[(row-rady-1)*W+col]))*rlenv ;
 				_mm_storeu_ps( &dst[row*W+col], tempv);
 			}
 			for (int row=H-rady; row<H; row++) {
@@ -190,7 +232,7 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, int radx, i
 				lenv = lenm1v;
 			}
 		}
-		for (int col = W-(W%4); col < W; col++) {
+		for (; col < W; col++) {
 			int len = rady + 1;
 			dst[0*W+col] = temp[0*W+col]/len;
 			for (int i=1; i<=rady; i++) {
@@ -230,8 +272,6 @@ template<class T, class A> SSEFUNCTION void boxblur (T* src, A* dst, int radx, i
 #endif
 	}
 	
-	delete buffer;
-	
 }
 
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -704,18 +744,22 @@ template<class T, class A> SSEFUNCTION void boxabsblur (T* src, A* dst, int radx
 			for (int i=1; i<=rady; i++) {
 				tempv = tempv + LVFU(temp[i*W+col]);
 			}
-			_mm_storeu_ps( &dst[0*W+col], tempv / lenv );
+			tempv = tempv / lenv;
+			_mm_storeu_ps( &dst[0*W+col], tempv );
 			for (int row=1; row<=rady; row++) {
 				lenp1v = lenv + onev;
-				_mm_storeu_ps( &dst[row*W+col],(LVFU(dst[(row-1)*W+col])*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v );
+				tempv = (tempv*lenv + LVFU(temp[(row+rady)*W+col]))/lenp1v;
+				_mm_storeu_ps( &dst[row*W+col],tempv);
 				lenv = lenp1v;
 			}
 			for (int row = rady+1; row < H-rady; row++) {
-				_mm_storeu_ps( &dst[row*W+col], LVFU(dst[(row-1)*W+col]) + (LVFU(temp[(row+rady)*W+col])- LVFU(temp[(row-rady-1)*W+col]))/lenv);
+				tempv = tempv + (LVFU(temp[(row+rady)*W+col])- LVFU(temp[(row-rady-1)*W+col]))/lenv;
+				_mm_storeu_ps( &dst[row*W+col], tempv);
 			}
 			for (int row=H-rady; row<H; row++) {
 				lenm1v = lenv - onev;
-				_mm_storeu_ps( &dst[row*W+col], (LVFU(dst[(row-1)*W+col])*lenv - LVFU(temp[(row-rady-1)*W+col]))/lenm1v);
+				tempv = (tempv*lenv - LVFU(temp[(row-rady-1)*W+col]))/lenm1v;
+				_mm_storeu_ps( &dst[row*W+col], tempv);
 				lenv = lenm1v;
 			}
 	}
diff --git a/rtengine/color.cc b/rtengine/color.cc
index 0a9d688fe..e1ef4250b 100644
--- a/rtengine/color.cc
+++ b/rtengine/color.cc
@@ -124,7 +124,7 @@ namespace rtengine {
     void Color::init () {
 
         int maxindex = 65536;
-        cachef(maxindex,0/*LUT_CLIP_BELOW*/);
+        cachef(maxindex,LUT_CLIP_BELOW);
 
         gamma2curve(maxindex,0);
 
@@ -432,6 +432,12 @@ namespace rtengine {
         y = ((xyz_rgb[1][0]*r + xyz_rgb[1][1]*g + xyz_rgb[1][2]*b)) ;
         z = ((xyz_rgb[2][0]*r + xyz_rgb[2][1]*g + xyz_rgb[2][2]*b)) ;
     }
+
+    void Color::rgbxyz (float r, float g, float b, float &x, float &y, float &z, const float xyz_rgb[3][3]) {
+        x = ((xyz_rgb[0][0]*r + xyz_rgb[0][1]*g + xyz_rgb[0][2]*b)) ;
+        y = ((xyz_rgb[1][0]*r + xyz_rgb[1][1]*g + xyz_rgb[1][2]*b)) ;
+        z = ((xyz_rgb[2][0]*r + xyz_rgb[2][1]*g + xyz_rgb[2][2]*b)) ;
+    }
 	
     void Color::xyz2rgb (float x, float y, float z, float &r, float &g, float &b, const double rgb_xyz[3][3]) {
         //Transform to output color.  Standard sRGB is D65, but internal representation is D50
@@ -858,11 +864,11 @@ namespace rtengine {
         float z = Z/D50z;
         float y= Y;
         float fx,fy,fz;
-
-        fx = (x<=65535.0f ? cachef[std::max(x,0.f)] : (327.68f*exp(log(x/MAXVALF)/3.0f )));
-        fy = (y<=65535.0f ? cachef[std::max(y,0.f)] : (327.68f*exp(log(y/MAXVALF)/3.0f )));
-        fz = (z<=65535.0f ? cachef[std::max(z,0.f)] : (327.68f*exp(log(z/MAXVALF)/3.0f )));
-
+
+        fx = (x<=65535.0f ? cachef[x] : (327.68f*xcbrtf(x/MAXVALF)));
+        fy = (y<=65535.0f ? cachef[y] : (327.68f*xcbrtf(x/MAXVALF)));
+        fz = (z<=65535.0f ? cachef[z] : (327.68f*xcbrtf(x/MAXVALF)));
+
         L = (116.0f *  fy - 5242.88f); //5242.88=16.0*327.68;
         a = (500.0f * (fx - fy) );
         b = (200.0f * (fy - fz) );
diff --git a/rtengine/color.h b/rtengine/color.h
index 29462527f..8c12467b5 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -296,6 +296,7 @@ public:
 	* @param xyz_rgb[3][3] transformation matrix to use for the conversion
 	*/
 	static void rgbxyz (float r, float g, float b, float &x, float &y, float &z, const double xyz_rgb[3][3]);
+	static void rgbxyz (float r, float g, float b, float &x, float &y, float &z, const float xyz_rgb[3][3]);
 
 
 	/**
@@ -427,8 +428,8 @@ public:
 
 	}
 	static inline float f2xyz(float f) {
-		const float epsilonExpInv3 = 6.0f/29.0f;
-		const float kappaInv = 27.0f/24389.0f;  // inverse of kappa
+		const float epsilonExpInv3 = 0.20689655f; // 6.0f/29.0f;
+		const float kappaInv = 0.0011070565f; // 27.0f/24389.0f;  // inverse of kappa
 
 		return (f > epsilonExpInv3) ? f*f*f : (116.f * f - 16.f) * kappaInv;
 	}
@@ -891,6 +892,16 @@ public:
 	static inline double gamman      (double x, double gamma) {         //standard gamma without slope...
 										return (x =exp(log(x)/gamma));
 									}
+
+	/**
+	* @brief Very basic gamma
+	* @param x red, green or blue channel's value [0 ; 1]
+	* @param gamma gamma value [1 ; 5]
+	* @return the gamma modified's value [0 ; 1]
+	*/
+	static inline float gammanf      (float x, float gamma) {         //standard gamma without slope...
+										return (x =xexpf(xlogf(x)/gamma));
+									}
 
 
 	/**
diff --git a/rtengine/cplx_wavelet_dec.cc b/rtengine/cplx_wavelet_dec.cc
index 27b885827..abdfbcff8 100644
--- a/rtengine/cplx_wavelet_dec.cc
+++ b/rtengine/cplx_wavelet_dec.cc
@@ -22,19 +22,6 @@
 
 namespace rtengine {
 	
-	cplx_wavelet_decomposition::~cplx_wavelet_decomposition()
-	{
-		for(int i = 0; i <= lvltot; i++) {
-			for (int j=0; j<4; j++) {
-				delete dual_tree[i][j];
-			}
-		}
-		delete[] first_lev_anal;
-		delete[] first_lev_synth;
-		delete[] wavfilt_anal;
-		delete[] wavfilt_synth;
-	}
-	
 	wavelet_decomposition::~wavelet_decomposition()
 	{
 		for(int i = 0; i <= lvltot; i++) {
diff --git a/rtengine/cplx_wavelet_dec.h b/rtengine/cplx_wavelet_dec.h
index de0b0b966..3cadf295a 100644
--- a/rtengine/cplx_wavelet_dec.h
+++ b/rtengine/cplx_wavelet_dec.h
@@ -24,367 +24,89 @@
 #include <cstddef>
 #include <math.h>
 
-
 #include "cplx_wavelet_level.h"
 #include "cplx_wavelet_filter_coeffs.h"
 
 namespace rtengine {
 
-
-// %%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-template <typename A>
-void copy_out(A * a, A * b, size_t datalen)
-{// for standard wavelet decomposition
-	memcpy(b, a, datalen*sizeof(A));
-}
-
-template <typename A, typename B>
-void copy_out(A ** a, B * b, size_t datalen)
-{// for complex wavelet decomposition
-	for (size_t j=0; j<datalen; j++) {
-		b[j] = static_cast<B> (0.25*(a[0][j]+a[1][j]+a[2][j]+a[3][j]));
-	}
-}
-	
-template <typename A, typename B>
-void copy_out(A * a, B * b, size_t datalen)
-{// for standard wavelet decomposition
-	for (size_t j=0; j<datalen; j++) {
-		b[j] = static_cast<B> (a[j]);
-	}
-}
-
-// %%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-class cplx_wavelet_decomposition
-{
-public:
-
-    typedef float internal_type;
-
-private:
-
-  //  static const int maxlevels = 8;//should be greater than any conceivable order of decimation
-    static const int maxlevels = 9;//should be greater than any conceivable order of decimation
-    
-    int lvltot, subsamp;
-    size_t m_w, m_h;//dimensions
-	
-	int first_lev_len, first_lev_offset;
-	float *first_lev_anal;
-	float *first_lev_synth;
-	
-	int wavfilt_len, wavfilt_offset;
-	float *wavfilt_anal;
-	float *wavfilt_synth;
-	
-	int testfilt_len, testfilt_offset;
-	float *testfilt_anal;
-	float *testfilt_synth;
-
-    wavelet_level<internal_type> * dual_tree[maxlevels][4];
-    
-public:
-
-    template<typename E>
-    cplx_wavelet_decomposition(E * src, int width, int height, int maxlvl, int subsampling);
-    
-    ~cplx_wavelet_decomposition();
-	
-	internal_type ** level_coeffs(int level, int branch) const
-	{
-		return dual_tree[level][branch]->subbands();
-	}
-	
-	int level_W(int level, int branch) const
-	{
-		return dual_tree[level][branch]->width();
-	}
-    
-	int level_H(int level, int branch) const
-	{
-		return dual_tree[level][branch]->height();
-	}
-	
-	int level_pad(int level, int branch) const
-	{
-		return dual_tree[level][branch]->padding();
-	}
-	
-	int maxlevel() const
-	{
-		return lvltot;
-	}
-	
-    template<typename E>
-    void reconstruct(E * dst);
-	
-};
-	
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	template<typename E>
-	cplx_wavelet_decomposition::cplx_wavelet_decomposition(E * src, int width, int height, int maxlvl, int subsampling)
-	: lvltot(0), subsamp(subsampling), m_w(width), m_h(height)
-	{
-		
-		//initialize wavelet filters
-		
-		first_lev_len = FSFarras_len;
-		first_lev_offset = FSFarras_offset;
-		first_lev_anal = new float[4*first_lev_len];
-		first_lev_synth = new float[4*first_lev_len];
-
-		for (int n=0; n<2; n++) {
-			for (int m=0; m<2; m++) {
-				for (int i=0; i<first_lev_len; i++) {
-					first_lev_anal[first_lev_len*(2*n+m)+i]  = FSFarras_anal[n][m][i]/sqrt(2);
-					first_lev_synth[first_lev_len*(2*n+m)+i] = FSFarras_anal[n][m][first_lev_len-1-i]/sqrt(2);
-				}
-			}
-		}
-		
-		/*first_lev_len = AntonB_len;
-		first_lev_offset = AntonB_offset;
-		first_lev_anal = new float[4*first_lev_len];
-		first_lev_synth = new float[4*first_lev_len];
-		
-		for (int n=0; n<2; n++) {
-			for (int m=0; m<2; m++) {
-				for (int i=0; i<first_lev_len; i++) {
-					first_lev_anal[first_lev_len*(2*n+m)+i]  = AntonB_anal[n][m][i];
-					first_lev_synth[first_lev_len*(2*n+m)+i] = 2*AntonB_synth[n][m][i];
-				}
-			}
-		}*/
-		
-		wavfilt_len = Kingsbury_len;
-		wavfilt_offset = Kingsbury_offset;
-		wavfilt_anal = new float[4*wavfilt_len];
-		wavfilt_synth = new float[4*wavfilt_len];
-		
-		for (int n=0; n<2; n++) {
-			for (int m=0; m<2; m++) {
-				for (int i=0; i<wavfilt_len; i++) {
-					wavfilt_anal[wavfilt_len*(2*n+m)+i]  = Kingsbury_anal[n][m][i]/sqrt(2);
-					wavfilt_synth[wavfilt_len*(2*n+m)+i] = Kingsbury_anal[n][m][first_lev_len-1-i]/sqrt(2);
-				}
-			}
-		}
-		
-		
-		// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-
-		
-		// after coefficient rotation, data structure is:
-		// dual_tree[scale][2*n+m=2*(Re/Im)+dir][channel={lo,hi1,hi2,hi3}][pixel_array]
-		
-		//srand((unsigned)time(0));
-		//for (int i=0; i<m_w*m_h; i++ )
-		//	src[i] = (float)rand()/(float)RAND_MAX;
-
-		for (int n=0; n<2; n++) {
-			for (int m=0; m<2; m++) {
-				lvltot=0;
-				float padding = 0;//1<<(maxlvl-1);
-				dual_tree[0][2*n+m] = new wavelet_level<internal_type>(src, lvltot, subsamp, padding, m_w, m_h, first_lev_anal+first_lev_len*2*n, \
-																				   first_lev_anal+first_lev_len*2*m, first_lev_len, first_lev_offset);
-				while(lvltot < maxlvl) {
-					lvltot++;
-					dual_tree[lvltot][2*n+m] = new wavelet_level<internal_type>(dual_tree[lvltot-1][2*n+m]->lopass()/*lopass*/, lvltot, subsamp, 0/*no padding*/, \
-																							dual_tree[lvltot-1][2*n+m]->width(), \
-																							dual_tree[lvltot-1][2*n+m]->height(), \
-																							wavfilt_anal+wavfilt_len*2*n, wavfilt_anal+wavfilt_len*2*m, \
-																							wavfilt_len, wavfilt_offset);
-				}
-			}
-		}
-		
-		
-		//rotate detail coefficients
-		float coeffave[5][4][3];
-
-		float root2 = sqrt(2);
-		for (int lvl=0; lvl<lvltot; lvl++) {
-			int Wlvl = dual_tree[lvl][0]->width();
-			int Hlvl = dual_tree[lvl][0]->height();
-			for (int n=0; n<4; n++) 
-				for (int m=1; m<4; m++)
-					coeffave[lvl][n][m-1]=0;
-
-			for (int m=1; m<4; m++) {//detail coefficients only
-				for (int i=0; i<Wlvl*Hlvl; i++) {//pixel
-					
-					float wavtmp = (dual_tree[lvl][0]->wavcoeffs[m][i] + dual_tree[lvl][3]->wavcoeffs[m][i])/root2;
-					dual_tree[lvl][3]->wavcoeffs[m][i] = (dual_tree[lvl][0]->wavcoeffs[m][i] - dual_tree[lvl][3]->wavcoeffs[m][i])/root2;
-					dual_tree[lvl][0]->wavcoeffs[m][i] = wavtmp;
-					
-					wavtmp = (dual_tree[lvl][1]->wavcoeffs[m][i] + dual_tree[lvl][2]->wavcoeffs[m][i])/root2;
-					dual_tree[lvl][2]->wavcoeffs[m][i] = (dual_tree[lvl][1]->wavcoeffs[m][i] - dual_tree[lvl][2]->wavcoeffs[m][i])/root2;
-					dual_tree[lvl][1]->wavcoeffs[m][i] = wavtmp;
-					
-					for (int n=0; n<4; n++) coeffave[lvl][n][m-1] += fabs(dual_tree[lvl][n]->wavcoeffs[m][i]);
-				}
-			}
-			for (int n=0; n<4; n++) 
-				for (int i=0; i<3; i++) 
-					coeffave[lvl][n][i] /= Wlvl*Hlvl;
-		}
-
-	}
-	
-	/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-	/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */
-	
-		
-	template<typename E>
-	void cplx_wavelet_decomposition::reconstruct(E * dst) { 
-		
-		// data structure is wavcoeffs[scale][2*n+m=2*(Re/Im)+dir][channel={lo,hi1,hi2,hi3}][pixel_array]
-		
-		//rotate detail coefficients
-		float root2 = sqrt(2);
-		for (int lvl=0; lvl<lvltot; lvl++) {
-			int Wlvl = dual_tree[lvl][0]->width();
-			int Hlvl = dual_tree[lvl][0]->height();
-			for (int i=0; i<Wlvl*Hlvl; i++) {//pixel
-				for (int m=1; m<4; m++) {//detail coefficients only
-					float wavtmp = (dual_tree[lvl][0]->wavcoeffs[m][i] + dual_tree[lvl][3]->wavcoeffs[m][i])/root2;
-					dual_tree[lvl][3]->wavcoeffs[m][i] = (dual_tree[lvl][0]->wavcoeffs[m][i] - dual_tree[lvl][3]->wavcoeffs[m][i])/root2;
-					dual_tree[lvl][0]->wavcoeffs[m][i] = wavtmp;
-					
-					wavtmp = (dual_tree[lvl][1]->wavcoeffs[m][i] + dual_tree[lvl][2]->wavcoeffs[m][i])/root2;
-					dual_tree[lvl][2]->wavcoeffs[m][i] = (dual_tree[lvl][1]->wavcoeffs[m][i] - dual_tree[lvl][2]->wavcoeffs[m][i])/root2;
-					dual_tree[lvl][1]->wavcoeffs[m][i] = wavtmp;
-				}
-			}
-		}
-		
-		internal_type ** tmp = new internal_type *[4];
-		for (int i=0; i<4; i++) {
-			tmp[i] = new internal_type[m_w*m_h];
-		}
-		
-		for (int n=0; n<2; n++) {
-			for (int m=0; m<2; m++) {
-				int skip=1<<(lvltot-1);
-				for (int lvl=lvltot-1; lvl>0; lvl--) {
-					dual_tree[lvl][2*n+m]->reconstruct_level(dual_tree[lvl-1][2*n+m]->wavcoeffs[0], wavfilt_synth+wavfilt_len*2*n, \
-																	wavfilt_synth+wavfilt_len*2*m, wavfilt_len, wavfilt_offset);
-					skip /=2;
-				}
-				dual_tree[0][2*n+m]->reconstruct_level(tmp[2*n+m], first_lev_synth+first_lev_len*2*n, 
-															  first_lev_synth+first_lev_len*2*m, first_lev_len, first_lev_offset);
-			}
-		}
-				
-		copy_out(tmp,dst,m_w*m_h);
-		
-		for (int i=0; i<4; i++) {
-			delete[] tmp[i];
-		}
-		delete[] tmp;
-		
-		
-		
-	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	
-	
 	class wavelet_decomposition
 	{
 	public:
-		
+
 		typedef float internal_type;
-		
+
 	private:
-		
+
 		static const int maxlevels = 9;//should be greater than any conceivable order of decimation
-		
+
 		int lvltot, subsamp;
 		size_t m_w, m_h;//dimensions
-		
+
 		int wavfilt_len, wavfilt_offset;
 		float *wavfilt_anal;
 		float *wavfilt_synth;
-		
+
 		int testfilt_len, testfilt_offset;
 		float *testfilt_anal;
 		float *testfilt_synth;
-		
+
 		wavelet_level<internal_type> * wavelet_decomp[maxlevels];
-		
+
 	public:
-		
+
 		template<typename E>
 		wavelet_decomposition(E * src, int width, int height, int maxlvl, int subsampling);
-		
+
 		~wavelet_decomposition();
-		
+
 		internal_type ** level_coeffs(int level) const
 		{
 			return wavelet_decomp[level]->subbands();
 		}
-		
+
 		int level_W(int level) const
 		{
 			return wavelet_decomp[level]->width();
 		}
-		
+
 		int level_H(int level) const
 		{
 			return wavelet_decomp[level]->height();
 		}
-		
+
 		int level_pad(int level) const
 		{
 			return wavelet_decomp[level]->padding();
 		}
-		
+
 		int level_stride(int level) const
 		{
 			return wavelet_decomp[level]->stride();
 		}
-		
+
 		int maxlevel() const
 		{
 			return lvltot;
 		}
-		
+
 		int subsample() const
 		{
 			return subsamp;
 		}
-		
 		template<typename E>
 		void reconstruct(E * dst);
-		
 	};
-	
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
+
 	template<typename E>
 	wavelet_decomposition::wavelet_decomposition(E * src, int width, int height, int maxlvl, int subsampling)
 	: lvltot(0), subsamp(subsampling), m_w(width), m_h(height)
 	{
-		
 		//initialize wavelet filters
-		
 		wavfilt_len = Daub4_len;
 		wavfilt_offset = Daub4_offset;
 		wavfilt_anal = new float[2*wavfilt_len];
 		wavfilt_synth = new float[2*wavfilt_len];
-		
+
 		for (int n=0; n<2; n++) {
 			for (int i=0; i<wavfilt_len; i++) {
 				wavfilt_anal[wavfilt_len*(n)+i]  = Daub4_anal[n][i];
@@ -392,18 +114,10 @@ public:
 				//n=0 lopass, n=1 hipass
 			}
 		}
-		
-		
-		// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-		
-		
+
 		// after coefficient rotation, data structure is:
 		// wavelet_decomp[scale][channel={lo,hi1,hi2,hi3}][pixel_array]
-		
-		//srand((unsigned)time(0));
-		//for (int i=0; i<m_w*m_h; i++ )
-		//	src[i] = (float)rand()/(float)RAND_MAX;
-		
+
 		int padding = 0;//pow(2, maxlvl);//must be a multiple of two
 		lvltot=0;
 		wavelet_decomp[lvltot] = new wavelet_level<internal_type>(src, lvltot/*level*/, subsamp, padding/*padding*/, m_w, m_h, \
@@ -414,41 +128,36 @@ public:
 																	  wavelet_decomp[lvltot-1]->width(), wavelet_decomp[lvltot-1]->height(), \
 																	  wavfilt_anal, wavfilt_anal, wavfilt_len, wavfilt_offset);
 		}
-		
-		
 	}
 	
-	/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */
-	/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */
-	
-	
 	template<typename E>
 	void wavelet_decomposition::reconstruct(E * dst) { 
-		
-		// data structure is wavcoeffs[scale][channel={lo,hi1,hi2,hi3}][pixel_array]
-		
-		//int skip=1<<(lvltot-1);
+
+		// data structure is wavcoeffs[scale][channel={lo,hi1,hi2,hi3}][pixel_array]
+		int m_w = 0;
+		int m_h2 = 0;
+
+		for(int lvl=0;lvl<lvltot;lvl++) {
+			if(m_w < wavelet_decomp[lvl]->m_w)
+				m_w = wavelet_decomp[lvl]->m_w;
+			if(m_h2 < wavelet_decomp[lvl]->m_h2)
+				m_h2 = wavelet_decomp[lvl]->m_h2;
+		}
+
+		E *tmpLo = new E[m_w*m_h2];
+		E *tmpHi = new E[m_w*m_h2];
+
 		for (int lvl=lvltot-1; lvl>0; lvl--) {
-			wavelet_decomp[lvl]->reconstruct_level(wavelet_decomp[lvl-1]->wavcoeffs[0], wavfilt_synth, wavfilt_synth, wavfilt_len, wavfilt_offset);
+			wavelet_decomp[lvl]->reconstruct_level(tmpLo, tmpHi, wavelet_decomp[lvl-1]->wavcoeffs[0], wavfilt_synth, wavfilt_synth, wavfilt_len, wavfilt_offset);
 			//skip /=2;
 		}
-		
-		internal_type * tmp = new internal_type[m_w*m_h];
 
-		wavelet_decomp[0]->reconstruct_level(tmp, wavfilt_synth, wavfilt_synth, wavfilt_len, wavfilt_offset);
-		
-		copy_out(tmp,dst,m_w*m_h);
-		
-		delete[] tmp;
-		
-		
+		wavelet_decomp[0]->reconstruct_level(tmpLo, tmpHi, dst, wavfilt_synth, wavfilt_synth, wavfilt_len, wavfilt_offset);
+
+		delete[] tmpLo;
+		delete[] tmpHi;
 		
 	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	
-	
 
 };
 
diff --git a/rtengine/cplx_wavelet_filter_coeffs.h b/rtengine/cplx_wavelet_filter_coeffs.h
index ac537d0a1..cfe3619fe 100644
--- a/rtengine/cplx_wavelet_filter_coeffs.h
+++ b/rtengine/cplx_wavelet_filter_coeffs.h
@@ -17,108 +17,9 @@
  *  2012 Emil Martinec <ejmartin@uchicago.edu>
  */
 
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-//#include "improcfun.h"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
 
 namespace rtengine {
 
-
-// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%	
-	
-/*const int AntonB_len = 12;//length of filter
-const int AntonB_offset = 6;//offset
-	
-const float AntonB_anal[2][2][12] = {//analysis filter
-		{{0, -0.08838834764832, 0.08838834764832, 0.69587998903400, 0.69587998903400, 
-			0.08838834764832, -0.08838834764832, 0.01122679215254, 0.01122679215254, 0},
-		{0, 0, 0, 0.04563588155712, -0.02877176311425, -0.29563588155712 , 
-			0.55754352622850, -0.29563588155713, -0.02877176311425, 0.04563588155712, 0, 0}},
-		{{0 , 0 , 0.02674875741081, -0.01686411844287, -0.07822326652899, 0.26686411844288, 
-			0.60294901823636, 0.26686411844287, -0.07822326652899, -0.01686411844287, 0.02674875741081, 0},
-		{0 , 0 , 0, 0 , 0.04563588155712, -0.02877176311425, 
-			-0.29563588155712 , 0.55754352622850, -0.29563588155713, -0.02877176311425, 0.04563588155712 , 0}} };
-	
-const float AntonB_synth[2][2][12] = {//synthesis filter
-		{{0 , 0 , 0, -0.04563588155712, -0.02877176311425, 0.29563588155712, 
-			0.55754352622850, 0.29563588155713, -0.02877176311425, -0.04563588155712, 0, 0},
-		{0, 0.02674875741081, 0.01686411844287, -0.07822326652899, -0.26686411844288 , 0.60294901823636, 
-			-0.26686411844287, -0.07822326652899, 0.01686411844287, 0.02674875741081, 0, 0}},
-		{{0 , 0, -0.04563588155712, -0.02877176311425, 0.29563588155712 , 0.55754352622850 , 
-			0.29563588155713, -0.02877176311425, -0.04563588155712, 0, 0 , 0},
-		{0.02674875741081 , 0.01686411844287, -0.07822326652899, -0.26686411844288 , 0.60294901823636, -0.26686411844287, 
-			-0.07822326652899, 0.01686411844287 , 0.02674875741081 , 0 , 0, 0}} };*/
-
-	
-/*	for (int i=0; i<4; i++)
-		for (int n=0; n<12; n++) {
-			AntonB.synth[i][n] *= 2;
-		}*/
-
-// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%	
-
-
-const int FSFarras_len=10;//length of filter
-const int FSFarras_offset=4;//offset
-
-const float FSFarras_anal[2][2][10] = {//analysis filter
-	{{0, -0.08838834764832, 0.08838834764832, 0.69587998903400, 0.69587998903400, 0.08838834764832, -0.08838834764832, 0.01122679215254 , 0.01122679215254, 0},
-		{ 0, -0.01122679215254, 0.01122679215254, 0.08838834764832, 0.08838834764832, -0.69587998903400, 0.69587998903400, -0.08838834764832, -0.08838834764832, 0}},
-	{{0.01122679215254, 0.01122679215254, -0.08838834764832, 0.08838834764832, 0.69587998903400, 0.69587998903400, 0.08838834764832, -0.08838834764832, 0, 0},
-		{0, 0, -0.08838834764832, -0.08838834764832, 0.69587998903400, -0.69587998903400, 0.08838834764832, 0.08838834764832, 0.01122679215254, -0.01122679215254}} };
-
-//synthesis filter is the reverse (see cplx_wavelet_dec.h)
-
-
-// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%	
-
-/*
- % Kingsbury Q-filters for the dual-tree complex DWT
- %
- % af{i},i=1,2-analysis filters for tree i
- % sf{i},i=1,2-synthesis filters for tree i
- % note:af{2} is the reverse of af{1}
- % ordering is {af[1],af[2],sf[1],sf[2]}
- % REFERENCE:% N.G.Kingsbury,"A dual-tree complex wavelet
- %    transform with improved orthogonality and symmetry
- %    properties",Proceedings of the IEEE Int.Conf.on
- % Image Proc.(ICIP),2000  */
-
-const int Kingsbury_len=10;//length of filter
-const int Kingsbury_offset=4;//offset
-
-const float Kingsbury_anal[2][2][10] =	{//analysis filter
-	{{0.03516384000000, 0, -0.08832942000000, 0.23389032000000, 0.76027237000000, 0.58751830000000, 0, -0.11430184000000 , 0, 0},
-		{ 0, 0, -0.11430184000000, 0, 0.58751830000000, -0.76027237000000, 0.23389032000000, 0.08832942000000, 0, -0.03516384000000}},
-	{{0, 0, -0.11430184000000, 0, 0.58751830000000, 0.76027237000000, 0.23389032000000, -0.08832942000000, 0, 0.03516384000000},
-		{-0.03516384000000, 0, 0.08832942000000, 0.23389032000000, -0.76027237000000, 0.58751830000000, 0, -0.11430184000000, 0, 0}} };
-
-//synthesis filter is the reverse (see cplx_wavelet_dec.h)
-
-/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */
-
-const int Haar_len=2;//length of filter
-const int Haar_offset=1;//offset
-
-const float Haar_anal[2][2] =	{{0.5,0.5}, {0.5,-0.5}};//analysis filter
-
-//synthesis filter is the reverse (see cplx_wavelet_dec.h)
-
-/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */
-
-const int LeGall_len=6;
-const int LeGall_offset=2;
-const float LeGall_anal[2][6] = {{0, 0.25, 0.5, 0.25, 0, 0}, {0, -0.125, -0.25, 0.75, -0.25, -0.125}};
-const float LeGall_synth[2][6] = {{-0.125, 0.25, 0.75, 0.25, -0.125, 0}, {0, 0, -0.25, 0.5, -0.25, 0}};
-	
-/* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */
-
 const int Daub4_len=6;
 const int Daub4_offset=2;
 const float Daub4_anal[2][6] = {//analysis filter
diff --git a/rtengine/cplx_wavelet_level.h b/rtengine/cplx_wavelet_level.h
index d9b93ce2a..a5454a0ec 100644
--- a/rtengine/cplx_wavelet_level.h
+++ b/rtengine/cplx_wavelet_level.h
@@ -15,75 +15,63 @@
  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  2010 Ilya Popov <ilia_popov@rambler.ru>
- *  2012 Emil Martinec <ejmartin@uchicago.edu>
+ *  2012 Emil Martinec <ejmartin@uchicago.edu>
+ *  2014 Ingo Weyrich <heckflosse@i-weyrich.de>
  */
 
 #ifndef CPLX_WAVELET_LEVEL_H_INCLUDED
 #define CPLX_WAVELET_LEVEL_H_INCLUDED
 
 #include <cstddef>
-#include <algorithm>
-
-#include "gauss.h"
 #include "rt_math.h"
+#include "opthelper.h"
 
 namespace rtengine {
 
-	
-	//////////////////////////////////////////////////////////////////////////////
-	
 	template<typename T>
 	class wavelet_level
 	{
-		// full size
-		size_t m_w, m_h;
-		
-		// size of low frequency part
-		size_t m_w2, m_h2;
-		
+
 		// size of padded border
 		size_t m_pad;
-		
+
 		// level of decomposition
 		int lvl;
-		
+
 		// whether to subsample the output
 		bool subsamp_out;
-		
+
 		// spacing of filter taps
 		int skip;
-		
+
 		// allocation and destruction of data storage
 		T ** create(size_t n);
 		void destroy(T ** subbands);
-		
-		// load a row/column of input data, possibly with padding
-		template<typename E>
-		void loadbuffer(E * src, E * dst, int srclen, int pitch);
-		
-		void AnalysisFilter (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
-							 int taps, int offset, int pitch, int srclen);
-		void SynthesisFilter (T * srcLo, T * srcHi, T * dst, T *bufferLo, T *bufferHi, 
-							  float *filterLo, float *filterHi, int taps, int offset, int pitch, int dstlen);
-		
-		void AnalysisFilterHaar (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen);
-		void SynthesisFilterHaar (T * srcLo, T * srcHi, T * dst, T *bufferLo, T *bufferHi, int pitch, int dstlen);
-		
-		void AnalysisFilterSubsamp (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
-							 int taps, int offset, int pitch, int srclen);
-		void SynthesisFilterSubsamp (T * srcLo, T * srcHi, T * dst, T *bufferLo, T *bufferHi, 
-							  float *filterLo, float *filterHi, int taps, int offset, int pitch, int dstlen);
-		
-		void AnalysisFilterSubsampHaar (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen);
-		void SynthesisFilterSubsampHaar (T * srcLo, T * srcHi, T * dst, int pitch, int dstlen);
-		
-		void imp_nr (T* src, int width, int height, double thresh);
 
-		
+		// load a row/column of input data, possibly with padding
+
+		void AnalysisFilterHaarVertical (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen);
+		void AnalysisFilterHaarHorizontal (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen);
+		void SynthesisFilterHaarHorizontal (T * srcLo, T * srcHi, T * dst, int dstlen);
+		void SynthesisFilterHaarVertical (T * srcLo, T * srcHi, T * dst, int pitch, int dstlen);
+
+		void AnalysisFilterSubsampHorizontal (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
+							 int taps, int offset, int pitch, int srclen, int m_w2);
+		void AnalysisFilterSubsampVertical (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
+							 int taps, int offset, int pitch, int srclen);
+		void SynthesisFilterSubsampHorizontal (T * srcLo, T * srcHi, T * dst,
+							  float *filterLo, float *filterHi, int taps, int offset, int dstlen);
+		void SynthesisFilterSubsampVertical (T * srcLo, T * srcHi, T * dst, float *filterLo, float *filterHi, int taps, int offset, int pitch, int dstlen);
+
 	public:
-		
+
 		T ** wavcoeffs;
-		
+				// full size
+		size_t m_w, m_h;
+
+		// size of low frequency part
+		size_t m_w2, m_h2;
+
 		template<typename E>
 		wavelet_level(E * src, int level, int subsamp, int padding, size_t w, size_t h, float *filterV, float *filterH, int len, int offset)
 		: m_w(w), m_h(h), m_w2(w), m_h2(h), m_pad(padding), wavcoeffs(NULL), lvl(level), skip(1<<level), subsamp_out((subsamp>>level)&1)
@@ -102,501 +90,317 @@ namespace rtengine {
 			decompose_level(src, filterV, filterH, len, offset);
 			
 		}
-		
+
 		~wavelet_level()
 		{
 			destroy(wavcoeffs);
 		}
-		
+
 		T ** subbands() const
 		{
 			return wavcoeffs;
 		}
-		
+
 		T * lopass() const
 		{
 			return wavcoeffs[0];
 		}
-		
+
 		size_t width() const
 		{
 			return m_w2;
 		}
-		
+
 		size_t height() const
 		{
 			return m_h2;
 		}
-		
+
 		size_t padding() const
 		{
 			return m_pad/skip;
 		}
-		
+
 		size_t stride() const
 		{
 			return skip;
 		}
-		
+
 		template<typename E>
 		void decompose_level(E *src, float *filterV, float *filterH, int len, int offset);
-		
+
 		template<typename E>
-		void reconstruct_level(E *dst, float *filterV, float *filterH, int len, int offset);
-		
+		void reconstruct_level(E* tmpLo, E* tmpHi, E *dst, float *filterV, float *filterH, int taps, int offset);
 	};
-	
-	//////////////////////////////////////////////////////////////////////////////
-	
-	
-	
+
 	template<typename T>
-	T ** wavelet_level<T>::create(size_t n)
-	{
+	T ** wavelet_level<T>::create(size_t n)	{
 		T * data = new T[4*n];
 		T ** subbands = new T*[4];
-		for(size_t j = 0; j < 4; j++)
-		{
+		for(size_t j = 0; j < 4; j++) {
 			subbands[j] = data + n * j;
 		}
 		return subbands;
 	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	
-	
+
 	template<typename T>
-	void wavelet_level<T>::destroy(T ** subbands)
-	{
-		if(subbands)
-		{
+	void wavelet_level<T>::destroy(T ** subbands) {
+		if(subbands) {
 			delete[] subbands[0];
 			delete[] subbands;
 		}
 	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-
-	template<typename T> template<typename E>
-	void wavelet_level<T>::loadbuffer(E * src, E * dst, int pitch, int srclen)
-	{
-		E * tmp = dst + m_pad;
-		memset(dst, 0, (MAX(m_w2,m_h2))*sizeof(E));
-		
-		//create padded buffer from src data
-		for (size_t i = 0, j = 0; i<srclen; i++, j += pitch)
-		{
-			tmp[i] = src[j];
-		}	
-		
-		// extend each coset mirror-like by padding amount 'm_pad' and to a multiple of 'skip'
-		for (size_t i=1; i<=MIN(srclen-1,m_pad); i++) {
-			tmp[-i] = tmp[i];
-			tmp[srclen+i-1] = tmp[srclen-i-1];
-		}
-		for (size_t i=0; i<srclen%skip; i++) {
-			tmp[srclen+m_pad+i] = tmp[srclen+m_pad-i-2];
-		}
-		
-	}
-	
-	
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	template<typename T>
-	void wavelet_level<T>::AnalysisFilter (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
-												int taps, int offset, int pitch, int srclen) {
-		
-		/* Basic convolution code
-		 * Applies an FIR filter 'filter' with filter length 'taps', 
-		 * aligning the 'offset' element of the filter with
-		 * the input pixel, and skipping 'skip' pixels between taps 
-		 *
-		 */
-		
-				
-		for (size_t i = 0; i < (srclen); i++) {
-			float lo=0,hi=0;
-			if (i>skip*taps && i<srclen-skip*taps) {//bulk
-				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
-					lo += filterLo[j] * srcbuffer[i-l];//lopass channel
-					hi += filterHi[j] * srcbuffer[i-l];//hipass channel
-				}
-			} else {//boundary
-				for (int j=0; j<taps; j++) {
-					int arg = MAX(0,MIN(i+skip*(offset-j),srclen-1));//clamped BC's
-					lo += filterLo[j] * srcbuffer[arg];//lopass channel
-					hi += filterHi[j] * srcbuffer[arg];//hipass channel
-				}
-			}
-			
-			dstLo[(pitch*(i))] = lo;
-			dstHi[(pitch*(i))] = hi;
-		}
-		
-	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
 
 	template<typename T>
-	void wavelet_level<T>::SynthesisFilter (T * srcLo, T * srcHi, T * dst, T *bufferLo, T *bufferHi, float *filterLo, 
-												 float *filterHi, int taps, int offset, int pitch, int dstlen) {
-		
-		/* Basic convolution code
-		 * Applies an FIR filter 'filter' with filter length 'taps', 
-		 * aligning the 'offset' element of the filter with
-		 * the input pixel, and skipping 'skip' pixels between taps 
-		 *
-		 */
-		
-		
-		// load into buffer
-		
-		int srclen = (dstlen==m_w ? m_w2 : m_h2);//length of row/col in src (coarser level)
-		
-		for (size_t i=0, j=0; i<srclen; i++, j+=pitch) {
-			bufferLo[i]=srcLo[j];
-			bufferHi[i]=srcHi[j];
-		}
-		
-		int shift=skip*(taps-offset-1);
-		for(size_t i = m_pad; i < (dstlen+m_pad); i++) {
-			float tot=0;
-			if (i>skip*taps && i<(srclen-skip*taps)) {//bulk
-				for (int j=0, l=-shift; j<taps; j++, l+=skip) {
-					tot += (filterLo[j] * bufferLo[i-l] + filterHi[j] * bufferHi[i-l]);
-				}
-			} else {//boundary
-				for (int j=0, l=-shift; j<taps; j++, l+=skip) {
-					int arg = MAX(0,MIN((i-l),srclen-1));//clamped BC's
-					tot += (filterLo[j] * bufferLo[arg] + filterHi[j] * bufferHi[arg]);
-				}
-			}
-			
-			dst[pitch*(i-m_pad)] = tot;
+	void wavelet_level<T>::AnalysisFilterHaarHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int pitch, int srclen) {
 
-		}
-		
-	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	template<typename T>
-	void wavelet_level<T>::AnalysisFilterHaar (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen) {
-		
 		/* Basic convolution code
 		 * Applies a Haar filter 
-		 *
-		 */										
-		
+		*/										
+		for(int j=0;j<pitch;j++) {
+			for(int i = 0; i < (srclen - skip); i++) {
+				dstLo[j*srclen+i] = 0.5f*(srcbuffer[j*srclen+i] + srcbuffer[j*srclen+i+skip]);
+				dstHi[j*srclen+i] = 0.5f*(srcbuffer[j*srclen+i] - srcbuffer[j*srclen+i+skip]);
+			}
+			for(size_t i = max(srclen-skip,skip); i < (srclen); i++) {
+				dstLo[j*srclen+i] = 0.5f*(srcbuffer[j*srclen+i] + srcbuffer[j*srclen+i-skip]);
+				dstHi[j*srclen+i] = 0.5f*(srcbuffer[j*srclen+i] - srcbuffer[j*srclen+i-skip]);
+			}
+		}
+	}
+
+	template<typename T> void wavelet_level<T>::AnalysisFilterHaarVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int pitch, int srclen) {
+
+	/* Basic convolution code
+	 * Applies a Haar filter 
+	*/
 		for(int i = 0; i < (srclen - skip); i++) {
-			dstLo[(pitch*(i))] = 0.5*(srcbuffer[i] + srcbuffer[i+skip]);
-			dstHi[(pitch*(i))] = 0.5*(srcbuffer[i] - srcbuffer[i+skip]);
+			for(int j=0;j<pitch;j++) {
+				dstLo[(pitch*(i))+j] = 0.5f*(srcbuffer[i*pitch+j] + srcbuffer[(i+skip)*pitch+j]);
+				dstHi[(pitch*(i))+j] = 0.5f*(srcbuffer[i*pitch+j] - srcbuffer[(i+skip)*pitch+j]);
+			}
 		}
-		// Start the loop at max(srclen-skip,skip) to avoid buffer underrun
 		for(size_t i = max(srclen-skip,skip); i < (srclen); i++) {
-			dstLo[(pitch*(i))] = 0.5*(srcbuffer[i] + srcbuffer[i-skip]);
-			dstHi[(pitch*(i))] = 0.5*(srcbuffer[i] - srcbuffer[i-skip]);
-		}
-																		 
+			for(int j=0;j<pitch;j++) {
+				dstLo[(pitch*(i))+j] = 0.5f*(srcbuffer[i*pitch+j] + srcbuffer[(i-skip)*pitch+j]);
+				dstHi[(pitch*(i))+j] = 0.5f*(srcbuffer[i*pitch+j] - srcbuffer[(i-skip)*pitch+j]);
+			}
+		}
 	}
-	
+
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
 	
-	template<typename T>
-	void wavelet_level<T>::SynthesisFilterHaar (T * srcLo, T * srcHi, T * dst, T *bufferLo, T *bufferHi, int pitch, int dstlen) {
-		
+	template<typename T> void wavelet_level<T>::SynthesisFilterHaarHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, int dstlen) {
+
 		/* Basic convolution code
 		 * Applies a Haar filter 
 		 *
 		 */
-		
-		int srclen = (dstlen==m_w ? m_w2 : m_h2);//length of row/col in src (coarser level)
-		
-		for (size_t i=0, j=0; i<srclen; i++, j+=pitch) {
-			bufferLo[i]=srcLo[j];
-			bufferHi[i]=srcHi[j];
-		}
-		
-		for(size_t i = m_pad+skip; i < (dstlen+m_pad); i++) {
-			dst[pitch*(i-m_pad)] = 0.5*(bufferLo[i] + bufferHi[i] + bufferLo[i-skip] - bufferHi[i-skip]);			
-		}
-		
-		for(size_t i = (m_pad); i < (m_pad+skip); i++) {
-			dst[pitch*(i-m_pad)] = (bufferLo[i] + bufferHi[i]);			
+
+		for (int k=0; k<m_h2; k++) {
+			for(size_t i = (m_pad); i < (m_pad+skip); i++) {
+				dst[k*m_w+(i-m_pad)] = (srcLo[k*m_w2+i] + srcHi[k*m_w2+i]);			
+			}
+			for(size_t i = m_pad+skip; i < (dstlen+m_pad); i++) {
+				dst[k*m_w+(i-m_pad)] = 0.5f*(srcLo[k*m_w2+i] + srcHi[k*m_w2+i] + srcLo[k*m_w2+i-skip] - srcHi[k*m_w2+i-skip]);			
+			}
 		}
 	}
+
+	template<typename T> void wavelet_level<T>::SynthesisFilterHaarVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, int pitch, int dstlen) {
 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-
-
+		/* Basic convolution code
+		 * Applies a Haar filter 
+		 *
+		 */
+
+		for(size_t i = (m_pad); i < (m_pad+skip); i++) {
+			for(int j=0;j<pitch;j++)
+				dst[pitch*(i-m_pad)+j] = (srcLo[i*pitch+j] + srcHi[i*pitch+j]);			
+		}
+		for(size_t i = m_pad+skip; i < (dstlen+m_pad); i++) {
+			for(int j=0;j<pitch;j++)
+				dst[pitch*(i-m_pad)+j] = 0.5f*(srcLo[i*pitch+j] + srcHi[i*pitch+j] + srcLo[(i-skip)*pitch+j] - srcHi[(i-skip)*pitch+j]);			
+		}
+	}
+
 	template<typename T>
-	void wavelet_level<T>::AnalysisFilterSubsamp (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
-												int taps, int offset, int pitch, int srclen) {
-		
+	void wavelet_level<T>::AnalysisFilterSubsampHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float *filterHi, 
+												int taps, int offset, int pitch, int srclen, int m_w2) {
+
 		/* Basic convolution code
 		 * Applies an FIR filter 'filter' with filter length 'taps', 
 		 * aligning the 'offset' element of the filter with
 		 * the input pixel, and skipping 'skip' pixels between taps 
 		 * Output is subsampled by two
 		 */
-		
+
 		// calculate coefficients
-		
-		for(int i = 0; i < (srclen); i+=2) {
-			float lo=0,hi=0;
-			if (i>skip*taps && i<srclen-skip*taps) {//bulk
-				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
-					lo += filterLo[j] * srcbuffer[i-l];//lopass channel
-					hi += filterHi[j] * srcbuffer[i-l];//hipass channel
+		for(int k=0;k<pitch;k++) {
+			for(int i = 0; i < srclen; i+=2) {
+				float lo = 0.f, hi = 0.f;
+				if (LIKELY(i>skip*taps && i<srclen-skip*taps)) {//bulk
+					for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+						float src = srcbuffer[k*srclen+i-l];
+						lo += filterLo[j] * src;//lopass channel
+						hi += filterHi[j] * src;//hipass channel
+					}
+				} else {
+					for (int j=0; j<taps; j++) {
+						int arg = max(0,min(i+skip*(offset-j),srclen-1));//clamped BC's
+						lo += filterLo[j] * srcbuffer[k*srclen+arg];//lopass channel
+						hi += filterHi[j] * srcbuffer[k*srclen+arg];//hipass channel
+					}
+				}
+				dstLo[k*m_w2+((i/2))] = lo;
+				dstHi[k*m_w2+((i/2))] = hi;
+			}
+		}
+	}
+
+	template<typename T> void wavelet_level<T>::AnalysisFilterSubsampVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float * RESTRICT filterHi, 
+													int taps, int offset, int pitch, int srclen) {
+			/* Basic convolution code
+			 * Applies an FIR filter 'filter' with filter length 'taps', 
+			 * aligning the 'offset' element of the filter with
+			 * the input pixel, and skipping 'skip' pixels between taps 
+			 * Output is subsampled by two
+			 */
+
+			// calculate coefficients
+
+		for(int i = 0; i < srclen; i+=2) {
+			if (LIKELY(i>skip*taps && i<srclen-skip*taps)) {//bulk
+				for (int k=0; k<pitch; k++) {
+					float lo = 0.f, hi = 0.f;
+					for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+						lo += filterLo[j] * srcbuffer[(i-l)*pitch+k];//lopass channel
+						hi += filterHi[j] * srcbuffer[(i-l)*pitch+k];//hipass channel
+					}
+					dstLo[(pitch*(i/2))+k] = lo;
+					dstHi[(pitch*(i/2))+k] = hi;
 				}
 			} else {//boundary
-				for (int j=0; j<taps; j++) {
-					int arg = MAX(0,MIN(i+skip*(offset-j),srclen-1));//clamped BC's
-					lo += filterLo[j] * srcbuffer[arg];//lopass channel
-					hi += filterHi[j] * srcbuffer[arg];//hipass channel
+				for (int k=0; k<pitch; k++) {
+					float lo = 0.f, hi = 0.f;
+					for (int j=0; j<taps; j++) {
+						int arg = max(0,min(i+skip*(offset-j),srclen-1))*pitch+k;//clamped BC's
+						lo += filterLo[j] * srcbuffer[arg];//lopass channel
+						hi += filterHi[j] * srcbuffer[arg];//hipass channel
+					}
+					dstLo[(pitch*(i/2))+k] = lo;
+					dstHi[(pitch*(i/2))+k] = hi;
 				}
 			}
-			
-			dstLo[(pitch*(i/2))] = lo;
-			dstHi[(pitch*(i/2))] = hi;
-		}
-		
+		}
 	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	template<typename T>
-	void wavelet_level<T>::SynthesisFilterSubsamp (T * srcLo, T * srcHi, T * dst, T *bufferLo, T *bufferHi, 
-												 float *filterLo, float *filterHi, int taps, int offset, int pitch, int dstlen) {
-		
+
+	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, int taps, int offset, int dstlen) {
+
 		/* Basic convolution code
 		 * Applies an FIR filter 'filter' with filter length 'taps', 
 		 * aligning the 'offset' element of the filter with
 		 * the input pixel, and skipping 'skip' pixels between taps 
 		 * Output is subsampled by two
 		 */
-		
-		
-		
+
 		// calculate coefficients
-		
 		int srclen = (dstlen==m_w ? m_w2 : m_h2);//length of row/col in src (coarser level)
-		
-		//fill a buffer with a given row/column of data
-		for (size_t i=0, j=0; i<srclen; i++, j+=pitch) {
-			bufferLo[i]=srcLo[j];
-			bufferHi[i]=srcHi[j];
+		int shift = skip*(taps-offset-1);//align filter with data
+
+		for (int k=0; k<m_h2; k++) {
+			for(size_t i = m_pad; i < (dstlen+m_pad); i++) {
+				float tot=0.f;
+				//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
+				int i_src = (i+shift)/2;
+				int begin = (i+shift)%2;
+				if (LIKELY(i>skip*taps && i<(srclen-skip*taps))) {//bulk
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						tot += ((filterLo[j] * srcLo[k*m_w2+i_src-l] + filterHi[j] * srcHi[k*m_w2+i_src-l]));
+					}
+				} else {//boundary
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						int arg = max(0,min((i_src-l),srclen-1));//clamped BC's
+						tot += ((filterLo[j] * srcLo[k*m_w2+arg] + filterHi[j] * srcHi[k*m_w2+arg]));
+					}
+				}
+				dst[k*m_w+(i-m_pad)] = 2.f * tot;
+			}
 		}
-				
-		int shift=skip*(taps-offset-1);//align filter with data
+	}
+
+	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, int taps, int offset, int pitch, int dstlen) {
+
+			/* Basic convolution code
+			 * Applies an FIR filter 'filter' with filter length 'taps', 
+			 * aligning the 'offset' element of the filter with
+			 * the input pixel, and skipping 'skip' pixels between taps 
+			 * Output is subsampled by two
+			 */
+
+		// calculate coefficients
+		int srclen = (dstlen==m_w ? m_w2 : m_h2);//length of row/col in src (coarser level)
+		int shift=skip*(taps-offset-1);//align filter with data
+
 		for(size_t i = m_pad; i < (dstlen+m_pad); i++) {
-			
-			float tot=0;
-			//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
 			int i_src = (i+shift)/2;
 			int begin = (i+shift)%2;
-			if (i>skip*taps && i<(srclen-skip*taps)) {//bulk
-				for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-					tot += 2*((filterLo[j] * bufferLo[i_src-l] + filterHi[j] * bufferHi[i_src-l]));
-				}
+			//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
+			if (LIKELY(i>skip*taps && i<(srclen-skip*taps))) {//bulk
+				for (int k=0; k<pitch; k++) {
+					float tot = 0.f;
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						tot += ((filterLo[j] * srcLo[(i_src-l)*pitch+k] + filterHi[j] * srcHi[(i_src-l)*pitch+k]));
+					}
+					dst[pitch*(i-m_pad)+k] = 2.f * tot;
+				}
 			} else {//boundary
-				for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-					int arg = MAX(0,MIN((i_src-l),srclen-1));//clamped BC's
-					tot += 2*((filterLo[j] * bufferLo[arg] + filterHi[j] * bufferHi[arg]));
+				for (int k=0; k<pitch; k++) {
+					float tot = 0.f;
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						int arg = max(0,min((i_src-l),srclen-1))*pitch+k;//clamped BC's
+						tot += ((filterLo[j] * srcLo[arg] + filterHi[j] * srcHi[arg]));
+					}
+					dst[pitch*(i-m_pad)+k] = 2.f * tot;
 				}
-			}
-			
-			dst[pitch*(i-m_pad)] = tot;
-			
+			}
 		}
-		
 	}
-	
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	
-	
-	template<typename T>
-	void wavelet_level<T>::AnalysisFilterSubsampHaar (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen) {
-		
-		/* Basic convolution code
-		 * Applies a Haar filter
-		 * Output is subsampled by two
-		 */
-		
-		// calculate coefficients
-		
-		for(size_t i = 0; i < (srclen - skip); i+=2) {
-			dstLo[(pitch*(i/2))] = 0.5*(srcbuffer[i] + srcbuffer[i+skip]);
-			dstHi[(pitch*(i/2))] = 0.5*(srcbuffer[i] - srcbuffer[i+skip]);
-		}
-		
-		for(size_t i = (srclen-skip)-((srclen-skip)&1); i < (srclen); i+=2) {
-			dstLo[(pitch*(i/2))] = 0.5*(srcbuffer[i] + srcbuffer[i-skip]);
-			dstHi[(pitch*(i/2))] = 0.5*(srcbuffer[i] - srcbuffer[i-skip]);
-		}
-		
-	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	
-	template<typename T>
-	void wavelet_level<T>::SynthesisFilterSubsampHaar (T * srcLo, T * srcHi, T * dst, int pitch, int dstlen) {
-		
-		/* Basic convolution code
-		 * Applies a Haar filter 
-		 * Input was subsampled by two
-		 */
-		
-		
-		// calculate coefficients
-				
-		//TODO: this code is buggy...
-		for (int n=0; n<skip; n++) {
-			for (size_t i = m_pad; i < (dstlen+m_pad-2*skip); i+=2*skip) {
-				dst[pitch*(i-m_pad+n)] = srcLo[i/2+n]+srcHi[i/2+n];
-				dst[pitch*(i-m_pad+skip+n)] = srcLo[i/2+n]-srcHi[i/2+n];
-			}
-		}
-		
-		if ((dstlen+m_pad-2*skip)<dstlen-1) {
-			for (int n=0; n<skip; n++) {
-				for (size_t i=(dstlen+m_pad-2*skip); i<dstlen+m_pad-1; i++) {
-					dst[pitch*(dstlen-m_pad+n)] = srcLo[i/2+n]+srcHi[i/2+n];
-				}
-			}
-		}
-		
-		
-	}
-	
 
+	template<typename T> template<typename E> void wavelet_level<T>::decompose_level(E *src, float *filterV, float *filterH, int taps, int offset) { 
 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	
-	
-	template<typename T> template<typename E>
-	void wavelet_level<T>::decompose_level(E *src, float *filterV, float *filterH, int taps, int offset) { 
-		
 		T *tmpLo = new T[m_w*m_h2];
 		T *tmpHi = new T[m_w*m_h2];
-		
-		T *buffer = new T[MAX(m_w,m_h)+2*m_pad+skip];
-		
-		/* filter along columns */
-//OpenMP here		
-		for (int j=0; j<m_w; j++) {
-			loadbuffer(src+j, buffer, m_w/*pitch*/, m_h/*srclen*/);//pad a column of data and load it to buffer
-			if (subsamp_out) {
-				AnalysisFilterSubsamp (buffer, tmpLo+j, tmpHi+j, filterV, filterV+taps, taps, offset, m_w/*output_pitch*/, m_h/*srclen*/);
-				//AnalysisFilterSubsampHaar (buffer, tmpLo+j, tmpHi+j, m_w, m_h);
-			} else {
-				//AnalysisFilter (buffer, tmpLo+j, tmpHi+j, filterV, filterV+taps, taps, offset, m_w/*output_pitch*/, m_h/*srclen*/);
-				AnalysisFilterHaar (buffer, tmpLo+j, tmpHi+j, m_w, m_h);
-			}
+
+		/* filter along rows and columns */
+		if(subsamp_out) {
+			AnalysisFilterSubsampVertical (src, tmpLo, tmpHi, filterV, filterV+taps, taps, offset, m_w/*output_pitch*/, m_h/*srclen*/);
+			AnalysisFilterSubsampHorizontal (tmpLo, wavcoeffs[0], wavcoeffs[1], filterH, filterH+taps, taps, offset, m_h2/*output_pitch*/, m_w/*srclen*/, m_w2);
+			AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_h2/*output_pitch*/, m_w/*srclen*/, m_w2);
+		} else {
+			AnalysisFilterHaarVertical (src, tmpLo, tmpHi, m_w, m_h);
+			AnalysisFilterHaarHorizontal (tmpLo, wavcoeffs[0], wavcoeffs[1], m_h, m_w);
+			AnalysisFilterHaarHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], m_h, m_w);
 		}
-		
-		/* filter along rows */
-//OpenMP here		
-		for (int i=0; i<m_h2; i++) {
-			loadbuffer(tmpLo+i*m_w, buffer, 1/*pitch*/, m_w/*srclen*/);//pad a row of data and load it to buffer
-			if (subsamp_out) {
-				AnalysisFilterSubsamp (buffer, wavcoeffs[0]+i*m_w2, wavcoeffs[1]+i*m_w2, filterH, filterH+taps, taps, offset, 1/*output_pitch*/, m_w/*srclen*/);
-				//AnalysisFilterSubsampHaar (buffer, wavcoeffs[0]+i*m_w2, wavcoeffs[1]+i*m_w2, 1, m_w);
-			} else {
-				//AnalysisFilter (buffer, wavcoeffs[0]+i*m_w2, wavcoeffs[1]+i*m_w2, filterH, filterH+taps, taps, offset, 1/*output_pitch*/, m_w/*srclen*/);
-				AnalysisFilterHaar (buffer, wavcoeffs[0]+i*m_w2, wavcoeffs[1]+i*m_w2, 1, m_w);
-			}
-			loadbuffer(tmpHi+i*m_w, buffer, 1/*pitch*/, m_w/*srclen*/);
-			if (subsamp_out) {
-				AnalysisFilterSubsamp (buffer, wavcoeffs[2]+i*m_w2, wavcoeffs[3]+i*m_w2, filterH, filterH+taps, taps, offset, 1/*output_pitch*/, m_w/*srclen*/);
-				//AnalysisFilterSubsampHaar (buffer, wavcoeffs[2]+i*m_w2, wavcoeffs[3]+i*m_w2, 1, m_w);
-			} else {
-				//AnalysisFilter (buffer, wavcoeffs[2]+i*m_w2, wavcoeffs[3]+i*m_w2, filterH, filterH+taps, taps, offset, 1/*output_pitch*/, m_w/*srclen*/);
-				AnalysisFilterHaar (buffer, wavcoeffs[2]+i*m_w2, wavcoeffs[3]+i*m_w2, 1, m_w);
-			}
-		}
-				
+
 		delete[] tmpLo;
 		delete[] tmpHi;
-		delete[] buffer;
-	}
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-	
-	template<typename T> template<typename E>
-	void wavelet_level<T>::reconstruct_level(E *dst, float *filterV, float *filterH, int taps, int offset) { 
-		
-		T *tmpLo = new T[m_w*m_h2];
-		T *tmpHi = new T[m_w*m_h2];
-		
-		int buflen = MAX(m_w2,m_h2);
-		float *bufferLo = new float[buflen];
-		float *bufferHi = new float[buflen];
-		
-		/* filter along rows */
-//OpenMP here		
-		for (int i=0; i<m_h2; i++) {
-			
-			if (subsamp_out) {
-				SynthesisFilterSubsamp (wavcoeffs[0]+i*m_w2, wavcoeffs[1]+i*m_w2, tmpLo+i*m_w, bufferLo, bufferHi,
-										filterH, filterH+taps, taps, offset, 1/*pitch*/, m_w/*dstlen*/);
-				SynthesisFilterSubsamp (wavcoeffs[2]+i*m_w2, wavcoeffs[3]+i*m_w2, tmpHi+i*m_w, bufferLo, bufferHi,
-										filterH, filterH+taps, taps, offset, 1/*pitch*/, m_w/*dstlen*/);
-				//SynthesisFilterSubsampHaar (wavcoeffs[0]+i*m_w2, wavcoeffs[1]+i*m_w2, tmpLo+i*m_w, 1, m_w);//TODO: this is buggy
-				//SynthesisFilterSubsampHaar (wavcoeffs[2]+i*m_w2, wavcoeffs[3]+i*m_w2, tmpHi+i*m_w, 1, m_w);
-			} else {
-				//SynthesisFilter (wavcoeffs[0]+i*m_w2, wavcoeffs[1]+i*m_w2, tmpLo+i*m_w, bufferLo, bufferHi,
-				//				 filterH, filterH+taps, taps, offset, 1/*pitch*/, m_w/*dstlen*/);
-				//SynthesisFilter (wavcoeffs[2]+i*m_w2, wavcoeffs[3]+i*m_w2, tmpHi+i*m_w, bufferLo, bufferHi,
-				//				 filterH, filterH+taps, taps, offset, 1/*pitch*/, m_w/*dstlen*/);
-				SynthesisFilterHaar (wavcoeffs[0]+i*m_w2, wavcoeffs[1]+i*m_w2, tmpLo+i*m_w, bufferLo, bufferHi, 1, m_w);
-				SynthesisFilterHaar (wavcoeffs[2]+i*m_w2, wavcoeffs[3]+i*m_w2, tmpHi+i*m_w, bufferLo, bufferHi, 1, m_w);
-			}
-		}
-		
-		/* filter along columns */
-//OpenMP here		
-		for (int j=0; j<m_w; j++) {
-			if (subsamp_out) {
-				SynthesisFilterSubsamp (tmpLo+j, tmpHi+j, dst+j, bufferLo, bufferHi,
-										filterV, filterV+taps, taps, offset, m_w/*pitch*/, m_h/*dstlen*/);
-				//SynthesisFilterSubsampHaar (tmpLo+j, tmpHi+j, dst+j, m_w, m_h);
-			} else {
-				//SynthesisFilter (tmpLo+j, tmpHi+j, dst+j, bufferLo, bufferHi,
-				//				 filterV, filterV+taps, taps, offset, m_w/*pitch*/, m_h/*dstlen*/);
-				SynthesisFilterHaar (tmpLo+j, tmpHi+j, dst+j, bufferLo, bufferHi, m_w, m_h);
-			}
-		}
-		
-		
-		delete[] tmpLo;
-		delete[] tmpHi;
-		delete[] bufferLo;
-		delete[] bufferHi;
-		
 	}
 
-	
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	template<typename T> template<typename E> void wavelet_level<T>::reconstruct_level(E* tmpLo, E* tmpHi, E *dst, float *filterV, float *filterH, int taps, int offset) { 
+
+		/* filter along rows and columns */
+		if (subsamp_out) {
+			SynthesisFilterSubsampHorizontal (wavcoeffs[0], wavcoeffs[1], tmpLo, filterH, filterH+taps, taps, offset, m_w/*dstlen*/);
+			SynthesisFilterSubsampHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, filterH, filterH+taps, taps, offset, m_w/*dstlen*/);
+			SynthesisFilterSubsampVertical (tmpLo, tmpHi, dst, filterV, filterV+taps, taps, offset, m_w/*pitch*/, m_h/*dstlen*/);
+		} else {
+			SynthesisFilterHaarHorizontal (wavcoeffs[0], wavcoeffs[1], tmpLo, m_w);
+			SynthesisFilterHaarHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, m_w);
+			SynthesisFilterHaarVertical (tmpLo, tmpHi, dst, m_w, m_h);
+		}
+	}
 	
 };
 
diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index e02de9e38..5391a67f3 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -23,7 +23,6 @@
 #include "refreshmap.h"
 #include "rt_math.h"
 #include "colortemp.h"
-//#include "../rtgui/cropguilistener.h"
 
 // "ceil" rounding
 #define SKIPS(a,b) ((a) / (b) + ((a) % (b) > 0))
@@ -168,8 +167,7 @@ void Crop::update (int todo) {
 		float autoNRmax = (float) settings->nrautomax;//
 		float autohigh = (float) settings->nrhigh;//
 		
-		DirPyrDenoiseParams denoiseParams = params.dirpyrDenoise;
-		denoiseParams.getCurves(noiseLCurve, noiseCCurve);
+		params.dirpyrDenoise.getCurves(noiseLCurve, noiseCCurve);
 
 		int tilesize;
 		int overlap;
@@ -204,20 +202,18 @@ void Crop::update (int todo) {
 		for(int cX=0;cX<numtiles_W;cX++) centerTile_X[cX]= tileWskip/2 + tileWskip*cX;
 		for(int cY=0;cY<numtiles_H;cY++) centerTile_Y[cY]= tileHskip/2 + tileHskip*cY;
 		if(settings->leveldnautsimpl==1){
-		if(denoiseParams.Cmethod=="MAN" || denoiseParams.Cmethod=="PON" )  {
-			PreviewProps pp (trafx, trafy, trafw*skip, trafh*skip, skip);
-			parent->imgsrc->getImage (parent->currWB, tr, origCrop, pp, params.toneCurve, params.icm, params.raw );
-		}
-		}
-		else {
-		if(denoiseParams.C2method=="MANU")  {
-			PreviewProps pp (trafx, trafy, trafw*skip, trafh*skip, skip);
-			parent->imgsrc->getImage (parent->currWB, tr, origCrop, pp, params.toneCurve, params.icm, params.raw );
-		}
-		
+			if(params.dirpyrDenoise.Cmethod=="MAN" || params.dirpyrDenoise.Cmethod=="PON" )  {
+				PreviewProps pp (trafx, trafy, trafw*skip, trafh*skip, skip);
+				parent->imgsrc->getImage (parent->currWB, tr, origCrop, pp, params.toneCurve, params.icm, params.raw );
+			}
+		} else {
+			if(params.dirpyrDenoise.C2method=="MANU")  {
+				PreviewProps pp (trafx, trafy, trafw*skip, trafh*skip, skip);
+				parent->imgsrc->getImage (parent->currWB, tr, origCrop, pp, params.toneCurve, params.icm, params.raw );
+			}
 		}
 		/*
-		if(denoiseParams.Cmethod=="PON") {
+		if(params.dirpyrDenoise.Cmethod=="PON") {
 			MyTime t1dce,t2dce;
 			t1dce.set();
 
@@ -227,7 +223,7 @@ void Crop::update (int todo) {
 			origCropPart = new Imagefloat (crW, crH);//allocate memory
 		
 		int skipP=1;
-		if (skip==1 && denoiseParams.enabled) {//evaluate Noise
+		if (skip==1 && params.dirpyrDenoise.enabled) {//evaluate Noise
 		
 			for(int wcr=0;wcr<numtiles_W;wcr++) {
 				for(int hcr=0;hcr<numtiles_H;hcr++) {
@@ -251,7 +247,7 @@ void Crop::update (int todo) {
 
 					float chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L;
 					chaut=0.f;redaut=0.f; blueaut=0.f; maxredaut=0.f; maxblueaut=0.f;chromina=0.f; sigma=0.f;
-					parent->ipf.RGB_denoise_info(provi, provi, provicalc, parent->imgsrc->isRAW(), denoiseParams, params.defringe, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve,lldenoiseutili, noiseCCurve,ccdenoiseutili, chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L);
+					parent->ipf.RGB_denoise_info(provi, provi, provicalc, parent->imgsrc->isRAW(), params.dirpyrDenoise, params.defringe, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve,lldenoiseutili, noiseCCurve,ccdenoiseutili, chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L);
 					//printf("DCROP skip=%d cha=%f red=%f bl=%f redM=%f bluM=%f chrom=%f sigm=%f lum=%f\n",skip, chaut,redaut,blueaut, maxredaut, maxblueaut, chromina, sigma, lumema);
 					if(maxredaut > maxblueaut) {
 						maxr=(maxredaut-chaut)/(autoNRmax/2.f);
@@ -281,11 +277,10 @@ void Crop::update (int todo) {
 		}
 		*/
 		
-		if((settings->leveldnautsimpl==1 && denoiseParams.Cmethod=="PRE") || (settings->leveldnautsimpl==0 && denoiseParams.C2method=="PREV")) {
-		
+		if((settings->leveldnautsimpl==1 && params.dirpyrDenoise.Cmethod=="PRE") || (settings->leveldnautsimpl==0 && params.dirpyrDenoise.C2method=="PREV")) {
 			PreviewProps pp (trafx, trafy, trafw*skip, trafh*skip, skip);
 			parent->imgsrc->getImage (parent->currWB, tr, origCrop, pp, params.toneCurve, params.icm, params.raw );
-			if((!isDetailWindow) && parent->adnListener && skip==1 && denoiseParams.enabled) {
+			if((!isDetailWindow) && parent->adnListener && skip==1 && params.dirpyrDenoise.enabled) {
 				float lowdenoise=1.f;
 				int levaut=settings->leveldnaut;
 				if(levaut==1)  //Standard
@@ -310,85 +305,72 @@ void Crop::update (int todo) {
 				if(settings->leveldnv ==2) {crW=int(tileWskip/2);crH=int(tileHskip/2);}
 				if(settings->leveldnv ==3) {crW=tileWskip-10;crH=tileHskip-10;}
 
+				float adjustr=1.f;	
+				if      (params.icm.working=="ProPhoto")   {adjustr =1.f;}
+				else if (params.icm.working=="Adobe RGB")  {adjustr = 1.f/1.3f;}
+				else if (params.icm.working=="sRGB")       {adjustr = 1.f/1.3f;}
+				else if (params.icm.working=="WideGamut")  {adjustr =1.f/1.1f;}
+				else if (params.icm.working=="Beta RGB")   {adjustr =1.f/1.2f;}
+				else if (params.icm.working=="BestRGB")    {adjustr =1.f/1.2f;}
+				else if (params.icm.working=="BruceRGB")   {adjustr =1.f/1.2f;}
+				if(parent->adnListener)
+					parent->adnListener->noiseTilePrev (centerTile_X[poscenterX], centerTile_Y[poscenterY],CenterPreview_X,CenterPreview_Y,crW, trafw*skip);
+			// I have tried "blind" some solutions..to move review ...but GUI is not my truc !
+			//	int W,H;
+			//	cropgl->cropMoved (centerTile_X[poscenterX],centerTile_Y[poscenterY] , W, H);
+			 //   cropImageListener->setPosition (int x, int y, bool update=true);
+			 //   bool update;
+			 //   cropImageListener->setPosition (centerTile_X[poscenterX],centerTile_Y[poscenterY] , true);
+				//setCropSizes (centerTile_X[poscenterX], centerTile_Y[poscenterY], trafw*skip,trafh*skip , skip, true);
 
-				if (skip==1 && denoiseParams.enabled) {
-					float adjustr=1.f;	
-					if      (params.icm.working=="ProPhoto")   {adjustr =1.f;}
-					else if (params.icm.working=="Adobe RGB")  {adjustr = 1.f/1.3f;}
-					else if (params.icm.working=="sRGB")       {adjustr = 1.f/1.3f;}
-					else if (params.icm.working=="WideGamut")  {adjustr =1.f/1.1f;}
-					else if (params.icm.working=="Beta RGB")   {adjustr =1.f/1.2f;}
-					else if (params.icm.working=="BestRGB")    {adjustr =1.f/1.2f;}
-					else if (params.icm.working=="BruceRGB")   {adjustr =1.f/1.2f;}
-					if(parent->adnListener) parent->adnListener->noiseTilePrev (centerTile_X[poscenterX], centerTile_Y[poscenterY],CenterPreview_X,CenterPreview_Y,crW, trafw*skip);
-				// I have tried "blind" some solutions..to move review ...but GUI is not my truc !
-				//	int W,H;
-				//	cropgl->cropMoved (centerTile_X[poscenterX],centerTile_Y[poscenterY] , W, H);
-				 //   cropImageListener->setPosition (int x, int y, bool update=true);
-				 //   bool update;
-				 //   cropImageListener->setPosition (centerTile_X[poscenterX],centerTile_Y[poscenterY] , true);
-					//setCropSizes (centerTile_X[poscenterX], centerTile_Y[poscenterY], trafw*skip,trafh*skip , skip, true);
-
-					Imagefloat *provi;
-					Imagefloat *provicalc;
-					provi = new Imagefloat (cropw, croph);//
-						if(origCrop !=  provi)
-							origCrop->copyData(provi);
-					provicalc = new Imagefloat (cropw, croph);//for Luminance denoise curve
-						if(origCrop !=  provicalc)
-							origCrop->copyData(provicalc);
-			
-					parent->imgsrc->convertColorSpace(provicalc, params.icm, parent->currWB, params.raw);//for denoise luminance curve
-			
-					float maxr=0.f;
-					float maxb=0.f;
-					float chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc;
-					int Nb;
-					
-					chaut=0.f;redaut=0.f; blueaut=0.f; maxredaut=0.f; maxblueaut=0.f;minredaut=0.f; minblueaut=0.f;
-					LUTf gamcurve(65536,0);
-					float gam, gamthresh, gamslope;
-					parent->ipf.RGB_denoise_infoGamCurve(denoiseParams, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope);
-
-					parent->ipf.RGB_denoise_info(provi, provi, provicalc, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope, denoiseParams, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut,nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc);
+				Imagefloat *provicalc = new Imagefloat (cropw, croph);//for Luminance denoise curve
+				origCrop->copyData(provicalc);
+				parent->imgsrc->convertColorSpace(provicalc, params.icm, parent->currWB, params.raw);//for denoise luminance curve
+				float maxr=0.f;
+				float maxb=0.f;
+				float chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc;
+				int Nb;
+				
+				chaut=0.f;redaut=0.f; blueaut=0.f; maxredaut=0.f; maxblueaut=0.f;minredaut=0.f; minblueaut=0.f;
+				LUTf gamcurve(65536,0);
+				float gam, gamthresh, gamslope;
+				parent->ipf.RGB_denoise_infoGamCurve(params.dirpyrDenoise, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope);
+				parent->ipf.RGB_denoise_info(origCrop, provicalc, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope, params.dirpyrDenoise, parent->imgsrc->getDirPyrDenoiseExpComp(), chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut,nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc, true);
 //					printf("redy=%f skin=%f pcskin=%f\n",redyel, skinc,nsknc);
 //					printf("DCROP skip=%d cha=%4.0f Nb=%d red=%4.0f bl=%4.0f redM=%4.0f bluM=%4.0f  L=%4.0f sigL=%4.0f Ch=%4.0f Si=%4.0f\n",skip, chaut,Nb, redaut,blueaut, maxredaut, maxblueaut, lumema, sigma_L, chromina, sigma);
-					float multip=1.f;
-					if(!parent->imgsrc->isRAW()) multip=2.f;//take into account gamma for TIF / JPG approximate value...not good for gamma=1
-					
-					float maxmax=max(maxredaut,maxblueaut);	
-					float delta;
-					int mode=0;
-				//	float redyel, skinc, nsknc;
-					int lissage=settings->leveldnliss;
-					parent->ipf.calcautodn_info (chaut, delta, Nb, levaut, maxmax, lumema, chromina, mode, lissage, redyel, skinc, nsknc);
-					
-					
-					if(maxredaut > maxblueaut) {
-					//	maxr=(maxredaut-chaut)/((autoNRmax*multip*adjustr)/2.f);
-						maxr=(delta)/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
-						if(minblueaut <= minredaut  && minblueaut < chaut) maxb=(-chaut+minblueaut)/(autoNRmax*multip*adjustr*lowdenoise);
-					} 
-					else {
-					//	maxb=(maxblueaut-chaut)/((autoNRmax*multip*adjustr)/2.f);
-						maxb=(delta)/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
-						if(minredaut <= minblueaut  && minredaut < chaut) maxr=(-chaut+minredaut)/(autoNRmax*multip*adjustr*lowdenoise);
-					}//maxb mxr - empirical evaluation red / blue
-					
+				float multip=1.f;
+				if(!parent->imgsrc->isRAW()) multip=2.f;//take into account gamma for TIF / JPG approximate value...not good for gamma=1
 				
-					denoiseParams.chroma=chaut/(autoNR*multip*adjustr*lowdenoise);
-					denoiseParams.redchro=maxr;
-					denoiseParams.bluechro=maxb;
-					parent->adnListener->chromaChanged(denoiseParams.chroma, denoiseParams.redchro, denoiseParams.bluechro);
-					
-					delete provicalc;
-					delete provi;
-				}	
+				float maxmax=max(maxredaut,maxblueaut);	
+				float delta;
+				int mode=0;
+			//	float redyel, skinc, nsknc;
+				int lissage=settings->leveldnliss;
+				parent->ipf.calcautodn_info (chaut, delta, Nb, levaut, maxmax, lumema, chromina, mode, lissage, redyel, skinc, nsknc);
+				
+				
+				if(maxredaut > maxblueaut) {
+				//	maxr=(maxredaut-chaut)/((autoNRmax*multip*adjustr)/2.f);
+					maxr=(delta)/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
+					if(minblueaut <= minredaut  && minblueaut < chaut) maxb=(-chaut+minblueaut)/(autoNRmax*multip*adjustr*lowdenoise);
+				} 
+				else {
+				//	maxb=(maxblueaut-chaut)/((autoNRmax*multip*adjustr)/2.f);
+					maxb=(delta)/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
+					if(minredaut <= minblueaut  && minredaut < chaut) maxr=(-chaut+minredaut)/(autoNRmax*multip*adjustr*lowdenoise);
+				}//maxb mxr - empirical evaluation red / blue
+				
+			
+				params.dirpyrDenoise.chroma=chaut/(autoNR*multip*adjustr*lowdenoise);
+				params.dirpyrDenoise.redchro=maxr;
+				params.dirpyrDenoise.bluechro=maxb;
+				parent->adnListener->chromaChanged(params.dirpyrDenoise.chroma, params.dirpyrDenoise.redchro, params.dirpyrDenoise.bluechro);
+				
+				delete provicalc;
 			}
 		}
 
-		if((settings->leveldnautsimpl==1 && denoiseParams.Cmethod=="AUT")  || (settings->leveldnautsimpl==0 && denoiseParams.C2method=="AUTO")) {
-		
+		if(skip==1 && params.dirpyrDenoise.enabled && ((settings->leveldnautsimpl==1 && params.dirpyrDenoise.Cmethod=="AUT")  || (settings->leveldnautsimpl==0 && params.dirpyrDenoise.C2method=="AUTO"))) {
 			MyTime t1aue,t2aue;
 			t1aue.set();
 		
@@ -403,68 +385,63 @@ void Crop::update (int todo) {
 			int levaut=settings->leveldnaut;
 			if(levaut==1)  //Standard
 				lowdenoise=0.7f;
-		
 
-			if (skip==1 && denoiseParams.enabled) {//evaluate Noise
 			LUTf gamcurve(65536,0);
 			float gam, gamthresh, gamslope;
-			parent->ipf.RGB_denoise_infoGamCurve(denoiseParams, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope);
+			parent->ipf.RGB_denoise_infoGamCurve(params.dirpyrDenoise, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope);
 			int Nb[9];
+#ifdef _OPENMP
+#pragma omp parallel			
+#endif
+{
+			Imagefloat *origCropPart;//init auto noise		
+			origCropPart = new Imagefloat (crW, crH);//allocate memory
+			Imagefloat *provicalc;
+			provicalc = new Imagefloat (crW, crH);			
 
-			#pragma omp parallel			
-			{	
-				Imagefloat *origCropPart;//init auto noise		
-				origCropPart = new Imagefloat (crW, crH);//allocate memory
-				Imagefloat *provi;
-				Imagefloat *provicalc;
-				provi = new Imagefloat (crW, crH);
-				provicalc = new Imagefloat (crW, crH);			
-			
-				int  coordW[3];//coordonate of part of image to mesure noise
-				int  coordH[3];
-				int begW=50;
-				int begH=50;
-				coordW[0]=begW;coordW[1]=widIm/2-crW/2;coordW[2]=widIm-crW-begW;
-				coordH[0]=begH;coordH[1]=heiIm/2-crH/2;coordH[2]=heiIm-crH-begH;
-				#pragma omp for schedule(dynamic) collapse(2) nowait								
-				for(int wcr=0;wcr<=2;wcr++) {
-					for(int hcr=0;hcr<=2;hcr++) {
-						PreviewProps ppP (coordW[wcr] , coordH[hcr], crW, crH, skipP);
-						parent->imgsrc->getImage (parent->currWB, tr, origCropPart, ppP, params.toneCurve, params.icm, params.raw );
-			
-						if(origCropPart !=  provi)
-							origCropPart->copyData(provi);
-						if(origCropPart !=  provicalc)
-							origCropPart->copyData(provicalc);
-						parent->imgsrc->convertColorSpace(provicalc, params.icm, parent->currWB, params.raw);//for denoise luminance curve
-						//float maxr=0.f;
-						//float maxb=0.f;
-						float pondcorrec=1.0f;
-						float chaut=0.f, redaut=0.f, blueaut=0.f, maxredaut=0.f, maxblueaut=0.f, minredaut=0.f, minblueaut=0.f, nresi=0.f, highresi=0.f, chromina=0.f, sigma=0.f, lumema=0.f, sigma_L=0.f, redyel=0.f, skinc=0.f, nsknc=0.f;
-						int nb=0;
+			int  coordW[3];//coordonate of part of image to mesure noise
+			int  coordH[3];
+			int begW=50;
+			int begH=50;
+			coordW[0]=begW;coordW[1]=widIm/2-crW/2;coordW[2]=widIm-crW-begW;
+			coordH[0]=begH;coordH[1]=heiIm/2-crH/2;coordH[2]=heiIm-crH-begH;
+#ifdef _OPENMP
+#pragma omp for schedule(dynamic) collapse(2) nowait								
+#endif
+			for(int wcr=0;wcr<=2;wcr++) {
+				for(int hcr=0;hcr<=2;hcr++) {
+					PreviewProps ppP (coordW[wcr] , coordH[hcr], crW, crH, skipP);
+					parent->imgsrc->getImage (parent->currWB, tr, origCropPart, ppP, params.toneCurve, params.icm, params.raw );
+
+					if(origCropPart !=  provicalc)
+						origCropPart->copyData(provicalc);
+					parent->imgsrc->convertColorSpace(provicalc, params.icm, parent->currWB, params.raw);//for denoise luminance curve
+					//float maxr=0.f;
+					//float maxb=0.f;
+					float pondcorrec=1.0f;
+					float chaut=0.f, redaut=0.f, blueaut=0.f, maxredaut=0.f, maxblueaut=0.f, minredaut=0.f, minblueaut=0.f, nresi=0.f, highresi=0.f, chromina=0.f, sigma=0.f, lumema=0.f, sigma_L=0.f, redyel=0.f, skinc=0.f, nsknc=0.f;
+					int nb=0;
 //						chaut=0.f;redaut=0.f; blueaut=0.f; maxredaut=0.f; maxblueaut=0.f; chromina=0.f; sigma=0.f;
-						parent->ipf.RGB_denoise_info(provi, provi, provicalc, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope, denoiseParams, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc);
+					parent->ipf.RGB_denoise_info(origCropPart, provicalc, parent->imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope, params.dirpyrDenoise, parent->imgsrc->getDirPyrDenoiseExpComp(), chaut, nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc);
+
+				//printf("DCROP skip=%d cha=%f red=%f bl=%f redM=%f bluM=%f chrom=%f sigm=%f lum=%f\n",skip, chaut,redaut,blueaut, maxredaut, maxblueaut, chromina, sigma, lumema);
+					Nb[hcr*3 + wcr] = nb;
+					ch_M[hcr*3 + wcr]=pondcorrec*chaut;
+					max_r[hcr*3 + wcr]=pondcorrec*maxredaut;
+					max_b[hcr*3 + wcr]=pondcorrec*maxblueaut;		
+					min_r[hcr*3 + wcr]=pondcorrec*minredaut;
+					min_b[hcr*3 + wcr]=pondcorrec*minblueaut;
+					lumL[hcr*3 + wcr]=lumema;	
+					chromC[hcr*3 + wcr]=chromina;	
+					ry[hcr*3 + wcr]=redyel;	
+					sk[hcr*3 + wcr]=skinc;	
+					pcsk[hcr*3 + wcr]=nsknc;	
 					
-					
-					//printf("DCROP skip=%d cha=%f red=%f bl=%f redM=%f bluM=%f chrom=%f sigm=%f lum=%f\n",skip, chaut,redaut,blueaut, maxredaut, maxblueaut, chromina, sigma, lumema);
-						Nb[hcr*3 + wcr] = nb;
-						ch_M[hcr*3 + wcr]=pondcorrec*chaut;
-						max_r[hcr*3 + wcr]=pondcorrec*maxredaut;
-						max_b[hcr*3 + wcr]=pondcorrec*maxblueaut;		
-						min_r[hcr*3 + wcr]=pondcorrec*minredaut;
-						min_b[hcr*3 + wcr]=pondcorrec*minblueaut;
-						lumL[hcr*3 + wcr]=lumema;	
-						chromC[hcr*3 + wcr]=chromina;	
-						ry[hcr*3 + wcr]=redyel;	
-						sk[hcr*3 + wcr]=skinc;	
-						pcsk[hcr*3 + wcr]=nsknc;	
-						
-					}
-				} 
+				}
+			} 
 			delete provicalc;				
-			delete provi;			
 			delete origCropPart;
-			}
+}
 			float chM=0.f;
 			float MaxR=0.f;
 			float MaxB=0.f;
@@ -480,86 +457,82 @@ void Crop::update (int todo) {
 			float MaxBMoy=0.f;
 			float MinRMoy=0.f;
 			float MinBMoy=0.f;
-			
+
 			float multip=1.f;
 			if(!parent->imgsrc->isRAW()) multip=2.f;//take into account gamma for TIF / JPG approximate value...not good fot gamma=1
 			float adjustr=1.f;	
-				if      (params.icm.working=="ProPhoto")   {adjustr =1.f;}// 
-				else if (params.icm.working=="Adobe RGB")  {adjustr = 1.f/1.3f;}
-				else if (params.icm.working=="sRGB")       {adjustr = 1.f/1.3f;}
-				else if (params.icm.working=="WideGamut")  {adjustr =1.f/1.1f;}
-				else if (params.icm.working=="Beta RGB")   {adjustr =1.f/1.2f;}
-				else if (params.icm.working=="BestRGB")    {adjustr =1.f/1.2f;}
-				else if (params.icm.working=="BruceRGB")   {adjustr =1.f/1.2f;}
+			if      (params.icm.working=="ProPhoto")   {adjustr =1.f;}// 
+			else if (params.icm.working=="Adobe RGB")  {adjustr = 1.f/1.3f;}
+			else if (params.icm.working=="sRGB")       {adjustr = 1.f/1.3f;}
+			else if (params.icm.working=="WideGamut")  {adjustr =1.f/1.1f;}
+			else if (params.icm.working=="Beta RGB")   {adjustr =1.f/1.2f;}
+			else if (params.icm.working=="BestRGB")    {adjustr =1.f/1.2f;}
+			else if (params.icm.working=="BruceRGB")   {adjustr =1.f/1.2f;}
 
-				float maxmax;
-				float minmin;				
-				float delta[9];
-				int mode=1;
-				float redyel, skinc, nsknc;
-				int lissage=settings->leveldnliss;
-				for(int k=0;k<9;k++) {
-					maxmax=max(max_r[k],max_b[k]);				
-					parent->ipf.calcautodn_info (ch_M[k], delta[k], Nb[k], levaut, maxmax,lumL[k],chromC[k], mode, lissage, ry[k], sk[k], pcsk[k]);
-				//	printf("ch_M=%f delta=%f\n",ch_M[k], delta[k]);
-				}
-				for(int k=0;k<9;k++) {
-					if(max_r[k] > max_b[k]) {
-						minmin=min(min_r[k],min_b[k]);
-						Max_R[k]=(delta[k])/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
-						Min_B[k]= -(ch_M[k]- min_b[k])/(autoNRmax*multip*adjustr*lowdenoise);
-						Max_B[k]=0.f;
-						Min_R[k]=0.f;
-					} 
-					else {
-						minmin=min(min_r[k],min_b[k]);	
-						Max_B[k]=(delta[k])/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
-						Min_R[k]=- (ch_M[k]-min_r[k])	/ (autoNRmax*multip*adjustr*lowdenoise);
-						Min_B[k]=0.f;
-						Max_R[k]=0.f;
-						
-					}
-				}
-				
-				for(int k=0;k<9;k++) {
-				//	printf("ch_M= %f Max_R=%f Max_B=%f min_r=%f min_b=%f\n",ch_M[k],Max_R[k], Max_B[k],Min_R[k], Min_B[k]);
-					chM+=ch_M[k];
-					MaxBMoy+=Max_B[k];
-					MaxRMoy+=Max_R[k];
-					MinRMoy+=Min_R[k];
-					MinBMoy+=Min_B[k];
-					
-					if(Max_R[k]>MaxR) MaxR=Max_R[k];
-					if(Max_B[k]>MaxB) MaxB=Max_B[k];
-					if(Min_R[k]<MinR) MinR=Min_R[k];
-					if(Min_B[k]<MinB) MinB=Min_B[k];
-				
-				}
-				chM/=9;
-				MaxBMoy/=9;
-				MaxRMoy/=9;
-				MinBMoy/=9;
-				MinRMoy/=9;
-				
-				if(MaxR > MaxB) {
-					maxr=MaxRMoy + (MaxR-MaxRMoy)*0.66f;//#std Dev
-					//maxb=MinB;
-					maxb=MinBMoy + (MinB-MinBMoy)*0.66f;
-					
+			float maxmax;
+			float minmin;				
+			float delta[9];
+			int mode=1;
+			float redyel, skinc, nsknc;
+			int lissage=settings->leveldnliss;
+			for (int k=0;k<9;k++) {
+				maxmax=max(max_r[k],max_b[k]);				
+				parent->ipf.calcautodn_info (ch_M[k], delta[k], Nb[k], levaut, maxmax,lumL[k],chromC[k], mode, lissage, ry[k], sk[k], pcsk[k]);
+			//	printf("ch_M=%f delta=%f\n",ch_M[k], delta[k]);
+			}
+			for (int k=0;k<9;k++) {
+				if(max_r[k] > max_b[k]) {
+					minmin=min(min_r[k],min_b[k]);
+					Max_R[k]=(delta[k])/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
+					Min_B[k]= -(ch_M[k]- min_b[k])/(autoNRmax*multip*adjustr*lowdenoise);
+					Max_B[k]=0.f;
+					Min_R[k]=0.f;
 				} 
 				else {
-					maxb=MaxBMoy + (MaxB-MaxBMoy)*0.66f;
-					maxr=MinRMoy + (MinR-MinRMoy)*0.66f;
+					minmin=min(min_r[k],min_b[k]);	
+					Max_B[k]=(delta[k])/((autoNRmax*multip*adjustr*lowdenoise)/2.f);
+					Min_R[k]=- (ch_M[k]-min_r[k])	/ (autoNRmax*multip*adjustr*lowdenoise);
+					Min_B[k]=0.f;
+					Max_R[k]=0.f;
 				}
-			
+			}
+
+			for (int k=0;k<9;k++) {
+			//	printf("ch_M= %f Max_R=%f Max_B=%f min_r=%f min_b=%f\n",ch_M[k],Max_R[k], Max_B[k],Min_R[k], Min_B[k]);
+				chM+=ch_M[k];
+				MaxBMoy+=Max_B[k];
+				MaxRMoy+=Max_R[k];
+				MinRMoy+=Min_R[k];
+				MinBMoy+=Min_B[k];
+
+				if(Max_R[k]>MaxR) MaxR=Max_R[k];
+				if(Max_B[k]>MaxB) MaxB=Max_B[k];
+				if(Min_R[k]<MinR) MinR=Min_R[k];
+				if(Min_B[k]<MinB) MinB=Min_B[k];
+			}
+			chM/=9;
+			MaxBMoy/=9;
+			MaxRMoy/=9;
+			MinBMoy/=9;
+			MinRMoy/=9;
+
+			if(MaxR > MaxB) {
+				maxr=MaxRMoy + (MaxR-MaxRMoy)*0.66f;//#std Dev
+				//maxb=MinB;
+				maxb=MinBMoy + (MinB-MinBMoy)*0.66f;
+			}
+			else {
+				maxb=MaxBMoy + (MaxB-MaxBMoy)*0.66f;
+				maxr=MinRMoy + (MinR-MinRMoy)*0.66f;
+			}
+
 //					printf("DCROP skip=%d cha=%f red=%f bl=%f \n",skip, chM,maxr,maxb);
-					denoiseParams.chroma=chM/(autoNR*multip*adjustr);
-					denoiseParams.redchro=maxr;
-					denoiseParams.bluechro=maxb;
-					if(parent->adnListener) {
-						parent->adnListener->chromaChanged(denoiseParams.chroma, denoiseParams.redchro, denoiseParams.bluechro);
-					}
-		}	
+			params.dirpyrDenoise.chroma=chM/(autoNR*multip*adjustr);
+			params.dirpyrDenoise.redchro=maxr;
+			params.dirpyrDenoise.bluechro=maxb;
+			if(parent->adnListener) {
+				parent->adnListener->chromaChanged(params.dirpyrDenoise.chroma, params.dirpyrDenoise.redchro, params.dirpyrDenoise.bluechro);
+			}
 		if (settings->verbose) {
 			t2aue.set();
 			printf("Info denoise auto performed in %d usec:\n", t2aue.etime(t1aue));
@@ -567,13 +540,14 @@ void Crop::update (int todo) {
 			
 		//end evaluate noise
 		}
-	//	if(denoiseParams.Cmethod=="AUT" || denoiseParams.Cmethod=="PON") {//reinit origCrop after Auto
-		if((settings->leveldnautsimpl==1 && denoiseParams.Cmethod=="AUT")  || (settings->leveldnautsimpl==0 && denoiseParams.C2method=="AUTO")) {//reinit origCrop after Auto
+	//	if(params.dirpyrDenoise.Cmethod=="AUT" || params.dirpyrDenoise.Cmethod=="PON") {//reinit origCrop after Auto
+		if((settings->leveldnautsimpl==1 && params.dirpyrDenoise.Cmethod=="AUT")  || (settings->leveldnautsimpl==0 && params.dirpyrDenoise.C2method=="AUTO")) {//reinit origCrop after Auto
 			PreviewProps pp (trafx, trafy, trafw*skip, trafh*skip, skip);
 			parent->imgsrc->getImage (parent->currWB, tr, origCrop, pp, params.toneCurve, params.icm, params.raw );
 		}
+		DirPyrDenoiseParams denoiseParams = params.dirpyrDenoise;
 
-		if(denoiseParams.Lmethod == "CUR") {
+		if(params.dirpyrDenoise.Lmethod == "CUR") {
 			if(noiseLCurve)
 				denoiseParams.luma = 0.5f;	//very small value to init process - select curve or slider
 			else
@@ -585,8 +559,7 @@ void Crop::update (int todo) {
 
 		if((noiseLCurve || noiseCCurve ) && skip==1 && denoiseParams.enabled)	{//only allocate memory if enabled and skip
 			calclum = new Imagefloat (cropw, croph);//for Luminance denoise curve
-				if(origCrop !=  calclum)
-					origCrop->copyData(calclum);
+			origCrop->copyData(calclum);
 		
 			parent->imgsrc->convertColorSpace(calclum, params.icm, parent->currWB, params.raw);//for denoise luminance curve
 		}
@@ -598,15 +571,13 @@ void Crop::update (int todo) {
 
 				float chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi;
                 parent->ipf.RGB_denoise(kall, origCrop, origCrop, calclum, ch_M, max_r, max_b, parent->imgsrc->isRAW(), /*Roffset,*/ denoiseParams, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi);
-				if(parent->adnListener) parent->adnListener->noiseChanged(nresi, highresi);	
-				if(settings->leveldnautsimpl==1) {
-				if((denoiseParams.Cmethod=="AUT" || denoiseParams.Cmethod=="PRE") && (parent->adnListener)) // force display value of sliders
-					parent->adnListener->chromaChanged(denoiseParams.chroma, denoiseParams.redchro, denoiseParams.bluechro);
-				}
-				else {
-				if((denoiseParams.C2method=="AUTO" || denoiseParams.C2method=="PREV") && (parent->adnListener)) // force display value of sliders
-					parent->adnListener->chromaChanged(denoiseParams.chroma, denoiseParams.redchro, denoiseParams.bluechro);
-				
+				if (parent->adnListener) parent->adnListener->noiseChanged(nresi, highresi);	
+				if (settings->leveldnautsimpl == 1) {
+					if ((denoiseParams.Cmethod == "AUT" || denoiseParams.Cmethod == "PRE") && (parent->adnListener)) // force display value of sliders
+						parent->adnListener->chromaChanged(denoiseParams.chroma, denoiseParams.redchro, denoiseParams.bluechro);
+				} else {
+					if((denoiseParams.C2method == "AUTO" || denoiseParams.C2method == "PREV") && (parent->adnListener)) // force display value of sliders
+						parent->adnListener->chromaChanged(denoiseParams.chroma, denoiseParams.redchro, denoiseParams.bluechro);
 				}
 				
 			}
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index ef72c6ad6..5fda98920 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -35,7 +35,6 @@
 #include "iccmatrices.h"
 #include "color.h"
 #include "calc_distort.h"
-#include "cplx_wavelet_dec.h"
 #include "boxblur.h"
 #include "rt_math.h"
 #include "EdgePreservingDecomposition.h"
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index e287c3eb8..3bc10fc0b 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -31,7 +31,6 @@
 #include "LUT.h"
 #include "lcp.h"
 #include "curves.h"
-#include <fftw3.h>
 #include "cplx_wavelet_dec.h"
 #include "editbuffer.h"
 
@@ -193,7 +192,7 @@ class ImProcFunctions {
 		float blueresid;
 //		float maxredresid;//used by noise_residual
 //		float maxblueresid;//used by noise_residual
-		int comptlevel;
+//		int comptlevel;
 
 		static void initCache ();
 		static void cleanupCache ();
@@ -276,8 +275,8 @@ class ImProcFunctions {
 		void Tile_calc (int tilesize, int overlap, int kall, int imwidth, int imheight, int &numtiles_W, int &numtiles_H, int &tilewidth, int &tileheight, int &tileWskip, int &tileHskip);
 
 		void RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp,const NoiseCurve & ctNoisCurve , const NoiseCurve & ctNoisCCcurve , float &chaut, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &nresi, float &highresi);
-		void RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope);
-		void RGB_denoise_info(Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp,const NoiseCurve & ctNoisCurve, const NoiseCurve & ctNoisCCcurve, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &minredaut, float & minblueaut,float &nresi, float &highresi, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc);
+		void RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, const bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope);
+		void RGB_denoise_info(Imagefloat * src, Imagefloat * calclum, bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &minredaut, float & minblueaut,float &nresi, float &highresi, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc, bool multiThread = false);
 		void RGBtile_denoise (float * fLblox, int hblproc, float noisevar_L, float * nbrwt, float * blurbuffer );	//for DCT
 		void RGBoutput_tile_row (float *Lbloxrow, float ** Ldetail, float ** tilemask_out, int height, int width, int top );
 		//void WaveletDenoise(cplx_wavelet_decomposition &DualTreeCoeffs, float noisevar );
@@ -285,28 +284,29 @@ class ImProcFunctions {
 	//	void WaveletDenoiseAll(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
 	//						   wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab, wavelet_decomposition &wch, NoiImage * noi );
 		void WaveletDenoiseAll(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
-							   wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & dnNoisCurve, const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch);
-		void WaveletDenoiseAll_info(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
-							   wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & dnNoisCurve, const NoiseCurve & dnNoisCCcurve, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut,float &minredaut, float & minblueaut, int schoice, bool autoch, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc,
-							   float &maxchred, float &maxchblue,float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue);
+							   wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & dnNoisCurve, const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch, bool perf);
+		void WaveletDenoiseAll_info(int levwav, wavelet_decomposition &WaveletCoeffs_a, 
+							   wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb, LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut,float &minredaut, float & minblueaut, int schoice, bool autoch, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc,
+							   float &maxchred, float &maxchblue,float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool perf, bool multiThread);
 							   
 		void WaveletDenoiseAll_BiShrink(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, 
-										wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb,  float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & dnNoisCurve, const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch);
+										wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL, float *mad_aa, float *mad_bb,  float noisevar_abr, float noisevar_abb, LabImage * noi, const NoiseCurve & dnNoisCurve, const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int schoice, bool autoch, bool perf);
 		//void BiShrink(float * ReCoeffs, float * ImCoeffs, float * ReParents, float * ImParents, 
 		//			  int W, int H, int level, int padding, float noisevar);
 		//void Shrink(float ** WavCoeffs, int W, int H, int level, float noisevar);
 	//	void ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level, 
 	//				   int W_L, int H_L, int W_ab, int H_ab, int W_h, int H_h, int skip_L, int skip_ab, int skip_h, float noisevar_L, float noisevar_ab, float **WavCoeffs_h, LabImage * noi);
 
-		void ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level, 
-					   int W_L, int H_L, int W_ab, int H_ab, int skip_L, int skip_ab, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL,  float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb,LabImage * noi, const NoiseCurve & dnNoisCurve,  const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int callby, bool autoch, float * madaa = NULL, float * madab = NULL, float * madaL = NULL, bool madCalculated= false);
-		void ShrinkAll_info(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, int level, 
-					   int W_L, int H_L, int W_ab, int H_ab, int skip_L, int skip_ab, float noisevar_L, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_LL,  float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb,LabImage * noi, const NoiseCurve & dnNoisCurve, const NoiseCurve & dnNoisCCcurve, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int callby, bool autoch, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc, 
-					   float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, float * madaa = NULL, float * madab = NULL, float * madaL = NULL, bool madCalculated= false);
+		void ShrinkAll(float ** WavCoeffs_L, float ** WavCoeffs_a, float ** WavCoeffs_b, float *buffer[4], int level, 
+					   int W_L, int H_L, int W_ab, int H_ab, int skip_L, int skip_ab, float noisevar_L, float **noisevarlum, float **noisevarchrom, int width, int height, float *mad_LL,  float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb,LabImage * noi, const NoiseCurve & dnNoisCurve,  const NoiseCurve & dnNoisCCcurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, int callby, bool autoch, bool perf, float * madaa = NULL, float * madab = NULL, float * madaL = NULL, bool madCalculated= false);
+		void ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b, int level, 
+					   int W_ab, int H_ab, int skip_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, int width, int height, float *mad_aa, float *mad_bb, float noisevar_abr, float noisevar_abb,LabImage * noi, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, bool autoch, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel,float &skinc, float &nsknc, 
+					   float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool perf, bool multiThread);
 		void Noise_residual(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_a, wavelet_decomposition &WaveletCoeffs_b, int width, int height, float &chresid, float &chmaxredresid,float &chmaxblueresid, float &chresidred, float & chresidblue, float resid, float residblue, float residred, float maxredresid, float maxblueresid, float nbresid);
 		void calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc);
 		float MadMax(float * HH_Coeffs, int &max, int datalen); 
-		float Mad(float * HH_Coeffs, int datalen, int * histo);
+		float Mad(float * DataList, const int datalen);
+		float MadRgb(float * DataList, const int datalen);
 		
 		// pyramid equalizer
 		void dirpyr_equalizer    (float ** src, float ** dst, int srcwidth, int srcheight, float ** l_a, float ** l_b, float ** dest_a, float ** dest_b, const double * mult, const double dirpyrThreshold, const double skinprot, const bool gamutlab, float b_l, float t_l, float t_r, float b_r,  int choice, int scale);//Emil's directional pyramid equalizer
diff --git a/rtengine/opthelper.h b/rtengine/opthelper.h
index 477e435ae..752ad586b 100644
--- a/rtengine/opthelper.h
+++ b/rtengine/opthelper.h
@@ -25,7 +25,7 @@
 	#ifdef __SSE2__
 		#include "sleefsseavx.c"
 		#ifdef __GNUC__
-			#ifdef WIN32
+			#if defined(WIN32) && !defined( __x86_64__ )
 				// needed for actual versions of GCC with 32-Bit Windows
 				#define SSEFUNCTION __attribute__((force_align_arg_pointer))
 			#else
@@ -37,7 +37,7 @@
 	#else
 		#ifdef __SSE__
 			#ifdef __GNUC__
-				#ifdef WIN32
+				#if defined(WIN32) && !defined( __x86_64__ )
 					// needed for actual versions of GCC with 32-Bit Windows
 					#define SSEFUNCTION __attribute__((force_align_arg_pointer))
 				#else
@@ -55,9 +55,14 @@
 		#define RESTRICT 	__restrict__
 		#define LIKELY(x)   __builtin_expect (!!(x), 1)
 		#define UNLIKELY(x) __builtin_expect (!!(x), 0)
+		#define ALIGNED64 __attribute__ ((aligned (64)))
+		#define ALIGNED16 __attribute__ ((aligned (16)))
+
 	#else
 		#define RESTRICT
 		#define LIKELY(x)    (x)
 		#define UNLIKELY(x)  (x)
+		#define ALIGNED64
+		#define ALIGNED16
 	#endif
 #endif
diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index 315775b1a..145ea04b3 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -904,7 +904,7 @@ static Glib::ustring relativePathIfInside(Glib::ustring procparams_fname, bool f
     return prefix + embedded_fname.substr(dir1.length());
 }
 
-int ProcParams::save (Glib::ustring fname, Glib::ustring fname2, bool fnameAbsolute, ParamsEdited* pedited) const {
+int ProcParams::save (Glib::ustring fname, Glib::ustring fname2, bool fnameAbsolute, ParamsEdited* pedited) {
 
     if (!fname.length() && !fname2.length())
         return 0;
@@ -1279,8 +1279,17 @@ int ProcParams::save (Glib::ustring fname, Glib::ustring fname2, bool fnameAbsol
     if (!pedited || pedited->dirpyrDenoise.chroma)  keyFile.set_double ("Directional Pyramid Denoising", "Chroma",  dirpyrDenoise.chroma);
     if (!pedited || pedited->dirpyrDenoise.dmethod)  keyFile.set_string  ("Directional Pyramid Denoising", "Method",  dirpyrDenoise.dmethod);
     if (!pedited || pedited->dirpyrDenoise.Lmethod)  keyFile.set_string  ("Directional Pyramid Denoising", "LMethod",  dirpyrDenoise.Lmethod);
-    if (!pedited || pedited->dirpyrDenoise.Cmethod)  keyFile.set_string  ("Directional Pyramid Denoising", "CMethod",  dirpyrDenoise.Cmethod);
-    if (!pedited || pedited->dirpyrDenoise.C2method)  keyFile.set_string  ("Directional Pyramid Denoising", "C2Method",  dirpyrDenoise.C2method);
+    // never save 'auto chroma preview mode' to pp3
+	if (!pedited || pedited->dirpyrDenoise.Cmethod) {
+		if(dirpyrDenoise.Cmethod=="PRE")
+			dirpyrDenoise.Cmethod = "MAN";
+		keyFile.set_string  ("Directional Pyramid Denoising", "CMethod",  dirpyrDenoise.Cmethod);
+    }
+    if (!pedited || pedited->dirpyrDenoise.C2method) {
+    	if(dirpyrDenoise.C2method=="PREV")
+			dirpyrDenoise.C2method = "MANU";
+		keyFile.set_string  ("Directional Pyramid Denoising", "C2Method",  dirpyrDenoise.C2method);
+    }
     if (!pedited || pedited->dirpyrDenoise.smethod)  keyFile.set_string  ("Directional Pyramid Denoising", "SMethod",  dirpyrDenoise.smethod);
     if (!pedited || pedited->dirpyrDenoise.medmethod)  keyFile.set_string  ("Directional Pyramid Denoising", "MedMethod",  dirpyrDenoise.medmethod);
     if (!pedited || pedited->dirpyrDenoise.rgbmethod)  keyFile.set_string  ("Directional Pyramid Denoising", "RGBMethod",  dirpyrDenoise.rgbmethod);
@@ -1955,7 +1964,12 @@ if (keyFile.has_group ("Directional Pyramid Denoising")) {//TODO: No longer an a
     if (keyFile.has_key ("Directional Pyramid Denoising", "Method"))     {dirpyrDenoise.dmethod  = keyFile.get_string  ("Directional Pyramid Denoising", "Method"); if (pedited) pedited->dirpyrDenoise.dmethod = true; }
     if (keyFile.has_key ("Directional Pyramid Denoising", "LMethod"))     {dirpyrDenoise.Lmethod  = keyFile.get_string  ("Directional Pyramid Denoising", "LMethod"); if (pedited) pedited->dirpyrDenoise.Lmethod = true; }
     if (keyFile.has_key ("Directional Pyramid Denoising", "CMethod"))     {dirpyrDenoise.Cmethod  = keyFile.get_string  ("Directional Pyramid Denoising", "CMethod"); if (pedited) pedited->dirpyrDenoise.Cmethod = true; }
-    if (keyFile.has_key ("Directional Pyramid Denoising", "C2Method"))     {dirpyrDenoise.C2method  = keyFile.get_string  ("Directional Pyramid Denoising", "C2Method"); if (pedited) pedited->dirpyrDenoise.C2method = true; }
+    // never load 'auto chroma preview mode' from pp3
+	if(dirpyrDenoise.Cmethod=="PRE")
+		dirpyrDenoise.Cmethod = "MAN";
+	if (keyFile.has_key ("Directional Pyramid Denoising", "C2Method"))     {dirpyrDenoise.C2method  = keyFile.get_string  ("Directional Pyramid Denoising", "C2Method"); if (pedited) pedited->dirpyrDenoise.C2method = true; }
+	if(dirpyrDenoise.C2method=="PREV")
+		dirpyrDenoise.C2method = "MANU";
     if (keyFile.has_key ("Directional Pyramid Denoising", "SMethod"))     {dirpyrDenoise.smethod  = keyFile.get_string  ("Directional Pyramid Denoising", "SMethod"); if (pedited) pedited->dirpyrDenoise.smethod = true; }
     if (keyFile.has_key ("Directional Pyramid Denoising", "MedMethod"))     {dirpyrDenoise.medmethod  = keyFile.get_string  ("Directional Pyramid Denoising", "MedMethod"); if (pedited) pedited->dirpyrDenoise.medmethod = true; }
     if (keyFile.has_key ("Directional Pyramid Denoising", "MethodMed"))     {dirpyrDenoise.methodmed  = keyFile.get_string  ("Directional Pyramid Denoising", "MethodMed"); if (pedited) pedited->dirpyrDenoise.methodmed = true; }
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index 79dd5f0e6..2105d8a23 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -1029,7 +1029,7 @@ class ProcParams {
         * @param pedited pointer to a ParamsEdited object (optional) to store which values has to be saved
         * @return Error code (=0 if all supplied filenames where created correctly)
         */
-        int     save        (Glib::ustring fname, Glib::ustring fname2 = "", bool fnameAbsolute = true, ParamsEdited* pedited=NULL) const;
+        int     save        (Glib::ustring fname, Glib::ustring fname2 = "", bool fnameAbsolute = true, ParamsEdited* pedited=NULL);
       /**
         * Loads the parameters from a file.
         * @param fname the name of the file
diff --git a/rtengine/simpleprocess.cc b/rtengine/simpleprocess.cc
index 141f22014..fefcdafee 100644
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@@ -194,9 +194,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 		{
 			Imagefloat *origCropPart;//init auto noise
 			origCropPart = new Imagefloat (crW, crH);//allocate memory
-			Imagefloat *provi;
 			Imagefloat *provicalc;
-			provi = new Imagefloat (crW, crH);
 			provicalc = new Imagefloat (crW, crH);	
 			int skipP=1;
 			#pragma omp for schedule(dynamic) collapse(2) nowait		
@@ -206,8 +204,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 						int beg_tileH=hcr*tileHskip + tileHskip/2.f - crH/2.f;
 						PreviewProps ppP (beg_tileW , beg_tileH, crW, crH, skipP);
 						imgsrc->getImage (currWB, tr, origCropPart, ppP, params.toneCurve, params.icm, params.raw );
-						if(origCropPart !=  provi)
-							origCropPart->copyData(provi);
 						if(origCropPart !=  provicalc)
 						origCropPart->copyData(provicalc);
 						imgsrc->convertColorSpace(provicalc, params.icm, currWB, params.raw);//for denoise luminance curve
@@ -217,7 +213,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 						float chaut, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc;
 						int Nb;
 						chaut=0.f;redaut=0.f; blueaut=0.f; maxredaut=0.f; maxblueaut=0.f;chromina=0.f; sigma=0.f;
-						ipf.RGB_denoise_info(provi, provi, provicalc, imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope, params.dirpyrDenoise, imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc);
+						ipf.RGB_denoise_info(origCropPart, provicalc, imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope, params.dirpyrDenoise, imgsrc->getDirPyrDenoiseExpComp(), chaut, Nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc);
 						float multip=1.f;
 						float adjustr=1.f;	
 							if      (params.icm.working=="ProPhoto")   {adjustr =1.f;}//
@@ -258,7 +254,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 				}
 		
 			delete provicalc;
-			delete provi;		
 			delete origCropPart;
 		}	
 
@@ -355,9 +350,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 			{
 				Imagefloat *origCropPart;//init auto noise		
 				origCropPart = new Imagefloat (crW, crH);//allocate memory
-				Imagefloat *provi;
 				Imagefloat *provicalc;
-				provi = new Imagefloat (crW, crH);
 				provicalc = new Imagefloat (crW, crH);			
 
 				#pragma omp for schedule(dynamic) collapse(2) nowait				
@@ -365,12 +358,11 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 					for(int hcr=0;hcr<=2;hcr++) {
 						PreviewProps ppP (coordW[wcr] , coordH[hcr], crW, crH, 1);
 						imgsrc->getImage (currWB, tr, origCropPart, ppP, params.toneCurve, params.icm, params.raw);
-						origCropPart->copyData(provi);
 						origCropPart->copyData(provicalc);
 						imgsrc->convertColorSpace(provicalc, params.icm, currWB, params.raw);//for denoise luminance curve
 						int nb = 0;
 						float chaut=0.f, redaut=0.f, blueaut=0.f, maxredaut=0.f, maxblueaut=0.f, minredaut=0.f, minblueaut=0.f, nresi=0.f, highresi=0.f, chromina=0.f, sigma=0.f, lumema=0.f, sigma_L=0.f, redyel=0.f, skinc=0.f, nsknc=0.f;
-						ipf.RGB_denoise_info(provi, provi, provicalc, imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope,  params.dirpyrDenoise, imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc);
+						ipf.RGB_denoise_info(origCropPart, provicalc, imgsrc->isRAW(), gamcurve, gam, gamthresh, gamslope,  params.dirpyrDenoise, imgsrc->getDirPyrDenoiseExpComp(), chaut, nb, redaut, blueaut, maxredaut, maxblueaut, minredaut, minblueaut, nresi, highresi, chromina, sigma, lumema, sigma_L, redyel, skinc, nsknc);
 						Nb[hcr*3 + wcr] = nb;
 						ch_M[hcr*3 + wcr] = chaut;
 						max_r[hcr*3 + wcr] = maxredaut;
@@ -385,7 +377,6 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p
 					}
 				} 
 				delete provicalc;
-				delete provi;			
 				delete origCropPart;
 			}
 			float chM=0.f;
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
index 64a89d1d6..219f9cf82 100644
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@@ -901,8 +901,11 @@ static INLINE vdouble xlog1p(vdouble a) {
 typedef struct {
   vfloat x, y;
 } vfloat2;
-
+#if defined( __FMA__ ) && defined( __x86_64__ ) && defined(WIN32)  // experimental
+static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x,y,z); }
+#else
 static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return vaddf(vmulf(x, y), z); }
+#endif
 static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
 static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
 
@@ -1242,8 +1245,8 @@ static INLINE vfloat xexpf(vfloat d) {
   vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
   vfloat s, u;
 
-  s = vaddf(d, vmulf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf)));
-  s = vaddf(s, vmulf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf)));
+  s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
+  s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
 
   u = vcast_vf_f(0.00136324646882712841033936f);
   u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
@@ -1251,7 +1254,7 @@ static INLINE vfloat xexpf(vfloat d) {
   u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
   u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
 
-  u = vaddf(vcast_vf_f(1.0f), vaddf(s, vmulf(vmulf(s, s), u)));
+  u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
 
   u = vldexpf(u, q);