diff --git a/rtengine/PF_correct_RT.cc b/rtengine/PF_correct_RT.cc
index 343d6c935..eef63463e 100644
--- a/rtengine/PF_correct_RT.cc
+++ b/rtengine/PF_correct_RT.cc
@@ -6,6 +6,7 @@
 //
 //
 // code dated: November 24, 2010
+// optimized: September 2013, Ingo Weyrich
 //
 //	PF_correct_RT.cc is free software: you can redistribute it and/or modify
 //	it under the terms of the GNU General Public License as published by
@@ -23,21 +24,565 @@
 ////////////////////////////////////////////////////////////////
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
 #include "gauss.h"
 #include "improcfun.h"
 #include "sleef.c"
 #include "mytime.h"
 #include "../rtgui/myflatcurve.h"
+#include "rt_math.h"
+
+#ifdef __SSE2__
+#include "sleefsseavx.c"
+#endif
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
-#include "rt_math.h"
-#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} }
+using namespace std;
 
+namespace rtengine {
+extern const Settings* settings;
+
+#if defined( __SSE2__ ) && defined( WIN32 )
+__attribute__((force_align_arg_pointer)) void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, double radius, int thresh)
+#else
+void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, double radius, int thresh)
+#endif
+{
+	const int halfwin = ceil(2*radius)+1;
+
+	FlatCurve* chCurve = NULL;
+	if (params->defringe.huecurve.size() && FlatCurveType(params->defringe.huecurve.at(0)) > FCT_Linear)
+		chCurve = new FlatCurve(params->defringe.huecurve);
+
+	// local variables
+	const int width=src->W, height=src->H;
+	//temporary array to store chromaticity
+	float (*fringe);
+	fringe = (float (*)) malloc (height * width * sizeof(*fringe));
+
+	LabImage * tmp1;
+	tmp1 = new LabImage(width, height);
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+	{
+	AlignedBufferMP<double> buffer(max(src->W,src->H));
+	gaussHorizontal<float> (src->a, tmp1->a, buffer, src->W, src->H, radius);
+	gaussHorizontal<float> (src->b, tmp1->b, buffer, src->W, src->H, radius);
+	gaussVertical<float>   (tmp1->a, tmp1->a, buffer, src->W, src->H, radius);
+	gaussVertical<float>   (tmp1->b, tmp1->b, buffer, src->W, src->H, radius);
+	}
+
+float chromave=0.0f;
+
+#ifdef __SSE2__
+	if( chCurve ) {
+// vectorized precalculation of the atan2 values
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+		{
+			int j;
+#ifdef _OPENMP
+#pragma omp for
+#endif
+			for(int i = 0; i < height; i++ ) {
+				for(j = 0; j < width-3; j+=4)
+					_mm_storeu_ps(&fringe[i*width+j], xatan2f(LVFU(src->b[i][j]),LVFU(src->a[i][j])));
+				for(; j < width; j++)
+					fringe[i*width+j]=xatan2f(src->b[i][j],src->a[i][j]);
+			}
+		}
+	}
+#endif
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	float chromaChfactor = 1.0f;
+#ifdef _OPENMP
+#pragma omp for reduction(+:chromave)
+#endif
+	for(int i = 0; i < height; i++ ) {
+		for(int j = 0; j < width; j++) {
+			if (chCurve) {
+#ifdef __SSE2__
+				// use the precalculated atan values
+				float HH=fringe[i*width+j];
+#else
+				// no precalculated values without SSE => calculate
+				float HH=xatan2f(src->b[i][j],src->a[i][j]);
+#endif
+				double hr;
+				Color::huelab_to_huehsv (HH, hr);
+				float chparam = float((chCurve->getVal(hr)-0.5f) * 2.0f);//get C=f(H)
+				if(chparam > 0.f) chparam /=2.f; // reduced action if chparam > 0
+					chromaChfactor=1.0f+chparam;
+			}
+			float chroma = SQR(chromaChfactor*(src->a[i][j]-tmp1->a[i][j]))+SQR(chromaChfactor*(src->b[i][j]-tmp1->b[i][j]));//modulate chroma function hue
+			chromave += chroma;
+			fringe[i*width+j]=chroma;
+		}
+	}
+}
+
+	chromave /= (height*width);
+	float threshfactor = SQR(thresh/33.f)*chromave*5.0f;
+
+
+// now chromave is calculated, so we postprocess fringe to reduce the number of divisions in future
+#ifdef __SSE2__
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	__m128 sumv = _mm_set1_ps( chromave );
+	__m128 onev = _mm_set1_ps( 1.0f );
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for(int j=0; j < width*height-3; j+=4)
+		_mm_storeu_ps( &fringe[j], onev/(LVFU(fringe[j])+sumv));
+}
+	for(int j=width*height - (width*height)%4; j < width*height; j++)
+		fringe[j] = 1.f/(fringe[j]+chromave);
+#else
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+		for(int j = 0; j < width*height; j++)
+			fringe[j] = 1.f/(fringe[j]+chromave);
+#endif
+
+	// because we changed the values of fringe we also have to recalculate threshfactor
+	threshfactor = 1.0f/(threshfactor + chromave);
+
+// Issue 1674:
+// often, CA isn't evenly distributed, e.g. a lot in contrasty regions and none in the sky.
+// so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work
+// Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better
+// choice for the chunk_size than 16
+// Issue 1972: Split this loop in three parts to avoid most of the min and max-operations
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic,16)
+#endif
+	for(int i = 0; i < height; i++ ) {
+		int j;
+		for(j = 0; j < halfwin-1; j++) {
+			tmp1->a[i][j] = src->a[i][j];
+			tmp1->b[i][j] = src->b[i][j];
+			//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
+			/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
+				/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
+			if (fringe[i*width+j]<threshfactor) {
+				float atot=0.f;
+				float btot=0.f;
+				float norm=0.f;
+				float wt;
+				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
+					for (int j1=0; j1<j+halfwin; j1++) {
+						//neighborhood average of pixels weighted by chrominance
+						wt = fringe[i1*width+j1];
+						atot += wt*src->a[i1][j1];
+						btot += wt*src->b[i1][j1];
+						norm += wt;
+					}
+				tmp1->a[i][j] = atot/norm;
+				tmp1->b[i][j] = btot/norm;
+			}
+		}
+		for(; j < width-halfwin+1; j++) {
+			tmp1->a[i][j] = src->a[i][j];
+			tmp1->b[i][j] = src->b[i][j];
+			//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
+			/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
+				/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
+			if (fringe[i*width+j]<threshfactor) {
+				float atot=0.f;
+				float btot=0.f;
+				float norm=0.f;
+				float wt;
+				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
+					for (int j1=j-halfwin+1; j1<j+halfwin; j1++) {
+						//neighborhood average of pixels weighted by chrominance
+						wt = fringe[i1*width+j1];
+						atot += wt*src->a[i1][j1];
+						btot += wt*src->b[i1][j1];
+						norm += wt;
+					}
+				tmp1->a[i][j] = atot/norm;
+				tmp1->b[i][j] = btot/norm;
+			}
+		}
+		for(; j < width; j++) {
+			tmp1->a[i][j] = src->a[i][j];
+			tmp1->b[i][j] = src->b[i][j];
+			//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
+			/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
+				/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
+			if (fringe[i*width+j]<threshfactor) {
+				float atot=0.f;
+				float btot=0.f;
+				float norm=0.f;
+				float wt;
+				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
+					for (int j1=j-halfwin+1; j1<width; j1++) {
+						//neighborhood average of pixels weighted by chrominance
+						wt = fringe[i1*width+j1];
+						atot += wt*src->a[i1][j1];
+						btot += wt*src->b[i1][j1];
+						norm += wt;
+					}
+				tmp1->a[i][j] = atot/norm;
+				tmp1->b[i][j] = btot/norm;
+			}
+		}
+	}//end of ab channel averaging
+
+	if(src != dst)
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+		for(int i = 0; i < height; i++ ) {
+			for(int j = 0; j < width; j++) {
+				dst->L[i][j] = src->L[i][j];
+			}
+		}
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+	for(int i = 0; i < height; i++ ) {
+		for(int j = 0; j < width; j++) {
+			dst->a[i][j] = tmp1->a[i][j];
+			dst->b[i][j] = tmp1->b[i][j];
+		}
+	}
+
+
+	delete tmp1;
+	if(chCurve) delete chCurve;
+	free(fringe);
+}
+#if defined( __SSE2__ ) && defined( WIN32 )
+__attribute__((force_align_arg_pointer)) void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * dst, double radius, int thresh)
+#else
+void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * dst, double radius, int thresh)
+#endif
+{
+	const int halfwin = ceil(2*radius)+1;
+
+	FlatCurve* chCurve = NULL;
+	if (params->defringe.huecurve.size() && FlatCurveType(params->defringe.huecurve.at(0)) > FCT_Linear)
+		chCurve = new FlatCurve(params->defringe.huecurve);
+
+	// local variables
+	const int width=src->W, height=src->H;
+	const float piid=3.14159265f/180.f;
+	const float eps2=0.01f;
+
+	//temporary array to store chromaticity
+	float (*fringe);
+	fringe = (float (*)) malloc (height * width * sizeof(*fringe));
+
+	float** sraa;
+	sraa = new float*[height];
+	for (int i=0; i<height; i++)
+		sraa[i] = new float[width];
+
+	float** srbb;
+	srbb = new float*[height];
+	for (int i=0; i<height; i++)
+		srbb[i] = new float[width];
+
+	float** tmaa;
+	tmaa = new float*[height];
+	for (int i=0; i<height; i++)
+		tmaa[i] = new float[width];
+
+	float** tmbb;
+	tmbb = new float*[height];
+	for (int i=0; i<height; i++)
+		tmbb[i] = new float[width];
+					
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	float2 sincosval;
+#ifdef __SSE2__
+	int j;
+	vfloat2 sincosvalv;
+	__m128 piidv = _mm_set1_ps(piid);
+#endif // __SSE2__
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for (int i=0; i<height; i++) {
+#ifdef __SSE2__
+		for (j=0; j<width-3; j+=4) {
+			sincosvalv = xsincosf(piidv*LVFU(src->h_p[i][j]));
+			_mm_storeu_ps(&sraa[i][j],LVFU(src->C_p[i][j])*sincosvalv.y);
+			_mm_storeu_ps(&srbb[i][j],LVFU(src->C_p[i][j])*sincosvalv.x);
+		}		
+		for (; j<width; j++) {
+			sincosval = xsincosf(piid*src->h_p[i][j]);
+			sraa[i][j]=src->C_p[i][j]*sincosval.y;			
+			srbb[i][j]=src->C_p[i][j]*sincosval.x;
+		}		
+#else
+		for (int j=0; j<width; j++) {
+			sincosval = xsincosf(piid*src->h_p[i][j]);
+			sraa[i][j]=src->C_p[i][j]*sincosval.y;			
+			srbb[i][j]=src->C_p[i][j]*sincosval.x;
+		}		
+#endif
+	}
+}
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+	{
+	AlignedBufferMP<double> buffer(max(src->W,src->H));
+	gaussHorizontal<float> (sraa, tmaa, buffer, src->W, src->H, radius);
+	gaussHorizontal<float> (srbb, tmbb, buffer, src->W, src->H, radius);
+	gaussVertical<float>   (tmaa, tmaa, buffer, src->W, src->H, radius);
+	gaussVertical<float>   (tmbb, tmbb, buffer, src->W, src->H, radius);
+	}
+
+float chromave=0.0f;
+
+#ifdef __SSE2__
+if( chCurve ) {
+// vectorized precalculation of the atan2 values
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	int j;
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for(int i = 0; i < height; i++ ) {
+		for(j = 0; j < width-3; j+=4)
+			_mm_storeu_ps(&fringe[i*width+j], xatan2f(LVFU(srbb[i][j]),LVFU(sraa[i][j])));
+		for(; j < width; j++)
+			fringe[i*width+j]=xatan2f(srbb[i][j],sraa[i][j]);
+	}
+}
+}
+#endif
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	float chromaChfactor = 1.0f;
+#ifdef _OPENMP
+#pragma omp for reduction(+:chromave)
+#endif
+	for(int i = 0; i < height; i++ ) {
+		for(int j = 0; j < width; j++) {
+			if (chCurve) {
+#ifdef __SSE2__
+				// use the precalculated atan values
+				float HH=fringe[i*width+j];
+#else
+				// no precalculated values without SSE => calculate
+				float HH=xatan2f(srbb[i][j],sraa[i][j]);
+#endif
+				double hr;
+				Color::huelab_to_huehsv (HH, hr);	//approximation Cam / Lab
+				float chparam = float((chCurve->getVal(hr)-0.5f) * 2.0f);//get C=f(H)
+				if(chparam > 0.f) chparam /=2.f; // reduced action if chparam > 0
+				chromaChfactor=1.0f+chparam;
+			}
+			float chroma = SQR(chromaChfactor*(sraa[i][j]-tmaa[i][j]))+SQR(chromaChfactor*(srbb[i][j]-tmbb[i][j]));//modulate chroma function hue
+			chromave += chroma;
+			fringe[i*width+j]=chroma;
+		}
+	}
+}
+
+	chromave /= (height*width);
+	float threshfactor = SQR(thresh/33.f)*chromave*5.0f; // Calculated once to eliminate mult inside the next loop
+
+// now chromave is calculated, so we postprocess fringe to reduce the number of divisions in future
+#ifdef __SSE2__
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	__m128 sumv = _mm_set1_ps( chromave + eps2 );
+	__m128 onev = _mm_set1_ps( 1.0f );
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for(int j=0; j < width*height-3; j+=4)
+		_mm_storeu_ps( &fringe[j], onev/(LVFU(fringe[j])+sumv));
+}
+	for(int j=width*height - (width*height)%4; j < width*height; j++)
+		fringe[j] = 1.f/(fringe[j]+chromave+eps2);
+#else
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+		for(int j = 0; j < width*height; j++)
+			fringe[j] = 1.f/(fringe[j]+chromave+eps2);
+#endif
+
+	// because we changed the values of fringe we also have to recalculate threshfactor
+	threshfactor = 1.0f/(threshfactor + chromave + eps2);
+
+// Issue 1674:
+// often, CA isn't evenly distributed, e.g. a lot in contrasty regions and none in the sky.
+// so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work
+// Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better
+// choice for the chunk_size than 16
+// Issue 1972: Split this loop in three parts to avoid most of the min and max-operations
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic,16)
+#endif
+	for(int i = 0; i < height; i++ ) {
+		int j;
+		for(j = 0; j < halfwin-1; j++) {
+			tmaa[i][j] = sraa[i][j];
+			tmbb[i][j] = srbb[i][j];
+
+			if (fringe[i*width+j]<threshfactor) {
+				float atot=0.f;
+				float btot=0.f;
+				float norm=0.f;
+				float wt;
+				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
+					for (int j1=0; j1<j+halfwin; j1++) {
+						//neighborhood average of pixels weighted by chrominance
+						wt = fringe[i1*width+j1];
+						atot += wt*sraa[i1][j1];
+						btot += wt*srbb[i1][j1];
+						norm += wt;
+					}
+				if(norm > 0.f){	
+					tmaa[i][j] = (atot/norm);
+					tmbb[i][j] = (btot/norm);
+				}
+			}
+		}
+		for(; j < width-halfwin+1; j++) {
+			tmaa[i][j] = sraa[i][j];
+			tmbb[i][j] = srbb[i][j];
+
+			if (fringe[i*width+j]<threshfactor) {
+				float atot=0.f;
+				float btot=0.f;
+				float norm=0.f;
+				float wt;
+				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
+					for (int j1=j-halfwin+1; j1<j+halfwin; j1++) {
+						//neighborhood average of pixels weighted by chrominance
+						wt = fringe[i1*width+j1];
+						atot += wt*sraa[i1][j1];
+						btot += wt*srbb[i1][j1];
+						norm += wt;
+					}
+				if(norm > 0.f){	
+					tmaa[i][j] = (atot/norm);
+					tmbb[i][j] = (btot/norm);
+				}
+			}
+		}
+		for(; j < width; j++) {
+			tmaa[i][j] = sraa[i][j];
+			tmbb[i][j] = srbb[i][j];
+
+			if (fringe[i*width+j]<threshfactor) {
+				float atot=0.f;
+				float btot=0.f;
+				float norm=0.f;
+				float wt;
+				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
+					for (int j1=j-halfwin+1; j1<width; j1++) {
+						//neighborhood average of pixels weighted by chrominance
+						wt = fringe[i1*width+j1];
+						atot += wt*sraa[i1][j1];
+						btot += wt*srbb[i1][j1];
+						norm += wt;
+					}
+				if(norm > 0.f){	
+					tmaa[i][j] = (atot/norm);
+					tmbb[i][j] = (btot/norm);
+				}
+			}
+		}
+	} //end of ab channel averaging
+
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+#ifdef __SSE2__
+	int j;
+	__m128 interav, interbv;
+	__m128 piidv = _mm_set1_ps(piid);
+#endif
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for(int i = 0; i < height; i++ ) {
+#ifdef __SSE2__
+		for(j = 0; j < width-3; j+=4) {
+			_mm_storeu_ps( &dst->sh_p[i][j], LVFU(src->sh_p[i][j]));
+			interav = LVFU(tmaa[i][j]);
+			interbv = LVFU(tmbb[i][j]);
+			_mm_storeu_ps(&dst->h_p[i][j],(xatan2f(interbv,interav))/piidv);
+			_mm_storeu_ps(&dst->C_p[i][j],_mm_sqrt_ps(SQRV(interbv)+SQRV(interav)));
+		}
+		for(; j < width; j++) {
+			dst->sh_p[i][j] = src->sh_p[i][j];
+			float intera = tmaa[i][j];
+			float interb = tmbb[i][j];
+			dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
+			dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
+		}
+#else
+		for(int j = 0; j < width; j++) {
+			dst->sh_p[i][j] = src->sh_p[i][j];
+			float intera = tmaa[i][j];
+			float interb = tmbb[i][j];
+			dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
+			dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
+		}
+#endif
+	}
+}
+
+	for (int i=0; i<height; i++)
+		delete [] sraa[i];
+	delete [] sraa;
+	for (int i=0; i<height; i++)
+		delete [] srbb[i];
+	delete [] srbb;
+	for (int i=0; i<height; i++)
+		delete [] tmaa[i];
+	delete [] tmaa;
+	for (int i=0; i<height; i++)
+		delete [] tmbb[i];
+	delete [] tmbb;
+
+    if(chCurve) delete chCurve;
+
+    free(fringe);
+}
+
+
+#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} }
 
 #define med3(a0,a1,a2,a3,a4,a5,a6,a7,a8,median) { \
 pp[0]=a0; pp[1]=a1; pp[2]=a2; pp[3]=a3; pp[4]=a4; pp[5]=a5; pp[6]=a6; pp[7]=a7; pp[8]=a8; \
@@ -49,366 +594,113 @@ PIX_SORT(pp[3],pp[6]); PIX_SORT(pp[1],pp[4]); PIX_SORT(pp[2],pp[5]); \
 PIX_SORT(pp[4],pp[7]); PIX_SORT(pp[4],pp[2]); PIX_SORT(pp[6],pp[4]); \
 PIX_SORT(pp[4],pp[2]); median=pp[4];} //pp4 = median
 
-using namespace std;
-
-namespace rtengine {
-extern const Settings* settings;
-
-void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, double radius, int thresh) {
-	int halfwin = ceil(2*radius)+1;
-
-#include "rt_math.h"
-
-	FlatCurve* chCurve = NULL;
-	if (params->defringe.huecurve.size() && FlatCurveType(params->defringe.huecurve.at(0)) > FCT_Linear)
-		chCurve = new FlatCurve(params->defringe.huecurve);
-
-	// local variables
-	int width=src->W, height=src->H;
-	//temporary array to store chromaticity
-	int (*fringe);
-	fringe = (int (*)) calloc ((height)*(width), sizeof *fringe);
-
-	LabImage * tmp1;
-	tmp1 = new LabImage(width, height);
-
-#ifdef _OPENMP
-#pragma omp parallel
+#if defined( __SSE2__ ) && defined( WIN32 )
+__attribute__((force_align_arg_pointer)) void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, double radius, int thresh, int mode)
+#else
+void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, double radius, int thresh, int mode)
 #endif
-	{
-		AlignedBufferMP<double> buffer(max(src->W,src->H));
-
-		gaussHorizontal<float> (src->a, tmp1->a, buffer, src->W, src->H, radius);
-		gaussHorizontal<float> (src->b, tmp1->b, buffer, src->W, src->H, radius);
-		gaussVertical<float>   (tmp1->a, tmp1->a, buffer, src->W, src->H, radius);
-		gaussVertical<float>   (tmp1->b, tmp1->b, buffer, src->W, src->H, radius);
-
-//		gaussHorizontal<float> (src->L, tmp1->L, buffer, src->W, src->H, radius);
-//		gaussVertical<float>   (tmp1->L, tmp1->L, buffer, src->W, src->H, radius);
-	}
-
-float chromave=0;
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+:chromave)
-#endif
-	for(int i = 0; i < height; i++ ) {
-		for(int j = 0; j < width; j++) {
-			float HH=xatan2f(src->b[i][j],src->a[i][j]);
-				double hr;
-				float chromaChfactor=1.0f;
-				if (chCurve) {
-					Color::huelab_to_huehsv (HH, hr);
-					float chparam = float((chCurve->getVal(hr)-0.5f) * 2.0f);//get C=f(H)
-					if(chparam > 0.f) chparam /=2.f; // reduced action if chparam > 0
-					chromaChfactor=1.0f+chparam;
-				}
-			float chroma = SQR(chromaChfactor*(src->a[i][j]-tmp1->a[i][j]))+SQR(chromaChfactor*(src->b[i][j]-tmp1->b[i][j]));//modulate chroma function hue
-			chromave += chroma;
-			fringe[i*width+j]=chroma;
-		}
-	}
-	chromave /= (height*width);
-	float threshfactor = SQR(thresh/33.f)*chromave*5.0f;
-//	printf("Chro %f \n",chromave);
-
-// Issue 1674:
-// often, CA isn't evenly distributed, e.g. a lot in contrasty regions and none in the sky.
-// so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work
-// Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better
-// choice for the chunk_size than 16
-#ifdef _OPENMP
-#pragma omp parallel for schedule(dynamic,16)
-#endif
-	for(int i = 0; i < height; i++ ) {
-		for(int j = 0; j < width; j++) {
-			tmp1->a[i][j] = src->a[i][j];
-			tmp1->b[i][j] = src->b[i][j];
-			//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
-			/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
-				/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
-			if (fringe[i*width+j]>threshfactor) {
-				float atot=0.f;
-				float btot=0.f;
-				float norm=0.f;
-				float wt;
-				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
-					for (int j1=max(0,j-halfwin+1); j1<min(width,j+halfwin); j1++) {
-						//neighborhood average of pixels weighted by chrominance
-						wt = 1.f/(fringe[i1*width+j1]+chromave);
-						atot += wt*src->a[i1][j1];
-						btot += wt*src->b[i1][j1];
-						norm += wt;
-					}
-				tmp1->a[i][j] = (int)(atot/norm);
-				tmp1->b[i][j] = (int)(btot/norm);
-			}//end of ab channel averaging
-		}
-	}
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-
-	for(int i = 0; i < height; i++ ) {
-		for(int j = 0; j < width; j++) {
-			dst->L[i][j] = src->L[i][j];
-			dst->a[i][j] = tmp1->a[i][j];
-			dst->b[i][j] = tmp1->b[i][j];
-		}
-	}
-
-	delete tmp1;
-	if(chCurve) delete chCurve;
-	free(fringe);
-}
-
-void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * dst, double radius, int thresh) {
-	int halfwin = ceil(2*radius)+1;
-
-#include "rt_math.h"
-	FlatCurve* chCurve = NULL;
-	if (params->defringe.huecurve.size() && FlatCurveType(params->defringe.huecurve.at(0)) > FCT_Linear)
-		chCurve = new FlatCurve(params->defringe.huecurve);
-
-	// local variables
-	int width=src->W, height=src->H;
-	float piid=3.14159265f/180.f;
-	static float eps2=0.01f;
-	//temporary array to store chromaticity
-	int (*fringe);
-	fringe = (int (*)) calloc ((height)*(width), sizeof *fringe);
-
-	float** sraa;
-		sraa = new float*[height];
-		for (int i=0; i<height; i++)
-			sraa[i] = new float[width];
-/*
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-		for (int i=0; i<height; i++)
-			for (int j=0; j<width; j++) {
-				sraa[i][j]=src->C_p[i][j]*cos(piid*src->h_p[i][j]);
-			}
-			*/
-	float** tmaa;
-		tmaa = new float*[height];
-		for (int i=0; i<height; i++)
-			tmaa[i] = new float[width];
-
-	float** srbb;
-	srbb = new float*[height];
-		for (int i=0; i<height; i++)
-			srbb[i] = new float[width];
-					
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-		for (int i=0; i<height; i++)
-			for (int j=0; j<width; j++) {
-				float2 sincosval = xsincosf(piid*src->h_p[i][j]);
-				sraa[i][j]=src->C_p[i][j]*sincosval.y;			
-				srbb[i][j]=src->C_p[i][j]*sincosval.x;
-			}
-	float** tmbb;
-		tmbb = new float*[height];
-	for (int i=0; i<height; i++)
-			tmbb[i] = new float[width];
-
-	/*float** tmL;
-		tmL = new float*[height];
-	for (int i=0; i<height; i++)
-			tmL[i] = new float[width];
-*/
-
-#ifdef _OPENMP
-#pragma omp parallel
-#endif
-	{
-		AlignedBufferMP<double> buffer(max(src->W,src->H));
-		gaussHorizontal<float> (sraa, tmaa, buffer, src->W, src->H, radius);
-		gaussHorizontal<float> (srbb, tmbb, buffer, src->W, src->H, radius);
-		gaussVertical<float>   (tmaa, tmaa, buffer, src->W, src->H, radius);
-		gaussVertical<float>   (tmbb, tmbb, buffer, src->W, src->H, radius);
-	//	gaussHorizontal<float> (src->sh_p, tmL, buffer, src->W, src->H, radius);
-	//	gaussVertical<float>   (tmL, tmL, buffer, src->W, src->H, radius);
-
-	}
-
-float chromave=0;
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+:chromave)
-#endif
-	for(int i = 0; i < height; i++ ) {
-		for(int j = 0; j < width; j++) {
-				float HH=xatan2f(srbb[i][j],sraa[i][j]); //piid*src->h_p[i][j];
-				float chromaChfactor=1.0f;
-				double hr;
-				if (chCurve) {
-					Color::huelab_to_huehsv (HH, hr);	//approximation Cam / Lab
-					float chparam = float((chCurve->getVal(hr)-0.5f) * 2.0f);//get C=f(H)
-					if(chparam > 0.f) chparam /=2.f; // reduced action if chparam > 0
-					chromaChfactor=1.0f+chparam;
-				}
-				float chroma = SQR(chromaChfactor*(sraa[i][j]-tmaa[i][j]))+SQR(chromaChfactor*(srbb[i][j]-tmbb[i][j]));//modulate chroma function hue
-		
-		
-		
-		//	float chroma =SQR(sraa[i][j]-tmaa[i][j])+SQR(srbb[i][j]-tmbb[i][j]);
-			chromave += chroma;
-			fringe[i*width+j]=chroma;
-		}
-	}
-	chromave /= (height*width);
-	float threshfactor = SQR(thresh/33.f)*chromave*5.0f; // Calculated once to eliminate mult inside the next loop
-	//	printf("Chromave CAM %f \n",chromave);
-
-// Issue 1674:
-// often, CA isn't evenly distributed, e.g. a lot in contrasty regions and none in the sky.
-// so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work
-// Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better
-// choice for the chunk_size than 16
-#ifdef _OPENMP
-#pragma omp parallel for schedule(dynamic,16)
-#endif
-	for(int i = 0; i < height; i++ ) {
-		for(int j = 0; j < width; j++) {
-			tmaa[i][j] = sraa[i][j];
-			tmbb[i][j] = srbb[i][j];
-
-			//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
-			/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
-				/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
-			if (fringe[i*width+j]>threshfactor) {
-				float atot=0.f;
-				float btot=0.f;
-				float norm=0.f;
-				float wt;
-				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
-					for (int j1=max(0,j-halfwin+1); j1<min(width,j+halfwin); j1++) {
-						//neighborhood average of pixels weighted by chrominance
-						wt = 1.f/(fringe[i1*width+j1]+chromave+eps2);
-						atot += wt*sraa[i1][j1];
-						btot += wt*srbb[i1][j1];
-						norm += wt;
-					}
-				if(norm > 0.f){	
-				tmaa[i][j] = (atot/norm);
-				tmbb[i][j] = (btot/norm);
-				}
-			}//end of ab channel averaging
-		}
-	}
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-
-	for(int i = 0; i < height; i++ ) {
-		for(int j = 0; j < width; j++) {
-			dst->sh_p[i][j] = src->sh_p[i][j];
-			float intera = tmaa[i][j];
-			float interb = tmbb[i][j];
-			dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
-			dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
-		}
-	}
-                for (int i=0; i<height; i++)
-                    delete [] sraa[i];
-                delete [] sraa;
-                for (int i=0; i<height; i++)
-                    delete [] srbb[i];
-                delete [] srbb;
-                for (int i=0; i<height; i++)
-                    delete [] tmaa[i];
-                delete [] tmaa;
-                for (int i=0; i<height; i++)
-                    delete [] tmbb[i];
-                delete [] tmbb;
-           /*     for (int i=0; i<height; i++)
-                    delete [] tmL[i];
-                delete [] tmL;
-           */
-    if(chCurve) delete chCurve;
-
-    free(fringe);
-}
-void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, double radius, int thresh, int mode) {
-	#include "rt_math.h"
-
-	int halfwin = ceil(2*radius)+1;
+{
+	const int halfwin = ceil(2*radius)+1;
     MyTime t1,t2;
     t1.set();
-	//bool algogauss = settings->ciebadpixgauss;
-	int width=src->W, height=src->H;
-	float piid=3.14159265f/180.f;
+
+	const int width=src->W, height=src->H;
+	const float piid=3.14159265f/180.f;
 	float shfabs, shmed;
+
 	int i1, j1, tot;
-	static float eps = 1.0f;
-	static float eps2 =0.01f;
+	const float eps = 1.0f;
+	const float eps2 = 0.01f;
 	float shsum, dirsh, norm, sum;	
 
 	float** sraa;
-		sraa = new float*[height];
-		for (int i=0; i<height; i++)
-			sraa[i] = new float[width];
-
-	float** tmaa;
-		tmaa = new float*[height];
-		for (int i=0; i<height; i++)
-			tmaa[i] = new float[width];
+	sraa = new float*[height];
+	for (int i=0; i<height; i++)
+		sraa[i] = new float[width];
 
 	float** srbb;
 	srbb = new float*[height];
-		for (int i=0; i<height; i++)
-			srbb[i] = new float[width];
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-		for (int i=0; i<height; i++)
-			for (int j=0; j<width; j++) {
-				float2 sincosval = xsincosf(piid*src->h_p[i][j]);
-				sraa[i][j]=src->C_p[i][j]*sincosval.y;			
-				srbb[i][j]=src->C_p[i][j]*sincosval.x;
-			}		
-	float** tmbb;
-		tmbb = new float*[height];
 	for (int i=0; i<height; i++)
-			tmbb[i] = new float[width];
-    float ** badpix = new float *[height];
+		srbb[i] = new float[width];
+
+	float** tmaa;
+	tmaa = new float*[height];
+	for (int i=0; i<height; i++)
+		tmaa[i] = new float[width];
+
+	float** tmbb;
+	tmbb = new float*[height];
+	for (int i=0; i<height; i++)
+		tmbb[i] = new float[width];
+
+	float* badpix = (float*)malloc(width*height*sizeof(float));
+
 	float** tmL;
 		tmL = new float*[height];
 	for (int i=0; i<height; i++) {
 			tmL[i] = new float[width];
-			badpix[i] = new float [width];
-
 		}
+
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	float2 sincosval;
+#ifdef __SSE2__
+	int j;
+	vfloat2 sincosvalv;
+	__m128 piidv = _mm_set1_ps(piid);
+#endif // __SSE2__
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for (int i=0; i<height; i++) {
+#ifdef __SSE2__
+		for (j=0; j<width-3; j+=4) {
+			sincosvalv = xsincosf(piidv*LVFU(src->h_p[i][j]));
+			_mm_storeu_ps(&sraa[i][j],LVFU(src->C_p[i][j])*sincosvalv.y);
+			_mm_storeu_ps(&srbb[i][j],LVFU(src->C_p[i][j])*sincosvalv.x);
+		}		
+		for (; j<width; j++) {
+			sincosval = xsincosf(piid*src->h_p[i][j]);
+			sraa[i][j]=src->C_p[i][j]*sincosval.y;			
+			srbb[i][j]=src->C_p[i][j]*sincosval.x;
+		}		
+#else
+		for (int j=0; j<width; j++) {
+			sincosval = xsincosf(piid*src->h_p[i][j]);
+			sraa[i][j]=src->C_p[i][j]*sincosval.y;			
+			srbb[i][j]=src->C_p[i][j]*sincosval.x;
+		}		
+#endif
+	}
+}
+
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
 	{
-		AlignedBufferMP<double> buffer(max(src->W,src->H));
-		//chroma a and b
-		if(mode==2) {//choice of gaussian blur 
+	AlignedBufferMP<double> buffer(max(src->W,src->H));
+	//chroma a and b
+	if(mode==2) {//choice of gaussian blur 
 		gaussHorizontal<float> (sraa, tmaa, buffer, src->W, src->H, radius);
 		gaussHorizontal<float> (srbb, tmbb, buffer, src->W, src->H, radius);
 		gaussVertical<float>   (tmaa, tmaa, buffer, src->W, src->H, radius);
 		gaussVertical<float>   (tmbb, tmbb, buffer, src->W, src->H, radius);
-		}
-		//luma sh_p
-		gaussHorizontal<float> (src->sh_p, tmL, buffer, src->W, src->H, 2.0);//low value to avoid artifacts
-		gaussVertical<float>   (tmL, tmL, buffer, src->W, src->H, 2.0);
-
 	}
+	//luma sh_p
+	gaussHorizontal<float> (src->sh_p, tmL, buffer, src->W, src->H, 2.0);//low value to avoid artifacts
+	gaussVertical<float>   (tmL, tmL, buffer, src->W, src->H, 2.0);
+	}
+
 if(mode==1){	//choice of median
 #pragma omp parallel
 	{
-#pragma omp for	
+	int ip,in,jp,jn;
+	float pp[9],temp;
+#pragma omp for	nowait  //nowait because next loop inside this parallel region is independent on this one
 	for (int i=0; i<height; i++) {
-		int ip,in,jp,jn;
-		float pp[9],temp;
 		if (i<2) {ip=i+2;} else {ip=i-2;}
 		if (i>height-3) {in=i-2;} else {in=i+2;}
 		for (int j=0; j<width; j++) {
@@ -419,8 +711,6 @@ if(mode==1){	//choice of median
 	}
 #pragma omp for		
 	for (int i=0; i<height; i++) {
-		int ip,in,jp,jn;
-		float pp[9],temp;
 		if (i<2) {ip=i+2;} else {ip=i-2;}
 		if (i>height-3) {in=i-2;} else {in=i+2;}
 		for (int j=0; j<width; j++) {
@@ -430,39 +720,100 @@ if(mode==1){	//choice of median
 		}
 	}
 	}
-}	
-	
+}
+
 //luma badpixels	
-float sh_thr = 4.5f;//low value for luma sh_p to avoid artifacts
-float shthr = sh_thr / 24.0f;	      
+	const float sh_thr = 4.5f;//low value for luma sh_p to avoid artifacts
+	const float shthr = sh_thr / 24.0f;	      
 
 #ifdef _OPENMP
-  #pragma omp parallel for private(shfabs, shmed,i1,j1)
+#pragma omp parallel
 #endif
-	for (int i=0; i < height; i++)
-		for (int j=0; j < width; j++) {
+{
+	int j;
+#ifdef __SSE2__
+	__m128 shfabsv, shmedv;
+	__m128 shthrv = _mm_set1_ps(shthr);
+	__m128 onev = _mm_set1_ps(1.0f);
+#endif // __SSE2__
+#ifdef _OPENMP
+#pragma omp for private(shfabs, shmed,i1,j1)
+#endif
+	for (int i=0; i < height; i++) {
+		for (j=0; j < 2; j++) {
 			shfabs = fabs(src->sh_p[i][j]-tmL[i][j]);
-			for (i1=max(0,i-2), shmed=0; i1<=min(i+2,height-1); i1++ )
-				for (j1=max(0,j-2); j1<=min(j+2,width-1); j1++ ) {
+			shmed=0.0f;
+			for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
+				for (j1=0; j1<=j+2; j1++ ) {
 					shmed += fabs(src->sh_p[i1][j1]-tmL[i1][j1]);
 				}
-			badpix[i][j] = (shfabs>((shmed-shfabs)*shthr));
+			badpix[i*width+j] = (shfabs>((shmed-shfabs)*shthr));
+		}
+		
+#ifdef __SSE2__
+		for (; j < width-5; j+=4) {
+			shfabsv = vabsf(LVFU(src->sh_p[i][j])-LVFU(tmL[i][j]));
+			shmedv = _mm_setzero_ps();
+			for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
+				for (j1=j-2; j1<=j+2; j1++ ) {
+					shmedv += vabsf(LVFU(src->sh_p[i1][j1])-LVFU(tmL[i1][j1]));
+				}
+			_mm_storeu_ps( &badpix[i*width+j], vself(vmaskf_gt(shfabsv,(shmedv - shfabsv)*shthrv), onev, _mm_setzero_ps()));
+		}
+		for (; j < width-2; j++) {
+			shfabs = fabs(src->sh_p[i][j]-tmL[i][j]);
+			shmed=0.0f;
+			for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
+				for (j1=j-2; j1<=j+2; j1++ ) {
+					shmed += fabs(src->sh_p[i1][j1]-tmL[i1][j1]);
+				}
+			badpix[i*width+j] = (shfabs>((shmed-shfabs)*shthr));
+		}
+#else
+		for (; j < width-2; j++) {
+			shfabs = fabs(src->sh_p[i][j]-tmL[i][j]);
+			shmed=0.0f;
+			for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
+				for (j1=j-2; j1<=j+2; j1++ ) {
+					shmed += fabs(src->sh_p[i1][j1]-tmL[i1][j1]);
+				}
+			badpix[i*width+j] = (shfabs>((shmed-shfabs)*shthr));
 		}
-#ifdef _OPENMP
-  #pragma omp parallel for private(shsum,norm,dirsh,sum,i1,j1) schedule(dynamic,16)
 #endif
-	for (int i=0; i < height; i++)
-		for (int j=0; j < width; j++) {
-			if (!badpix[i][j]) continue;
+		for (; j < width; j++) {
+			shfabs = fabs(src->sh_p[i][j]-tmL[i][j]);
+			shmed=0.0f;
+			for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
+				for (j1=j-2; j1<width; j1++ ) {
+					shmed += fabs(src->sh_p[i1][j1]-tmL[i1][j1]);
+				}
+			badpix[i*width+j] = (shfabs>((shmed-shfabs)*shthr));
+		}
+	}
+}
+
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	int j;
+#ifdef _OPENMP
+#pragma omp for private(shsum,norm,dirsh,sum,i1,j1) schedule(dynamic,16)
+#endif
+	for (int i=0; i < height; i++) {
+		for (j=0; j < 2; j++) {
+			if (!badpix[i*width+j]) continue;
 			norm=0.0f;
 			shsum=0.0f;
 			sum=0.0f;
 			tot=0;
-			for (i1=max(0,i-2), shmed=0; i1<=min(i+2,height-1); i1++ )
-				for (j1=max(0,j-2); j1<=min(j+2,width-1); j1++ ) {
+			for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
+				for (j1=0; j1<=j+2; j1++ ) {
 					if (i1==i && j1==j) continue;
-					if (badpix[i1][j1]) continue;
-					sum += src->sh_p[i1][j1]; tot++;
+					if (badpix[i1*width+j1]) continue;
+					sum += src->sh_p[i1][j1];
+					tot++;
 					dirsh = 1.f/(SQR(src->sh_p[i1][j1]-src->sh_p[i][j])+eps);
 					shsum += dirsh*src->sh_p[i1][j1];
 					norm += dirsh;
@@ -474,8 +825,57 @@ float shthr = sh_thr / 24.0f;
 				if(tot > 0) src->sh_p[i][j]=sum / tot;
 				}
 		}
+		for (; j < width-2; j++) {
+			if (!badpix[i*width+j]) continue;
+			norm=0.0f;
+			shsum=0.0f;
+			sum=0.0f;
+			tot=0;
+			for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
+				for (j1=j-2; j1<=j+2; j1++ ) {
+					if (i1==i && j1==j) continue;
+					if (badpix[i1*width+j1]) continue;
+					sum += src->sh_p[i1][j1];
+					tot++;
+					dirsh = 1.f/(SQR(src->sh_p[i1][j1]-src->sh_p[i][j])+eps);
+					shsum += dirsh*src->sh_p[i1][j1];
+					norm += dirsh;
+			}
+			if (norm > 0.f) {
+				src->sh_p[i][j]=shsum/norm;
+			}
+			else {
+				if(tot > 0) src->sh_p[i][j]=sum / tot;
+				}
+		}
+		for (; j < width; j++) {
+			if (!badpix[i*width+j]) continue;
+			norm=0.0f;
+			shsum=0.0f;
+			sum=0.0f;
+			tot=0;
+			for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
+				for (j1=j-2; j1<width; j1++ ) {
+					if (i1==i && j1==j) continue;
+					if (badpix[i1*width+j1]) continue;
+					sum += src->sh_p[i1][j1];
+					tot++;
+					dirsh = 1.f/(SQR(src->sh_p[i1][j1]-src->sh_p[i][j])+eps);
+					shsum += dirsh*src->sh_p[i1][j1];
+					norm += dirsh;
+			}
+			if (norm > 0.f) {
+				src->sh_p[i][j]=shsum/norm;
+			}
+			else {
+				if(tot > 0) src->sh_p[i][j]=sum / tot;
+				}
+		}
+	}
+}
 // end luma badpixels		
 
+
 // begin chroma badpixels
 float chrommed=0.f;
 #ifdef _OPENMP
@@ -485,28 +885,108 @@ float chrommed=0.f;
 		for(int j = 0; j < width; j++) {
 			float chroma =SQR(sraa[i][j]-tmaa[i][j])+SQR(srbb[i][j]-tmbb[i][j]);
 			chrommed += chroma;
-			badpix[i][j]=chroma;
+			badpix[i*width+j]=chroma;
 		}
 	}
 	chrommed /= (height*width);
 	float threshfactor = (thresh*chrommed)/33.f;   
 
+// now chrommed is calculated, so we postprocess badpix to reduce the number of divisions in future
+#ifdef __SSE2__
 #ifdef _OPENMP
-#pragma omp parallel for schedule(dynamic,16)
+#pragma omp parallel
+#endif
+{
+	int j;
+	__m128 sumv = _mm_set1_ps( chrommed + eps2 );
+	__m128 onev = _mm_set1_ps( 1.0f );
+#ifdef _OPENMP
+#pragma omp for
+#endif
+	for(int i = 0; i<height; i++) {
+		for(j=0; j < width-3; j+=4)
+			_mm_storeu_ps( &badpix[i*width+j], onev/(LVFU(badpix[i*width+j])+sumv));
+		for(; j < width; j++)
+			badpix[i*width+j] = 1.f/(badpix[i*width+j]+chrommed+eps2);
+	}
+}
+#else
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+	for(int i = 0; i<height; i++)
+		for(int j = 0; j < width; j++)
+			badpix[i*width+j] = 1.f/(badpix[i*width+j]+chrommed+eps2);
+#endif
+
+	// because we changed the values of badpix we also have to recalculate threshfactor
+	threshfactor = 1.0f/(threshfactor + chrommed + eps2);
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+{
+	int j;
+#ifdef _OPENMP
+#pragma omp for schedule(dynamic,16)
 #endif
 	for(int i = 0; i < height; i++ ) {
-		for(int j = 0; j < width; j++) {
+		for(j = 0; j < halfwin; j++) {
 			tmaa[i][j] = sraa[i][j];
 			tmbb[i][j] = srbb[i][j];
 
-			if (badpix[i][j]>threshfactor) {
+			if (badpix[i*width+j]<threshfactor) {
 				float atot=0.f;
 				float btot=0.f;
 				float norm=0.f;
 				float wt;
 				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
-					for (int j1=max(0,j-halfwin+1); j1<min(width,j+halfwin); j1++) {
-						wt = 1.f/(badpix[i1][j1]+chrommed+eps2);
+					for (int j1=0; j1<j+halfwin; j1++) {
+						wt = badpix[i1*width+j1];
+						atot += wt*sraa[i1][j1];
+						btot += wt*srbb[i1][j1];
+						norm += wt;
+					}
+				if(norm > 0.f){	
+				tmaa[i][j] = (atot/norm);
+				tmbb[i][j] = (btot/norm);
+				}
+			}
+		}
+		for(; j < width-halfwin; j++) {
+			tmaa[i][j] = sraa[i][j];
+			tmbb[i][j] = srbb[i][j];
+
+			if (badpix[i*width+j]<threshfactor) {
+				float atot=0.f;
+				float btot=0.f;
+				float norm=0.f;
+				float wt;
+				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
+					for (int j1=j-halfwin+1; j1<j+halfwin; j1++) {
+						wt = badpix[i1*width+j1];
+						atot += wt*sraa[i1][j1];
+						btot += wt*srbb[i1][j1];
+						norm += wt;
+					}
+				if(norm > 0.f){	
+				tmaa[i][j] = (atot/norm);
+				tmbb[i][j] = (btot/norm);
+				}
+			}
+		}
+		for(; j < width; j++) {
+			tmaa[i][j] = sraa[i][j];
+			tmbb[i][j] = srbb[i][j];
+
+			if (badpix[i*width+j]<threshfactor) {
+				float atot=0.f;
+				float btot=0.f;
+				float norm=0.f;
+				float wt;
+				for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
+					for (int j1=j-halfwin+1; j1<width; j1++) {
+						wt = badpix[i1*width+j1];
 						atot += wt*sraa[i1][j1];
 						btot += wt*srbb[i1][j1];
 						norm += wt;
@@ -518,37 +998,73 @@ float chrommed=0.f;
 			}
 		}
 	}
+}
 
 #ifdef _OPENMP
-#pragma omp parallel for
+#pragma omp parallel
+#endif
+{
+#ifdef __SSE2__
+	int j;
+	__m128 interav, interbv;
+	__m128 piidv = _mm_set1_ps(piid);
+#endif
+#ifdef _OPENMP
+#pragma omp for
 #endif
 	for(int i = 0; i < height; i++ ) {
-		for(int j = 0; j < width; j++) {
-			dst->sh_p[i][j] = src->sh_p[i][j];
+#ifdef __SSE2__
+		for(j = 0; j < width-3; j+=4) {
+			interav = LVFU(tmaa[i][j]);
+			interbv = LVFU(tmbb[i][j]);
+			_mm_storeu_ps(&dst->h_p[i][j],(xatan2f(interbv,interav))/piidv);
+			_mm_storeu_ps(&dst->C_p[i][j],_mm_sqrt_ps(SQRV(interbv)+SQRV(interav)));
+		}
+		for(; j < width; j++) {
 			float intera = tmaa[i][j];
 			float interb = tmbb[i][j];
 			dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
 			dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
 		}
+#else
+		for(int j = 0; j < width; j++) {
+			float intera = tmaa[i][j];
+			float interb = tmbb[i][j];
+			dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
+			dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
+		}
+#endif
 	}
-                for (int i=0; i<height; i++)
-                    delete [] sraa[i];
-                delete [] sraa;
-                for (int i=0; i<height; i++)
-                    delete [] srbb[i];
-                delete [] srbb;
-                for (int i=0; i<height; i++)
-                    delete [] tmaa[i];
-                delete [] tmaa;
-                for (int i=0; i<height; i++)
-                    delete [] tmbb[i];
-                delete [] tmbb;
-                for (int i=0; i<height; i++){
-                    delete [] tmL[i];
-					delete [] badpix[i];
-				}
-                delete [] tmL;
-                delete [] badpix;
+}
+
+	if(src != dst) {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+		for(int i = 0; i < height; i++ )
+			for(int j = 0; j < width; j++)
+				dst->sh_p[i][j] = src->sh_p[i][j];
+	}
+
+
+	for (int i=0; i<height; i++)
+		delete [] sraa[i];
+	delete [] sraa;
+	for (int i=0; i<height; i++)
+		delete [] srbb[i];
+	delete [] srbb;
+	for (int i=0; i<height; i++)
+		delete [] tmaa[i];
+	delete [] tmaa;
+	for (int i=0; i<height; i++)
+		delete [] tmbb[i];
+	delete [] tmbb;
+	for (int i=0; i<height; i++){
+		delete [] tmL[i];
+	}
+	delete [] tmL;
+
+	free(badpix);
 			
     t2.set();
     if( settings->verbose )
@@ -559,4 +1075,3 @@ float chrommed=0.f;
 }
 #undef PIX_SORT
 #undef med3
-