diff --git a/rtengine/cfa_linedn_RT.cc b/rtengine/cfa_linedn_RT.cc
index 59cba81b8..d49b53f74 100644
--- a/rtengine/cfa_linedn_RT.cc
+++ b/rtengine/cfa_linedn_RT.cc
@@ -27,8 +27,8 @@
 
 #define CLASS
 
-
-/*#include <ctype.h>
+/*
+#include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <float.h>
@@ -59,22 +59,24 @@ void RawImageSource::CLASS cfa_linedn(float noise)
 {  
 	// local variables
 	int height=H, width=W;
-	int top, left, row, col;
-	int rr, cc, indx, i, j;
-	int ex, ey; 
 	int verbose=1;
 	
 	const float clip_pt = 0.8*initialGain* 65535.0;
 	
 	float eps=1e-5;			//tolerance to avoid dividing by zero
 	
-	float gauss[5] = {0.20416368871516755, 0.18017382291138087, 0.1238315368057753, 0.0662822452863612, 0.02763055063889883};
-	float rolloff[8] = {0, 0.135335, 0.249352, 0.411112, 0.606531, 0.800737, 0.945959, 1}; //gaussian with sigma=3
-	float window[8] = {0, .25, .75, 1, 1, .75, .25, 0}; //sine squared
-	float noisevar, linehvar, linevvar, coeffsq;
+	const float gauss[5] = {0.20416368871516755, 0.18017382291138087, 0.1238315368057753, 0.0662822452863612, 0.02763055063889883};
+	const float rolloff[8] = {0, 0.135335, 0.249352, 0.411112, 0.606531, 0.800737, 0.945959, 1}; //gaussian with sigma=3
+	const float window[8] = {0, .25, .75, 1, 1, .75, .25, 0}; //sine squared
 	
-	float aarr[8][8], *dctblock[8];
-    for (i = 0; i < 8; i++) dctblock[i] = aarr[i];
+	
+	float aarr[4][8][8], *bbrr[4][8], **dctblock[4];
+	for (int j=0; j<4; j++) {
+		for (int i = 0; i < 8; i++) 
+			bbrr[j][i] = aarr[j][i];
+		dctblock[j] = bbrr[j];
+	}
+
 
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	
@@ -84,124 +86,193 @@ void RawImageSource::CLASS cfa_linedn(float noise)
 	}
 	
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	float noisevar=SQR(3*noise*65535); // _noise_ (as a fraction of saturation) is input to the algorithm
+	volatile double progress = 0.0;
 	
+#pragma omp parallel
+	{	
 
-	noisevar=SQR(3*noise*65535); // _noise_ (as a fraction of saturation) is input to the algorithm
 	float *cfain= new float[TS*TS];
 	float *cfablur= new float[TS*TS];
 	float *cfadiff= new float[TS*TS];
 	float *cfadn= new float[TS*TS];
 
+/*
+	char			*buffer;			// TS*TS*16
+	float        	(*cfain);			// TS*TS*4
+	float         	(*cfablur);		// TS*TS*4
+	float         	(*cfadiff);		// TS*TS*4
+	float         	(*cfadn);			// TS*TS*4
+	buffer = (char *) calloc((3*sizeof(float)+sizeof(int))*TS*TS,1);// 16
+//	merror(buffer,"cfa_linedn()");
+//	memset(buffer,0,16*TS*TS);
+	cfain			= (float (*))		buffer; //pointers to rows of array
+ 	cfablur			= (float (*))			(buffer +  sizeof(float)*TS*TS); //4
+	cfadiff			= (float (*))			(buffer +  2*sizeof(float)*TS*TS);//8
+	cfadn			= (float (*))			(buffer +  3*sizeof(float)*TS*TS);//12
+*/
 	// Main algorithm: Tile loop
-	for (top=0; top < height-16; top += TS-32)
-		for (left=0; left < width-16; left += TS-32) {
+#pragma omp for schedule(dynamic) nowait
+	for (int top=0; top < height-16; top += TS-32)
+		for (int left=0; left < width-16; left += TS-32) {
+	
 			int bottom = MIN( top+TS,height);
 			int right  = MIN(left+TS, width);
 			int numrows = bottom - top;
 			int numcols = right - left;
-			
+			int indx1;
 			// load CFA data; data should be in linear gamma space, before white balance multipliers are applied
-			for (rr=top; rr < top+numrows; rr++)
-				for (cc=left, indx=(rr-top)*TS; cc < left+numcols; cc++, indx++) {
+			for (int rr=top; rr < top+numrows; rr++)
+				for (int cc=left, indx=(rr-top)*TS; cc < left+numcols; cc++, indx++) {
+			
 					cfain[indx] = rawData[rr][cc];
 				}
 			//pad the block to a multiple of 16 on both sides
 			
 			if (numcols < TS) {
-				indx=numcols % 16;
-				for (i=0; i<(16-indx); i++) 
-					for (rr=0; rr<numrows; rr++) 
+				indx1=numcols % 16;
+				for (int i=0; i<(16-indx1); i++) 
+					for (int rr=0; rr<numrows; rr++) 
+			
 						cfain[(rr)*TS+numcols+i+1]=cfain[(rr)*TS+numcols-i];
-				numcols += 16-indx;
+				numcols += 16-indx1;
 			}
 			
 			if (numrows < TS) {
-				indx=numrows % 16;
-				for (i=0; i<(16-indx); i++)
-					for (cc=0; cc<numcols; cc++)
+				indx1=numrows % 16;
+				for (int i=0; i<(16-indx1); i++)
+					for (int cc=0; cc<numcols; cc++)
+		
 						cfain[(numrows+i+1)*TS+cc]=cfain[(numrows-i)*TS+cc];
-				numrows += 16-indx;
+				numrows += 16-indx1;
 			}
 			
 			//The cleaning algorithm starts here
 			
 			
 			//gaussian blur of CFA data
-			for (rr=8; rr < numrows-8; rr++)
-				for (indx=rr*TS; indx < rr*TS+numcols; indx++) {
+			for (int rr=8; rr < numrows-8; rr++)
+				for (int indx=rr*TS; indx < rr*TS+numcols; indx++) {
 					
 					cfablur[indx]=gauss[0]*cfain[indx];
-					for (i=1; i<5; i++) {
+					for (int i=1; i<5; i++) {
 						cfablur[indx] += gauss[i]*(cfain[indx-(2*i)*TS]+cfain[indx+(2*i)*TS]);
 					}
 				}
-			for (rr=8; rr < numrows-8; rr++)
-				for (indx=rr*TS+8; indx < rr*TS+numcols-8; indx++) {
+			for (int rr=8; rr < numrows-8; rr++)
+				for (int indx=rr*TS+8; indx < rr*TS+numcols-8; indx++) {
 					
 					cfadn[indx] = gauss[0]*cfablur[indx];
-					for (i=1; i<5; i++) {
+					for (int i=1; i<5; i++) {
 						cfadn[indx] += gauss[i]*(cfablur[indx-2*i]+cfablur[indx+2*i]);
 					}
 					cfadiff[indx]=cfain[indx]-cfadn[indx]; // hipass cfa data
 				}
 			
 			//begin block DCT
-			for (ey=0; ey<2; ey++) // (ex,ey) specify RGGB subarray
-				for (ex=0; ex<2; ex++) 
-					for (rr=8+ey; rr < numrows-22; rr+=8) // (rr,cc) shift by 8 to overlap blocks
-						for (cc=8+ex; cc < numcols-22; cc+=8) {
+			float linehvar[4], linevvar[4], noisefactor[4][8][2], coeffsq;
+#pragma omp critical
+		{	
+			for (int rr=8; rr < numrows-22; rr+=8) // (rr,cc) shift by 8 to overlap blocks
+				for (int cc=8; cc < numcols-22; cc+=8) {
+
+					for (int ey=0; ey<2; ey++) // (ex,ey) specify RGGB subarray
+						for (int ex=0; ex<2; ex++) {
+	
 							//grab an 8x8 block of a given RGGB channel
-							for (i=0; i<8; i++) 
-								for (j=0; j<8; j++) {
-									dctblock[i][j]=cfadiff[(rr+2*i)*TS+cc+2*j];
+							for (int i=0; i<8; i++) 
+								for (int j=0; j<8; j++) {
+									dctblock[2*ey+ex][i][j]=cfadiff[(rr+2*i+ey)*TS+cc+2*j+ex];
+		
 								}
 							
-							 ddct8x8s(-1, dctblock); //forward DCT
+							ddct8x8s(-1, dctblock[2*ey+ex]); //forward DCT
+						}
+					for (int ey=0; ey<2; ey++) // (ex,ey) specify RGGB subarray
+						for (int ex=0; ex<2; ex++) {
+							linehvar[2*ey+ex]=linevvar[2*ey+ex]=0;
+							for (int i=4; i<8; i++) {
+								linehvar[2*ey+ex] += SQR(dctblock[2*ey+ex][0][i]);
+								linevvar[2*ey+ex] += SQR(dctblock[2*ey+ex][i][0]);
 							 
-							linehvar=linevvar=0;
-							for (i=4; i<8; i++) {
-								linehvar += SQR(dctblock[0][i]);
-								linevvar += SQR(dctblock[i][0]);
 							}
 							//Wiener filter for line denoising; roll off low frequencies
-							if (noisevar>linehvar) {
-								for (i=1; i<8; i++) {
-									coeffsq=SQR(dctblock[0][i]);
-									dctblock[0][i] *= coeffsq/(coeffsq+rolloff[i]*noisevar+eps);
+							for (int i=1; i<8; i++) {
+								coeffsq = SQR(dctblock[2*ey+ex][i][0]);//vertical
+								noisefactor[2*ey+ex][i][0] = coeffsq/(coeffsq+rolloff[i]*noisevar+eps);
+								coeffsq = SQR(dctblock[2*ey+ex][0][i]);//horizontal
+								noisefactor[2*ey+ex][i][1] = coeffsq/(coeffsq+rolloff[i]*noisevar+eps);
+								// noisefactor labels are [RGGB subarray][row/col position][0=vert,1=hor]
+					
 								}
 							}
-							if (noisevar>linevvar) {
-								for (i=1; i<8; i++) {
-									coeffsq=SQR(dctblock[i][0]);
-									dctblock[i][0] *= coeffsq/(coeffsq+rolloff[i]*noisevar+eps);
+					//horizontal lines
+					if (4*noisevar>(linehvar[0]+linehvar[1])) {//horizontal lines
+						for (int i=1; i<8; i++) {
+							dctblock[0][0][i] *= 0.5*(noisefactor[0][i][1]+noisefactor[1][i][1]);//or should we use MIN???
+							dctblock[1][0][i] *= 0.5*(noisefactor[0][i][1]+noisefactor[1][i][1]);//or should we use MIN???
+						}
+					}
+					if (4*noisevar>(linehvar[2]+linehvar[3])) {//horizontal lines
+						for (int i=1; i<8; i++) {
+							dctblock[2][0][i] *= 0.5*(noisefactor[2][i][1]+noisefactor[3][i][1]);//or should we use MIN???
+							dctblock[3][0][i] *= 0.5*(noisefactor[2][i][1]+noisefactor[3][i][1]);//or should we use MIN???
 								}
 							}
 							
-							ddct8x8s(1, dctblock); //inverse DCT
+					//vertical lines
+					if (4*noisevar>(linevvar[0]+linevvar[2])) {//vertical lines
+						for (int i=1; i<8; i++) {
+							dctblock[0][i][0] *= 0.5*(noisefactor[0][i][0]+noisefactor[2][i][0]);//or should we use MIN???
+							dctblock[2][i][0] *= 0.5*(noisefactor[0][i][0]+noisefactor[2][i][0]);//or should we use MIN???
+						}
+					}
+					if (4*noisevar>(linevvar[1]+linevvar[3])) {//vertical lines
+						for (int i=1; i<8; i++) {
+							dctblock[1][i][0] *= 0.5*(noisefactor[1][i][0]+noisefactor[3][i][0]);//or should we use MIN???
+							dctblock[3][i][0] *= 0.5*(noisefactor[1][i][0]+noisefactor[3][i][0]);//or should we use MIN???
+						}
+					}
+					
+					for (int ey=0; ey<2; ey++) // (ex,ey) specify RGGB subarray
+						for (int ex=0; ex<2; ex++) {
+							ddct8x8s(1, dctblock[2*ey+ex]); //inverse DCT
 							
 							//multiply by window fn and add to output (cfadn)
-							for (i=0; i<8; i++) 
-								for (j=0; j<8; j++) {
-									cfadn[(rr+2*i)*TS+cc+2*j] += window[i]*window[j]*dctblock[i][j];
+							for (int i=0; i<8; i++) 
+								for (int j=0; j<8; j++) {
+									cfadn[(rr+2*i+ey)*TS+cc+2*j+ex] += window[i]*window[j]*dctblock[2*ey+ex][i][j];
+
 								}
 						}	
-
+			}
+		}
 			// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 			// copy smoothed results back to image matrix
-			for (rr=16; rr < numrows-16; rr++) {
-				row = rr + top; 
-				for (col=16+left, indx=rr*TS+16; indx < rr*TS+numcols-16; indx++, col++) {
+			for (int rr=16; rr < numrows-16; rr++) {
+				int row = rr + top; 
+				for (int col=16+left, indx=rr*TS+16; indx < rr*TS+numcols-16; indx++, col++) {
+
 					if (rawData[row][col]<clip_pt && cfadn[indx]<clip_pt) 
 					   rawData[row][col] = CLIP((int)(cfadn[indx]+ 0.5));
 				}
 			}
-			if(plistener) plistener->setProgress(fabs((float)top/height));
+			progress+=(double)((TS-32)*(TS-32))/(height*width);
+			if (progress>1.0)
+			{
+				progress=1.0;
+			}
+			if(plistener) plistener->setProgress(progress);
+			
+			//if(plistener) plistener->setProgress(fabs((float)top/height));
 		}
 	// clean up
 	delete [] cfain;
 	delete [] cfablur;
 	delete [] cfadiff;
 	delete [] cfadn;
+	//free(buffer);
+	}
 
 }
 #undef TS