From f7b0dc429c3c69cf8748827ab540de22bc0351a2 Mon Sep 17 00:00:00 2001
From: Ingo <heckflosse@i-weyrich.de>
Date: Fri, 12 Jul 2013 16:28:46 +0200
Subject: [PATCH] Optimization for Chromatic Abberation Correction, Issue 1923

---
 rtengine/CA_correct_RT.cc | 624 ++++++++++++++++++++------------------
 1 file changed, 337 insertions(+), 287 deletions(-)

diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc
index f144bca57..8e3fbcdd9 100644
--- a/rtengine/CA_correct_RT.cc
+++ b/rtengine/CA_correct_RT.cc
@@ -52,10 +52,10 @@ int RawImageSource::LinEqSolve(int nDim, float* pfMatr, float* pfVect, float* pf
 	
 	for(k=0; k<(nDim-1); k++) {// base row of matrix
 		// search of line with max element
-		fMaxElem = fabs( pfMatr[k*nDim + k] );
+		fMaxElem = fabsf( pfMatr[k*nDim + k] );
 		m = k;
 		for (i=k+1; i<nDim; i++) {
-			if(fMaxElem < fabs(pfMatr[i*nDim + k]) ) {
+			if(fMaxElem < fabsf(pfMatr[i*nDim + k]) ) {
 				fMaxElem = pfMatr[i*nDim + k];
 				m = i;
 			}
@@ -102,22 +102,58 @@ int RawImageSource::LinEqSolve(int nDim, float* pfMatr, float* pfVect, float* pf
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
-void RawImageSource::CA_correct_RT(double cared, double cablue) { 
-
+void RawImageSource::CA_correct_RT(double cared, double cablue) {
+// multithreaded by Ingo Weyrich
 #define TS 256		// Tile size
-//#define border 8
-//#define border2	16
+#define TSH 128		// Half Tile size
+#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} }
+
+	volatile double progress = 0.0;
+	if(plistener) plistener->setProgress (progress);
 	
-	#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} }
 
 	// local variables
 	int width=W, height=H;
 	//temporary array to store simple interpolation of G
 	float (*Gtmp);
 	Gtmp = (float (*)) calloc ((height)*(width), sizeof *Gtmp);
-	
+
+	// temporary array to avoid race conflicts, only every second pixel needs to be saved here
+	float (*RawDataTmp);
+	RawDataTmp = (float*) malloc( height * width * sizeof(float)/2);
+
+	float	blockave[2][3]={{0,0,0},{0,0,0}}, blocksqave[2][3]={{0,0,0},{0,0,0}}, blockdenom[2][3]={{0,0,0},{0,0,0}}, blockvar[2][3];
+
+	// Because we can't break parallel processing, we need a switch do handle the errors
+	bool processpasstwo = true;
+
+	//block CA shift values and weight assigned to block
+	char		*buffer1;				// vblsz*hblsz*(3*2+1)
+	float		(*blockwt);				// vblsz*hblsz
+	float		(*blockshifts)[3][2];	// vblsz*hblsz*3*2 
+
 	const int border=8;
 	const int border2=16;
+
+	int vz1, hz1;
+	if((height+border2)%(TS-border2)==0) vz1=1; else vz1=0;
+    if((width+border2)%(TS-border2)==0) hz1=1; else hz1=0;
+
+	int vblsz, hblsz;
+    vblsz=ceil((float)(height+border2)/(TS-border2)+2+vz1);
+    hblsz=ceil((float)(width+border2)/(TS-border2)+2+hz1);
+
+	buffer1 = (char *) malloc(vblsz*hblsz*(3*2+1)*sizeof(float));
+	//merror(buffer1,"CA_correct()");
+	memset(buffer1,0,vblsz*hblsz*(3*2+1)*sizeof(float));
+	// block CA shifts
+	blockwt		= (float (*))			(buffer1);
+	blockshifts	= (float (*)[3][2])		(buffer1+(vblsz*hblsz*sizeof(float)));
+	
+	float	polymat[3][2][256], shiftmat[3][2][16], fitparams[3][2][16];
+
+#pragma omp parallel shared(Gtmp,width,height,blockave,blocksqave,blockdenom,blockvar,blockwt,blockshifts,polymat,shiftmat,fitparams)
+{
 	//order of 2d polynomial fit (polyord), and numpar=polyord^2
 	int polyord=4, numpar=16;
 	//number of blocks used in the fit
@@ -134,14 +170,14 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 	int offset[2][3];
 	int	shifthfloor[3], shiftvfloor[3], shifthceil[3], shiftvceil[3];
 	//number of tiles in the image
-	int vblsz, hblsz, vblock, hblock, vz1, hz1;
+	int vblock, hblock;
 	//int verbose=1;
 	//flag indicating success or failure of polynomial fit
 	int res;
 	//shifts to location of vertical and diagonal neighbors
-	const int v1=TS, v2=2*TS, /* v3=3*TS,*/ v4=4*TS;//, p1=-TS+1, p2=-2*TS+2, p3=-3*TS+3, m1=TS+1, m2=2*TS+2, m3=3*TS+3;
+	const int v1=TS, v2=2*TS, v3=3*TS, v4=4*TS;//, p1=-TS+1, p2=-2*TS+2, p3=-3*TS+3, m1=TS+1, m2=2*TS+2, m3=3*TS+3;
 	
-	float eps=1e-5, eps2=1e-10;	//tolerance to avoid dividing by zero
+	float eps=1e-5f, eps2=1e-10f;	//tolerance to avoid dividing by zero
 	
 	//adaptive weights for green interpolation
 	float	wtu, wtd, wtl, wtr;
@@ -150,7 +186,6 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 	//measured CA shift parameters for a tile
 	float	CAshift[2][3];
 	//polynomial fit coefficients
-	float	polymat[3][2][256], shiftmat[3][2][16], fitparams[3][2][16];
 	//residual CA shift amount within a plaquette
 	float	shifthfrac[3], shiftvfrac[3];
 	//temporary storage for median filter
@@ -161,8 +196,9 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 	float	Ginthfloor, Ginthceil, Gint, RBint, gradwt;
 	//interpolated color difference at edge of plaquette
 	float	grbdiffinthfloor, grbdiffinthceil, grbdiffint, grbdiffold;
-	//data for evaluation of block CA shift variance
-	float	blockave[2][3]={{0,0,0},{0,0,0}}, blocksqave[2][3]={{0,0,0},{0,0,0}}, blockdenom[2][3]={{0,0,0},{0,0,0}}, blockvar[2][3];
+	//per thread data for evaluation of block CA shift variance
+	float	blockavethr[2][3]={{0,0,0},{0,0,0}}, blocksqavethr[2][3]={{0,0,0},{0,0,0}}, blockdenomthr[2][3]={{0,0,0},{0,0,0}};//, blockvarthr[2][3];
+
 	//low and high pass 1D filters of G in vertical/horizontal directions
 	float	glpfh, glpfv;
 	
@@ -176,7 +212,7 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 	
 	char		*buffer;			// TS*TS*16
 	//rgb data in a tile
-	float         (*rgb)[3];		// TS*TS*12
+	float* rgb[3];
 	//color differences
 	float         (*grbdiff);		// TS*TS*4
 	//green interpolated to optical sample points for R/B
@@ -197,64 +233,51 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 	
 	/* assign working space; this would not be necessary
 	 if the algorithm is part of the larger pre-interpolation processing */
-	buffer = (char *) malloc(11*sizeof(float)*TS*TS);
+	buffer = (char *) malloc(3*sizeof(float)*TS*TS + 8*sizeof(float)*TS*TSH + 10*64 + 64);
 	//merror(buffer,"CA_correct()");
-	memset(buffer,0,11*sizeof(float)*TS*TS);
+	memset(buffer,0,3*sizeof(float)*TS*TS + 8*sizeof(float)*TS*TSH + 10*64 + 64);
 	
-	// rgb array
-	rgb         = (float (*)[3])		buffer;
-	grbdiff		= (float (*))			(buffer +	3*sizeof(float)*TS*TS);
-	gshift		= (float (*))			(buffer +	4*sizeof(float)*TS*TS);
-	rbhpfh		= (float (*))			(buffer +	5*sizeof(float)*TS*TS);
-	rbhpfv		= (float (*))			(buffer +	6*sizeof(float)*TS*TS);
-	rblpfh		= (float (*))			(buffer +	7*sizeof(float)*TS*TS);
-	rblpfv		= (float (*))			(buffer +	8*sizeof(float)*TS*TS);
-	grblpfh		= (float (*))			(buffer +	9*sizeof(float)*TS*TS);
-	grblpfv		= (float (*))			(buffer +	10*sizeof(float)*TS*TS);
+	char 	*data;
+	data 	= buffer;
 
+//  buffers aligned to size of cacheline
+//	data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
 	
-	if((height+border2)%(TS-border2)==0) vz1=1; else vz1=0;
-    if((width+border2)%(TS-border2)==0) hz1=1; else hz1=0;
-    
-    vblsz=ceil((float)(height+border2)/(TS-border2)+2+vz1);
-    hblsz=ceil((float)(width+border2)/(TS-border2)+2+hz1);
 	
-	//block CA shift values and weight assigned to block
-	char		*buffer1;				// vblsz*hblsz*(3*2+1)
-	float		(*blockwt);				// vblsz*hblsz
-	float		(*blockshifts)[3][2];	// vblsz*hblsz*3*2 
-	//float blockshifts[1000][3][2]; //fixed memory allocation
-	//float blockwt[1000]; //fixed memory allocation
-	
-	buffer1 = (char *) malloc(vblsz*hblsz*(3*2+1)*sizeof(float));
-	//merror(buffer1,"CA_correct()");
-	memset(buffer1,0,vblsz*hblsz*(3*2+1)*sizeof(float));
-	// block CA shifts
-	blockwt		= (float (*))			(buffer1);
-	blockshifts	= (float (*)[3][2])		(buffer1+(vblsz*hblsz*sizeof(float)));
-		
-	int vctr=0, hctr=0;
+	// shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache
+	rgb[0]      = (float (*))		data;
+	rgb[1]		= (float (*))		(data + 1*sizeof(float)*TS*TS + 1*64);
+	rgb[2]		= (float (*))		(data + 2*sizeof(float)*TS*TS + 2*64);
+	grbdiff		= (float (*))			(data +	3*sizeof(float)*TS*TS + 3*64);
+	gshift		= (float (*))			(data +	3*sizeof(float)*TS*TS + sizeof(float)*TS*TSH + 4*64);
+	rbhpfh		= (float (*))			(data +	4*sizeof(float)*TS*TS + 5*64);
+	rbhpfv		= (float (*))			(data +	4*sizeof(float)*TS*TS + sizeof(float)*TS*TSH + 6*64);
+	rblpfh		= (float (*))			(data +	5*sizeof(float)*TS*TS + 7*64);
+	rblpfv		= (float (*))			(data +	5*sizeof(float)*TS*TS + sizeof(float)*TS*TSH + 8*64);
+	grblpfh		= (float (*))			(data +	6*sizeof(float)*TS*TS + 9*64);
+	grblpfv		= (float (*))			(data +	6*sizeof(float)*TS*TS + sizeof(float)*TS*TSH + 10*64);
+   
 	
 	if (cared==0 && cablue==0) {
 	// Main algorithm: Tile loop
-	//#pragma omp parallel for shared(image,height,width) private(top,left,indx,indx1) schedule(dynamic)
-	for (top=-border, vblock=1; top < height; top += TS-border2, vblock++) {
-		hctr=0;
-		vctr++;
-		for (left=-border, hblock=1; left < width; left += TS-border2, hblock++) {
-			hctr++;
+#pragma omp for collapse(2) schedule(dynamic) nowait
+	for (top=-border ; top < height; top += TS-border2) 
+		for (left=-border; left < width; left += TS-border2) {
+			vblock = ((top+border)/(TS-border2))+1;
+			hblock = ((left+border)/(TS-border2))+1;
 			int bottom = min(top+TS,height+border);
 			int right  = min(left+TS, width+border);
 			int rr1 = bottom - top;
 			int cc1 = right - left;
 			//t1_init = clock();
-			// rgb from input CFA data
-			// rgb values should be floating point number between 0 and 1 
-			// after white balance multipliers are applied 
 			if (top<0) {rrmin=border;} else {rrmin=0;}
 			if (left<0) {ccmin=border;} else {ccmin=0;}
 			if (bottom>height) {rrmax=height-top;} else {rrmax=rr1;}
 			if (right>width) {ccmax=width-left;} else {ccmax=cc1;}
+
+			// rgb from input CFA data
+			// rgb values should be floating point number between 0 and 1 
+			// after white balance multipliers are applied 
 			
 			for (rr=rrmin; rr < rrmax; rr++)
 				for (row=rr+top, cc=ccmin; cc < ccmax; cc++) {
@@ -262,7 +285,7 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 					c = FC(rr,cc);
 					indx=row*width+col;
 					indx1=rr*TS+cc;
-					rgb[indx1][c] = (rawData[row][col])/65535.0f;
+					rgb[c][indx1] = (rawData[row][col])/65535.0f;
 					//rgb[indx1][c] = image[indx][c]/65535.0f;//for dcraw implementation
 				}
 			
@@ -272,14 +295,14 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=ccmin; cc<ccmax; cc++) {
 						c = FC(rr,cc);
-						rgb[rr*TS+cc][c] = rgb[(border2-rr)*TS+cc][c];
+						rgb[c][rr*TS+cc] = rgb[c][(border2-rr)*TS+cc];
 					}
 			}
 			if (rrmax<rr1) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=ccmin; cc<ccmax; cc++) {
 						c=FC(rr,cc);
-						rgb[(rrmax+rr)*TS+cc][c] = (rawData[(height-rr-2)][left+cc])/65535.0f;
+						rgb[c][(rrmax+rr)*TS+cc] = (rawData[(height-rr-2)][left+cc])/65535.0f;
 						//rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+left+cc][c])/65535.0f;//for dcraw implementation
 					}
 			}
@@ -287,14 +310,14 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (rr=rrmin; rr<rrmax; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[rr*TS+cc][c] = rgb[rr*TS+border2-cc][c];
+						rgb[c][rr*TS+cc] = rgb[c][rr*TS+border2-cc];
 					}
 			}
 			if (ccmax<cc1) {
 				for (rr=rrmin; rr<rrmax; rr++)
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[rr*TS+ccmax+cc][c] = (rawData[(top+rr)][(width-cc-2)])/65535.0f;
+						rgb[c][rr*TS+ccmax+cc] = (rawData[(top+rr)][(width-cc-2)])/65535.0f;
 						//rgb[rr*TS+ccmax+cc][c] = (image[(top+rr)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
 					}
 			}
@@ -304,7 +327,7 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[(rr)*TS+cc][c] = (rawData[border2-rr][border2-cc])/65535.0f;
+						rgb[c][(rr)*TS+cc] = (rawData[border2-rr][border2-cc])/65535.0f;
 						//rgb[(rr)*TS+cc][c] = (rgb[(border2-rr)*TS+(border2-cc)][c]);//for dcraw implementation
 					}
 			}
@@ -312,7 +335,7 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[(rrmax+rr)*TS+ccmax+cc][c] = (rawData[(height-rr-2)][(width-cc-2)])/65535.0f;
+						rgb[c][(rrmax+rr)*TS+ccmax+cc] = (rawData[(height-rr-2)][(width-cc-2)])/65535.0f;
 						//rgb[(rrmax+rr)*TS+ccmax+cc][c] = (image[(height-rr-2)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
 					}
 			}
@@ -320,7 +343,7 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[(rr)*TS+ccmax+cc][c] = (rawData[(border2-rr)][(width-cc-2)])/65535.0f;
+						rgb[c][(rr)*TS+ccmax+cc] = (rawData[(border2-rr)][(width-cc-2)])/65535.0f;
 						//rgb[(rr)*TS+ccmax+cc][c] = (image[(border2-rr)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
 					}
 			}
@@ -328,7 +351,7 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[(rrmax+rr)*TS+cc][c] = (rawData[(height-rr-2)][(border2-cc)])/65535.0f;
+						rgb[c][(rrmax+rr)*TS+cc] = (rawData[(height-rr-2)][(border2-cc)])/65535.0f;
 						//rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+(border2-cc)][c])/65535.0f;//for dcraw implementation
 					}
 			}
@@ -352,16 +375,16 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 					
 					if (c!=1) {
 						//compute directional weights using image gradients
-						wtu=1/SQR(eps+fabs(rgb[(rr+1)*TS+cc][1]-rgb[(rr-1)*TS+cc][1])+fabs(rgb[(rr)*TS+cc][c]-rgb[(rr-2)*TS+cc][c])+fabs(rgb[(rr-1)*TS+cc][1]-rgb[(rr-3)*TS+cc][1]));
-						wtd=1/SQR(eps+fabs(rgb[(rr-1)*TS+cc][1]-rgb[(rr+1)*TS+cc][1])+fabs(rgb[(rr)*TS+cc][c]-rgb[(rr+2)*TS+cc][c])+fabs(rgb[(rr+1)*TS+cc][1]-rgb[(rr+3)*TS+cc][1]));
-						wtl=1/SQR(eps+fabs(rgb[(rr)*TS+cc+1][1]-rgb[(rr)*TS+cc-1][1])+fabs(rgb[(rr)*TS+cc][c]-rgb[(rr)*TS+cc-2][c])+fabs(rgb[(rr)*TS+cc-1][1]-rgb[(rr)*TS+cc-3][1]));
-						wtr=1/SQR(eps+fabs(rgb[(rr)*TS+cc-1][1]-rgb[(rr)*TS+cc+1][1])+fabs(rgb[(rr)*TS+cc][c]-rgb[(rr)*TS+cc+2][c])+fabs(rgb[(rr)*TS+cc+1][1]-rgb[(rr)*TS+cc+3][1]));
+						wtu=1.0/SQR(eps+fabsf(rgb[1][indx+v1]-rgb[1][indx-v1])+fabsf(rgb[c][indx]-rgb[c][indx-v2])+fabsf(rgb[1][indx-v1]-rgb[1][indx-v3]));
+						wtd=1.0/SQR(eps+fabsf(rgb[1][indx-v1]-rgb[1][indx+v1])+fabsf(rgb[c][indx]-rgb[c][indx+v2])+fabsf(rgb[1][indx+v1]-rgb[1][indx+v3]));
+						wtl=1.0/SQR(eps+fabsf(rgb[1][indx+1]-rgb[1][indx-1])+fabsf(rgb[c][indx]-rgb[c][indx-2])+fabsf(rgb[1][indx-1]-rgb[1][indx-3]));
+						wtr=1.0/SQR(eps+fabsf(rgb[1][indx-1]-rgb[1][indx+1])+fabsf(rgb[c][indx]-rgb[c][indx+2])+fabsf(rgb[1][indx+1]-rgb[1][indx+3]));
 						
 						//store in rgb array the interpolated G value at R/B grid points using directional weighted average
-						rgb[indx][1]=(wtu*rgb[indx-v1][1]+wtd*rgb[indx+v1][1]+wtl*rgb[indx-1][1]+wtr*rgb[indx+1][1])/(wtu+wtd+wtl+wtr);
+						rgb[1][indx]=(wtu*rgb[1][indx-v1]+wtd*rgb[1][indx+v1]+wtl*rgb[1][indx-1]+wtr*rgb[1][indx+1])/(wtu+wtd+wtl+wtr);
 					}
 					if (row>-1 && row<height && col>-1 && col<width)
-						Gtmp[row*width + col] = rgb[indx][1];
+						Gtmp[row*width + col] = rgb[1][indx];
 				}
 			
 			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -370,64 +393,62 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (cc=4+(FC(rr,2)&1), indx=rr*TS+cc, c = FC(rr,cc); cc < cc1-4; cc+=2, indx+=2) {
 					
 					
-					rbhpfv[indx] = fabs(fabs((rgb[indx][1]-rgb[indx][c])-(rgb[indx+v4][1]-rgb[indx+v4][c])) +
-										fabs((rgb[indx-v4][1]-rgb[indx-v4][c])-(rgb[indx][1]-rgb[indx][c])) -
-										fabs((rgb[indx-v4][1]-rgb[indx-v4][c])-(rgb[indx+v4][1]-rgb[indx+v4][c])));
-					rbhpfh[indx] = fabs(fabs((rgb[indx][1]-rgb[indx][c])-(rgb[indx+4][1]-rgb[indx+4][c])) +
-										fabs((rgb[indx-4][1]-rgb[indx-4][c])-(rgb[indx][1]-rgb[indx][c])) -
-										fabs((rgb[indx-4][1]-rgb[indx-4][c])-(rgb[indx+4][1]-rgb[indx+4][c])));
+					rbhpfv[indx>>1] = fabsf(fabsf((rgb[1][indx]-rgb[c][indx])-(rgb[1][indx+v4]-rgb[c][indx+v4])) +
+										fabsf((rgb[1][indx-v4]-rgb[c][indx-v4])-(rgb[1][indx]-rgb[c][indx])) -
+										fabsf((rgb[1][indx-v4]-rgb[c][indx-v4])-(rgb[1][indx+v4]-rgb[c][indx+v4])));
+					rbhpfh[indx>>1] = fabsf(fabsf((rgb[1][indx]-rgb[c][indx])-(rgb[1][indx+4]-rgb[c][indx+4])) +
+										fabsf((rgb[1][indx-4]-rgb[c][indx-4])-(rgb[1][indx]-rgb[c][indx])) -
+										fabsf((rgb[1][indx-4]-rgb[c][indx-4])-(rgb[1][indx+4]-rgb[c][indx+4])));
 					
-					/*ghpfv = fabs(fabs(rgb[indx][1]-rgb[indx+v4][1])+fabs(rgb[indx][1]-rgb[indx-v4][1]) -
-					 fabs(rgb[indx+v4][1]-rgb[indx-v4][1]));
-					 ghpfh = fabs(fabs(rgb[indx][1]-rgb[indx+4][1])+fabs(rgb[indx][1]-rgb[indx-4][1]) -
-					 fabs(rgb[indx+4][1]-rgb[indx-4][1]));
-					 rbhpfv[indx] = fabs(ghpfv - fabs(fabs(rgb[indx][c]-rgb[indx+v4][c])+fabs(rgb[indx][c]-rgb[indx-v4][c]) -
-					 fabs(rgb[indx+v4][c]-rgb[indx-v4][c])));
-					 rbhpfh[indx] = fabs(ghpfh - fabs(fabs(rgb[indx][c]-rgb[indx+4][c])+fabs(rgb[indx][c]-rgb[indx-4][c]) -
-					 fabs(rgb[indx+4][c]-rgb[indx-4][c])));*/
+					/*ghpfv = fabsf(fabsf(rgb[indx][1]-rgb[indx+v4][1])+fabsf(rgb[indx][1]-rgb[indx-v4][1]) -
+					 fabsf(rgb[indx+v4][1]-rgb[indx-v4][1]));
+					 ghpfh = fabsf(fabsf(rgb[indx][1]-rgb[indx+4][1])+fabsf(rgb[indx][1]-rgb[indx-4][1]) -
+					 fabsf(rgb[indx+4][1]-rgb[indx-4][1]));
+					 rbhpfv[indx] = fabsf(ghpfv - fabsf(fabsf(rgb[indx][c]-rgb[indx+v4][c])+fabsf(rgb[indx][c]-rgb[indx-v4][c]) -
+					 fabsf(rgb[indx+v4][c]-rgb[indx-v4][c])));
+					 rbhpfh[indx] = fabsf(ghpfh - fabsf(fabsf(rgb[indx][c]-rgb[indx+4][c])+fabsf(rgb[indx][c]-rgb[indx-4][c]) -
+					 fabsf(rgb[indx+4][c]-rgb[indx-4][c])));*/
 					
-					glpfv = 0.25*(2*rgb[indx][1]+rgb[indx+v2][1]+rgb[indx-v2][1]);
-					glpfh = 0.25*(2*rgb[indx][1]+rgb[indx+2][1]+rgb[indx-2][1]);
-					rblpfv[indx] = eps+fabs(glpfv - 0.25*(2*rgb[indx][c]+rgb[indx+v2][c]+rgb[indx-v2][c]));
-					rblpfh[indx] = eps+fabs(glpfh - 0.25*(2*rgb[indx][c]+rgb[indx+2][c]+rgb[indx-2][c]));
-					grblpfv[indx] = glpfv + 0.25*(2*rgb[indx][c]+rgb[indx+v2][c]+rgb[indx-v2][c]);
-					grblpfh[indx] = glpfh + 0.25*(2*rgb[indx][c]+rgb[indx+2][c]+rgb[indx-2][c]);
+					glpfv = 0.25*(2.0*rgb[1][indx]+rgb[1][indx+v2]+rgb[1][indx-v2]);
+					glpfh = 0.25*(2.0*rgb[1][indx]+rgb[1][indx+2]+rgb[1][indx-2]);
+					rblpfv[indx>>1] = eps+fabsf(glpfv - 0.25*(2.0*rgb[c][indx]+rgb[c][indx+v2]+rgb[c][indx-v2]));
+					rblpfh[indx>>1] = eps+fabsf(glpfh - 0.25*(2.0*rgb[c][indx]+rgb[c][indx+2]+rgb[c][indx-2]));
+					grblpfv[indx>>1] = glpfv + 0.25*(2.0*rgb[c][indx]+rgb[c][indx+v2]+rgb[c][indx-v2]);
+					grblpfh[indx>>1] = glpfh + 0.25*(2.0*rgb[c][indx]+rgb[c][indx+2]+rgb[c][indx-2]);
 				}
 			
 			// along line segments, find the point along each segment that minimizes the color variance
 			// averaged over the tile; evaluate for up/down and left/right away from R/B grid point 
 			for (rr=8; rr < rr1-8; rr++)
 				for (cc=8+(FC(rr,2)&1), indx=rr*TS+cc, c = FC(rr,cc); cc < cc1-8; cc+=2, indx+=2) {
-					
+				
 					areawt[0][c]=areawt[1][c]=0;
-					
+				
 					//in linear interpolation, color differences are a quadratic function of interpolation position;
 					//solve for the interpolation position that minimizes color difference variance over the tile
-					
+				
 					//vertical
-					gdiff=0.3125*(rgb[indx+TS][1]-rgb[indx-TS][1])+0.09375*(rgb[indx+TS+1][1]-rgb[indx-TS+1][1]+rgb[indx+TS-1][1]-rgb[indx-TS-1][1]);
-					deltgrb=(rgb[indx][c]-rgb[indx][1]);
-					
-					gradwt=fabs(0.25*rbhpfv[indx]+0.125*(rbhpfv[indx+2]+rbhpfv[indx-2]) )*(grblpfv[indx-v2]+grblpfv[indx+v2])/(eps+0.1*grblpfv[indx-v2]+rblpfv[indx-v2]+0.1*grblpfv[indx+v2]+rblpfv[indx+v2]);
-					
+					gdiff=0.3125*(rgb[1][indx+TS]-rgb[1][indx-TS])+0.09375*(rgb[1][indx+TS+1]-rgb[1][indx-TS+1]+rgb[1][indx+TS-1]-rgb[1][indx-TS-1]);
+					deltgrb=(rgb[c][indx]-rgb[1][indx]);
+				
+					gradwt=fabsf(0.25*rbhpfv[indx>>1]+0.125*(rbhpfv[(indx>>1)+1]+rbhpfv[(indx>>1)-1]) )*(grblpfv[(indx>>1)-v1]+grblpfv[(indx>>1)+v1])/(eps+0.1*grblpfv[(indx>>1)-v1]+rblpfv[(indx>>1)-v1]+0.1*grblpfv[(indx>>1)+v1]+rblpfv[(indx>>1)+v1]);
+				
 					coeff[0][0][c] += gradwt*deltgrb*deltgrb;
 					coeff[0][1][c] += gradwt*gdiff*deltgrb;
 					coeff[0][2][c] += gradwt*gdiff*gdiff;
 					areawt[0][c]+=1;
-					
-					
+
 					//horizontal
-					gdiff=0.3125*(rgb[indx+1][1]-rgb[indx-1][1])+0.09375*(rgb[indx+1+TS][1]-rgb[indx-1+TS][1]+rgb[indx+1-TS][1]-rgb[indx-1-TS][1]);
-					deltgrb=(rgb[indx][c]-rgb[indx][1]);
-					
-					gradwt=fabs(0.25*rbhpfh[indx]+0.125*(rbhpfh[indx+v2]+rbhpfh[indx-v2]) )*(grblpfh[indx-2]+grblpfh[indx+2])/(eps+0.1*grblpfh[indx-2]+rblpfh[indx-2]+0.1*grblpfh[indx+2]+rblpfh[indx+2]);
-					
+					gdiff=0.3125*(rgb[1][indx+1]-rgb[1][indx-1])+0.09375*(rgb[1][indx+1+TS]-rgb[1][indx-1+TS]+rgb[1][indx+1-TS]-rgb[1][indx-1-TS]);
+					deltgrb=(rgb[c][indx]-rgb[1][indx]);
+
+					gradwt=fabsf(0.25*rbhpfh[indx>>1]+0.125*(rbhpfh[(indx>>1)+v1]+rbhpfh[(indx>>1)-v1]) )*(grblpfh[(indx>>1)-1]+grblpfh[(indx>>1)+1])/(eps+0.1*grblpfh[(indx>>1)-1]+rblpfh[(indx>>1)-1]+0.1*grblpfh[(indx>>1)+1]+rblpfh[(indx>>1)+1]);
+
 					coeff[1][0][c] += gradwt*deltgrb*deltgrb;
 					coeff[1][1][c] += gradwt*gdiff*deltgrb;
 					coeff[1][2][c] += gradwt*gdiff*gdiff;
 					areawt[1][c]+=1;
-					
-					
+
 					//	In Mathematica,
 					//  f[x_]=Expand[Total[Flatten[
 					//  ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]];
@@ -515,21 +536,17 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 
 					//data structure = CAshift[vert/hor][color]
 					//j=0=vert, 1=hor
-					
-						
-					offset[j][c]=floor(CAshift[j][c]);
+
 					//offset gives NW corner of square containing the min; j=0=vert, 1=hor
 					
-					if (fabs(CAshift[j][c])<2.0) {
-						blockave[j][c] += CAshift[j][c];
-						blocksqave[j][c] += SQR(CAshift[j][c]);
-						blockdenom[j][c] += 1;
+					if (fabsf(CAshift[j][c])<2.0f) {
+						blockavethr[j][c] += CAshift[j][c];
+						blocksqavethr[j][c] += SQR(CAshift[j][c]);
+						blockdenomthr[j][c] += 1;
 					}
 				}//vert/hor
 			}//color
 			
-			
-			
 			/* CAshift[j][c] are the locations  
 			 that minimize color difference variances; 
 			 This is the approximate _optical_ location of the R/B pixels */
@@ -542,20 +559,38 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				//if (c==0) printf("vblock= %d hblock= %d blockshiftsmedian= %f \n",vblock,hblock,blockshifts[(vblock)*hblsz+hblock][c][0]);
 			}
 
-			if(plistener) plistener->setProgress(0.5*fabs((float)top/height));
-
+			if(plistener) {
+				progress+=(double)((TS-border2)*(TS-border2))/(2*height*width);
+				if (progress>1.0)
+				{
+					progress=1.0;
+				}
+				plistener->setProgress(progress);
+			}
 		}
-	}
+
 	//end of diagnostic pass
-	
+#pragma omp critical	
+{
+	for (j=0; j<2; j++)
+		for (c=0; c<3; c+=2) {
+			blockdenom[j][c] += blockdenomthr[j][c];
+			blocksqave[j][c] += blocksqavethr[j][c];
+			blockave[j][c]   += blockavethr[j][c];
+		}
+}
+#pragma omp barrier
+
+#pragma omp single
+{
 	for (j=0; j<2; j++)
 		for (c=0; c<3; c+=2) {
 			if (blockdenom[j][c]) {
 				blockvar[j][c] = blocksqave[j][c]/blockdenom[j][c]-SQR(blockave[j][c]/blockdenom[j][c]);
 			} else {
+				processpasstwo = false;
 				printf ("blockdenom vanishes \n");
-                                free(buffer1);
-				return;
+				break;
 			}
 		}
 	
@@ -566,120 +601,122 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 	
 	//now prepare for CA correction pass
 	//first, fill border blocks of blockshift array
-	for (vblock=1; vblock<vblsz-1; vblock++) {//left and right sides
-		for (c=0; c<3; c+=2) {
-			for (i=0; i<2; i++) {
-				blockshifts[vblock*hblsz][c][i]=blockshifts[(vblock)*hblsz+2][c][i];
-				blockshifts[vblock*hblsz+hblsz-1][c][i]=blockshifts[(vblock)*hblsz+hblsz-3][c][i];
-			}
-		}
-	}
-	for (hblock=0; hblock<hblsz; hblock++) {//top and bottom sides
-		for (c=0; c<3; c+=2) {
-			for (i=0; i<2; i++) {
-				blockshifts[hblock][c][i]=blockshifts[2*hblsz+hblock][c][i];
-				blockshifts[(vblsz-1)*hblsz+hblock][c][i]=blockshifts[(vblsz-3)*hblsz+hblock][c][i];
-			}
-		}
-	}
-	//end of filling border pixels of blockshift array
-	
-	//initialize fit arrays
-	for (i=0; i<256; i++) {polymat[0][0][i] = polymat[0][1][i] = polymat[2][0][i] = polymat[2][1][i] = 0;}
-	for (i=0; i<16; i++) {shiftmat[0][0][i] = shiftmat[0][1][i] = shiftmat[2][0][i] = shiftmat[2][1][i] = 0;}
-	
-	for (vblock=1; vblock<vblsz-1; vblock++)
-		for (hblock=1; hblock<hblsz-1; hblock++) {
-			// block 3x3 median of blockshifts for robustness
+	if(processpasstwo) {
+		for (vblock=1; vblock<vblsz-1; vblock++) {//left and right sides
 			for (c=0; c<3; c+=2) {
-				for (dir=0; dir<2; dir++) {
-					p[0] = blockshifts[(vblock-1)*hblsz+hblock-1][c][dir];
-					p[1] = blockshifts[(vblock-1)*hblsz+hblock][c][dir];
-					p[2] = blockshifts[(vblock-1)*hblsz+hblock+1][c][dir];
-					p[3] = blockshifts[(vblock)*hblsz+hblock-1][c][dir];
-					p[4] = blockshifts[(vblock)*hblsz+hblock][c][dir];
-					p[5] = blockshifts[(vblock)*hblsz+hblock+1][c][dir];
-					p[6] = blockshifts[(vblock+1)*hblsz+hblock-1][c][dir];
-					p[7] = blockshifts[(vblock+1)*hblsz+hblock][c][dir];
-					p[8] = blockshifts[(vblock+1)*hblsz+hblock+1][c][dir];
-					PIX_SORT(p[1],p[2]); PIX_SORT(p[4],p[5]); PIX_SORT(p[7],p[8]);
-					PIX_SORT(p[0],p[1]); PIX_SORT(p[3],p[4]); PIX_SORT(p[6],p[7]);
-					PIX_SORT(p[1],p[2]); PIX_SORT(p[4],p[5]); PIX_SORT(p[7],p[8]);
-					PIX_SORT(p[0],p[3]); PIX_SORT(p[5],p[8]); PIX_SORT(p[4],p[7]);
-					PIX_SORT(p[3],p[6]); PIX_SORT(p[1],p[4]); PIX_SORT(p[2],p[5]);
-					PIX_SORT(p[4],p[7]); PIX_SORT(p[4],p[2]); PIX_SORT(p[6],p[4]);
-					PIX_SORT(p[4],p[2]);
-					blockshifts[(vblock)*hblsz+hblock][c][dir] = p[4];
-					//if (c==0 && dir==0) printf("vblock= %d hblock= %d blockshiftsmedian= %f \n",vblock,hblock,p[4]);
+				for (i=0; i<2; i++) {
+					blockshifts[vblock*hblsz][c][i]=blockshifts[(vblock)*hblsz+2][c][i];
+					blockshifts[vblock*hblsz+hblsz-1][c][i]=blockshifts[(vblock)*hblsz+hblsz-3][c][i];
 				}
-				
-				
-				//if (verbose) fprintf (stderr,_("tile vshift hshift (%d %d %4f %4f)...\n"),vblock, hblock, blockshifts[(vblock)*hblsz+hblock][c][0], blockshifts[(vblock)*hblsz+hblock][c][1]);
-				
-				
-				//now prepare coefficient matrix; use only data points within two std devs of zero
-				if (SQR(blockshifts[(vblock)*hblsz+hblock][c][0])>4.0*blockvar[0][c] || SQR(blockshifts[(vblock)*hblsz+hblock][c][1])>4.0*blockvar[1][c]) continue;
-				numblox[c] += 1;
-				for (dir=0; dir<2; dir++) {
-					for (i=0; i<polyord; i++) {
-						for (j=0; j<polyord; j++) {
-							for (m=0; m<polyord; m++)
-								for (n=0; n<polyord; n++) {
-									polymat[c][dir][numpar*(polyord*i+j)+(polyord*m+n)] += (float)pow((float)vblock,i+m)*pow((float)hblock,j+n)*blockwt[vblock*hblsz+hblock];
-								}
-							shiftmat[c][dir][(polyord*i+j)] += (float)pow((float)vblock,i)*pow((float)hblock,j)*blockshifts[(vblock)*hblsz+hblock][c][dir]*blockwt[vblock*hblsz+hblock];
-						}
-						//if (c==0 && dir==0) {printf("i= %d j= %d shiftmat= %f \n",i,j,shiftmat[c][dir][(polyord*i+j)]);}
-					}//monomials	
-				}//dir 
-				
-			}//c
-		}//blocks
-	
-	numblox[1]=min(numblox[0],numblox[2]);
-	//if too few data points, restrict the order of the fit to linear
-	if (numblox[1]<32) {
-		polyord=2; numpar=4;
-		if (numblox[1]< 10) {
-			printf ("numblox = %d \n",numblox[1]);
-                        free(buffer1);
-			return;
-		}
-	}
-	
-	//fit parameters to blockshifts
-	for (c=0; c<3; c+=2)
-		for (dir=0; dir<2; dir++) {
-			res = LinEqSolve(numpar, polymat[c][dir], shiftmat[c][dir], fitparams[c][dir]);
-			if (res) {
-				printf ("CA correction pass failed -- can't solve linear equations for color %d direction %d...\n",c,dir);
-                                free(buffer1);
-				return;
 			}
 		}
+		for (hblock=0; hblock<hblsz; hblock++) {//top and bottom sides
+			for (c=0; c<3; c+=2) {
+				for (i=0; i<2; i++) {
+					blockshifts[hblock][c][i]=blockshifts[2*hblsz+hblock][c][i];
+					blockshifts[(vblsz-1)*hblsz+hblock][c][i]=blockshifts[(vblsz-3)*hblsz+hblock][c][i];
+				}
+			}
+		}
+		//end of filling border pixels of blockshift array
+
+		//initialize fit arrays
+		for (i=0; i<256; i++) {polymat[0][0][i] = polymat[0][1][i] = polymat[2][0][i] = polymat[2][1][i] = 0;}
+		for (i=0; i<16; i++) {shiftmat[0][0][i] = shiftmat[0][1][i] = shiftmat[2][0][i] = shiftmat[2][1][i] = 0;}
+		//#pragma omp for	collapse(2)
+		for (vblock=1; vblock<vblsz-1; vblock++)
+			for (hblock=1; hblock<hblsz-1; hblock++) {
+				// block 3x3 median of blockshifts for robustness
+				for (c=0; c<3; c+=2) {
+					for (dir=0; dir<2; dir++) {
+						p[0] = blockshifts[(vblock-1)*hblsz+hblock-1][c][dir];
+						p[1] = blockshifts[(vblock-1)*hblsz+hblock][c][dir];
+						p[2] = blockshifts[(vblock-1)*hblsz+hblock+1][c][dir];
+						p[3] = blockshifts[(vblock)*hblsz+hblock-1][c][dir];
+						p[4] = blockshifts[(vblock)*hblsz+hblock][c][dir];
+						p[5] = blockshifts[(vblock)*hblsz+hblock+1][c][dir];
+						p[6] = blockshifts[(vblock+1)*hblsz+hblock-1][c][dir];
+						p[7] = blockshifts[(vblock+1)*hblsz+hblock][c][dir];
+						p[8] = blockshifts[(vblock+1)*hblsz+hblock+1][c][dir];
+						PIX_SORT(p[1],p[2]); PIX_SORT(p[4],p[5]); PIX_SORT(p[7],p[8]);
+						PIX_SORT(p[0],p[1]); PIX_SORT(p[3],p[4]); PIX_SORT(p[6],p[7]);
+						PIX_SORT(p[1],p[2]); PIX_SORT(p[4],p[5]); PIX_SORT(p[7],p[8]);
+						PIX_SORT(p[0],p[3]); PIX_SORT(p[5],p[8]); PIX_SORT(p[4],p[7]);
+						PIX_SORT(p[3],p[6]); PIX_SORT(p[1],p[4]); PIX_SORT(p[2],p[5]);
+						PIX_SORT(p[4],p[7]); PIX_SORT(p[4],p[2]); PIX_SORT(p[6],p[4]);
+						PIX_SORT(p[4],p[2]);
+						blockshifts[(vblock)*hblsz+hblock][c][dir] = p[4];
+						//if (c==0 && dir==0) printf("vblock= %d hblock= %d blockshiftsmedian= %f \n",vblock,hblock,p[4]);
+					}
+
+					//if (verbose) fprintf (stderr,_("tile vshift hshift (%d %d %4f %4f)...\n"),vblock, hblock, blockshifts[(vblock)*hblsz+hblock][c][0], blockshifts[(vblock)*hblsz+hblock][c][1]);
+
+					//now prepare coefficient matrix; use only data points within two std devs of zero
+					if (SQR(blockshifts[(vblock)*hblsz+hblock][c][0])>4.0*blockvar[0][c] || SQR(blockshifts[(vblock)*hblsz+hblock][c][1])>4.0*blockvar[1][c]) continue;
+					numblox[c] += 1;
+					for (dir=0; dir<2; dir++) {
+						for (i=0; i<polyord; i++) {
+							for (j=0; j<polyord; j++) {
+								for (m=0; m<polyord; m++)
+									for (n=0; n<polyord; n++) {
+										polymat[c][dir][numpar*(polyord*i+j)+(polyord*m+n)] += (float)pow((float)vblock,i+m)*pow((float)hblock,j+n)*blockwt[vblock*hblsz+hblock];
+									}
+								shiftmat[c][dir][(polyord*i+j)] += (float)pow((float)vblock,i)*pow((float)hblock,j)*blockshifts[(vblock)*hblsz+hblock][c][dir]*blockwt[vblock*hblsz+hblock];
+							}
+							//if (c==0 && dir==0) {printf("i= %d j= %d shiftmat= %f \n",i,j,shiftmat[c][dir][(polyord*i+j)]);}
+						}//monomials	
+					}//dir 
+					
+				}//c
+			}//blocks
+
+		numblox[1]=min(numblox[0],numblox[2]);
+		//if too few data points, restrict the order of the fit to linear
+		if (numblox[1]<32) {
+			polyord=2; numpar=4;
+			if (numblox[1]< 10) {
+
+				printf ("numblox = %d \n",numblox[1]);
+				processpasstwo = false;
+			}
+		}
+		if(processpasstwo)
+			//fit parameters to blockshifts
+			for (c=0; c<3; c+=2)
+				for (dir=0; dir<2; dir++) {
+					res = LinEqSolve(numpar, polymat[c][dir], shiftmat[c][dir], fitparams[c][dir]);
+					if (res) {
+						printf ("CA correction pass failed -- can't solve linear equations for color %d direction %d...\n",c,dir);
+						processpasstwo = false;
+					}
+				}
+		}
 	//fitparams[polyord*i+j] gives the coefficients of (vblock^i hblock^j) in a polynomial fit for i,j<=4
 	} 
 	//end of initialization for CA correction pass
 	//only executed if cared and cablue are zero
-	
+}
 	// Main algorithm: Tile loop
-	//#pragma omp parallel for shared(image,height,width) private(top,left,indx,indx1) schedule(dynamic)
-	for (top=-border, vblock=1; top < height; top += TS-border2, vblock++)
-		for (left=-border, hblock=1; left < width; left += TS-border2, hblock++) {
+if(processpasstwo) {
+#pragma omp for schedule(dynamic) collapse(2) nowait
+	for (top=-border; top < height; top += TS-border2)
+		for (left=-border; left < width; left += TS-border2) {
+			vblock = ((top+border)/(TS-border2))+1;
+			hblock = ((left+border)/(TS-border2))+1;
 			int bottom = min(top+TS,height+border);
 			int right  = min(left+TS, width+border);
 			int rr1 = bottom - top;
 			int cc1 = right - left;
 			//t1_init = clock();
-			// rgb from input CFA data
-			// rgb values should be floating point number between 0 and 1 
-			// after white balance multipliers are applied 
 			if (top<0) {rrmin=border;} else {rrmin=0;}
 			if (left<0) {ccmin=border;} else {ccmin=0;}
 			if (bottom>height) {rrmax=height-top;} else {rrmax=rr1;}
 			if (right>width) {ccmax=width-left;} else {ccmax=cc1;}
-			
-			
+
+			// rgb from input CFA data
+			// rgb values should be floating point number between 0 and 1 
+			// after white balance multipliers are applied 
+
 			for (rr=rrmin; rr < rrmax; rr++)
 				for (row=rr+top, cc=ccmin; cc < ccmax; cc++) {
 					col = cc+left;
@@ -687,10 +724,10 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 					indx=row*width+col;
 					indx1=rr*TS+cc;	
 					//rgb[indx1][c] = image[indx][c]/65535.0f;
-					rgb[indx1][c] = (rawData[row][col])/65535.0f;
+					rgb[c][indx1] = (rawData[row][col])/65535.0f;
 					//rgb[indx1][c] = image[indx][c]/65535.0f;//for dcraw implementation
 
-					if ((c&1)==0) rgb[indx1][1] = Gtmp[indx];
+					if ((c&1)==0) rgb[1][indx1] = Gtmp[indx];
 				}
 			// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 			//fill borders
@@ -698,36 +735,36 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=ccmin; cc<ccmax; cc++) {
 						c = FC(rr,cc);
-						rgb[rr*TS+cc][c] = rgb[(border2-rr)*TS+cc][c];
-						rgb[rr*TS+cc][1] = rgb[(border2-rr)*TS+cc][1];
+						rgb[c][rr*TS+cc] = rgb[c][(border2-rr)*TS+cc];
+						rgb[1][rr*TS+cc] = rgb[1][(border2-rr)*TS+cc];
 					}
 			}
 			if (rrmax<rr1) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=ccmin; cc<ccmax; cc++) {
 						c=FC(rr,cc);
-						rgb[(rrmax+rr)*TS+cc][c] = (rawData[(height-rr-2)][left+cc])/65535.0f;
+						rgb[c][(rrmax+rr)*TS+cc] = (rawData[(height-rr-2)][left+cc])/65535.0f;
 						//rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+left+cc][c])/65535.0f;//for dcraw implementation
 
-						rgb[(rrmax+rr)*TS+cc][1] = Gtmp[(height-rr-2)*width+left+cc];
+						rgb[1][(rrmax+rr)*TS+cc] = Gtmp[(height-rr-2)*width+left+cc];
 					}
 			}
 			if (ccmin>0) {
 				for (rr=rrmin; rr<rrmax; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[rr*TS+cc][c] = rgb[rr*TS+border2-cc][c];
-						rgb[rr*TS+cc][1] = rgb[rr*TS+border2-cc][1];
+						rgb[c][rr*TS+cc] = rgb[c][rr*TS+border2-cc];
+						rgb[1][rr*TS+cc] = rgb[1][rr*TS+border2-cc];
 					}
 			}
 			if (ccmax<cc1) {
 				for (rr=rrmin; rr<rrmax; rr++)
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[rr*TS+ccmax+cc][c] = (rawData[(top+rr)][(width-cc-2)])/65535.0f;
+						rgb[c][rr*TS+ccmax+cc] = (rawData[(top+rr)][(width-cc-2)])/65535.0f;
 						//rgb[rr*TS+ccmax+cc][c] = (image[(top+rr)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
 
-						rgb[rr*TS+ccmax+cc][1] = Gtmp[(top+rr)*width+(width-cc-2)];
+						rgb[1][rr*TS+ccmax+cc] = Gtmp[(top+rr)*width+(width-cc-2)];
 					}
 			}
 			
@@ -736,65 +773,65 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[(rr)*TS+cc][c] = (rawData[border2-rr][border2-cc])/65535.0f;
+						rgb[c][(rr)*TS+cc] = (rawData[border2-rr][border2-cc])/65535.0f;
 						//rgb[(rr)*TS+cc][c] = (rgb[(border2-rr)*TS+(border2-cc)][c]);//for dcraw implementation
 
-						rgb[(rr)*TS+cc][1] = Gtmp[(border2-rr)*width+border2-cc];
+						rgb[1][(rr)*TS+cc] = Gtmp[(border2-rr)*width+border2-cc];
 					}
 			}
 			if (rrmax<rr1 && ccmax<cc1) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[(rrmax+rr)*TS+ccmax+cc][c] = (rawData[(height-rr-2)][(width-cc-2)])/65535.0f;
+						rgb[c][(rrmax+rr)*TS+ccmax+cc] = (rawData[(height-rr-2)][(width-cc-2)])/65535.0f;
 						//rgb[(rrmax+rr)*TS+ccmax+cc][c] = (image[(height-rr-2)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
 
-						rgb[(rrmax+rr)*TS+ccmax+cc][1] = Gtmp[(height-rr-2)*width+(width-cc-2)];
+						rgb[1][(rrmax+rr)*TS+ccmax+cc] = Gtmp[(height-rr-2)*width+(width-cc-2)];
 					}
 			}
 			if (rrmin>0 && ccmax<cc1) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[(rr)*TS+ccmax+cc][c] = (rawData[(border2-rr)][(width-cc-2)])/65535.0f;
+						rgb[c][(rr)*TS+ccmax+cc] = (rawData[(border2-rr)][(width-cc-2)])/65535.0f;
 						//rgb[(rr)*TS+ccmax+cc][c] = (image[(border2-rr)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
 
-						rgb[(rr)*TS+ccmax+cc][1] = Gtmp[(border2-rr)*width+(width-cc-2)];
+						rgb[1][(rr)*TS+ccmax+cc] = Gtmp[(border2-rr)*width+(width-cc-2)];
 					}
 			}
 			if (rrmax<rr1 && ccmin>0) {
 				for (rr=0; rr<border; rr++) 
 					for (cc=0; cc<border; cc++) {
 						c=FC(rr,cc);
-						rgb[(rrmax+rr)*TS+cc][c] = (rawData[(height-rr-2)][(border2-cc)])/65535.0f;
+						rgb[c][(rrmax+rr)*TS+cc] = (rawData[(height-rr-2)][(border2-cc)])/65535.0f;
 						//rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+(border2-cc)][c])/65535.0f;//for dcraw implementation
 
-						rgb[(rrmax+rr)*TS+cc][1] = Gtmp[(height-rr-2)*width+(border2-cc)];
+						rgb[1][(rrmax+rr)*TS+cc] = Gtmp[(height-rr-2)*width+(border2-cc)];
 					}
 			}
-			
+
 			//end of border fill
 			// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%	
-			
+
 			if (cared || cablue) {
 				//manual CA correction; use red/blue slider values to set CA shift parameters
 				for (rr=3; rr < rr1-3; rr++)
 					for (row=rr+top, cc=3, indx=rr*TS+cc; cc < cc1-3; cc++, indx++) {
 						col = cc+left;
 						c = FC(rr,cc);
-						
+
 						if (c!=1) {
 							//compute directional weights using image gradients
-							wtu=1/SQR(eps+fabs(rgb[(rr+1)*TS+cc][1]-rgb[(rr-1)*TS+cc][1])+fabs(rgb[(rr)*TS+cc][c]-rgb[(rr-2)*TS+cc][c])+fabs(rgb[(rr-1)*TS+cc][1]-rgb[(rr-3)*TS+cc][1]));
-							wtd=1/SQR(eps+fabs(rgb[(rr-1)*TS+cc][1]-rgb[(rr+1)*TS+cc][1])+fabs(rgb[(rr)*TS+cc][c]-rgb[(rr+2)*TS+cc][c])+fabs(rgb[(rr+1)*TS+cc][1]-rgb[(rr+3)*TS+cc][1]));
-							wtl=1/SQR(eps+fabs(rgb[(rr)*TS+cc+1][1]-rgb[(rr)*TS+cc-1][1])+fabs(rgb[(rr)*TS+cc][c]-rgb[(rr)*TS+cc-2][c])+fabs(rgb[(rr)*TS+cc-1][1]-rgb[(rr)*TS+cc-3][1]));
-							wtr=1/SQR(eps+fabs(rgb[(rr)*TS+cc-1][1]-rgb[(rr)*TS+cc+1][1])+fabs(rgb[(rr)*TS+cc][c]-rgb[(rr)*TS+cc+2][c])+fabs(rgb[(rr)*TS+cc+1][1]-rgb[(rr)*TS+cc+3][1]));
-							
+							wtu=1.0/SQR(eps+fabsf(rgb[1][(rr+1)*TS+cc]-rgb[1][(rr-1)*TS+cc])+fabsf(rgb[c][(rr)*TS+cc]-rgb[c][(rr-2)*TS+cc])+fabsf(rgb[1][(rr-1)*TS+cc]-rgb[1][(rr-3)*TS+cc]));
+							wtd=1.0/SQR(eps+fabsf(rgb[1][(rr-1)*TS+cc]-rgb[1][(rr+1)*TS+cc])+fabsf(rgb[c][(rr)*TS+cc]-rgb[c][(rr+2)*TS+cc])+fabsf(rgb[1][(rr+1)*TS+cc]-rgb[1][(rr+3)*TS+cc]));
+							wtl=1.0/SQR(eps+fabsf(rgb[1][(rr)*TS+cc+1]-rgb[1][(rr)*TS+cc-1])+fabsf(rgb[c][(rr)*TS+cc]-rgb[c][(rr)*TS+cc-2])+fabsf(rgb[1][(rr)*TS+cc-1]-rgb[1][(rr)*TS+cc-3]));
+							wtr=1.0/SQR(eps+fabsf(rgb[1][(rr)*TS+cc-1]-rgb[1][(rr)*TS+cc+1])+fabsf(rgb[c][(rr)*TS+cc]-rgb[c][(rr)*TS+cc+2])+fabsf(rgb[1][(rr)*TS+cc+1]-rgb[1][(rr)*TS+cc+3]));
+
 							//store in rgb array the interpolated G value at R/B grid points using directional weighted average
-							rgb[indx][1]=(wtu*rgb[indx-v1][1]+wtd*rgb[indx+v1][1]+wtl*rgb[indx-1][1]+wtr*rgb[indx+1][1])/(wtu+wtd+wtl+wtr);
+							rgb[1][indx]=(wtu*rgb[1][indx-v1]+wtd*rgb[1][indx+v1]+wtl*rgb[1][indx-1]+wtr*rgb[1][indx+1])/(wtu+wtd+wtl+wtr);
 						}
 						if (row>-1 && row<height && col>-1 && col<width)
-							Gtmp[row*width + col] = rgb[indx][1];
+							Gtmp[row*width + col] = rgb[1][indx];
 					}
 				float hfrac = -((float)(hblock-0.5)/(hblsz-2) - 0.5);
 				float vfrac = -((float)(vblock-0.5)/(vblsz-2) - 0.5)*height/width;
@@ -852,15 +889,15 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 				for (cc=4+(FC(rr,2)&1), c = FC(rr,cc); cc < cc1-4; cc+=2) {
 					//perform CA correction using color ratios or color differences
 					
-					Ginthfloor=(1-shifthfrac[c])*rgb[(rr+shiftvfloor[c])*TS+cc+shifthfloor[c]][1]+(shifthfrac[c])*rgb[(rr+shiftvfloor[c])*TS+cc+shifthceil[c]][1];
-					Ginthceil=(1-shifthfrac[c])*rgb[(rr+shiftvceil[c])*TS+cc+shifthfloor[c]][1]+(shifthfrac[c])*rgb[(rr+shiftvceil[c])*TS+cc+shifthceil[c]][1];
+					Ginthfloor=(1-shifthfrac[c])*rgb[1][(rr+shiftvfloor[c])*TS+cc+shifthfloor[c]]+(shifthfrac[c])*rgb[1][(rr+shiftvfloor[c])*TS+cc+shifthceil[c]];
+					Ginthceil=(1-shifthfrac[c])*rgb[1][(rr+shiftvceil[c])*TS+cc+shifthfloor[c]]+(shifthfrac[c])*rgb[1][(rr+shiftvceil[c])*TS+cc+shifthceil[c]];
 					//Gint is blinear interpolation of G at CA shift point
 					Gint=(1-shiftvfrac[c])*Ginthfloor+(shiftvfrac[c])*Ginthceil;
 					
 					//determine R/B at grid points using color differences at shift point plus interpolated G value at grid point
 					//but first we need to interpolate G-R/G-B to grid points...
-					grbdiff[(rr)*TS+cc]=Gint-rgb[(rr)*TS+cc][c];
-					gshift[(rr)*TS+cc]=Gint;
+					grbdiff[((rr)*TS+cc)>>1]=Gint-rgb[c][(rr)*TS+cc];
+					gshift[((rr)*TS+cc)>>1]=Gint;
 				}
 			
 			for (rr=8; rr < rr1-8; rr++)
@@ -868,72 +905,85 @@ void RawImageSource::CA_correct_RT(double cared, double cablue) {
 					
 					//if (rgb[indx][c]>clip_pt || Gtmp[indx]>clip_pt) continue;
 
-					grbdiffold = rgb[indx][1]-rgb[indx][c];
+					grbdiffold = rgb[1][indx]-rgb[c][indx];
 
 					//interpolate color difference from optical R/B locations to grid locations
-					grbdiffinthfloor=(1-shifthfrac[c]/2)*grbdiff[indx]+(shifthfrac[c]/2)*grbdiff[indx-2*GRBdir[1][c]];
-					grbdiffinthceil=(1-shifthfrac[c]/2)*grbdiff[(rr-2*GRBdir[0][c])*TS+cc]+(shifthfrac[c]/2)*grbdiff[(rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c]];
+					grbdiffinthfloor=(1.0f-shifthfrac[c]/2.0f)*grbdiff[indx>>1]+(shifthfrac[c]/2.0f)*grbdiff[(indx-2*GRBdir[1][c])>>1];
+					grbdiffinthceil=(1.0f-shifthfrac[c]/2.0f)*grbdiff[((rr-2*GRBdir[0][c])*TS+cc)>>1]+(shifthfrac[c]/2.0f)*grbdiff[((rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c])>>1];
 					//grbdiffint is bilinear interpolation of G-R/G-B at grid point
-					grbdiffint=(1-shiftvfrac[c]/2)*grbdiffinthfloor+(shiftvfrac[c]/2)*grbdiffinthceil;
+					grbdiffint=(1.0f-shiftvfrac[c]/2.0f)*grbdiffinthfloor+(shiftvfrac[c]/2.0f)*grbdiffinthceil;
 					
 					//now determine R/B at grid points using interpolated color differences and interpolated G value at grid point
-					RBint=rgb[indx][1]-grbdiffint;
+					RBint=rgb[1][indx]-grbdiffint;
 					
-					if (fabs(RBint-rgb[indx][c])<0.25*(RBint+rgb[indx][c])) {
-						if (fabs(grbdiffold)>fabs(grbdiffint) ) {
-							rgb[indx][c]=RBint;
+					if (fabsf(RBint-rgb[c][indx])<0.25f*(RBint+rgb[c][indx])) {
+						if (fabsf(grbdiffold)>fabsf(grbdiffint) ) {
+							rgb[c][indx]=RBint;
 						}
 					} else {
 						
 						//gradient weights using difference from G at CA shift points and G at grid points
-						p[0]=1/(eps+fabs(rgb[indx][1]-gshift[indx]));
-						p[1]=1/(eps+fabs(rgb[indx][1]-gshift[indx-2*GRBdir[1][c]]));
-						p[2]=1/(eps+fabs(rgb[indx][1]-gshift[(rr-2*GRBdir[0][c])*TS+cc]));
-						p[3]=1/(eps+fabs(rgb[indx][1]-gshift[(rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c]]));
+						p[0]=1.0f/(eps+fabsf(rgb[1][indx]-gshift[indx>>1]));
+						p[1]=1.0f/(eps+fabsf(rgb[1][indx]-gshift[(indx-2*GRBdir[1][c])>>1]));
+						p[2]=1.0f/(eps+fabsf(rgb[1][indx]-gshift[((rr-2*GRBdir[0][c])*TS+cc)>>1]));
+						p[3]=1.0f/(eps+fabsf(rgb[1][indx]-gshift[((rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c])>>1]));
 						
-						grbdiffint = (p[0]*grbdiff[indx]+p[1]*grbdiff[indx-2*GRBdir[1][c]]+
-									  p[2]*grbdiff[(rr-2*GRBdir[0][c])*TS+cc]+p[3]*grbdiff[(rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c]])/(p[0]+p[1]+p[2]+p[3]);
+						grbdiffint = (p[0]*grbdiff[indx>>1]+p[1]*grbdiff[(indx-2*GRBdir[1][c])>>1]+
+									  p[2]*grbdiff[((rr-2*GRBdir[0][c])*TS+cc)>>1]+p[3]*grbdiff[((rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c])>>1])/(p[0]+p[1]+p[2]+p[3]);
 						
 						//now determine R/B at grid points using interpolated color differences and interpolated G value at grid point
-						if (fabs(grbdiffold)>fabs(grbdiffint) ) {
-							rgb[indx][c]=rgb[indx][1]-grbdiffint;
+						if (fabsf(grbdiffold)>fabsf(grbdiffint) ) {
+							rgb[c][indx]=rgb[1][indx]-grbdiffint;
 						}
 					}
 					
 					//if color difference interpolation overshot the correction, just desaturate
 					if (grbdiffold*grbdiffint<0) {
-						rgb[indx][c]=rgb[indx][1]-0.5*(grbdiffold+grbdiffint);
+						rgb[c][indx]=rgb[1][indx]-0.5f*(grbdiffold+grbdiffint);
 					}
 				}
 			
-			// copy CA corrected results back to image matrix
-			for (rr=border; rr < rr1-border; rr++)
-				for (row=rr+top, cc=border+(FC(rr,2)&1); cc < cc1-border; cc+=2) {
+			// copy CA corrected results to temporary image matrix
+			for (rr=border; rr < rr1-border; rr++){
+				c = FC(rr+top, left + border+FC(rr+top,2)&1);
+				for (row=rr+top, cc=border+(FC(rr,2)&1),indx=(row*width+cc+left)>>1; cc < cc1-border; cc+=2,indx++) {
 					col = cc + left;
-					indx = row*width + col;
-					c = FC(row,col);
-					 
-					rawData[row][col] = 65535.0f*rgb[(rr)*TS+cc][c] + 0.5f;
+					RawDataTmp[indx] = 65535.0f*rgb[c][(rr)*TS+cc] + 0.5f;
 					//image[indx][c] = CLIP((int)(65535.0*rgb[(rr)*TS+cc][c] + 0.5));//for dcraw implementation
+				}
+			}
 
-				} 
-			
-			if(plistener) plistener->setProgress(0.5+0.5*fabs((float)top/height));
-
+			if(plistener) {
+				progress+=(double)((TS-border2)*(TS-border2))/(2*height*width);
+				if (progress>1.0)
+				{
+					progress=1.0;
+				}
+				plistener->setProgress(progress);
+			}
 		}
+	}
+	free(buffer);
+
+#pragma omp barrier
+
+// copy temporary image matrix back to image matrix
+#pragma omp for
+	for(row=0;row<height;row++)
+		for(col=0+(FC(row,0)&1),indx=(row*width+col)>>1;col<width;col+=2,indx++)
+			rawData[row][col] = RawDataTmp[indx];
+	
+	
 	
 	// clean up
-	free(buffer);
+
+}
+	
 	free(Gtmp);
 	free(buffer1);
-	
-
+	free(RawDataTmp);
 	
 #undef TS
-//#undef border
-//#undef border2
+#undef TSH
 #undef PIX_SORT
 }
-
-
-