rawTherapee/rtengine/CA_correct_RT.cc

////////////////////////////////////////////////////////////////
//
//		Chromatic Aberration Auto-correction
//
//		copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
//
//
// code dated: November 26, 2010
//
//	CA_correct_RT.cc is free software: you can redistribute it and/or modify
//	it under the terms of the GNU General Public License as published by
//	the Free Software Foundation, either version 3 of the License, or
//	(at your option) any later version.
//
//	This program is distributed in the hope that it will be useful,
//	but WITHOUT ANY WARRANTY; without even the implied warranty of
//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//	GNU General Public License for more details.
//
//	You should have received a copy of the GNU General Public License
//	along with this program.  If not, see <http://www.gnu.org/licenses/>.
//
////////////////////////////////////////////////////////////////
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#include "rtengine.h"
#include "rawimagesource.h"
#include "rt_math.h"

using namespace std;
using namespace rtengine;

int RawImageSource::LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
{
//==============================================================================
// return 1 if system not solving, 0 if system solved
// nDim - system dimension
// pfMatr - matrix with coefficients
// pfVect - vector with free members
// pfSolution - vector with system solution
// pfMatr becames trianglular after function call
// pfVect changes after function call
//
// Developer: Henry Guennadi Levkin
//
//==============================================================================

	double fMaxElem;
	double fAcc;

	int i, j, k, m;

	for(k=0; k<(nDim-1); k++) {// base row of matrix
		// search of line with max element
		fMaxElem = fabsf( pfMatr[k*nDim + k] );
		m = k;
		for (i=k+1; i<nDim; i++) {
			if(fMaxElem < fabsf(pfMatr[i*nDim + k]) ) {
				fMaxElem = pfMatr[i*nDim + k];
				m = i;
			}
		}

		// permutation of base line (index k) and max element line(index m)
		if(m != k) {
			for(i=k; i<nDim; i++) {
				fAcc               = pfMatr[k*nDim + i];
				pfMatr[k*nDim + i] = pfMatr[m*nDim + i];
				pfMatr[m*nDim + i] = fAcc;
			}
			fAcc = pfVect[k];
			pfVect[k] = pfVect[m];
			pfVect[m] = fAcc;
		}

		if( pfMatr[k*nDim + k] == 0.) {
			//linear system has no solution
			return 1; // needs improvement !!!
		}

		// triangulation of matrix with coefficients
		for(j=(k+1); j<nDim; j++) {// current row of matrix
			fAcc = - pfMatr[j*nDim + k] / pfMatr[k*nDim + k];
			for(i=k; i<nDim; i++) {
				pfMatr[j*nDim + i] = pfMatr[j*nDim + i] + fAcc*pfMatr[k*nDim + i];
			}
			pfVect[j] = pfVect[j] + fAcc*pfVect[k]; // free member recalculation
		}
	}

	for(k=(nDim-1); k>=0; k--) {
		pfSolution[k] = pfVect[k];
		for(i=(k+1); i<nDim; i++) {
			pfSolution[k] -= (pfMatr[k*nDim + i]*pfSolution[i]);
		}
		pfSolution[k] = pfSolution[k] / pfMatr[k*nDim + k];
	}

	return 0;
}
//end of linear equation solver
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


void RawImageSource::CA_correct_RT(double cared, double cablue) {
// multithreaded by Ingo Weyrich
#define TS 128		// Tile size
#define TSH 64		// Half Tile size
#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} }

	volatile double progress = 0.0;
	if(plistener) plistener->setProgress (progress);

	bool autoCA = (cared==0 && cablue==0);
	// local variables
	int width=W, height=H;
	//temporary array to store simple interpolation of G
	float (*Gtmp);
	Gtmp = (float (*)) calloc ((height)*(width), sizeof *Gtmp);

	// temporary array to avoid race conflicts, only every second pixel needs to be saved here
	float (*RawDataTmp);
	RawDataTmp = (float*) malloc( height * width * sizeof(float)/2);

	float	blockave[2][3]={{0,0,0},{0,0,0}}, blocksqave[2][3]={{0,0,0},{0,0,0}}, blockdenom[2][3]={{0,0,0},{0,0,0}}, blockvar[2][3];

	// Because we can't break parallel processing, we need a switch do handle the errors
	bool processpasstwo = true;

	//block CA shift values and weight assigned to block
	char		*buffer1;				// vblsz*hblsz*(3*2+1)
	float		(*blockwt);				// vblsz*hblsz
	float		(*blockshifts)[3][2];	// vblsz*hblsz*3*2


	const int border=8;
	const int border2=16;

	int vz1, hz1;
	if((height+border2)%(TS-border2)==0) vz1=1; else vz1=0;
    if((width+border2)%(TS-border2)==0) hz1=1; else hz1=0;

	int vblsz, hblsz;
    vblsz=ceil((float)(height+border2)/(TS-border2)+2+vz1);
    hblsz=ceil((float)(width+border2)/(TS-border2)+2+hz1);

	buffer1 = (char *) malloc(vblsz*hblsz*(3*2+1)*sizeof(float));
	//merror(buffer1,"CA_correct()");
	memset(buffer1,0,vblsz*hblsz*(3*2+1)*sizeof(float));
	// block CA shifts
	blockwt		= (float (*))			(buffer1);
	blockshifts	= (float (*)[3][2])		(buffer1+(vblsz*hblsz*sizeof(float)));

	double	polymat[3][2][256], shiftmat[3][2][16], fitparams[3][2][16];
	for (int i=0; i<256; i++) {polymat[0][0][i] = polymat[0][1][i] = polymat[2][0][i] = polymat[2][1][i] = 0;}
	for (int i=0; i<16; i++) {shiftmat[0][0][i] = shiftmat[0][1][i] = shiftmat[2][0][i] = shiftmat[2][1][i] = 0;}

	//order of 2d polynomial fit (polyord), and numpar=polyord^2
	int polyord=4, numpar=16;
	int numblox[3]={0,0,0};

#pragma omp parallel shared(Gtmp,width,height,blockave,blocksqave,blockdenom,blockvar,blockwt,blockshifts,polymat,shiftmat,fitparams,polyord,numpar)
{
	int progresscounter = 0;
	//number of blocks used in the fit
	int numbloxthr[3]={0,0,0};

	int rrmin, rrmax, ccmin, ccmax;
	int top, left, row, col;
	int rr, cc, c, indx, indx1, i, j, k, m, n, dir;
	//number of pixels in a tile contributing to the CA shift diagnostic
	int areawt[2][3];
	//direction of the CA shift in a tile
	int GRBdir[2][3];
	//offset data of the plaquette where the optical R/B data are sampled
	int offset[2][3];
	int	shifthfloor[3], shiftvfloor[3], shifthceil[3], shiftvceil[3];
	//number of tiles in the image
	int vblock, hblock;
	//int verbose=1;
	//flag indicating success or failure of polynomial fit
	int res;
	//shifts to location of vertical and diagonal neighbors
	const int v1=TS, v2=2*TS, v3=3*TS, v4=4*TS;//, p1=-TS+1, p2=-2*TS+2, p3=-3*TS+3, m1=TS+1, m2=2*TS+2, m3=3*TS+3;

	float eps=1e-5f, eps2=1e-10f;	//tolerance to avoid dividing by zero

	//adaptive weights for green interpolation
	float	wtu, wtd, wtl, wtr;
	//local quadratic fit to shift data within a tile
	float	coeff[2][3][3];
	//measured CA shift parameters for a tile
	float	CAshift[2][3];
	//polynomial fit coefficients
	//residual CA shift amount within a plaquette
	float	shifthfrac[3], shiftvfrac[3];
	//temporary storage for median filter
	float	temp, p[9];
	//temporary parameters for tile CA evaluation
	float	gdiff, deltgrb;
	//interpolated G at edge of plaquette
	float	Ginthfloor, Ginthceil, Gint, RBint, gradwt;
	//interpolated color difference at edge of plaquette
	float	grbdiffinthfloor, grbdiffinthceil, grbdiffint, grbdiffold;
	//per thread data for evaluation of block CA shift variance
	float	blockavethr[2][3]={{0,0,0},{0,0,0}}, blocksqavethr[2][3]={{0,0,0},{0,0,0}}, blockdenomthr[2][3]={{0,0,0},{0,0,0}};//, blockvarthr[2][3];

	//low and high pass 1D filters of G in vertical/horizontal directions
	float	glpfh, glpfv;

	//max allowed CA shift
	const float bslim = 3.99;
	//gaussians for low pass filtering of G and R/B
	//static const float gaussg[5] = {0.171582, 0.15839, 0.124594, 0.083518, 0.0477063};//sig=2.5
	//static const float gaussrb[3] = {0.332406, 0.241376, 0.0924212};//sig=1.25

	//block CA shift values and weight assigned to block

	char		*buffer;			// TS*TS*16
	//rgb data in a tile
	float* rgb[3];
	//color differences
	float         (*grbdiff);		// TS*TS*4
	//green interpolated to optical sample points for R/B
	float         (*gshift);		// TS*TS*4
	//high pass filter for R/B in vertical direction
	float         (*rbhpfh);		// TS*TS*4
	//high pass filter for R/B in horizontal direction
	float         (*rbhpfv);		// TS*TS*4
	//low pass filter for R/B in horizontal direction
	float         (*rblpfh);		// TS*TS*4
	//low pass filter for R/B in vertical direction
	float         (*rblpfv);		// TS*TS*4
	//low pass filter for color differences in horizontal direction
	float         (*grblpfh);		// TS*TS*4
	//low pass filter for color differences in vertical direction
	float         (*grblpfv);		// TS*TS*4


	/* assign working space; this would not be necessary
	 if the algorithm is part of the larger pre-interpolation processing */
	buffer = (char *) malloc(3*sizeof(float)*TS*TS + 8*sizeof(float)*TS*TSH + 10*64 + 64);
	//merror(buffer,"CA_correct()");
	memset(buffer,0,3*sizeof(float)*TS*TS + 8*sizeof(float)*TS*TSH + 10*64 + 64);

	char 	*data;
	data 	= buffer;

//  buffers aligned to size of cacheline
//	data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);


	// shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache
	rgb[0]      = (float (*))		data;
	rgb[1]		= (float (*))		(data + 1*sizeof(float)*TS*TS + 1*64);
	rgb[2]		= (float (*))		(data + 2*sizeof(float)*TS*TS + 2*64);
	grbdiff		= (float (*))		(data +	3*sizeof(float)*TS*TS + 3*64);
	gshift		= (float (*))		(data +	3*sizeof(float)*TS*TS + sizeof(float)*TS*TSH + 4*64);
	rbhpfh		= (float (*))		(data +	4*sizeof(float)*TS*TS + 5*64);
	rbhpfv		= (float (*))		(data +	4*sizeof(float)*TS*TS + sizeof(float)*TS*TSH + 6*64);
	rblpfh		= (float (*))		(data +	5*sizeof(float)*TS*TS + 7*64);
	rblpfv		= (float (*))		(data +	5*sizeof(float)*TS*TS + sizeof(float)*TS*TSH + 8*64);
	grblpfh		= (float (*))		(data +	6*sizeof(float)*TS*TS + 9*64);
	grblpfv		= (float (*))		(data +	6*sizeof(float)*TS*TS + sizeof(float)*TS*TSH + 10*64);


	if (autoCA) {
	// Main algorithm: Tile loop
#pragma omp for collapse(2) schedule(dynamic) nowait
	for (top=-border ; top < height; top += TS-border2)
		for (left=-border; left < width; left += TS-border2) {
			vblock = ((top+border)/(TS-border2))+1;
			hblock = ((left+border)/(TS-border2))+1;
			int bottom = min(top+TS,height+border);
			int right  = min(left+TS, width+border);
			int rr1 = bottom - top;
			int cc1 = right - left;
			//t1_init = clock();
			if (top<0) {rrmin=border;} else {rrmin=0;}
			if (left<0) {ccmin=border;} else {ccmin=0;}
			if (bottom>height) {rrmax=height-top;} else {rrmax=rr1;}
			if (right>width) {ccmax=width-left;} else {ccmax=cc1;}

			// rgb from input CFA data
			// rgb values should be floating point number between 0 and 1
			// after white balance multipliers are applied

			for (rr=rrmin; rr < rrmax; rr++)
				for (row=rr+top, cc=ccmin; cc < ccmax; cc++) {
					col = cc+left;
					c = FC(rr,cc);
					indx=row*width+col;
					indx1=rr*TS+cc;
					rgb[c][indx1] = (rawData[row][col])/65535.0f;
					//rgb[indx1][c] = image[indx][c]/65535.0f;//for dcraw implementation
				}

			// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
			//fill borders
			if (rrmin>0) {
				for (rr=0; rr<border; rr++)
					for (cc=ccmin; cc<ccmax; cc++) {
						c = FC(rr,cc);
						rgb[c][rr*TS+cc] = rgb[c][(border2-rr)*TS+cc];
					}
			}
			if (rrmax<rr1) {
				for (rr=0; rr<border; rr++)
					for (cc=ccmin; cc<ccmax; cc++) {
						c=FC(rr,cc);
						rgb[c][(rrmax+rr)*TS+cc] = (rawData[(height-rr-2)][left+cc])/65535.0f;
						//rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+left+cc][c])/65535.0f;//for dcraw implementation
					}
			}
			if (ccmin>0) {
				for (rr=rrmin; rr<rrmax; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][rr*TS+cc] = rgb[c][rr*TS+border2-cc];
					}
			}
			if (ccmax<cc1) {
				for (rr=rrmin; rr<rrmax; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][rr*TS+ccmax+cc] = (rawData[(top+rr)][(width-cc-2)])/65535.0f;
						//rgb[rr*TS+ccmax+cc][c] = (image[(top+rr)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
					}
			}

			//also, fill the image corners
			if (rrmin>0 && ccmin>0) {
				for (rr=0; rr<border; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][(rr)*TS+cc] = (rawData[border2-rr][border2-cc])/65535.0f;
						//rgb[(rr)*TS+cc][c] = (rgb[(border2-rr)*TS+(border2-cc)][c]);//for dcraw implementation
					}
			}
			if (rrmax<rr1 && ccmax<cc1) {
				for (rr=0; rr<border; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][(rrmax+rr)*TS+ccmax+cc] = (rawData[(height-rr-2)][(width-cc-2)])/65535.0f;
						//rgb[(rrmax+rr)*TS+ccmax+cc][c] = (image[(height-rr-2)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
					}
			}
			if (rrmin>0 && ccmax<cc1) {
				for (rr=0; rr<border; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][(rr)*TS+ccmax+cc] = (rawData[(border2-rr)][(width-cc-2)])/65535.0f;
						//rgb[(rr)*TS+ccmax+cc][c] = (image[(border2-rr)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation
					}
			}
			if (rrmax<rr1 && ccmin>0) {
				for (rr=0; rr<border; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][(rrmax+rr)*TS+cc] = (rawData[(height-rr-2)][(border2-cc)])/65535.0f;
						//rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+(border2-cc)][c])/65535.0f;//for dcraw implementation
					}
			}

			//end of border fill
			// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


			for (j=0; j<2; j++)
				for (k=0; k<3; k++)
					for (c=0; c<3; c+=2) {
						coeff[j][k][c]=0;
					}
			//end of initialization


			for (rr=3; rr < rr1-3; rr++)
				for (row=rr+top, cc=3, indx=rr*TS+cc; cc < cc1-3; cc++, indx++) {
					col = cc+left;
					c = FC(rr,cc);

					if (c!=1) {
						//compute directional weights using image gradients
						wtu=1.0/SQR(eps+fabsf(rgb[1][indx+v1]-rgb[1][indx-v1])+fabsf(rgb[c][indx]-rgb[c][indx-v2])+fabsf(rgb[1][indx-v1]-rgb[1][indx-v3]));
						wtd=1.0/SQR(eps+fabsf(rgb[1][indx-v1]-rgb[1][indx+v1])+fabsf(rgb[c][indx]-rgb[c][indx+v2])+fabsf(rgb[1][indx+v1]-rgb[1][indx+v3]));
						wtl=1.0/SQR(eps+fabsf(rgb[1][indx+1]-rgb[1][indx-1])+fabsf(rgb[c][indx]-rgb[c][indx-2])+fabsf(rgb[1][indx-1]-rgb[1][indx-3]));
						wtr=1.0/SQR(eps+fabsf(rgb[1][indx-1]-rgb[1][indx+1])+fabsf(rgb[c][indx]-rgb[c][indx+2])+fabsf(rgb[1][indx+1]-rgb[1][indx+3]));

						//store in rgb array the interpolated G value at R/B grid points using directional weighted average
						rgb[1][indx]=(wtu*rgb[1][indx-v1]+wtd*rgb[1][indx+v1]+wtl*rgb[1][indx-1]+wtr*rgb[1][indx+1])/(wtu+wtd+wtl+wtr);
					}
					if (row>-1 && row<height && col>-1 && col<width)
						Gtmp[row*width + col] = rgb[1][indx];
				}

			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

			for (rr=4; rr < rr1-4; rr++)
				for (cc=4+(FC(rr,2)&1), indx=rr*TS+cc, c = FC(rr,cc); cc < cc1-4; cc+=2, indx+=2) {


					rbhpfv[indx>>1] = fabsf(fabsf((rgb[1][indx]-rgb[c][indx])-(rgb[1][indx+v4]-rgb[c][indx+v4])) +
										fabsf((rgb[1][indx-v4]-rgb[c][indx-v4])-(rgb[1][indx]-rgb[c][indx])) -
										fabsf((rgb[1][indx-v4]-rgb[c][indx-v4])-(rgb[1][indx+v4]-rgb[c][indx+v4])));
					rbhpfh[indx>>1] = fabsf(fabsf((rgb[1][indx]-rgb[c][indx])-(rgb[1][indx+4]-rgb[c][indx+4])) +
										fabsf((rgb[1][indx-4]-rgb[c][indx-4])-(rgb[1][indx]-rgb[c][indx])) -
										fabsf((rgb[1][indx-4]-rgb[c][indx-4])-(rgb[1][indx+4]-rgb[c][indx+4])));

					/*ghpfv = fabsf(fabsf(rgb[indx][1]-rgb[indx+v4][1])+fabsf(rgb[indx][1]-rgb[indx-v4][1]) -
					 fabsf(rgb[indx+v4][1]-rgb[indx-v4][1]));
					 ghpfh = fabsf(fabsf(rgb[indx][1]-rgb[indx+4][1])+fabsf(rgb[indx][1]-rgb[indx-4][1]) -
					 fabsf(rgb[indx+4][1]-rgb[indx-4][1]));
					 rbhpfv[indx] = fabsf(ghpfv - fabsf(fabsf(rgb[indx][c]-rgb[indx+v4][c])+fabsf(rgb[indx][c]-rgb[indx-v4][c]) -
					 fabsf(rgb[indx+v4][c]-rgb[indx-v4][c])));
					 rbhpfh[indx] = fabsf(ghpfh - fabsf(fabsf(rgb[indx][c]-rgb[indx+4][c])+fabsf(rgb[indx][c]-rgb[indx-4][c]) -
					 fabsf(rgb[indx+4][c]-rgb[indx-4][c])));*/

					glpfv = 0.25*(2.0*rgb[1][indx]+rgb[1][indx+v2]+rgb[1][indx-v2]);
					glpfh = 0.25*(2.0*rgb[1][indx]+rgb[1][indx+2]+rgb[1][indx-2]);
					rblpfv[indx>>1] = eps+fabsf(glpfv - 0.25*(2.0*rgb[c][indx]+rgb[c][indx+v2]+rgb[c][indx-v2]));
					rblpfh[indx>>1] = eps+fabsf(glpfh - 0.25*(2.0*rgb[c][indx]+rgb[c][indx+2]+rgb[c][indx-2]));
					grblpfv[indx>>1] = glpfv + 0.25*(2.0*rgb[c][indx]+rgb[c][indx+v2]+rgb[c][indx-v2]);
					grblpfh[indx>>1] = glpfh + 0.25*(2.0*rgb[c][indx]+rgb[c][indx+2]+rgb[c][indx-2]);
				}
			areawt[0][0]=areawt[1][0]=1;
			areawt[0][2]=areawt[1][2]=1;

			// along line segments, find the point along each segment that minimizes the color variance
			// averaged over the tile; evaluate for up/down and left/right away from R/B grid point
			for (rr=8; rr < rr1-8; rr++)
				for (cc=8+(FC(rr,2)&1), indx=rr*TS+cc, c = FC(rr,cc); cc < cc1-8; cc+=2, indx+=2) {

//					areawt[0][c]=areawt[1][c]=0;

					//in linear interpolation, color differences are a quadratic function of interpolation position;
					//solve for the interpolation position that minimizes color difference variance over the tile

					//vertical
					gdiff=0.3125*(rgb[1][indx+TS]-rgb[1][indx-TS])+0.09375*(rgb[1][indx+TS+1]-rgb[1][indx-TS+1]+rgb[1][indx+TS-1]-rgb[1][indx-TS-1]);
					deltgrb=(rgb[c][indx]-rgb[1][indx]);

					gradwt=fabsf(0.25*rbhpfv[indx>>1]+0.125*(rbhpfv[(indx>>1)+1]+rbhpfv[(indx>>1)-1]) )*(grblpfv[(indx>>1)-v1]+grblpfv[(indx>>1)+v1])/(eps+0.1*grblpfv[(indx>>1)-v1]+rblpfv[(indx>>1)-v1]+0.1*grblpfv[(indx>>1)+v1]+rblpfv[(indx>>1)+v1]);

					coeff[0][0][c] += gradwt*deltgrb*deltgrb;
					coeff[0][1][c] += gradwt*gdiff*deltgrb;
					coeff[0][2][c] += gradwt*gdiff*gdiff;
//					areawt[0][c]+=1;

					//horizontal
					gdiff=0.3125*(rgb[1][indx+1]-rgb[1][indx-1])+0.09375*(rgb[1][indx+1+TS]-rgb[1][indx-1+TS]+rgb[1][indx+1-TS]-rgb[1][indx-1-TS]);
					deltgrb=(rgb[c][indx]-rgb[1][indx]);

					gradwt=fabsf(0.25*rbhpfh[indx>>1]+0.125*(rbhpfh[(indx>>1)+v1]+rbhpfh[(indx>>1)-v1]) )*(grblpfh[(indx>>1)-1]+grblpfh[(indx>>1)+1])/(eps+0.1*grblpfh[(indx>>1)-1]+rblpfh[(indx>>1)-1]+0.1*grblpfh[(indx>>1)+1]+rblpfh[(indx>>1)+1]);

					coeff[1][0][c] += gradwt*deltgrb*deltgrb;
					coeff[1][1][c] += gradwt*gdiff*deltgrb;
					coeff[1][2][c] += gradwt*gdiff*gdiff;
//					areawt[1][c]+=1;

					//	In Mathematica,
					//  f[x_]=Expand[Total[Flatten[
					//  ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]];
					//  extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2]
				}

			//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
			/*
			for (rr=4; rr < rr1-4; rr++)
				for (cc=4+(FC(rr,2)&1), indx=rr*TS+cc, c = FC(rr,cc); cc < cc1-4; cc+=2, indx+=2) {


					rbhpfv[indx] = SQR(fabs((rgb[indx][1]-rgb[indx][c])-(rgb[indx+v4][1]-rgb[indx+v4][c])) +
										fabs((rgb[indx-v4][1]-rgb[indx-v4][c])-(rgb[indx][1]-rgb[indx][c])) -
										fabs((rgb[indx-v4][1]-rgb[indx-v4][c])-(rgb[indx+v4][1]-rgb[indx+v4][c])));
					rbhpfh[indx] = SQR(fabs((rgb[indx][1]-rgb[indx][c])-(rgb[indx+4][1]-rgb[indx+4][c])) +
										fabs((rgb[indx-4][1]-rgb[indx-4][c])-(rgb[indx][1]-rgb[indx][c])) -
										fabs((rgb[indx-4][1]-rgb[indx-4][c])-(rgb[indx+4][1]-rgb[indx+4][c])));


					glpfv = 0.25*(2*rgb[indx][1]+rgb[indx+v2][1]+rgb[indx-v2][1]);
					glpfh = 0.25*(2*rgb[indx][1]+rgb[indx+2][1]+rgb[indx-2][1]);
					rblpfv[indx] = eps+fabs(glpfv - 0.25*(2*rgb[indx][c]+rgb[indx+v2][c]+rgb[indx-v2][c]));
					rblpfh[indx] = eps+fabs(glpfh - 0.25*(2*rgb[indx][c]+rgb[indx+2][c]+rgb[indx-2][c]));
					grblpfv[indx] = glpfv + 0.25*(2*rgb[indx][c]+rgb[indx+v2][c]+rgb[indx-v2][c]);
					grblpfh[indx] = glpfh + 0.25*(2*rgb[indx][c]+rgb[indx+2][c]+rgb[indx-2][c]);
				}

			for (c=0;c<3;c++) {areawt[0][c]=areawt[1][c]=0;}

			// along line segments, find the point along each segment that minimizes the color variance
			// averaged over the tile; evaluate for up/down and left/right away from R/B grid point
			for (rr=rrmin+8; rr < rrmax-8; rr++)
				for (cc=ccmin+8+(FC(rr,2)&1), indx=rr*TS+cc, c = FC(rr,cc); cc < ccmax-8; cc+=2, indx+=2) {

					if (rgb[indx][c]>0.8*clip_pt || Gtmp[indx]>0.8*clip_pt) continue;

					//in linear interpolation, color differences are a quadratic function of interpolation position;
					//solve for the interpolation position that minimizes color difference variance over the tile

					//vertical
					gdiff=0.3125*(rgb[indx+TS][1]-rgb[indx-TS][1])+0.09375*(rgb[indx+TS+1][1]-rgb[indx-TS+1][1]+rgb[indx+TS-1][1]-rgb[indx-TS-1][1]);
					deltgrb=(rgb[indx][c]-rgb[indx][1])-0.5*((rgb[indx-v4][c]-rgb[indx-v4][1])+(rgb[indx+v4][c]-rgb[indx+v4][1]));

					gradwt=fabs(0.25*rbhpfv[indx]+0.125*(rbhpfv[indx+2]+rbhpfv[indx-2]) );// *(grblpfv[indx-v2]+grblpfv[indx+v2])/(eps+0.1*grblpfv[indx-v2]+rblpfv[indx-v2]+0.1*grblpfv[indx+v2]+rblpfv[indx+v2]);
					if (gradwt>eps) {
					coeff[0][0][c] += gradwt*deltgrb*deltgrb;
					coeff[0][1][c] += gradwt*gdiff*deltgrb;
					coeff[0][2][c] += gradwt*gdiff*gdiff;
					areawt[0][c]++;
					}

					//horizontal
					gdiff=0.3125*(rgb[indx+1][1]-rgb[indx-1][1])+0.09375*(rgb[indx+1+TS][1]-rgb[indx-1+TS][1]+rgb[indx+1-TS][1]-rgb[indx-1-TS][1]);
					deltgrb=(rgb[indx][c]-rgb[indx][1])-0.5*((rgb[indx-4][c]-rgb[indx-4][1])+(rgb[indx+4][c]-rgb[indx+4][1]));

					gradwt=fabs(0.25*rbhpfh[indx]+0.125*(rbhpfh[indx+v2]+rbhpfh[indx-v2]) );// *(grblpfh[indx-2]+grblpfh[indx+2])/(eps+0.1*grblpfh[indx-2]+rblpfh[indx-2]+0.1*grblpfh[indx+2]+rblpfh[indx+2]);
					if (gradwt>eps) {
					coeff[1][0][c] += gradwt*deltgrb*deltgrb;
					coeff[1][1][c] += gradwt*gdiff*deltgrb;
					coeff[1][2][c] += gradwt*gdiff*gdiff;
					areawt[1][c]++;
					}

					//	In Mathematica,
					//  f[x_]=Expand[Total[Flatten[
					//  ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]];
					//  extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2]
				}*/
			for (c=0; c<3; c+=2){
				for (j=0; j<2; j++) {// vert/hor
					//printf("hblock %d vblock %d j %d c %d areawt %d \n",hblock,vblock,j,c,areawt[j][c]);
					//printf("hblock %d vblock %d j %d c %d areawt %d ",hblock,vblock,j,c,areawt[j][c]);

					if (areawt[j][c]>0 && coeff[j][2][c]>eps2) {
						CAshift[j][c]=coeff[j][1][c]/coeff[j][2][c];
						blockwt[vblock*hblsz+hblock]= areawt[j][c];//*coeff[j][2][c]/(eps+coeff[j][0][c]) ;
					} else {
						CAshift[j][c]=17.0;
						blockwt[vblock*hblsz+hblock]=0;
					}
					//if (c==0 && j==0) printf("vblock= %d hblock= %d denom= %f areawt= %d \n",vblock,hblock,coeff[j][2][c],areawt[j][c]);

					//printf("%f  \n",CAshift[j][c]);

					//data structure = CAshift[vert/hor][color]
					//j=0=vert, 1=hor

					//offset gives NW corner of square containing the min; j=0=vert, 1=hor

					if (fabsf(CAshift[j][c])<2.0f) {
						blockavethr[j][c] += CAshift[j][c];
						blocksqavethr[j][c] += SQR(CAshift[j][c]);
						blockdenomthr[j][c] += 1;
					}
				}//vert/hor
			}//color

			/* CAshift[j][c] are the locations
			 that minimize color difference variances;
			 This is the approximate _optical_ location of the R/B pixels */

			for (c=0; c<3; c+=2) {
				//evaluate the shifts to the location that minimizes CA within the tile
				blockshifts[(vblock)*hblsz+hblock][c][0]=(CAshift[0][c]); //vert CA shift for R/B
				blockshifts[(vblock)*hblsz+hblock][c][1]=(CAshift[1][c]); //hor CA shift for R/B
				//data structure: blockshifts[blocknum][R/B][v/h]
				//if (c==0) printf("vblock= %d hblock= %d blockshiftsmedian= %f \n",vblock,hblock,blockshifts[(vblock)*hblsz+hblock][c][0]);
			}
			if(plistener) {
				progresscounter++;
				if(progresscounter % 8 == 0)
#pragma omp critical
				 {
					progress+=(double)(8.0*(TS-border2)*(TS-border2))/(2*height*width);
					if (progress>1.0)
					{
						progress=1.0;
					}
					plistener->setProgress(progress);
				}
			}

		}

	//end of diagnostic pass
#pragma omp critical
{
	for (j=0; j<2; j++)
		for (c=0; c<3; c+=2) {
			blockdenom[j][c] += blockdenomthr[j][c];
			blocksqave[j][c] += blocksqavethr[j][c];
			blockave[j][c]   += blockavethr[j][c];
		}
}
#pragma omp barrier

#pragma omp single
{
	for (j=0; j<2; j++)
		for (c=0; c<3; c+=2) {
			if (blockdenom[j][c]) {
				blockvar[j][c] = blocksqave[j][c]/blockdenom[j][c]-SQR(blockave[j][c]/blockdenom[j][c]);
			} else {
				processpasstwo = false;
				printf ("blockdenom vanishes \n");
				break;
			}
		}
}
	//printf ("tile variances %f %f %f %f \n",blockvar[0][0],blockvar[1][0],blockvar[0][2],blockvar[1][2] );


	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	//now prepare for CA correction pass
	//first, fill border blocks of blockshift array
	if(processpasstwo) {
#pragma omp sections
{
#pragma omp section
		for (vblock=1; vblock<vblsz-1; vblock++) {//left and right sides
			for (c=0; c<3; c+=2) {
				for (i=0; i<2; i++) {
					blockshifts[vblock*hblsz][c][i]=blockshifts[(vblock)*hblsz+2][c][i];
					blockshifts[vblock*hblsz+hblsz-1][c][i]=blockshifts[(vblock)*hblsz+hblsz-3][c][i];
				}
			}
		}
#pragma omp section
		for (hblock=0; hblock<hblsz; hblock++) {//top and bottom sides
			for (c=0; c<3; c+=2) {
				for (i=0; i<2; i++) {
					blockshifts[hblock][c][i]=blockshifts[2*hblsz+hblock][c][i];
					blockshifts[(vblsz-1)*hblsz+hblock][c][i]=blockshifts[(vblsz-3)*hblsz+hblock][c][i];
				}
			}
		}
}
		//end of filling border pixels of blockshift array
#pragma omp barrier

		//initialize fit arrays
		double	polymatthr[3][2][256], shiftmatthr[3][2][16];
		float bstemp[3][2];
		//initialize fit arrays
		for (i=0; i<256; i++) {polymatthr[0][0][i] = polymatthr[0][1][i] = polymatthr[2][0][i] = polymatthr[2][1][i] = 0;}
		for (i=0; i<16; i++) {shiftmatthr[0][0][i] = shiftmatthr[0][1][i] = shiftmatthr[2][0][i] = shiftmatthr[2][1][i] = 0;}
#pragma omp for nowait	// nowait to allow the first ready thread to start the critical section as soon as possible
		for (vblock=1; vblock<vblsz-1; vblock++)
			for (hblock=1; hblock<hblsz-1; hblock++) {
				// block 3x3 median of blockshifts for robustness
				for (c=0; c<3; c+=2) {
					for (dir=0; dir<2; dir++) {
						p[0] = blockshifts[(vblock-1)*hblsz+hblock-1][c][dir];
						p[1] = blockshifts[(vblock-1)*hblsz+hblock][c][dir];
						p[2] = blockshifts[(vblock-1)*hblsz+hblock+1][c][dir];
						p[3] = blockshifts[(vblock)*hblsz+hblock-1][c][dir];
						p[4] = blockshifts[(vblock)*hblsz+hblock][c][dir];
						p[5] = blockshifts[(vblock)*hblsz+hblock+1][c][dir];
						p[6] = blockshifts[(vblock+1)*hblsz+hblock-1][c][dir];
						p[7] = blockshifts[(vblock+1)*hblsz+hblock][c][dir];
						p[8] = blockshifts[(vblock+1)*hblsz+hblock+1][c][dir];
						PIX_SORT(p[1],p[2]); PIX_SORT(p[4],p[5]); PIX_SORT(p[7],p[8]);
						PIX_SORT(p[0],p[1]); PIX_SORT(p[3],p[4]); PIX_SORT(p[6],p[7]);
						PIX_SORT(p[1],p[2]); PIX_SORT(p[4],p[5]); PIX_SORT(p[7],p[8]);
						PIX_SORT(p[0],p[3]); PIX_SORT(p[5],p[8]); PIX_SORT(p[4],p[7]);
						PIX_SORT(p[3],p[6]); PIX_SORT(p[1],p[4]); PIX_SORT(p[2],p[5]);
						PIX_SORT(p[4],p[7]); PIX_SORT(p[4],p[2]); PIX_SORT(p[6],p[4]);
						PIX_SORT(p[4],p[2]);
						bstemp[c][dir] = p[4];
						//if (c==0 && dir==0) printf("vblock= %d hblock= %d blockshiftsmedian= %f \n",vblock,hblock,p[4]);
					}

					//if (verbose) fprintf (stderr,_("tile vshift hshift (%d %d %4f %4f)...\n"),vblock, hblock, blockshifts[(vblock)*hblsz+hblock][c][0], blockshifts[(vblock)*hblsz+hblock][c][1]);

					//now prepare coefficient matrix; use only data points within two std devs of zero
					if (SQR(bstemp[c][0])>4.0*blockvar[0][c] || SQR(bstemp[c][1])>4.0*blockvar[1][c])
						continue;
					numbloxthr[c]++;
					for (dir=0; dir<2; dir++) {
						for (i=0; i<polyord; i++) {
							for (j=0; j<polyord; j++) {
								for (m=0; m<polyord; m++)
									for (n=0; n<polyord; n++) {
										polymatthr[c][dir][numpar*(polyord*i+j)+(polyord*m+n)] += (float)pow((double)vblock,i+m)*pow((double)hblock,j+n)*blockwt[vblock*hblsz+hblock];
									}
								shiftmatthr[c][dir][(polyord*i+j)] += (float)pow((double)vblock,i)*pow((double)hblock,j)*bstemp[c][dir]*blockwt[vblock*hblsz+hblock];
							}
							//if (c==0 && dir==0) {printf("i= %d j= %d shiftmat= %f \n",i,j,shiftmat[c][dir][(polyord*i+j)]);}
						}//monomials
					}//dir

				}//c
			}//blocks
#pragma omp critical
{
	// now sum up the per thread vars
	for (i=0; i<256; i++) {
		polymat[0][0][i] += polymatthr[0][0][i];
		polymat[0][1][i] += polymatthr[0][1][i];
		polymat[2][0][i] += polymatthr[2][0][i];
		polymat[2][1][i] += polymatthr[2][1][i];
	}
	for (i=0; i<16; i++) {
		shiftmat[0][0][i] += shiftmatthr[0][0][i];
		shiftmat[0][1][i] += shiftmatthr[0][1][i];
		shiftmat[2][0][i] += shiftmatthr[2][0][i];
		shiftmat[2][1][i] += shiftmatthr[2][1][i];
	}
	numblox[0] += numbloxthr[0];
	numblox[2] += numbloxthr[2];

}
#pragma omp barrier

#pragma omp single
{

		numblox[1]=min(numblox[0],numblox[2]);
		//if too few data points, restrict the order of the fit to linear
		if (numblox[1]<32) {
			polyord=2; numpar=4;
			if (numblox[1]< 10) {

				printf ("numblox = %d \n",numblox[1]);
				processpasstwo = false;
			}
		}
		if(processpasstwo)
			//fit parameters to blockshifts
			for (c=0; c<3; c+=2)
				for (dir=0; dir<2; dir++) {
					res = LinEqSolve(numpar, polymat[c][dir], shiftmat[c][dir], fitparams[c][dir]);
					if (res) {
						printf("CA correction pass failed -- can't solve linear equations for color %d direction %d...\n",c,dir);
						processpasstwo = false;
					}
				}
		}
	//fitparams[polyord*i+j] gives the coefficients of (vblock^i hblock^j) in a polynomial fit for i,j<=4
	}
	//end of initialization for CA correction pass
	//only executed if cared and cablue are zero
}
	// Main algorithm: Tile loop
if(processpasstwo) {
#pragma omp for schedule(dynamic) collapse(2) nowait
	for (top=-border; top < height; top += TS-border2)
		for (left=-border; left < width; left += TS-border2) {
			vblock = ((top+border)/(TS-border2))+1;
			hblock = ((left+border)/(TS-border2))+1;
			int bottom = min(top+TS,height+border);
			int right  = min(left+TS, width+border);
			int rr1 = bottom - top;
			int cc1 = right - left;
			//t1_init = clock();
			if (top<0) {rrmin=border;} else {rrmin=0;}
			if (left<0) {ccmin=border;} else {ccmin=0;}
			if (bottom>height) {rrmax=height-top;} else {rrmax=rr1;}
			if (right>width) {ccmax=width-left;} else {ccmax=cc1;}

			// rgb from input CFA data
			// rgb values should be floating point number between 0 and 1
			// after white balance multipliers are applied

			for (rr=rrmin; rr < rrmax; rr++)
				for (row=rr+top, cc=ccmin; cc < ccmax; cc++) {
					col = cc+left;
					c = FC(rr,cc);
					indx=row*width+col;
					indx1=rr*TS+cc;
					//rgb[indx1][c] = image[indx][c]/65535.0f;
					rgb[c][indx1] = (rawData[row][col])/65535.0f;
					//rgb[indx1][c] = image[indx][c]/65535.0f;//for dcraw implementation

					if ((c&1)==0) rgb[1][indx1] = Gtmp[indx];
				}
			// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
			//fill borders
			if (rrmin>0) {
				for (rr=0; rr<border; rr++)
					for (cc=ccmin; cc<ccmax; cc++) {
						c = FC(rr,cc);
						rgb[c][rr*TS+cc] = rgb[c][(border2-rr)*TS+cc];
						rgb[1][rr*TS+cc] = rgb[1][(border2-rr)*TS+cc];
					}
			}
			if (rrmax<rr1) {
				for (rr=0; rr<border; rr++)
					for (cc=ccmin; cc<ccmax; cc++) {
						c=FC(rr,cc);
						rgb[c][(rrmax+rr)*TS+cc] = (rawData[(height-rr-2)][left+cc])/65535.0f;
						//rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+left+cc][c])/65535.0f;//for dcraw implementation

						rgb[1][(rrmax+rr)*TS+cc] = Gtmp[(height-rr-2)*width+left+cc];
					}
			}
			if (ccmin>0) {
				for (rr=rrmin; rr<rrmax; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][rr*TS+cc] = rgb[c][rr*TS+border2-cc];
						rgb[1][rr*TS+cc] = rgb[1][rr*TS+border2-cc];
					}
			}
			if (ccmax<cc1) {
				for (rr=rrmin; rr<rrmax; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][rr*TS+ccmax+cc] = (rawData[(top+rr)][(width-cc-2)])/65535.0f;
						//rgb[rr*TS+ccmax+cc][c] = (image[(top+rr)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation

						rgb[1][rr*TS+ccmax+cc] = Gtmp[(top+rr)*width+(width-cc-2)];
					}
			}

			//also, fill the image corners
			if (rrmin>0 && ccmin>0) {
				for (rr=0; rr<border; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][(rr)*TS+cc] = (rawData[border2-rr][border2-cc])/65535.0f;
						//rgb[(rr)*TS+cc][c] = (rgb[(border2-rr)*TS+(border2-cc)][c]);//for dcraw implementation

						rgb[1][(rr)*TS+cc] = Gtmp[(border2-rr)*width+border2-cc];
					}
			}
			if (rrmax<rr1 && ccmax<cc1) {
				for (rr=0; rr<border; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][(rrmax+rr)*TS+ccmax+cc] = (rawData[(height-rr-2)][(width-cc-2)])/65535.0f;
						//rgb[(rrmax+rr)*TS+ccmax+cc][c] = (image[(height-rr-2)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation

						rgb[1][(rrmax+rr)*TS+ccmax+cc] = Gtmp[(height-rr-2)*width+(width-cc-2)];
					}
			}
			if (rrmin>0 && ccmax<cc1) {
				for (rr=0; rr<border; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][(rr)*TS+ccmax+cc] = (rawData[(border2-rr)][(width-cc-2)])/65535.0f;
						//rgb[(rr)*TS+ccmax+cc][c] = (image[(border2-rr)*width+(width-cc-2)][c])/65535.0f;//for dcraw implementation

						rgb[1][(rr)*TS+ccmax+cc] = Gtmp[(border2-rr)*width+(width-cc-2)];
					}
			}
			if (rrmax<rr1 && ccmin>0) {
				for (rr=0; rr<border; rr++)
					for (cc=0; cc<border; cc++) {
						c=FC(rr,cc);
						rgb[c][(rrmax+rr)*TS+cc] = (rawData[(height-rr-2)][(border2-cc)])/65535.0f;
						//rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+(border2-cc)][c])/65535.0f;//for dcraw implementation

						rgb[1][(rrmax+rr)*TS+cc] = Gtmp[(height-rr-2)*width+(border2-cc)];
					}
			}

			//end of border fill
			// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

			if (!autoCA) {
				//manual CA correction; use red/blue slider values to set CA shift parameters
				for (rr=3; rr < rr1-3; rr++)
					for (row=rr+top, cc=3, indx=rr*TS+cc; cc < cc1-3; cc++, indx++) {
						col = cc+left;
						c = FC(rr,cc);

						if (c!=1) {
							//compute directional weights using image gradients
							wtu=1.0/SQR(eps+fabsf(rgb[1][(rr+1)*TS+cc]-rgb[1][(rr-1)*TS+cc])+fabsf(rgb[c][(rr)*TS+cc]-rgb[c][(rr-2)*TS+cc])+fabsf(rgb[1][(rr-1)*TS+cc]-rgb[1][(rr-3)*TS+cc]));
							wtd=1.0/SQR(eps+fabsf(rgb[1][(rr-1)*TS+cc]-rgb[1][(rr+1)*TS+cc])+fabsf(rgb[c][(rr)*TS+cc]-rgb[c][(rr+2)*TS+cc])+fabsf(rgb[1][(rr+1)*TS+cc]-rgb[1][(rr+3)*TS+cc]));
							wtl=1.0/SQR(eps+fabsf(rgb[1][(rr)*TS+cc+1]-rgb[1][(rr)*TS+cc-1])+fabsf(rgb[c][(rr)*TS+cc]-rgb[c][(rr)*TS+cc-2])+fabsf(rgb[1][(rr)*TS+cc-1]-rgb[1][(rr)*TS+cc-3]));
							wtr=1.0/SQR(eps+fabsf(rgb[1][(rr)*TS+cc-1]-rgb[1][(rr)*TS+cc+1])+fabsf(rgb[c][(rr)*TS+cc]-rgb[c][(rr)*TS+cc+2])+fabsf(rgb[1][(rr)*TS+cc+1]-rgb[1][(rr)*TS+cc+3]));

							//store in rgb array the interpolated G value at R/B grid points using directional weighted average
							rgb[1][indx]=(wtu*rgb[1][indx-v1]+wtd*rgb[1][indx+v1]+wtl*rgb[1][indx-1]+wtr*rgb[1][indx+1])/(wtu+wtd+wtl+wtr);
						}
						if (row>-1 && row<height && col>-1 && col<width)
							Gtmp[row*width + col] = rgb[1][indx];
					}
				float hfrac = -((float)(hblock-0.5)/(hblsz-2) - 0.5);
				float vfrac = -((float)(vblock-0.5)/(vblsz-2) - 0.5)*height/width;
				blockshifts[(vblock)*hblsz+hblock][0][0] = 2*vfrac*cared;
				blockshifts[(vblock)*hblsz+hblock][0][1] = 2*hfrac*cared;
				blockshifts[(vblock)*hblsz+hblock][2][0] = 2*vfrac*cablue;
				blockshifts[(vblock)*hblsz+hblock][2][1] = 2*hfrac*cablue;
			} else {
				//CA auto correction; use CA diagnostic pass to set shift parameters
				blockshifts[(vblock)*hblsz+hblock][0][0] = blockshifts[(vblock)*hblsz+hblock][0][1] = 0;
				blockshifts[(vblock)*hblsz+hblock][2][0] = blockshifts[(vblock)*hblsz+hblock][2][1] = 0;
				for (i=0; i<polyord; i++)
					for (j=0; j<polyord; j++) {
						//printf("i= %d j= %d polycoeff= %f \n",i,j,fitparams[0][0][polyord*i+j]);
						blockshifts[(vblock)*hblsz+hblock][0][0] += (float)pow((float)vblock,i)*pow((float)hblock,j)*fitparams[0][0][polyord*i+j];
						blockshifts[(vblock)*hblsz+hblock][0][1] += (float)pow((float)vblock,i)*pow((float)hblock,j)*fitparams[0][1][polyord*i+j];
						blockshifts[(vblock)*hblsz+hblock][2][0] += (float)pow((float)vblock,i)*pow((float)hblock,j)*fitparams[2][0][polyord*i+j];
						blockshifts[(vblock)*hblsz+hblock][2][1] += (float)pow((float)vblock,i)*pow((float)hblock,j)*fitparams[2][1][polyord*i+j];
					}
				blockshifts[(vblock)*hblsz+hblock][0][0] = LIM(blockshifts[(vblock)*hblsz+hblock][0][0], -bslim, bslim);
				blockshifts[(vblock)*hblsz+hblock][0][1] = LIM(blockshifts[(vblock)*hblsz+hblock][0][1], -bslim, bslim);
				blockshifts[(vblock)*hblsz+hblock][2][0] = LIM(blockshifts[(vblock)*hblsz+hblock][2][0], -bslim, bslim);
				blockshifts[(vblock)*hblsz+hblock][2][1] = LIM(blockshifts[(vblock)*hblsz+hblock][2][1], -bslim, bslim);
			}//end of setting CA shift parameters

			//printf("vblock= %d hblock= %d vshift= %f hshift= %f \n",vblock,hblock,blockshifts[(vblock)*hblsz+hblock][0][0],blockshifts[(vblock)*hblsz+hblock][0][1]);

			for (c=0; c<3; c+=2) {

				//some parameters for the bilinear interpolation
				shiftvfloor[c]=floor((float)blockshifts[(vblock)*hblsz+hblock][c][0]);
				shiftvceil[c]=ceil((float)blockshifts[(vblock)*hblsz+hblock][c][0]);
				shiftvfrac[c]=blockshifts[(vblock)*hblsz+hblock][c][0]-shiftvfloor[c];

				shifthfloor[c]=floor((float)blockshifts[(vblock)*hblsz+hblock][c][1]);
				shifthceil[c]=ceil((float)blockshifts[(vblock)*hblsz+hblock][c][1]);
				shifthfrac[c]=blockshifts[(vblock)*hblsz+hblock][c][1]-shifthfloor[c];


				if (blockshifts[(vblock)*hblsz+hblock][c][0]>0) {
					GRBdir[0][c] = 1;
				} else {
					GRBdir[0][c] = -1;
				}
				if (blockshifts[(vblock)*hblsz+hblock][c][1]>0) {
					GRBdir[1][c] = 1;
				} else {
					GRBdir[1][c] = -1;
				}

			}


			for (rr=4; rr < rr1-4; rr++)
				for (cc=4+(FC(rr,2)&1), c = FC(rr,cc); cc < cc1-4; cc+=2) {
					//perform CA correction using color ratios or color differences

					Ginthfloor=(1-shifthfrac[c])*rgb[1][(rr+shiftvfloor[c])*TS+cc+shifthfloor[c]]+(shifthfrac[c])*rgb[1][(rr+shiftvfloor[c])*TS+cc+shifthceil[c]];
					Ginthceil=(1-shifthfrac[c])*rgb[1][(rr+shiftvceil[c])*TS+cc+shifthfloor[c]]+(shifthfrac[c])*rgb[1][(rr+shiftvceil[c])*TS+cc+shifthceil[c]];
					//Gint is blinear interpolation of G at CA shift point
					Gint=(1-shiftvfrac[c])*Ginthfloor+(shiftvfrac[c])*Ginthceil;

					//determine R/B at grid points using color differences at shift point plus interpolated G value at grid point
					//but first we need to interpolate G-R/G-B to grid points...
					grbdiff[((rr)*TS+cc)>>1]=Gint-rgb[c][(rr)*TS+cc];
					gshift[((rr)*TS+cc)>>1]=Gint;
				}

			for (rr=8; rr < rr1-8; rr++)
				for (cc=8+(FC(rr,2)&1), c = FC(rr,cc), indx=rr*TS+cc; cc < cc1-8; cc+=2, indx+=2) {

					//if (rgb[indx][c]>clip_pt || Gtmp[indx]>clip_pt) continue;

					grbdiffold = rgb[1][indx]-rgb[c][indx];

					//interpolate color difference from optical R/B locations to grid locations
					grbdiffinthfloor=(1.0f-shifthfrac[c]/2.0f)*grbdiff[indx>>1]+(shifthfrac[c]/2.0f)*grbdiff[(indx-2*GRBdir[1][c])>>1];
					grbdiffinthceil=(1.0f-shifthfrac[c]/2.0f)*grbdiff[((rr-2*GRBdir[0][c])*TS+cc)>>1]+(shifthfrac[c]/2.0f)*grbdiff[((rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c])>>1];
					//grbdiffint is bilinear interpolation of G-R/G-B at grid point
					grbdiffint=(1.0f-shiftvfrac[c]/2.0f)*grbdiffinthfloor+(shiftvfrac[c]/2.0f)*grbdiffinthceil;

					//now determine R/B at grid points using interpolated color differences and interpolated G value at grid point
					RBint=rgb[1][indx]-grbdiffint;

					if (fabsf(RBint-rgb[c][indx])<0.25f*(RBint+rgb[c][indx])) {
						if (fabsf(grbdiffold)>fabsf(grbdiffint) ) {
							rgb[c][indx]=RBint;
						}
					} else {

						//gradient weights using difference from G at CA shift points and G at grid points
						p[0]=1.0f/(eps+fabsf(rgb[1][indx]-gshift[indx>>1]));
						p[1]=1.0f/(eps+fabsf(rgb[1][indx]-gshift[(indx-2*GRBdir[1][c])>>1]));
						p[2]=1.0f/(eps+fabsf(rgb[1][indx]-gshift[((rr-2*GRBdir[0][c])*TS+cc)>>1]));
						p[3]=1.0f/(eps+fabsf(rgb[1][indx]-gshift[((rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c])>>1]));

						grbdiffint = (p[0]*grbdiff[indx>>1]+p[1]*grbdiff[(indx-2*GRBdir[1][c])>>1]+
									  p[2]*grbdiff[((rr-2*GRBdir[0][c])*TS+cc)>>1]+p[3]*grbdiff[((rr-2*GRBdir[0][c])*TS+cc-2*GRBdir[1][c])>>1])/(p[0]+p[1]+p[2]+p[3]);

						//now determine R/B at grid points using interpolated color differences and interpolated G value at grid point
						if (fabsf(grbdiffold)>fabsf(grbdiffint) ) {
							rgb[c][indx]=rgb[1][indx]-grbdiffint;
						}
					}

					//if color difference interpolation overshot the correction, just desaturate
					if (grbdiffold*grbdiffint<0) {
						rgb[c][indx]=rgb[1][indx]-0.5f*(grbdiffold+grbdiffint);
					}
				}

			// copy CA corrected results to temporary image matrix
			for (rr=border; rr < rr1-border; rr++){
				c = FC(rr+top, left + border+FC(rr+top,2)&1);
				for (row=rr+top, cc=border+(FC(rr,2)&1),indx=(row*width+cc+left)>>1; cc < cc1-border; cc+=2,indx++) {
					col = cc + left;
					RawDataTmp[indx] = 65535.0f*rgb[c][(rr)*TS+cc] + 0.5f;
					//image[indx][c] = CLIP((int)(65535.0*rgb[(rr)*TS+cc][c] + 0.5));//for dcraw implementation
				}
			}

			if(plistener) {
				progresscounter++;
				if(progresscounter % 8 == 0)
#pragma omp critical
				 {
					progress+=(double)(8.0*(TS-border2)*(TS-border2))/(2*height*width);
					if (progress>1.0)
					{
						progress=1.0;
					}
					plistener->setProgress(progress);
				}
			}

		}

#pragma omp barrier
// copy temporary image matrix back to image matrix
#pragma omp for
	for(row=0;row<height;row++)
		for(col=0+(FC(row,0)&1),indx=(row*width+col)>>1;col<width;col+=2,indx++)
			rawData[row][col] = RawDataTmp[indx];

	}
	// clean up
	free(buffer);


}

	free(Gtmp);
	free(buffer1);
	free(RawDataTmp);
	if(plistener)
		plistener->setProgress(1.0);

#undef TS
#undef TSH
#undef PIX_SORT
}