Futher speedup for auto ca correction and cleaned code

This commit is contained in:
heckflosse
2016-03-01 00:51:19 +01:00
parent cbc88a5804
commit 777b08f7f6
2 changed files with 184 additions and 90 deletions

View File

@@ -108,6 +108,14 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
//end of linear equation solver
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
inline void pixSort(float &a, float &b) {
if (a > b) {
float temp = a;
a = b;
b = temp;
}
}
}
using namespace std;
@@ -122,8 +130,6 @@ BENCHFUN
//shifts to location of vertical and diagonal neighbors
constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, v4 = 4 * ts; //, p1=-ts+1, p2=-2*ts+2, p3=-3*ts+3, m1=ts+1, m2=2*ts+2, m3=3*ts+3;
#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} }
// Test for RGB cfa
for(int i = 0; i < 2; i++)
for(int j = 0; j < 2; j++)
@@ -152,10 +158,6 @@ BENCHFUN
// Because we can't break parallel processing, we need a switch do handle the errors
bool processpasstwo = true;
//block CA shift values and weight assigned to block
float (*blockwt); // vblsz*hblsz
float (*blockshifts)[2][2]; // vblsz*hblsz*3*2
constexpr int border = 8;
constexpr int border2 = 16;
@@ -164,11 +166,11 @@ BENCHFUN
const int vblsz = ceil((float)(height + border2) / (ts - border2) + 2 + vz1);
const int hblsz = ceil((float)(width + border2) / (ts - border2) + 2 + hz1);
char *buffer1 = (char *) malloc(vblsz * hblsz * (2 * 2 + 1) * sizeof(float));
memset(buffer1, 0, vblsz * hblsz * (2 * 2 + 1)*sizeof(float));
// block CA shifts
blockwt = (float (*)) (buffer1);
blockshifts = (float (*)[2][2]) (buffer1 + (vblsz * hblsz * sizeof(float)));
char *buffer1 = (char *) calloc(vblsz * hblsz * (2 * 2 + 1), sizeof(float));
//block CA shift values and weight assigned to block
float *blockwt = (float*)buffer1;
float (*blockshifts)[2][2] = (float (*)[2][2])(buffer1 + (vblsz * hblsz * sizeof(float)));
double fitparams[2][2][16];
@@ -177,8 +179,7 @@ BENCHFUN
constexpr float eps = 1e-5f, eps2 = 1e-10f; //tolerance to avoid dividing by zero
#pragma omp parallel shared(Gtmp,blockave,blocksqave,blockdenom,blockvar,blockwt,blockshifts,fitparams,polyord,numpar)
#pragma omp parallel
{
int progresscounter = 0;
@@ -197,12 +198,10 @@ BENCHFUN
//per thread data for evaluation of block CA shift variance
float blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
/* assign working space; this would not be necessary
if the algorithm is part of the larger pre-interpolation processing */
char *buffer = (char *) malloc(3 * sizeof(float) * ts * ts + 6 * sizeof(float) * ts * tsh + 8 * 64 + 64);
memset(buffer, 0, 3 * sizeof(float)*ts * ts + 6 * sizeof(float)*ts * tsh + 8 * 64 + 64);
char *data = buffer;
// assign working space
constexpr int buffersize = 3 * sizeof(float) * ts * ts + 6 * sizeof(float) * ts * tsh + 8 * 64 + 63;
char *buffer = (char *) calloc(buffersize, 1);
char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
// shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache
@@ -231,7 +230,7 @@ BENCHFUN
if (autoCA) {
// Main algorithm: Tile loop
// Main algorithm: Tile loop calculating correction parameters per tile
#pragma omp for collapse(2) schedule(dynamic) nowait
for (int top = -border ; top < height; top += ts - border2)
for (int left = -border; left < width; left += ts - border2) {
@@ -247,7 +246,7 @@ BENCHFUN
const int ccmax = right > width ? width - left : cc1;
// rgb from input CFA data
// rgb values should be floating point number between 0 and 1
// rgb values should be floating point numbers between 0 and 1
// after white balance multipliers are applied
for (int rr = rrmin; rr < rrmax; rr++)
@@ -328,41 +327,51 @@ BENCHFUN
//end of border fill
// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for (int dir = 0; dir < 2; dir++)
for (int k = 0; k < 3; k++)
for (int c = 0; c < 2; c++) {
coeff[dir][k][c] = 0;
}
//end of initialization
for (int rr = 3; rr < rr1 - 3; rr++)
for (int row = rr + top, cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) {
int col = cc + left;
int c = FC(rr, cc);
#ifdef __SSE2__
vfloat onev = F2V(1.f);
vfloat epsv = F2V(eps);
#endif
for (int rr = 3; rr < rr1 - 3; rr++) {
int row = rr + top;
int cc = 3 + (FC(rr,3) & 1);
int indx = rr * ts + cc;
int c = FC(rr,cc);
#ifdef __SSE2__
for (; cc < cc1 - 9; cc+=8, indx+=8) {
//compute directional weights using image gradients
vfloat wtuv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx - v1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx - v2])) + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx - v3])));
vfloat wtdv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx + v1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx + v2])) + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx + v3])));
vfloat wtlv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx - 2])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
vfloat wtrv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx + 1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx + 2])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));
if (c != 1) {
//compute directional weights using image gradients
float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx] - rgb[c][indx - v2]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx] - rgb[c][indx + v2]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx] - rgb[c][indx - 2]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx] - rgb[c][indx + 2]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));
//store in rgb array the interpolated G value at R/B grid points using directional weighted average
STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
}
//store in rgb array the interpolated G value at R/B grid points using directional weighted average
rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
}
#endif
for (; cc < cc1 - 3; cc+=2, indx+=2) {
//compute directional weights using image gradients
float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx] - rgb[c][indx - v2]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3]));
float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx] - rgb[c][indx + v2]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3]));
float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx] - rgb[c][indx - 2]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3]));
float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx] - rgb[c][indx + 2]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3]));
if (row > -1 && row < height && col > -1 && col < width) {
//store in rgb array the interpolated G value at R/B grid points using directional weighted average
rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
}
if (row > -1 && row < height) {
for(int col = max(left + 3, 0), indx = rr * ts + 3 - (left < 0 ? (left+3) : 0); col < min(cc1 + left - 3, width); col++, indx++) {
Gtmp[row * width + col] = rgb[1][indx];
}
}
}
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#ifdef __SSE2__
vfloat epsv = F2V(eps);
vfloat zd25v = F2V(0.25f);
#endif
for (int rr = 4; rr < rr1 - 4; rr++) {
@@ -381,18 +390,18 @@ BENCHFUN
STVFU(rbhpfh[indx >> 1], temp2v);
//low and high pass 1D filters of G in vertical/horizontal directions
vfloat glpfv = zd25v * (vmul2f(rgb1v) + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]));
vfloat glpfh = zd25v * (vmul2f(rgb1v) + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]));
STVFU(rblpfv[indx >> 1], epsv + vabsf(glpfv - zd25v * (vmul2f(rgbcv) + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2]))));
STVFU(rblpfh[indx >> 1], epsv + vabsf(glpfh - zd25v * (vmul2f(rgbcv) + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2]))));
STVFU(grblpfv[indx >> 1], glpfv + zd25v * (vmul2f(LC2VFU(rgb[c][indx])) + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2])));
STVFU(grblpfh[indx >> 1], glpfh + zd25v * (vmul2f(LC2VFU(rgb[c][indx])) + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2])));
rgb1v = vmul2f(rgb1v);
vfloat glpfvv = zd25v * (rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2]));
vfloat glpfhv = zd25v * (rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2]));
rgbcv = vmul2f(rgbcv);
STVFU(rblpfv[indx >> 1], epsv + vabsf(glpfvv - zd25v * (rgbcv + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2]))));
STVFU(rblpfh[indx >> 1], epsv + vabsf(glpfhv - zd25v * (rgbcv + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2]))));
STVFU(grblpfv[indx >> 1], glpfvv + zd25v * (rgbcv + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2])));
STVFU(grblpfh[indx >> 1], glpfhv + zd25v * (rgbcv + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2])));
}
#endif
for (; cc < cc1 - 4; cc += 2, indx += 2) {
rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx]) - (rgb[1][indx + v4] - rgb[c][indx + v4])) +
fabsf((rgb[1][indx - v4] - rgb[c][indx - v4]) - (rgb[1][indx] - rgb[c][indx])) -
fabsf((rgb[1][indx - v4] - rgb[c][indx - v4]) - (rgb[1][indx + v4] - rgb[c][indx + v4])));
@@ -410,10 +419,72 @@ BENCHFUN
}
}
for (int dir = 0; dir < 2; dir++) {
for (int k = 0; k < 3; k++) {
for (int c = 0; c < 2; c++) {
coeff[dir][k][c] = 0;
}
}
}
#ifdef __SSE2__
vfloat zd3125v = F2V(0.3125f);
vfloat zd09375v = F2V(0.09375f);
vfloat zd1v = F2V(0.1f);
vfloat zd125v = F2V(0.125f);
#endif
// along line segments, find the point along each segment that minimizes the colour variance
// averaged over the tile; evaluate for up/down and left/right away from R/B grid point
for (int rr = 8; rr < rr1 - 8; rr++)
for (int cc = 8 + (FC(rr, 2) & 1), indx = rr * ts + cc, c = FC(rr, cc); cc < cc1 - 8; cc += 2, indx += 2) {
for (int rr = 8; rr < rr1 - 8; rr++) {
int cc = 8 + (FC(rr, 2) & 1);
int indx = rr * ts + cc;
int c = FC(rr, cc);
#ifdef __SSE2__
vfloat coeff00v = ZEROV;
vfloat coeff01v = ZEROV;
vfloat coeff02v = ZEROV;
vfloat coeff10v = ZEROV;
vfloat coeff11v = ZEROV;
vfloat coeff12v = ZEROV;
for (; cc < cc1 - 14; cc += 8, indx += 8) {
//in linear interpolation, colour differences are a quadratic function of interpolation position;
//solve for the interpolation position that minimizes colour difference variance over the tile
//vertical
vfloat gdiffv = zd3125v * (LC2VFU(rgb[1][indx + ts]) - LC2VFU(rgb[1][indx - ts])) + zd09375v * (LC2VFU(rgb[1][indx + ts + 1]) - LC2VFU(rgb[1][indx - ts + 1]) + LC2VFU(rgb[1][indx + ts - 1]) - LC2VFU(rgb[1][indx - ts - 1]));
vfloat deltgrbv = LC2VFU(rgb[c][indx]) - LC2VFU(rgb[1][indx]);
vfloat gradwtv = vabsf(zd25v * LVFU(rbhpfv[indx >> 1]) + zd125v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1])) ) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) - v1]) + zd1v * LVFU(grblpfv[(indx >> 1) + v1]) + LVFU(rblpfv[(indx >> 1) + v1]));
coeff00v += gradwtv * deltgrbv * deltgrbv;
coeff01v += gradwtv * gdiffv * deltgrbv;
coeff02v += gradwtv * gdiffv * gdiffv;
//horizontal
gdiffv = zd3125v * (LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + zd09375v * (LC2VFU(rgb[1][indx + 1 + ts]) - LC2VFU(rgb[1][indx - 1 + ts]) + LC2VFU(rgb[1][indx + 1 - ts]) - LC2VFU(rgb[1][indx - 1 - ts]));
gradwtv = vabsf(zd25v * LVFU(rbhpfh[indx >> 1]) + zd125v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1])) ) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) - 1]) + zd1v * LVFU(grblpfh[(indx >> 1) + 1]) + LVFU(rblpfh[(indx >> 1) + 1]));
coeff10v += gradwtv * deltgrbv * deltgrbv;
coeff11v += gradwtv * gdiffv * deltgrbv;
coeff12v += gradwtv * gdiffv * gdiffv;
// In Mathematica,
// f[x_]=Expand[Total[Flatten[
// ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]];
// extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2]
}
coeff[0][0][c>>1] += vhadd(coeff00v);
coeff[0][1][c>>1] += vhadd(coeff01v);
coeff[0][2][c>>1] += vhadd(coeff02v);
coeff[1][0][c>>1] += vhadd(coeff10v);
coeff[1][1][c>>1] += vhadd(coeff11v);
coeff[1][2][c>>1] += vhadd(coeff12v);
#endif
for (; cc < cc1 - 8; cc += 2, indx += 2) {
//in linear interpolation, colour differences are a quadratic function of interpolation position;
//solve for the interpolation position that minimizes colour difference variance over the tile
@@ -442,7 +513,7 @@ BENCHFUN
// ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]];
// extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2]
}
}
for (int c = 0; c < 2; c++) {
for (int dir = 0; dir < 2; dir++) { // vert/hor
@@ -462,7 +533,6 @@ BENCHFUN
//dir : 0=vert, 1=hor
//offset gives NW corner of square containing the min; dir : 0=vert, 1=hor
if (fabsf(CAshift[dir][c]) < 2.0f) {
blockavethr[dir][c] += CAshift[dir][c];
blocksqavethr[dir][c] += SQR(CAshift[dir][c]);
@@ -543,7 +613,7 @@ BENCHFUN
//end of filling border pixels of blockshift array
//initialize fit arrays
double polymat[2][2][256], shiftmat[2][2][16];
double polymat[2][2][256], shiftmat[2][2][16];
for (int i = 0; i < 256; i++) {
polymat[0][0][i] = polymat[0][1][i] = polymat[1][0][i] = polymat[1][1][i] = 0;
@@ -562,7 +632,7 @@ BENCHFUN
float bstemp[2];
for (int dir = 0; dir < 2; dir++) {
//temporary storage for median filter
float temp, p[9];
float p[9];
p[0] = blockshifts[(vblock - 1) * hblsz + hblock - 1][c][dir];
p[1] = blockshifts[(vblock - 1) * hblsz + hblock][c][dir];
p[2] = blockshifts[(vblock - 1) * hblsz + hblock + 1][c][dir];
@@ -572,29 +642,29 @@ BENCHFUN
p[6] = blockshifts[(vblock + 1) * hblsz + hblock - 1][c][dir];
p[7] = blockshifts[(vblock + 1) * hblsz + hblock][c][dir];
p[8] = blockshifts[(vblock + 1) * hblsz + hblock + 1][c][dir];
PIX_SORT(p[1], p[2]);
PIX_SORT(p[4], p[5]);
PIX_SORT(p[7], p[8]);
PIX_SORT(p[0], p[1]);
PIX_SORT(p[3], p[4]);
PIX_SORT(p[6], p[7]);
PIX_SORT(p[1], p[2]);
PIX_SORT(p[4], p[5]);
PIX_SORT(p[7], p[8]);
PIX_SORT(p[0], p[3]);
PIX_SORT(p[5], p[8]);
PIX_SORT(p[4], p[7]);
PIX_SORT(p[3], p[6]);
PIX_SORT(p[1], p[4]);
PIX_SORT(p[2], p[5]);
PIX_SORT(p[4], p[7]);
PIX_SORT(p[4], p[2]);
PIX_SORT(p[6], p[4]);
PIX_SORT(p[4], p[2]);
pixSort(p[1], p[2]);
pixSort(p[4], p[5]);
pixSort(p[7], p[8]);
pixSort(p[0], p[1]);
pixSort(p[3], p[4]);
pixSort(p[6], p[7]);
pixSort(p[1], p[2]);
pixSort(p[4], p[5]);
pixSort(p[7], p[8]);
pixSort(p[0], p[3]);
pixSort(p[5], p[8]);
pixSort(p[4], p[7]);
pixSort(p[3], p[6]);
pixSort(p[1], p[4]);
pixSort(p[2], p[5]);
pixSort(p[4], p[7]);
pixSort(p[4], p[2]);
pixSort(p[6], p[4]);
pixSort(p[4], p[2]);
bstemp[dir] = p[4];
}
//now prepare coefficient matrix; use only data points within two std devs of zero
//now prepare coefficient matrix; use only data points within caautostrength/2 std devs of zero
if (SQR(bstemp[0]) > caautostrength * blockvar[0][c] || SQR(bstemp[1]) > caautostrength * blockvar[1][c]) {
continue;
}
@@ -623,6 +693,7 @@ BENCHFUN
}//dir
}//c
}//blocks
numblox[1] = min(numblox[0], numblox[1]);
//if too few data points, restrict the order of the fit to linear
@@ -841,31 +912,56 @@ BENCHFUN
}
for (int rr = 4; rr < rr1 - 4; rr++)
for (int cc = 4 + (FC(rr, 2) & 1), c = FC(rr, cc); cc < cc1 - 4; cc += 2) {
for (int rr = 4; rr < rr1 - 4; rr++) {
int cc = 4 + (FC(rr, 2) & 1);
int c = FC(rr, cc);
#ifdef __SSE2__
vfloat shifthfracv = F2V(shifthfrac[c]);
vfloat shiftvfracv = F2V(shiftvfrac[c]);
for (; cc < cc1 - 10; cc += 8) {
//perform CA correction using colour ratios or colour differences
float Ginthfloor = (1 - shifthfrac[c]) * rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthfloor[c]] + (shifthfrac[c]) * rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthceil[c]];
float Ginthceil = (1 - shifthfrac[c]) * rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthfloor[c]] + (shifthfrac[c]) * rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthceil[c]];
vfloat Ginthfloorv = vintpf(shifthfracv, LC2VFU(rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthceil[c]]), LC2VFU(rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthfloor[c]]));
vfloat Ginthceilv = vintpf(shifthfracv, LC2VFU(rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthceil[c]]), LC2VFU(rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthfloor[c]]));
//Gint is bilinear interpolation of G at CA shift point
float Gint = (1 - shiftvfrac[c]) * Ginthfloor + (shiftvfrac[c]) * Ginthceil;
vfloat Gintv = vintpf(shiftvfracv, Ginthceilv, Ginthfloorv);
//determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
//but first we need to interpolate G-R/G-B to grid points...
STVFU(grbdiff[((rr)*ts + cc) >> 1], Gintv - LC2VFU(rgb[c][(rr) * ts + cc]));
STVFU(gshift[((rr)*ts + cc) >> 1], Gintv);
}
#endif
for (; cc < cc1 - 4; cc += 2) {
//perform CA correction using colour ratios or colour differences
float Ginthfloor = intp(shifthfrac[c], rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthceil[c]], rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthfloor[c]]);
float Ginthceil = intp(shifthfrac[c], rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthceil[c]], rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthfloor[c]]);
//Gint is bilinear interpolation of G at CA shift point
float Gint = intp(shiftvfrac[c], Ginthceil, Ginthfloor);
//determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point
//but first we need to interpolate G-R/G-B to grid points...
grbdiff[((rr)*ts + cc) >> 1] = Gint - rgb[c][(rr) * ts + cc];
gshift[((rr)*ts + cc) >> 1] = Gint;
}
}
shifthfrac[0] /= 2.f;
shifthfrac[2] /= 2.f;
shiftvfrac[0] /= 2.f;
shiftvfrac[2] /= 2.f;
// this loop does not deserve vectorization in mainly because the most expensive part with the divisions does not happen often (less than 1/10 in my tests)
for (int rr = 8; rr < rr1 - 8; rr++)
for (int cc = 8 + (FC(rr, 2) & 1), c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) {
float grbdiffold = rgb[1][indx] - rgb[c][indx];
//interpolate colour difference from optical R/B locations to grid locations
float grbdiffinthfloor = (1.0f - shifthfrac[c] / 2.0f) * grbdiff[indx >> 1] + (shifthfrac[c] / 2.0f) * grbdiff[(indx - GRBdir[1][c]) >> 1];
float grbdiffinthceil = (1.0f - shifthfrac[c] / 2.0f) * grbdiff[((rr - GRBdir[0][c]) * ts + cc) >> 1] + (shifthfrac[c] / 2.0f) * grbdiff[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1];
float grbdiffinthfloor = intp(shifthfrac[c], grbdiff[(indx - GRBdir[1][c]) >> 1], grbdiff[indx >> 1]);
float grbdiffinthceil = intp(shifthfrac[c], grbdiff[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1], grbdiff[((rr - GRBdir[0][c]) * ts + cc) >> 1]);
//grbdiffint is bilinear interpolation of G-R/G-B at grid point
float grbdiffint = (1.0f - shiftvfrac[c] / 2.0f) * grbdiffinthfloor + (shiftvfrac[c] / 2.0f) * grbdiffinthceil;
float grbdiffint = intp(shiftvfrac[c], grbdiffinthceil, grbdiffinthfloor);
//now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point
float RBint = rgb[1][indx] - grbdiffint;
@@ -949,6 +1045,4 @@ BENCHFUN
if(plistener) {
plistener->setProgress(1.0);
}
#undef PIX_SORT
}