diff --git a/rtengine/CA_correct_RT.cc b/rtengine/CA_correct_RT.cc index c496e83cd..5cd3ab997 100644 --- a/rtengine/CA_correct_RT.cc +++ b/rtengine/CA_correct_RT.cc @@ -108,6 +108,14 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution) //end of linear equation solver //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +inline void pixSort(float &a, float &b) { + if (a > b) { + float temp = a; + a = b; + b = temp; + } +} + } using namespace std; @@ -122,8 +130,6 @@ BENCHFUN //shifts to location of vertical and diagonal neighbors constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, v4 = 4 * ts; //, p1=-ts+1, p2=-2*ts+2, p3=-3*ts+3, m1=ts+1, m2=2*ts+2, m3=3*ts+3; -#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} } - // Test for RGB cfa for(int i = 0; i < 2; i++) for(int j = 0; j < 2; j++) @@ -152,10 +158,6 @@ BENCHFUN // Because we can't break parallel processing, we need a switch do handle the errors bool processpasstwo = true; - //block CA shift values and weight assigned to block - float (*blockwt); // vblsz*hblsz - float (*blockshifts)[2][2]; // vblsz*hblsz*3*2 - constexpr int border = 8; constexpr int border2 = 16; @@ -164,11 +166,11 @@ BENCHFUN const int vblsz = ceil((float)(height + border2) / (ts - border2) + 2 + vz1); const int hblsz = ceil((float)(width + border2) / (ts - border2) + 2 + hz1); - char *buffer1 = (char *) malloc(vblsz * hblsz * (2 * 2 + 1) * sizeof(float)); - memset(buffer1, 0, vblsz * hblsz * (2 * 2 + 1)*sizeof(float)); - // block CA shifts - blockwt = (float (*)) (buffer1); - blockshifts = (float (*)[2][2]) (buffer1 + (vblsz * hblsz * sizeof(float))); + char *buffer1 = (char *) calloc(vblsz * hblsz * (2 * 2 + 1), sizeof(float)); + + //block CA shift values and weight assigned to block + float *blockwt = (float*)buffer1; + float (*blockshifts)[2][2] = (float (*)[2][2])(buffer1 + (vblsz * hblsz * sizeof(float))); double fitparams[2][2][16]; @@ -177,8 +179,7 @@ BENCHFUN constexpr float eps = 1e-5f, eps2 = 1e-10f; //tolerance to avoid dividing by zero - - #pragma omp parallel shared(Gtmp,blockave,blocksqave,blockdenom,blockvar,blockwt,blockshifts,fitparams,polyord,numpar) + #pragma omp parallel { int progresscounter = 0; @@ -197,12 +198,10 @@ BENCHFUN //per thread data for evaluation of block CA shift variance float blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}}; - /* assign working space; this would not be necessary - if the algorithm is part of the larger pre-interpolation processing */ - char *buffer = (char *) malloc(3 * sizeof(float) * ts * ts + 6 * sizeof(float) * ts * tsh + 8 * 64 + 64); - memset(buffer, 0, 3 * sizeof(float)*ts * ts + 6 * sizeof(float)*ts * tsh + 8 * 64 + 64); - - char *data = buffer; + // assign working space + constexpr int buffersize = 3 * sizeof(float) * ts * ts + 6 * sizeof(float) * ts * tsh + 8 * 64 + 63; + char *buffer = (char *) calloc(buffersize, 1); + char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64); // shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache @@ -231,7 +230,7 @@ BENCHFUN if (autoCA) { - // Main algorithm: Tile loop + // Main algorithm: Tile loop calculating correction parameters per tile #pragma omp for collapse(2) schedule(dynamic) nowait for (int top = -border ; top < height; top += ts - border2) for (int left = -border; left < width; left += ts - border2) { @@ -247,7 +246,7 @@ BENCHFUN const int ccmax = right > width ? width - left : cc1; // rgb from input CFA data - // rgb values should be floating point number between 0 and 1 + // rgb values should be floating point numbers between 0 and 1 // after white balance multipliers are applied for (int rr = rrmin; rr < rrmax; rr++) @@ -328,41 +327,51 @@ BENCHFUN //end of border fill // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - - for (int dir = 0; dir < 2; dir++) - for (int k = 0; k < 3; k++) - for (int c = 0; c < 2; c++) { - coeff[dir][k][c] = 0; - } - //end of initialization - for (int rr = 3; rr < rr1 - 3; rr++) - for (int row = rr + top, cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) { - int col = cc + left; - int c = FC(rr, cc); +#ifdef __SSE2__ + vfloat onev = F2V(1.f); + vfloat epsv = F2V(eps); +#endif + for (int rr = 3; rr < rr1 - 3; rr++) { + int row = rr + top; + int cc = 3 + (FC(rr,3) & 1); + int indx = rr * ts + cc; + int c = FC(rr,cc); +#ifdef __SSE2__ + for (; cc < cc1 - 9; cc+=8, indx+=8) { + //compute directional weights using image gradients + vfloat wtuv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx - v1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx - v2])) + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx - v3]))); + vfloat wtdv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - v1]) - LC2VFU(rgb[1][indx + v1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx + v2])) + vabsf(LC2VFU(rgb[1][indx + v1]) - LC2VFU(rgb[1][indx + v3]))); + vfloat wtlv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx - 2])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3]))); + vfloat wtrv = onev / SQRV(epsv + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx + 1])) + vabsf(LC2VFU(rgb[c][indx]) - LC2VFU(rgb[c][indx + 2])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3]))); - if (c != 1) { - //compute directional weights using image gradients - float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx] - rgb[c][indx - v2]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3])); - float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx] - rgb[c][indx + v2]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3])); - float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx] - rgb[c][indx - 2]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3])); - float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx] - rgb[c][indx + 2]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3])); + //store in rgb array the interpolated G value at R/B grid points using directional weighted average + STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv)); + } - //store in rgb array the interpolated G value at R/B grid points using directional weighted average - rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr); - } +#endif + for (; cc < cc1 - 3; cc+=2, indx+=2) { + //compute directional weights using image gradients + float wtu = 1.f / SQR(eps + fabsf(rgb[1][indx + v1] - rgb[1][indx - v1]) + fabsf(rgb[c][indx] - rgb[c][indx - v2]) + fabsf(rgb[1][indx - v1] - rgb[1][indx - v3])); + float wtd = 1.f / SQR(eps + fabsf(rgb[1][indx - v1] - rgb[1][indx + v1]) + fabsf(rgb[c][indx] - rgb[c][indx + v2]) + fabsf(rgb[1][indx + v1] - rgb[1][indx + v3])); + float wtl = 1.f / SQR(eps + fabsf(rgb[1][indx + 1] - rgb[1][indx - 1]) + fabsf(rgb[c][indx] - rgb[c][indx - 2]) + fabsf(rgb[1][indx - 1] - rgb[1][indx - 3])); + float wtr = 1.f / SQR(eps + fabsf(rgb[1][indx - 1] - rgb[1][indx + 1]) + fabsf(rgb[c][indx] - rgb[c][indx + 2]) + fabsf(rgb[1][indx + 1] - rgb[1][indx + 3])); - if (row > -1 && row < height && col > -1 && col < width) { + //store in rgb array the interpolated G value at R/B grid points using directional weighted average + rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr); + } + + if (row > -1 && row < height) { + for(int col = max(left + 3, 0), indx = rr * ts + 3 - (left < 0 ? (left+3) : 0); col < min(cc1 + left - 3, width); col++, indx++) { Gtmp[row * width + col] = rgb[1][indx]; } } + } //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #ifdef __SSE2__ - vfloat epsv = F2V(eps); vfloat zd25v = F2V(0.25f); #endif for (int rr = 4; rr < rr1 - 4; rr++) { @@ -381,18 +390,18 @@ BENCHFUN STVFU(rbhpfh[indx >> 1], temp2v); //low and high pass 1D filters of G in vertical/horizontal directions - vfloat glpfv = zd25v * (vmul2f(rgb1v) + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2])); - vfloat glpfh = zd25v * (vmul2f(rgb1v) + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2])); - STVFU(rblpfv[indx >> 1], epsv + vabsf(glpfv - zd25v * (vmul2f(rgbcv) + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2])))); - STVFU(rblpfh[indx >> 1], epsv + vabsf(glpfh - zd25v * (vmul2f(rgbcv) + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2])))); - STVFU(grblpfv[indx >> 1], glpfv + zd25v * (vmul2f(LC2VFU(rgb[c][indx])) + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2]))); - STVFU(grblpfh[indx >> 1], glpfh + zd25v * (vmul2f(LC2VFU(rgb[c][indx])) + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2]))); + rgb1v = vmul2f(rgb1v); + vfloat glpfvv = zd25v * (rgb1v + LC2VFU(rgb[1][indx + v2]) + LC2VFU(rgb[1][indx - v2])); + vfloat glpfhv = zd25v * (rgb1v + LC2VFU(rgb[1][indx + 2]) + LC2VFU(rgb[1][indx - 2])); + rgbcv = vmul2f(rgbcv); + STVFU(rblpfv[indx >> 1], epsv + vabsf(glpfvv - zd25v * (rgbcv + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2])))); + STVFU(rblpfh[indx >> 1], epsv + vabsf(glpfhv - zd25v * (rgbcv + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2])))); + STVFU(grblpfv[indx >> 1], glpfvv + zd25v * (rgbcv + LC2VFU(rgb[c][indx + v2]) + LC2VFU(rgb[c][indx - v2]))); + STVFU(grblpfh[indx >> 1], glpfhv + zd25v * (rgbcv + LC2VFU(rgb[c][indx + 2]) + LC2VFU(rgb[c][indx - 2]))); } #endif for (; cc < cc1 - 4; cc += 2, indx += 2) { - - rbhpfv[indx >> 1] = fabsf(fabsf((rgb[1][indx] - rgb[c][indx]) - (rgb[1][indx + v4] - rgb[c][indx + v4])) + fabsf((rgb[1][indx - v4] - rgb[c][indx - v4]) - (rgb[1][indx] - rgb[c][indx])) - fabsf((rgb[1][indx - v4] - rgb[c][indx - v4]) - (rgb[1][indx + v4] - rgb[c][indx + v4]))); @@ -410,10 +419,72 @@ BENCHFUN } } + for (int dir = 0; dir < 2; dir++) { + for (int k = 0; k < 3; k++) { + for (int c = 0; c < 2; c++) { + coeff[dir][k][c] = 0; + } + } + } + +#ifdef __SSE2__ + vfloat zd3125v = F2V(0.3125f); + vfloat zd09375v = F2V(0.09375f); + vfloat zd1v = F2V(0.1f); + vfloat zd125v = F2V(0.125f); +#endif + // along line segments, find the point along each segment that minimizes the colour variance // averaged over the tile; evaluate for up/down and left/right away from R/B grid point - for (int rr = 8; rr < rr1 - 8; rr++) - for (int cc = 8 + (FC(rr, 2) & 1), indx = rr * ts + cc, c = FC(rr, cc); cc < cc1 - 8; cc += 2, indx += 2) { + for (int rr = 8; rr < rr1 - 8; rr++) { + int cc = 8 + (FC(rr, 2) & 1); + int indx = rr * ts + cc; + int c = FC(rr, cc); +#ifdef __SSE2__ + vfloat coeff00v = ZEROV; + vfloat coeff01v = ZEROV; + vfloat coeff02v = ZEROV; + vfloat coeff10v = ZEROV; + vfloat coeff11v = ZEROV; + vfloat coeff12v = ZEROV; + for (; cc < cc1 - 14; cc += 8, indx += 8) { + + //in linear interpolation, colour differences are a quadratic function of interpolation position; + //solve for the interpolation position that minimizes colour difference variance over the tile + + //vertical + vfloat gdiffv = zd3125v * (LC2VFU(rgb[1][indx + ts]) - LC2VFU(rgb[1][indx - ts])) + zd09375v * (LC2VFU(rgb[1][indx + ts + 1]) - LC2VFU(rgb[1][indx - ts + 1]) + LC2VFU(rgb[1][indx + ts - 1]) - LC2VFU(rgb[1][indx - ts - 1])); + vfloat deltgrbv = LC2VFU(rgb[c][indx]) - LC2VFU(rgb[1][indx]); + + vfloat gradwtv = vabsf(zd25v * LVFU(rbhpfv[indx >> 1]) + zd125v * (LVFU(rbhpfv[(indx >> 1) + 1]) + LVFU(rbhpfv[(indx >> 1) - 1])) ) * (LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(grblpfv[(indx >> 1) + v1])) / (epsv + zd1v * LVFU(grblpfv[(indx >> 1) - v1]) + LVFU(rblpfv[(indx >> 1) - v1]) + zd1v * LVFU(grblpfv[(indx >> 1) + v1]) + LVFU(rblpfv[(indx >> 1) + v1])); + + coeff00v += gradwtv * deltgrbv * deltgrbv; + coeff01v += gradwtv * gdiffv * deltgrbv; + coeff02v += gradwtv * gdiffv * gdiffv; + + //horizontal + gdiffv = zd3125v * (LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1])) + zd09375v * (LC2VFU(rgb[1][indx + 1 + ts]) - LC2VFU(rgb[1][indx - 1 + ts]) + LC2VFU(rgb[1][indx + 1 - ts]) - LC2VFU(rgb[1][indx - 1 - ts])); + + gradwtv = vabsf(zd25v * LVFU(rbhpfh[indx >> 1]) + zd125v * (LVFU(rbhpfh[(indx >> 1) + v1]) + LVFU(rbhpfh[(indx >> 1) - v1])) ) * (LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(grblpfh[(indx >> 1) + 1])) / (epsv + zd1v * LVFU(grblpfh[(indx >> 1) - 1]) + LVFU(rblpfh[(indx >> 1) - 1]) + zd1v * LVFU(grblpfh[(indx >> 1) + 1]) + LVFU(rblpfh[(indx >> 1) + 1])); + + coeff10v += gradwtv * deltgrbv * deltgrbv; + coeff11v += gradwtv * gdiffv * deltgrbv; + coeff12v += gradwtv * gdiffv * gdiffv; + + // In Mathematica, + // f[x_]=Expand[Total[Flatten[ + // ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]]; + // extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2] + } + coeff[0][0][c>>1] += vhadd(coeff00v); + coeff[0][1][c>>1] += vhadd(coeff01v); + coeff[0][2][c>>1] += vhadd(coeff02v); + coeff[1][0][c>>1] += vhadd(coeff10v); + coeff[1][1][c>>1] += vhadd(coeff11v); + coeff[1][2][c>>1] += vhadd(coeff12v); + +#endif + for (; cc < cc1 - 8; cc += 2, indx += 2) { //in linear interpolation, colour differences are a quadratic function of interpolation position; //solve for the interpolation position that minimizes colour difference variance over the tile @@ -442,7 +513,7 @@ BENCHFUN // ((1-x) RotateLeft[Gint,shift1]+x RotateLeft[Gint,shift2]-cfapad)^2[[dv;;-1;;2,dh;;-1;;2]]]]]; // extremum = -.5Coefficient[f[x],x]/Coefficient[f[x],x^2] } - + } for (int c = 0; c < 2; c++) { for (int dir = 0; dir < 2; dir++) { // vert/hor @@ -462,7 +533,6 @@ BENCHFUN //dir : 0=vert, 1=hor //offset gives NW corner of square containing the min; dir : 0=vert, 1=hor - if (fabsf(CAshift[dir][c]) < 2.0f) { blockavethr[dir][c] += CAshift[dir][c]; blocksqavethr[dir][c] += SQR(CAshift[dir][c]); @@ -543,7 +613,7 @@ BENCHFUN //end of filling border pixels of blockshift array //initialize fit arrays - double polymat[2][2][256], shiftmat[2][2][16]; + double polymat[2][2][256], shiftmat[2][2][16]; for (int i = 0; i < 256; i++) { polymat[0][0][i] = polymat[0][1][i] = polymat[1][0][i] = polymat[1][1][i] = 0; @@ -562,7 +632,7 @@ BENCHFUN float bstemp[2]; for (int dir = 0; dir < 2; dir++) { //temporary storage for median filter - float temp, p[9]; + float p[9]; p[0] = blockshifts[(vblock - 1) * hblsz + hblock - 1][c][dir]; p[1] = blockshifts[(vblock - 1) * hblsz + hblock][c][dir]; p[2] = blockshifts[(vblock - 1) * hblsz + hblock + 1][c][dir]; @@ -572,29 +642,29 @@ BENCHFUN p[6] = blockshifts[(vblock + 1) * hblsz + hblock - 1][c][dir]; p[7] = blockshifts[(vblock + 1) * hblsz + hblock][c][dir]; p[8] = blockshifts[(vblock + 1) * hblsz + hblock + 1][c][dir]; - PIX_SORT(p[1], p[2]); - PIX_SORT(p[4], p[5]); - PIX_SORT(p[7], p[8]); - PIX_SORT(p[0], p[1]); - PIX_SORT(p[3], p[4]); - PIX_SORT(p[6], p[7]); - PIX_SORT(p[1], p[2]); - PIX_SORT(p[4], p[5]); - PIX_SORT(p[7], p[8]); - PIX_SORT(p[0], p[3]); - PIX_SORT(p[5], p[8]); - PIX_SORT(p[4], p[7]); - PIX_SORT(p[3], p[6]); - PIX_SORT(p[1], p[4]); - PIX_SORT(p[2], p[5]); - PIX_SORT(p[4], p[7]); - PIX_SORT(p[4], p[2]); - PIX_SORT(p[6], p[4]); - PIX_SORT(p[4], p[2]); + pixSort(p[1], p[2]); + pixSort(p[4], p[5]); + pixSort(p[7], p[8]); + pixSort(p[0], p[1]); + pixSort(p[3], p[4]); + pixSort(p[6], p[7]); + pixSort(p[1], p[2]); + pixSort(p[4], p[5]); + pixSort(p[7], p[8]); + pixSort(p[0], p[3]); + pixSort(p[5], p[8]); + pixSort(p[4], p[7]); + pixSort(p[3], p[6]); + pixSort(p[1], p[4]); + pixSort(p[2], p[5]); + pixSort(p[4], p[7]); + pixSort(p[4], p[2]); + pixSort(p[6], p[4]); + pixSort(p[4], p[2]); bstemp[dir] = p[4]; } - //now prepare coefficient matrix; use only data points within two std devs of zero + //now prepare coefficient matrix; use only data points within caautostrength/2 std devs of zero if (SQR(bstemp[0]) > caautostrength * blockvar[0][c] || SQR(bstemp[1]) > caautostrength * blockvar[1][c]) { continue; } @@ -623,6 +693,7 @@ BENCHFUN }//dir }//c }//blocks + numblox[1] = min(numblox[0], numblox[1]); //if too few data points, restrict the order of the fit to linear @@ -841,31 +912,56 @@ BENCHFUN } - for (int rr = 4; rr < rr1 - 4; rr++) - for (int cc = 4 + (FC(rr, 2) & 1), c = FC(rr, cc); cc < cc1 - 4; cc += 2) { + for (int rr = 4; rr < rr1 - 4; rr++) { + int cc = 4 + (FC(rr, 2) & 1); + int c = FC(rr, cc); +#ifdef __SSE2__ + vfloat shifthfracv = F2V(shifthfrac[c]); + vfloat shiftvfracv = F2V(shiftvfrac[c]); + for (; cc < cc1 - 10; cc += 8) { //perform CA correction using colour ratios or colour differences - - float Ginthfloor = (1 - shifthfrac[c]) * rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthfloor[c]] + (shifthfrac[c]) * rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthceil[c]]; - float Ginthceil = (1 - shifthfrac[c]) * rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthfloor[c]] + (shifthfrac[c]) * rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthceil[c]]; + vfloat Ginthfloorv = vintpf(shifthfracv, LC2VFU(rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthceil[c]]), LC2VFU(rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthfloor[c]])); + vfloat Ginthceilv = vintpf(shifthfracv, LC2VFU(rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthceil[c]]), LC2VFU(rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthfloor[c]])); //Gint is bilinear interpolation of G at CA shift point - float Gint = (1 - shiftvfrac[c]) * Ginthfloor + (shiftvfrac[c]) * Ginthceil; + vfloat Gintv = vintpf(shiftvfracv, Ginthceilv, Ginthfloorv); + + //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point + //but first we need to interpolate G-R/G-B to grid points... + STVFU(grbdiff[((rr)*ts + cc) >> 1], Gintv - LC2VFU(rgb[c][(rr) * ts + cc])); + STVFU(gshift[((rr)*ts + cc) >> 1], Gintv); + } + +#endif + for (; cc < cc1 - 4; cc += 2) { + //perform CA correction using colour ratios or colour differences + float Ginthfloor = intp(shifthfrac[c], rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthceil[c]], rgb[1][(rr + shiftvfloor[c]) * ts + cc + shifthfloor[c]]); + float Ginthceil = intp(shifthfrac[c], rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthceil[c]], rgb[1][(rr + shiftvceil[c]) * ts + cc + shifthfloor[c]]); + //Gint is bilinear interpolation of G at CA shift point + float Gint = intp(shiftvfrac[c], Ginthceil, Ginthfloor); //determine R/B at grid points using colour differences at shift point plus interpolated G value at grid point //but first we need to interpolate G-R/G-B to grid points... grbdiff[((rr)*ts + cc) >> 1] = Gint - rgb[c][(rr) * ts + cc]; gshift[((rr)*ts + cc) >> 1] = Gint; } + } + shifthfrac[0] /= 2.f; + shifthfrac[2] /= 2.f; + shiftvfrac[0] /= 2.f; + shiftvfrac[2] /= 2.f; + + // this loop does not deserve vectorization in mainly because the most expensive part with the divisions does not happen often (less than 1/10 in my tests) for (int rr = 8; rr < rr1 - 8; rr++) for (int cc = 8 + (FC(rr, 2) & 1), c = FC(rr, cc), indx = rr * ts + cc; cc < cc1 - 8; cc += 2, indx += 2) { float grbdiffold = rgb[1][indx] - rgb[c][indx]; //interpolate colour difference from optical R/B locations to grid locations - float grbdiffinthfloor = (1.0f - shifthfrac[c] / 2.0f) * grbdiff[indx >> 1] + (shifthfrac[c] / 2.0f) * grbdiff[(indx - GRBdir[1][c]) >> 1]; - float grbdiffinthceil = (1.0f - shifthfrac[c] / 2.0f) * grbdiff[((rr - GRBdir[0][c]) * ts + cc) >> 1] + (shifthfrac[c] / 2.0f) * grbdiff[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1]; + float grbdiffinthfloor = intp(shifthfrac[c], grbdiff[(indx - GRBdir[1][c]) >> 1], grbdiff[indx >> 1]); + float grbdiffinthceil = intp(shifthfrac[c], grbdiff[((rr - GRBdir[0][c]) * ts + cc - GRBdir[1][c]) >> 1], grbdiff[((rr - GRBdir[0][c]) * ts + cc) >> 1]); //grbdiffint is bilinear interpolation of G-R/G-B at grid point - float grbdiffint = (1.0f - shiftvfrac[c] / 2.0f) * grbdiffinthfloor + (shiftvfrac[c] / 2.0f) * grbdiffinthceil; + float grbdiffint = intp(shiftvfrac[c], grbdiffinthceil, grbdiffinthfloor); //now determine R/B at grid points using interpolated colour differences and interpolated G value at grid point float RBint = rgb[1][indx] - grbdiffint; @@ -949,6 +1045,4 @@ BENCHFUN if(plistener) { plistener->setProgress(1.0); } - -#undef PIX_SORT } diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc index e6d207abc..55405e420 100644 --- a/rtengine/procparams.cc +++ b/rtengine/procparams.cc @@ -888,7 +888,7 @@ void RAWParams::setDefaults() ff_clipControl = 0; cared = 0; cablue = 0; - caautostrength = 4; + caautostrength = 6; ca_autocorrect = false; hotPixelFilter = false; deadPixelFilter = false;