Speedup for ca-correction of pixelshift files, also fixed a memory leak

This commit is contained in:
heckflosse 2018-05-20 13:59:50 +02:00
parent 68fabf0be5
commit ac78dd311e
3 changed files with 83 additions and 83 deletions

View File

@ -111,7 +111,7 @@ bool LinEqSolve(int nDim, double* pfMatr, double* pfVect, double* pfSolution)
using namespace std;
using namespace rtengine;
void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
float* RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer, bool fitParamsIn, bool fitParamsOut, float *buffer, bool freeBuffer)
{
// multithreaded and vectorized by Ingo Weyrich
constexpr int ts = 128;
@ -124,7 +124,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
for(int j = 0; j < 2; j++)
if(FC(i, j) == 3) {
printf("CA correction supports only RGB Colour filter arrays\n");
return;
return buffer;
}
volatile double progress = 0.0;
@ -135,19 +135,6 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
// local variables
const int width = W + (W & 1), height = H;
//temporary array to store simple interpolation of G
if (!buffer) {
buffer = (float (*)) malloc ((height * width) / 2 * sizeof (float) + (height * width) * sizeof(float) / 2);
}
float *Gtmp = buffer;
float *RawDataTmp = buffer + (height * width) / 2;
float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
// Because we can't break parallel processing, we need a switch do handle the errors
bool processpasstwo = true;
constexpr int border = 8;
constexpr int border2 = 16;
@ -156,13 +143,27 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
const int vblsz = ceil((float)(height + border2) / (ts - border2) + 2 + vz1);
const int hblsz = ceil((float)(width + border2) / (ts - border2) + 2 + hz1);
//temporary array to store simple interpolation of G
if (!buffer) {
buffer = static_cast<float*>(malloc ((height * width + vblsz * hblsz * (2 * 2 + 1)) * sizeof(float)));
}
float *Gtmp = buffer;
float *RawDataTmp = buffer + (height * width) / 2;
//block CA shift values and weight assigned to block
float* const blockwt = static_cast<float*>(calloc(vblsz * hblsz * (2 * 2 + 1), sizeof(float)));
float *const blockwt = buffer + (height * width);
memset(blockwt, 0, vblsz * hblsz * (2 * 2 + 1) * sizeof(float));
float (*blockshifts)[2][2] = (float (*)[2][2])(blockwt + vblsz * hblsz);
float blockave[2][2] = {{0, 0}, {0, 0}}, blocksqave[2][2] = {{0, 0}, {0, 0}}, blockdenom[2][2] = {{0, 0}, {0, 0}}, blockvar[2][2];
// Because we can't break parallel processing, we need a switch do handle the errors
bool processpasstwo = true;
double fitparams[2][2][16];
const bool fitParamsSet = fitParamsTransfer && fitParamsIn;
if(autoCA && fitParamsSet) {
// use stored parameters
int index = 0;
for(int c = 0; c < 2; ++c) {
for(int d = 0; d < 2; ++d) {
@ -186,22 +187,18 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
int shifthfloor[3], shiftvfloor[3], shifthceil[3], shiftvceil[3];
//local quadratic fit to shift data within a tile
float coeff[2][3][2];
//measured CA shift parameters for a tile
float CAshift[2][2];
//polynomial fit coefficients
//residual CA shift amount within a plaquette
float shifthfrac[3], shiftvfrac[3];
//per thread data for evaluation of block CA shift variance
float blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
// assign working space
constexpr int buffersize = sizeof(float) * ts * ts + 8 * sizeof(float) * ts * tsh + 8 * 64 + 63;
char *bufferThr = (char *) malloc(buffersize);
char *data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
constexpr int buffersizePassTwo = sizeof(float) * ts * ts + 4 * sizeof(float) * ts * tsh + 4 * 64 + 63;
char * const bufferThr = (char *) malloc((autoCA && !fitParamsSet) ? buffersize : buffersizePassTwo);
// shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <=4-way associative L1-Cache
char * const data = (char*)( ( uintptr_t(bufferThr) + uintptr_t(63)) / 64 * 64);
// shift the beginning of all arrays but the first by 64 bytes to avoid cache miss conflicts on CPUs which have <= 4-way associative L1-Cache
//rgb data in a tile
float* rgb[3];
@ -209,25 +206,29 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
rgb[1] = (float (*)) (data + sizeof(float) * ts * tsh + 1 * 64);
rgb[2] = (float (*)) (data + sizeof(float) * (ts * ts + ts * tsh) + 2 * 64);
//high pass filter for R/B in vertical direction
float *rbhpfh = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
//high pass filter for R/B in horizontal direction
float *rbhpfv = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
//low pass filter for R/B in horizontal direction
float *rblpfh = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
//low pass filter for R/B in vertical direction
float *rblpfv = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
//low pass filter for colour differences in horizontal direction
float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
//low pass filter for colour differences in vertical direction
float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
float *grbdiff = rbhpfh; // there is no overlap in buffer usage => share
//green interpolated to optical sample points for R/B
float *gshift = rbhpfv; // there is no overlap in buffer usage => share
if (autoCA && !fitParamsSet) {
//high pass filter for R/B in vertical direction
float *rbhpfh = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64);
//high pass filter for R/B in horizontal direction
float *rbhpfv = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64);
//low pass filter for R/B in horizontal direction
float *rblpfh = (float (*)) (data + 3 * sizeof(float) * ts * ts + 5 * 64);
//low pass filter for R/B in vertical direction
float *rblpfv = (float (*)) (data + 3 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 6 * 64);
//low pass filter for colour differences in horizontal direction
float *grblpfh = (float (*)) (data + 4 * sizeof(float) * ts * ts + 7 * 64);
//low pass filter for colour differences in vertical direction
float *grblpfv = (float (*)) (data + 4 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 8 * 64);
// Main algorithm: Tile loop calculating correction parameters per tile
//local quadratic fit to shift data within a tile
float coeff[2][3][2];
//measured CA shift parameters for a tile
float CAshift[2][2];
//per thread data for evaluation of block CA shift variance
float blockavethr[2][2] = {{0, 0}, {0, 0}}, blocksqavethr[2][2] = {{0, 0}, {0, 0}}, blockdenomthr[2][2] = {{0, 0}, {0, 0}};
#pragma omp for collapse(2) schedule(dynamic) nowait
for (int top = -border ; top < height; top += ts - border2)
for (int left = -border; left < width - (W & 1); left += ts - border2) {
@ -763,11 +764,14 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
// Main algorithm: Tile loop
if(processpasstwo) {
float *grbdiff = (float (*)) (data + 2 * sizeof(float) * ts * ts + 3 * 64); // there is no overlap in buffer usage => share
//green interpolated to optical sample points for R/B
float *gshift = (float (*)) (data + 2 * sizeof(float) * ts * ts + sizeof(float) * ts * tsh + 4 * 64); // there is no overlap in buffer usage => share
#pragma omp for schedule(dynamic) collapse(2) nowait
for (int top = -border; top < height; top += ts - border2)
for (int left = -border; left < width - (W & 1); left += ts - border2) {
memset(bufferThr, 0, buffersize);
memset(bufferThr, 0, buffersizePassTwo);
float lblockshifts[2][2];
const int vblock = ((top + border) / (ts - border2)) + 1;
const int hblock = ((left + border) / (ts - border2)) + 1;
@ -913,25 +917,42 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
//end of border fill
// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
if (!autoCA) {
if (!autoCA || fitParamsIn) {
#ifdef __SSE2__
const vfloat onev = F2V(1.f);
const vfloat epsv = F2V(eps);
#endif
//manual CA correction; use red/blue slider values to set CA shift parameters
for (int rr = 3; rr < rr1 - 3; rr++)
for (int cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) {
int c = FC(rr, cc);
if (c != 1) {
//compute directional weights using image gradients
float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc + 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
//store in rgb array the interpolated G value at R/B grid points using directional weighted average
rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
}
for (int rr = 3; rr < rr1 - 3; rr++) {
int cc = 3 + FC(rr, 1), c = FC(rr,cc), indx = rr * ts + cc;
#ifdef __SSE2__
for (; cc < cc1 - 10; cc += 8, indx += 8) {
//compute directional weights using image gradients
vfloat val1v = epsv + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 1) * ts + cc]));
vfloat val2v = epsv + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx - 1]));
vfloat wtuv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr - 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr - 1) * ts + cc]) - LC2VFU(rgb[1][(rr - 3) * ts + cc])));
vfloat wtdv = onev / SQRV(val1v + vabsf(LVFU(rgb[c][(rr * ts + cc) >> 1]) - LVFU(rgb[c][((rr + 2) * ts + cc) >> 1])) + vabsf(LC2VFU(rgb[1][(rr + 1) * ts + cc]) - LC2VFU(rgb[1][(rr + 3) * ts + cc])));
vfloat wtlv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx - 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx - 1]) - LC2VFU(rgb[1][indx - 3])));
vfloat wtrv = onev / SQRV(val2v + vabsf(LVFU(rgb[c][indx >> 1]) - LVFU(rgb[c][(indx + 2) >> 1])) + vabsf(LC2VFU(rgb[1][indx + 1]) - LC2VFU(rgb[1][indx + 3])));
//store in rgb array the interpolated G value at R/B grid points using directional weighted average
STC2VFU(rgb[1][indx], (wtuv * LC2VFU(rgb[1][indx - v1]) + wtdv * LC2VFU(rgb[1][indx + v1]) + wtlv * LC2VFU(rgb[1][indx - 1]) + wtrv * LC2VFU(rgb[1][indx + 1])) / (wtuv + wtdv + wtlv + wtrv));
}
#endif
for (; cc < cc1 - 3; cc += 2, indx += 2) {
//compute directional weights using image gradients
float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
//store in rgb array the interpolated G value at R/B grid points using directional weighted average
rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
}
}
}
if (!autoCA) {
float hfrac = -((float)(hblock - 0.5) / (hblsz - 2) - 0.5);
float vfrac = -((float)(vblock - 0.5) / (vblsz - 2) - 0.5) * height / width;
lblockshifts[0][0] = 2 * vfrac * cared;
@ -940,32 +961,12 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
lblockshifts[1][1] = 2 * hfrac * cablue;
} else {
//CA auto correction; use CA diagnostic pass to set shift parameters
if (fitParamsIn) {
for (int rr = 3; rr < rr1 - 3; rr++) {
for (int cc = 3, indx = rr * ts + cc; cc < cc1 - 3; cc++, indx++) {
int c = FC(rr, cc);
if (c != 1) {
//compute directional weights using image gradients
float wtu = 1.f / SQR(eps + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr - 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr - 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr - 3) * ts + cc]));
float wtd = 1.f / SQR(eps + fabsf(rgb[1][(rr - 1) * ts + cc] - rgb[1][(rr + 1) * ts + cc]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][((rr + 2) * ts + cc) >> 1]) + fabsf(rgb[1][(rr + 1) * ts + cc] - rgb[1][(rr + 3) * ts + cc]));
float wtl = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc - 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc - 2) >> 1]) + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc - 3]));
float wtr = 1.f / SQR(eps + fabsf(rgb[1][rr * ts + cc - 1] - rgb[1][rr * ts + cc + 1]) + fabsf(rgb[c][(rr * ts + cc) >> 1] - rgb[c][(rr * ts + cc + 2) >> 1]) + fabsf(rgb[1][rr * ts + cc + 1] - rgb[1][rr * ts + cc + 3]));
//store in rgb array the interpolated G value at R/B grid points using directional weighted average
rgb[1][indx] = (wtu * rgb[1][indx - v1] + wtd * rgb[1][indx + v1] + wtl * rgb[1][indx - 1] + wtr * rgb[1][indx + 1]) / (wtu + wtd + wtl + wtr);
}
}
}
}
lblockshifts[0][0] = lblockshifts[0][1] = 0;
lblockshifts[1][0] = lblockshifts[1][1] = 0;
double powVblock = 1.0;
for (int i = 0; i < polyord; i++) {
double powHblock = powVblock;
for (int j = 0; j < polyord; j++) {
//printf("i= %d j= %d polycoeff= %f \n",i,j,fitparams[0][0][polyord*i+j]);
lblockshifts[0][0] += powHblock * fitparams[0][0][polyord * i + j];
lblockshifts[0][1] += powHblock * fitparams[0][1][polyord * i + j];
lblockshifts[1][0] += powHblock * fitparams[1][0][polyord * i + j];
@ -1187,6 +1188,7 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
}
if(autoCA && fitParamsTransfer && fitParamsOut) {
// store calculated parameters
int index = 0;
for(int c = 0; c < 2; ++c) {
for(int d = 0; d < 2; ++d) {
@ -1201,9 +1203,8 @@ void RawImageSource::CA_correct_RT(const bool autoCA, const double cared, const
free(buffer);
}
free(blockwt);
if(plistener) {
plistener->setProgress(1.0);
}
return buffer;
}

View File

@ -2017,8 +2017,7 @@ void RawImageSource::preprocess (const RAWParams &raw, const LensProfParams &le
if(numFrames == 4) {
StopWatch Stop1("ps ca correction");
double fitParams[64];
float *buffer = nullptr;
CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, buffer, false);
float *buffer = CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[0], fitParams, false, true, nullptr, false);
for(int i = 1; i < 3; ++i) {
CA_correct_RT(raw.ca_autocorrect, raw.cared, raw.cablue, 8.0, *rawDataFrames[i], fitParams, true, false, buffer, false);
}

View File

@ -245,7 +245,7 @@ protected:
inline void interpolate_row_rb (float* ar, float* ab, float* pg, float* cg, float* ng, int i);
inline void interpolate_row_rb_mul_pp (float* ar, float* ab, float* pg, float* cg, float* ng, int i, float r_mul, float g_mul, float b_mul, int x1, int width, int skip);
void CA_correct_RT (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer = nullptr, bool fitParamsIn = false, bool fitParamsOut = false, float *buffer = nullptr, bool freeBuffer = true);
float* CA_correct_RT (const bool autoCA, const double cared, const double cablue, const double caautostrength, array2D<float> &rawData, double *fitParamsTransfer = nullptr, bool fitParamsIn = false, bool fitParamsOut = false, float * buffer = nullptr, bool freeBuffer = true);
void ddct8x8s(int isgn, float a[8][8]);
void processRawWhitepoint (float expos, float preser, array2D<float> &rawData); // exposure before interpolation