Speedup and reduced memory usage for lmmse demosaic, Issue 2665

This commit is contained in:
Ingo
2015-02-22 22:00:06 +01:00
parent 9dd4da2547
commit 4e8a326645
4 changed files with 382 additions and 213 deletions

View File

@@ -58,6 +58,7 @@ namespace rtengine {
#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} }
#define PIX_SORTV(av,bv) tempv = _mm_min_ps(av,bv); bv = _mm_max_ps(av,bv); av = tempv;
extern const Settings* settings;
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -1185,13 +1186,10 @@ void RawImageSource::jdl_interpolate_omp() // from "Lassus"
// IEEE Trans. on Image Processing, vol. 14, pp. 2167-2178,
// Dec. 2005.
// Adapted to RT by Jacques Desmis 3/2013
// Improved speed and reduced memory consumption by Ingo Weyrich 2/2015
//TODO Tiles to reduce memory consumption
void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
SSEFUNCTION void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
{
int c, ii;
float h0, h1, h2, h3, h4, hs;
const int width=winw, height=winh;
const int ba = 10;
const int rr1 = height + 2*ba;
@@ -1200,7 +1198,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
const int w2 = 2*w1;
const int w3 = 3*w1;
const int w4 = 4*w1;
int iter;
float h0, h1, h2, h3, h4, hs;
h0 = 1.0f;
h1 = exp( -1.0f/8.0f);
h2 = exp( -4.0f/8.0f);
@@ -1213,77 +1211,73 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
h3 /= hs;
h4 /= hs;
int passref;
if(iterations <=4) {iter = iterations-1;passref=0;}
else if (iterations <=6){iter=3;passref=iterations-4;}
else if (iterations <=8){iter=3;passref=iterations-6;}
int iter;
if(iterations <=4) {
iter = iterations-1;
passref=0;
} else if (iterations <=6) {
iter=3;
passref=iterations-4;
} else if (iterations <=8) {
iter=3;
passref=iterations-6;
}
bool applyGamma=true;
if(iterations==0) {applyGamma=false;iter=0;} else applyGamma=true;
if(iterations==0) {
applyGamma=false;
iter=0;
} else
applyGamma=true;
LUTf *gamtab;
if(applyGamma)
gamtab = &(Color::gammatab_24_17a);
else {
gamtab = new LUTf(65536, LUT_CLIP_ABOVE | LUT_CLIP_BELOW);
for(int i=0;i<65536;i++)
(*gamtab)[i] = (float)i / 65535.f;
}
if (plistener) {
plistener->setProgressStr (Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), RAWParams::BayerSensor::methodstring[RAWParams::BayerSensor::lmmse]));
plistener->setProgress (0.0);
}
float *imageBuffer = (float*)malloc (((((width+1)/2)*height) + (((height+1)/2)*(width+1))) * sizeof *imageBuffer);
float *image[3];
image[0] = imageBuffer;
image[1] = imageBuffer + ((height+1)/2)*((width+1)/2);
image[2] = imageBuffer + ((height+1)/2)*((width+1)/2) + (((width+1)/2)*height);
unsigned int a=0;
float *rix[6];
float *qix[6];
float *buffer = (float *)calloc(rr1*cc1*6*sizeof(float),1);
float *rix[5];
float *qix[5];
float *buffer = (float *)calloc(rr1*cc1*5*sizeof(float),1);
if(buffer == NULL) { // allocation of big block of memory failed, try to get 6 smaller ones
printf("lmmse_interpolate_omp: allocation of big memory block failed, try to get 6 smaller ones now...\n");
for(int i=0;i<6;i++)
printf("lmmse_interpolate_omp: allocation of big memory block failed, try to get 5 smaller ones now...\n");
for(int i=0;i<5;i++)
qix[i] = (float *)calloc(rr1*cc1*sizeof(float),1);
} else {
qix[0] = buffer;
for(int i=1;i<6;i++)
for(int i=1;i<5;i++)
qix[i] = qix[i-1] + rr1*cc1;
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<height; row++)
for (int col=0; col<width; col++) {
int c = fc(row,col);
image[c][(((row*width)>>(1-(c&1)))+col)>>1] = CLIP(rawData[row][col]);
}
{
if (plistener) plistener->setProgress (0.1);
}
#ifdef _OPENMP
#pragma omp parallel firstprivate (image,rix,qix,h0,h1,h2,h3,h4)
#pragma omp parallel private(rix)
#endif
{
#ifdef _OPENMP
#pragma omp for
#endif
for (int rrr=ba; rrr < rr1-ba; rrr++)
for (int rrr=ba; rrr < rr1-ba; rrr++) {
for (int ccc=ba, row=rrr-ba; ccc < cc1-ba; ccc++) {
int col = ccc - ba;
rix[4] = qix[4] + rrr*cc1 + ccc;
int c = fc(row,col);
if (applyGamma)
rix[4][0] = Color::gammatab_24_17a[image[c][(((row*width)>>(1-(c&1)))+col)>>1]];
else
rix[4][0] = (float)image[c][(((row*width)>>(1-(c&1)))+col)>>1]/65535.0f;
float *rix = qix[4] + rrr*cc1 + ccc;
rix[0] = (*gamtab)[rawData[row][col]];
}
}
#ifdef _OPENMP
#pragma omp single
#endif
{
if (plistener) plistener->setProgress (0.2);
if (plistener) plistener->setProgress (0.1);
}
// G-R(B)
#ifdef _OPENMP
#pragma omp for schedule(dynamic,16)
@@ -1327,7 +1321,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
#pragma omp single
#endif
{
if (plistener) plistener->setProgress (0.25);
if (plistener) plistener->setProgress (0.2);
}
@@ -1351,69 +1345,136 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
if (plistener) plistener->setProgress (0.3);
}
// interpolate G-R(B) at R(B)
#ifdef _OPENMP
#pragma omp for
#pragma omp for
#endif
for (int rr=4; rr < rr1-4; rr++)
for (int cc=4+(FC(rr,4)&1); cc < cc1-4; cc+=2) {
for (int rr=4; rr < rr1-4; rr++) {
int cc=4+(FC(rr,4)&1);
#ifdef __SSE2__
__m128 p1v, p2v, p3v, p4v, p5v, p6v, p7v, p8v, p9v, muv, vxv, vnv, xhv, vhv, xvv, vvv;
__m128 epsv = _mm_set1_ps(1e-7);
__m128 ninev = _mm_set1_ps(9.f);
for (; cc < cc1-10; cc+=8) {
rix[0] = qix[0] + rr*cc1 + cc;
rix[1] = qix[1] + rr*cc1 + cc;
rix[2] = qix[2] + rr*cc1 + cc;
rix[3] = qix[3] + rr*cc1 + cc;
rix[4] = qix[4] + rr*cc1 + cc;
// horizontal
float mu = (rix[2][-4] + rix[2][-3] + rix[2][-2] + rix[2][-1] + rix[2][0]+ rix[2][ 1] + rix[2][ 2] + rix[2][ 3] + rix[2][ 4]) / 9.0f;
float p1 = rix[2][-4] - mu;
float p2 = rix[2][-3] - mu;
float p3 = rix[2][-2] - mu;
float p4 = rix[2][-1] - mu;
float p5 = rix[2][ 0] - mu;
float p6 = rix[2][ 1] - mu;
float p7 = rix[2][ 2] - mu;
float p8 = rix[2][ 3] - mu;
float p9 = rix[2][ 4] - mu;
float vx = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9;
p1 = rix[0][-4] - rix[2][-4];
p2 = rix[0][-3] - rix[2][-3];
p3 = rix[0][-2] - rix[2][-2];
p4 = rix[0][-1] - rix[2][-1];
p5 = rix[0][ 0] - rix[2][ 0];
p6 = rix[0][ 1] - rix[2][ 1];
p7 = rix[0][ 2] - rix[2][ 2];
p8 = rix[0][ 3] - rix[2][ 3];
p9 = rix[0][ 4] - rix[2][ 4];
float vn = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9;
p1v = LC2VFU(rix[2][-4]);
p2v = LC2VFU(rix[2][-3]);
p3v = LC2VFU(rix[2][-2]);
p4v = LC2VFU(rix[2][-1]);
p5v = LC2VFU(rix[2][ 0]);
p6v = LC2VFU(rix[2][ 1]);
p7v = LC2VFU(rix[2][ 2]);
p8v = LC2VFU(rix[2][ 3]);
p9v = LC2VFU(rix[2][ 4]);
muv = (p1v + p2v + p3v + p4v + p5v + p6v + p7v + p8v + p9v) / ninev;
vxv = epsv+SQRV(p1v-muv)+SQRV(p2v-muv)+SQRV(p3v-muv)+SQRV(p4v-muv)+SQRV(p5v-muv)+SQRV(p6v-muv)+SQRV(p7v-muv)+SQRV(p8v-muv)+SQRV(p9v-muv);
p1v -= LC2VFU(rix[0][-4]);
p2v -= LC2VFU(rix[0][-3]);
p3v -= LC2VFU(rix[0][-2]);
p4v -= LC2VFU(rix[0][-1]);
p5v -= LC2VFU(rix[0][ 0]);
p6v -= LC2VFU(rix[0][ 1]);
p7v -= LC2VFU(rix[0][ 2]);
p8v -= LC2VFU(rix[0][ 3]);
p9v -= LC2VFU(rix[0][ 4]);
vnv = epsv+SQRV(p1v)+SQRV(p2v)+SQRV(p3v)+SQRV(p4v)+SQRV(p5v)+SQRV(p6v)+SQRV(p7v)+SQRV(p8v)+SQRV(p9v);
xhv = (LC2VFU(rix[0][0])*vxv + LC2VFU(rix[2][0])*vnv)/(vxv + vnv);
vhv = vxv*vnv/(vxv + vnv);
// vertical
p1v = LC2VFU(rix[3][-w4]);
p2v = LC2VFU(rix[3][-w3]);
p3v = LC2VFU(rix[3][-w2]);
p4v = LC2VFU(rix[3][-w1]);
p5v = LC2VFU(rix[3][ 0]);
p6v = LC2VFU(rix[3][ w1]);
p7v = LC2VFU(rix[3][ w2]);
p8v = LC2VFU(rix[3][ w3]);
p9v = LC2VFU(rix[3][ w4]);
muv = (p1v + p2v + p3v + p4v + p5v + p6v + p7v + p8v + p9v) / ninev;
vxv = epsv+SQRV(p1v-muv)+SQRV(p2v-muv)+SQRV(p3v-muv)+SQRV(p4v-muv)+SQRV(p5v-muv)+SQRV(p6v-muv)+SQRV(p7v-muv)+SQRV(p8v-muv)+SQRV(p9v-muv);
p1v -= LC2VFU(rix[1][-w4]);
p2v -= LC2VFU(rix[1][-w3]);
p3v -= LC2VFU(rix[1][-w2]);
p4v -= LC2VFU(rix[1][-w1]);
p5v -= LC2VFU(rix[1][ 0]);
p6v -= LC2VFU(rix[1][ w1]);
p7v -= LC2VFU(rix[1][ w2]);
p8v -= LC2VFU(rix[1][ w3]);
p9v -= LC2VFU(rix[1][ w4]);
vnv = epsv+SQRV(p1v)+SQRV(p2v)+SQRV(p3v)+SQRV(p4v)+SQRV(p5v)+SQRV(p6v)+SQRV(p7v)+SQRV(p8v)+SQRV(p9v);
xvv = (LC2VFU(rix[1][0])*vxv + LC2VFU(rix[3][0])*vnv)/(vxv + vnv);
vvv = vxv*vnv/(vxv + vnv);
// interpolated G-R(B)
muv = (xhv*vvv + xvv*vhv)/(vhv + vvv);
STC2VFU(rix[4][0], muv);
}
#endif
for (; cc < cc1-4; cc+=2) {
rix[0] = qix[0] + rr*cc1 + cc;
rix[1] = qix[1] + rr*cc1 + cc;
rix[2] = qix[2] + rr*cc1 + cc;
rix[3] = qix[3] + rr*cc1 + cc;
rix[4] = qix[4] + rr*cc1 + cc;
// horizontal
float p1 = rix[2][-4];
float p2 = rix[2][-3];
float p3 = rix[2][-2];
float p4 = rix[2][-1];
float p5 = rix[2][ 0];
float p6 = rix[2][ 1];
float p7 = rix[2][ 2];
float p8 = rix[2][ 3];
float p9 = rix[2][ 4];
float mu = (p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9) / 9.f;
float vx = 1e-7+SQR(p1-mu)+SQR(p2-mu)+SQR(p3-mu)+SQR(p4-mu)+SQR(p5-mu)+SQR(p6-mu)+SQR(p7-mu)+SQR(p8-mu)+SQR(p9-mu);
p1 -= rix[0][-4];
p2 -= rix[0][-3];
p3 -= rix[0][-2];
p4 -= rix[0][-1];
p5 -= rix[0][ 0];
p6 -= rix[0][ 1];
p7 -= rix[0][ 2];
p8 -= rix[0][ 3];
p9 -= rix[0][ 4];
float vn = 1e-7+SQR(p1)+SQR(p2)+SQR(p3)+SQR(p4)+SQR(p5)+SQR(p6)+SQR(p7)+SQR(p8)+SQR(p9);
float xh = (rix[0][0]*vx + rix[2][0]*vn)/(vx + vn);
float vh = vx*vn/(vx + vn);
// vertical
mu = (rix[3][-w4] + rix[3][-w3] + rix[3][-w2] + rix[3][-w1] + rix[3][0]+rix[3][ w1] + rix[3][ w2] + rix[3][ w3] + rix[3][ w4]) / 9.0f;
p1 = rix[3][-w4] - mu;
p2 = rix[3][-w3] - mu;
p3 = rix[3][-w2] - mu;
p4 = rix[3][-w1] - mu;
p5 = rix[3][ 0] - mu;
p6 = rix[3][ w1] - mu;
p7 = rix[3][ w2] - mu;
p8 = rix[3][ w3] - mu;
p9 = rix[3][ w4] - mu;
vx = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9;
p1 = rix[1][-w4] - rix[3][-w4];
p2 = rix[1][-w3] - rix[3][-w3];
p3 = rix[1][-w2] - rix[3][-w2];
p4 = rix[1][-w1] - rix[3][-w1];
p5 = rix[1][ 0] - rix[3][ 0];
p6 = rix[1][ w1] - rix[3][ w1];
p7 = rix[1][ w2] - rix[3][ w2];
p8 = rix[1][ w3] - rix[3][ w3];
p9 = rix[1][ w4] - rix[3][ w4];
vn = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9;
p1 = rix[3][-w4];
p2 = rix[3][-w3];
p3 = rix[3][-w2];
p4 = rix[3][-w1];
p5 = rix[3][ 0];
p6 = rix[3][ w1];
p7 = rix[3][ w2];
p8 = rix[3][ w3];
p9 = rix[3][ w4];
mu = (p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9) / 9.f;
vx = 1e-7+SQR(p1-mu)+SQR(p2-mu)+SQR(p3-mu)+SQR(p4-mu)+SQR(p5-mu)+SQR(p6-mu)+SQR(p7-mu)+SQR(p8-mu)+SQR(p9-mu);
p1 -= rix[1][-w4];
p2 -= rix[1][-w3];
p3 -= rix[1][-w2];
p4 -= rix[1][-w1];
p5 -= rix[1][ 0];
p6 -= rix[1][ w1];
p7 -= rix[1][ w2];
p8 -= rix[1][ w3];
p9 -= rix[1][ w4];
vn = 1e-7+SQR(p1)+SQR(p2)+SQR(p3)+SQR(p4)+SQR(p5)+SQR(p6)+SQR(p7)+SQR(p8)+SQR(p9);
float xv = (rix[1][0]*vx + rix[3][0]*vn)/(vx + vn);
float vv = vx*vn/(vx + vn);
// interpolated G-R(B)
rix[4][0] = (xh*vv + xv*vh)/(vh + vv);
}
}
#ifdef _OPENMP
#pragma omp single
#endif
@@ -1421,7 +1482,6 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
if (plistener) plistener->setProgress (0.4);
}
// copy CFA values
#ifdef _OPENMP
#pragma omp for
@@ -1432,11 +1492,8 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
int c = FC(rr,cc);
rix[c] = qix[c] + rr*cc1 + cc;
if ((row >= 0) & (row < height) & (col >= 0) & (col < width)) {
if (applyGamma)
rix[c][0] = Color::gammatab_24_17a[image[c][(((row*width)>>(1-(c&1)))+col)>>1]];
else
rix[c][0] = (float)image[c][(((row*width)>>(1-(c&1)))+col)>>1]/65535.0f;
}
rix[c][0] = (*gamtab)[rawData[row][col]];
}
else
rix[c][0] = 0.f;
if (c != 1) {
@@ -1445,6 +1502,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
rix[1][0] = rix[c][0] + rix[4][0];
}
}
#ifdef _OPENMP
#pragma omp single
#endif
@@ -1452,7 +1510,6 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
if (plistener) plistener->setProgress (0.5);
}
// bilinear interpolation for R/B
// interpolate R/B at G location
#ifdef _OPENMP
@@ -1468,6 +1525,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
rix[c][0] = rix[1][0]+ xdiv2f(rix[c][-w1] - rix[1][-w1] + rix[c][w1] - rix[1][w1]);
c = 2 - c;
}
#ifdef _OPENMP
#pragma omp single
#endif
@@ -1475,7 +1533,6 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
if (plistener) plistener->setProgress (0.6);
}
// interpolate R/B at B/R location
#ifdef _OPENMP
#pragma omp for
@@ -1486,6 +1543,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
rix[1] = qix[1] + rr*cc1 + cc;
rix[c][0] = rix[1][0]+ x0250(rix[c][-w1] - rix[1][-w1] + rix[c][ -1] - rix[1][ -1]+ rix[c][ 1] - rix[1][ 1] + rix[c][ w1] - rix[1][ w1]);
}
#ifdef _OPENMP
#pragma omp single
#endif
@@ -1499,26 +1557,44 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
for (int pass=0; pass < iter; pass++) {
for (int c=0; c < 3; c+=2) {
// Compute median(R-G) and median(B-G)
int d = c + 3;
#ifdef _OPENMP
#pragma omp parallel for firstprivate (qix)
#endif
for (int ii=0; ii < rr1*cc1; ii++)
qix[4][ii] = qix[c][ii] - qix[1][ii];
int d = c + 3 - (c == 0 ? 0 : 1);
// Apply 3x3 median filter
#ifdef _OPENMP
#pragma omp parallel for firstprivate (rix,qix)
#pragma omp parallel for private(rix)
#endif
for (int rr=1; rr < rr1-1; rr++)
for (int cc=1; cc < cc1-1; cc++) {
for (int rr=1; rr < rr1-1; rr++) {
int cc=1;
#ifdef __SSE2__
__m128 p1v,p2v,p3v,p4v,p5v,p6v,p7v,p8v,p9v,tempv;
for (; cc < cc1-4; cc+=4) {
rix[d] = qix[d] + rr*cc1 + cc;
rix[c] = qix[c] + rr*cc1 + cc;
rix[1] = qix[1] + rr*cc1 + cc;
// Assign 3x3 differential color values
p1v = LVFU(rix[c][-w1-1])-LVFU(rix[1][-w1-1]); p2v = LVFU(rix[c][-w1])-LVFU(rix[1][-w1]); p3v = LVFU(rix[c][-w1+1])-LVFU(rix[1][-w1+1]);
p4v = LVFU(rix[c][ -1])-LVFU(rix[1][ -1]); p5v = LVFU(rix[c][ 0])-LVFU(rix[1][ 0]); p6v = LVFU(rix[c][ 1])-LVFU(rix[1][ 1]);
p7v = LVFU(rix[c][ w1-1])-LVFU(rix[1][ w1-1]); p8v = LVFU(rix[c][ w1])-LVFU(rix[1][ w1]); p9v = LVFU(rix[c][ w1+1])-LVFU(rix[1][ w1+1]);
// Sort for median of 9 values
PIX_SORTV(p2v,p3v); PIX_SORTV(p5v,p6v); PIX_SORTV(p8v,p9v);
PIX_SORTV(p1v,p2v); PIX_SORTV(p4v,p5v); PIX_SORTV(p7v,p8v);
PIX_SORTV(p2v,p3v); PIX_SORTV(p5v,p6v); PIX_SORTV(p8v,p9v);
p4v = _mm_max_ps(p1v,p4v); p6v = _mm_min_ps(p6v,p9v); PIX_SORTV(p5v,p8v);
p7v = _mm_max_ps(p4v,p7v); p5v = _mm_max_ps(p5v,p2v); p3v = _mm_min_ps(p3v,p6v);
p5v = _mm_min_ps(p5v,p8v); PIX_SORTV(p5v,p3v); p5v = _mm_max_ps(p7v,p5v);
p5v = _mm_min_ps(p3v,p5v);
_mm_storeu_ps(&rix[d][0], p5v);
}
#endif
for (; cc < cc1-1; cc++) {
float temp;
rix[d] = qix[d] + rr*cc1 + cc;
rix[4] = qix[4] + rr*cc1 + cc;
rix[c] = qix[c] + rr*cc1 + cc;
rix[1] = qix[1] + rr*cc1 + cc;
// Assign 3x3 differential color values
float p1 = rix[4][-w1-1]; float p2 = rix[4][-w1]; float p3 = rix[4][-w1+1];
float p4 = rix[4][ -1]; float p5 = rix[4][ 0]; float p6 = rix[4][ 1];
float p7 = rix[4][ w1-1]; float p8 = rix[4][ w1]; float p9 = rix[4][ w1+1];
float p1 = rix[c][-w1-1]-rix[1][-w1-1]; float p2 = rix[c][-w1]-rix[1][-w1]; float p3 = rix[c][-w1+1]-rix[1][-w1+1];
float p4 = rix[c][ -1]-rix[1][ -1]; float p5 = rix[c][ 0]-rix[1][ 0]; float p6 = rix[c][ 1]-rix[1][ 1];
float p7 = rix[c][ w1-1]-rix[1][ w1-1]; float p8 = rix[c][ w1]-rix[1][ w1]; float p9 = rix[c][ w1+1]-rix[1][ w1+1];
// Sort for median of 9 values
PIX_SORT(p2,p3); PIX_SORT(p5,p6); PIX_SORT(p8,p9);
PIX_SORT(p1,p2); PIX_SORT(p4,p5); PIX_SORT(p7,p8);
@@ -1529,104 +1605,99 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations)
PIX_SORT(p5,p3);
rix[d][0] = p5;
}
}
}
// red/blue at GREEN pixel locations
// red/blue at GREEN pixel locations & red/blue and green at BLUE/RED pixel locations
#ifdef _OPENMP
#pragma omp parallel for firstprivate (rix,qix)
#pragma omp parallel for private (rix)
#endif
for (int rr=0; rr < rr1; rr++)
for (int cc=(FC(rr,1)&1) /*, c=FC(rr,cc+1)*/; cc < cc1; cc+=2) {
rix[0] = qix[0] + rr*cc1 + cc;
rix[1] = qix[1] + rr*cc1 + cc;
rix[2] = qix[2] + rr*cc1 + cc;
rix[3] = qix[3] + rr*cc1 + cc;
rix[5] = qix[5] + rr*cc1 + cc;
rix[0][0] = rix[1][0] + rix[3][0];
rix[2][0] = rix[1][0] + rix[5][0];
}
// red/blue and green at BLUE/RED pixel locations
#ifdef _OPENMP
#pragma omp parallel for firstprivate (rix,qix)
#endif
for (int rr=0; rr < rr1; rr++)
for (int cc=(FC(rr,0)&1), c=2-FC(rr,cc),d=c+3; cc < cc1; cc+=2) {
rix[0] = qix[0] + rr*cc1 + cc;
rix[1] = qix[1] + rr*cc1 + cc;
rix[2] = qix[2] + rr*cc1 + cc;
rix[3] = qix[3] + rr*cc1 + cc;
rix[5] = qix[5] + rr*cc1 + cc;
rix[c][0] = rix[1][0] + rix[d][0];
rix[1][0] = 0.5f * (rix[0][0] - rix[3][0] + rix[2][0] - rix[5][0]);
for (int rr=0; rr < rr1; rr++) {
rix[0] = qix[0] + rr*cc1;
rix[1] = qix[1] + rr*cc1;
rix[2] = qix[2] + rr*cc1;
rix[3] = qix[3] + rr*cc1;
rix[4] = qix[4] + rr*cc1;
int c0 = FC(rr,0);
int c1 = FC(rr,1);
if(c0 == 1){
c1 = 2 - c1;
int d = c1 + 3 - (c1 == 0 ? 0 : 1);
int cc;
for (cc=0; cc < cc1-1; cc+=2) {
rix[0][0] = rix[1][0] + rix[3][0];
rix[2][0] = rix[1][0] + rix[4][0];
rix[0]++; rix[1]++; rix[2]++; rix[3]++; rix[4]++;
rix[c1][0] = rix[1][0] + rix[d][0];
rix[1][0] = 0.5f * (rix[0][0] - rix[3][0] + rix[2][0] - rix[4][0]);
rix[0]++; rix[1]++; rix[2]++; rix[3]++; rix[4]++;
}
if(cc < cc1) { // remaining pixel, only if width is odd
rix[0][0] = rix[1][0] + rix[3][0];
rix[2][0] = rix[1][0] + rix[4][0];
}
} else {
c0 = 2 - c0;
int d = c0 + 3 - (c0 == 0 ? 0 : 1);
int cc;
for (cc=0; cc < cc1-1; cc+=2) {
rix[c0][0] = rix[1][0] + rix[d][0];
rix[1][0] = 0.5f * (rix[0][0] - rix[3][0] + rix[2][0] - rix[4][0]);
rix[0]++; rix[1]++; rix[2]++; rix[3]++; rix[4]++;
rix[0][0] = rix[1][0] + rix[3][0];
rix[2][0] = rix[1][0] + rix[4][0];
rix[0]++; rix[1]++; rix[2]++; rix[3]++; rix[4]++;
}
if(cc < cc1) { // remaining pixel, only if width is odd
rix[c0][0] = rix[1][0] + rix[d][0];
rix[1][0] = 0.5f * (rix[0][0] - rix[3][0] + rix[2][0] - rix[4][0]);
}
}
}
}
if (plistener) plistener->setProgress (0.8);
#ifdef _OPENMP
#pragma omp parallel firstprivate (image,rix,qix)
#endif
{
if(applyGamma)
gamtab = &(Color::igammatab_24_17);
else {
for(int i=0;i<65536;i++)
(*gamtab)[i] = (float)i + 0.5f;
}
array2D<float> (*rgb[3]);
rgb[0] = &red;
rgb[1] = &green;
rgb[2] = &blue;
// copy result back to image matrix
#ifdef _OPENMP
#pragma omp for
#pragma omp parallel for
#endif
for (int row=0; row < height; row++) {
for (int col=0, rr=row+ba; col < width; col++) {
int cc = col+ba;
int c = FC(row,col);
array2D<float> *nonInterpolated;
if(c==0)
nonInterpolated = &red;
else if(c==1)
nonInterpolated = &green;
else
nonInterpolated = &blue;
if (applyGamma) {
for (int ii=0; ii < 3; ii++)
if (ii != c) {
array2D<float> *interpolated;
if(ii==0)
interpolated = &red;
else if(ii==1)
interpolated = &green;
else
interpolated = &blue;
rix[ii] = qix[ii] + rr*cc1 + cc;
float v0 = 65535.f*rix[ii][0];
(*interpolated)[row][col] = Color::igammatab_24_17[v0];
} else {
(*nonInterpolated)[row][col] = image[ii][(((row*width)>>(1-(ii&1)))+col)>>1];
}
}
else
for (int ii=0; ii < 3; ii++)
if (ii != c) {
array2D<float> *interpolated;
if(ii==0)
interpolated = &red;
else if(ii==1)
interpolated = &green;
else
interpolated = &blue;
rix[ii] = qix[ii] + rr*cc1 + cc;
(*interpolated)[row][col] = ((65535.0f*rix[ii][0] + 0.5f));
} else {
(*nonInterpolated)[row][col] = image[ii][(((row*width)>>(1-(ii&1)))+col)>>1];
}
for (int ii=0; ii < 3; ii++)
if (ii != c) {
float *rix = qix[ii] + rr*cc1 + cc;
(*(rgb[ii]))[row][col] = (*gamtab)[65535.f*rix[0]];
} else {
(*(rgb[ii]))[row][col] = CLIP(rawData[row][col]);
}
}
}
}
// End of parallelization 2
if (plistener) plistener->setProgress (1.0);
if(buffer)
free(buffer);
else
for(int i=0;i<6;i++)
for(int i=0;i<5;i++)
free(qix[i]);
free(imageBuffer);
//if(iterations > 4) refinement_lassus(passref);
if(!applyGamma)
delete gamtab;
if(iterations > 4 && iterations <=6) refinement(passref);
else if(iterations > 6) refinement_lassus(passref);
@@ -2451,7 +2522,10 @@ void RawImageSource::nodemosaic(bool bw)
Adapted for Rawtherapee - Jacques Desmis 04/2013
*/
void RawImageSource::refinement(int PassCount)
#ifdef __SSE2__
#define CLIPV(a) LIMV(a,ZEROV,c65535v)
#endif
SSEFUNCTION void RawImageSource::refinement(int PassCount)
{
MyTime t1e,t2e;
t1e.set();
@@ -2485,8 +2559,27 @@ void RawImageSource::refinement(int PassCount)
#ifdef _OPENMP
#pragma omp for
#endif
for (int row=2; row < height-2; row++)
for (int col=2+(FC(row,2) & 1), c=FC(row,col); col < width-2; col+=2) {
for (int row=2; row < height-2; row++) {
int col = 2+(FC(row,2) & 1);
int c = FC(row,col);
#ifdef __SSE2__
__m128 dLv, dRv, dUv, dDv, v0v;
__m128 onev = _mm_set1_ps(1.f);
__m128 zd5v = _mm_set1_ps(0.5f);
__m128 c65535v = _mm_set1_ps(65535.f);
for (; col < width-8; col+=8) {
int indx = row*width+col;
pix[c] = (float*)(*rgb[c]) + indx;
pix[1] = (float*)(*rgb[1]) + indx;
dLv = onev/(onev+vabsf(LC2VFU(pix[c][ -2])-LC2VFU(pix[c][0]))+vabsf(LC2VFU(pix[1][ 1])-LC2VFU(pix[1][ -1])));
dRv = onev/(onev+vabsf(LC2VFU(pix[c][ 2])-LC2VFU(pix[c][0]))+vabsf(LC2VFU(pix[1][ 1])-LC2VFU(pix[1][ -1])));
dUv = onev/(onev+vabsf(LC2VFU(pix[c][-w2])-LC2VFU(pix[c][0]))+vabsf(LC2VFU(pix[1][w1])-LC2VFU(pix[1][-w1])));
dDv = onev/(onev+vabsf(LC2VFU(pix[c][ w2])-LC2VFU(pix[c][0]))+vabsf(LC2VFU(pix[1][w1])-LC2VFU(pix[1][-w1])));
v0v = CLIPV(LC2VFU(pix[c][0]) + zd5v +((LC2VFU(pix[1][-1])-LC2VFU(pix[c][-1]))*dLv +(LC2VFU(pix[1][1])-LC2VFU(pix[c][1]))*dRv +(LC2VFU(pix[1][-w1])-LC2VFU(pix[c][-w1]))*dUv + (LC2VFU(pix[1][w1])-LC2VFU(pix[c][w1]))*dDv ) / (dLv+dRv+dUv+dDv));
STC2VFU(pix[1][0],v0v);
}
#endif
for (; col < width-2; col+=2) {
int indx = row*width+col;
pix[c] = (float*)(*rgb[c]) + indx;
pix[1] = (float*)(*rgb[1]) + indx;
@@ -2497,13 +2590,34 @@ void RawImageSource::refinement(int PassCount)
float v0 = (pix[c][0] + 0.5f +((pix[1][ -1]-pix[c][ -1])*dL +(pix[1][ 1]-pix[c][ 1])*dR +(pix[1][-w1]-pix[c][-w1])*dU + (pix[1][ w1]-pix[c][ w1])*dD ) / (dL+dR+dU+dD));
pix[1][0] = CLIP(v0);
}
}
/* Reinforce interpolated red/blue pixels on GREEN pixel locations */
#ifdef _OPENMP
#pragma omp for
#endif
for (int row=2; row < height-2; row++)
for (int col=2+(FC(row,3) & 1), c=FC(row,col+1); col < width-2; col+=2) {
for (int row=2; row < height-2; row++) {
int col = 2+(FC(row,3) & 1);
int c = FC(row,col+1);
#ifdef __SSE2__
__m128 dLv, dRv, dUv, dDv, v0v;
__m128 onev = _mm_set1_ps(1.f);
__m128 zd5v = _mm_set1_ps(0.5f);
__m128 c65535v = _mm_set1_ps(65535.f);
for (; col < width-8; col+=8) {
int indx = row*width+col;
pix[1] = (float*)(*rgb[1]) + indx;
for (int i=0; i < 2; c=2-c, i++) {
pix[c] = (float*)(*rgb[c]) + indx;
dLv = onev/(onev+vabsf(LC2VFU(pix[1][ -2])-LC2VFU(pix[1][0]))+vabsf(LC2VFU(pix[c][ 1])-LC2VFU(pix[c][ -1])));
dRv = onev/(onev+vabsf(LC2VFU(pix[1][ 2])-LC2VFU(pix[1][0]))+vabsf(LC2VFU(pix[c][ 1])-LC2VFU(pix[c][ -1])));
dUv = onev/(onev+vabsf(LC2VFU(pix[1][-w2])-LC2VFU(pix[1][0]))+vabsf(LC2VFU(pix[c][w1])-LC2VFU(pix[c][-w1])));
dDv = onev/(onev+vabsf(LC2VFU(pix[1][ w2])-LC2VFU(pix[1][0]))+vabsf(LC2VFU(pix[c][w1])-LC2VFU(pix[c][-w1])));
v0v = CLIPV(LC2VFU(pix[1][0]) + zd5v -((LC2VFU(pix[1][-1])-LC2VFU(pix[c][-1]))*dLv + (LC2VFU(pix[1][1])-LC2VFU(pix[c][1]))*dRv +(LC2VFU(pix[1][-w1])-LC2VFU(pix[c][-w1]))*dUv + (LC2VFU(pix[1][w1])-LC2VFU(pix[c][w1]))*dDv ) / (dLv+dRv+dUv+dDv));
STC2VFU(pix[c][0],v0v);
}
}
#endif
for (; col < width-2; col+=2) {
int indx = row*width+col;
pix[1] = (float*)(*rgb[1]) + indx;
for (int i=0; i < 2; c=2-c, i++) {
@@ -2516,13 +2630,34 @@ void RawImageSource::refinement(int PassCount)
pix[c][0] = CLIP(v0);
}
}
}
/* Reinforce integrated red/blue pixels on BLUE/RED pixel locations */
#ifdef _OPENMP
#pragma omp for
#endif
for (int row=2; row < height-2; row++)
for (int col=2+(FC(row,2) & 1), c=2-FC(row,col); col < width-2; col+=2) {
for (int row=2; row < height-2; row++) {
int col = 2+(FC(row,2) & 1);
int c = 2-FC(row,col);
#ifdef __SSE2__
__m128 dLv, dRv, dUv, dDv, v0v;
__m128 onev = _mm_set1_ps(1.f);
__m128 zd5v = _mm_set1_ps(0.5f);
__m128 c65535v = _mm_set1_ps(65535.f);
for (; col < width-8; col+=8) {
int indx = row*width+col;
pix[0] = (float*)(*rgb[0]) + indx;
pix[1] = (float*)(*rgb[1]) + indx;
pix[2] = (float*)(*rgb[2]) + indx;
int d = 2 - c;
dLv = onev/(onev+vabsf(LC2VFU(pix[d][ -2])-LC2VFU(pix[d][0]))+vabsf(LC2VFU(pix[1][ 1])-LC2VFU(pix[1][ -1])));
dRv = onev/(onev+vabsf(LC2VFU(pix[d][ 2])-LC2VFU(pix[d][0]))+vabsf(LC2VFU(pix[1][ 1])-LC2VFU(pix[1][ -1])));
dUv = onev/(onev+vabsf(LC2VFU(pix[d][-w2])-LC2VFU(pix[d][0]))+vabsf(LC2VFU(pix[1][w1])-LC2VFU(pix[1][-w1])));
dDv = onev/(onev+vabsf(LC2VFU(pix[d][ w2])-LC2VFU(pix[d][0]))+vabsf(LC2VFU(pix[1][w1])-LC2VFU(pix[1][-w1])));
v0v = CLIPV(LC2VFU(pix[1][0]) + zd5v -((LC2VFU(pix[1][-1])-LC2VFU(pix[c][-1]))*dLv +(LC2VFU(pix[1][1])-LC2VFU(pix[c][1]))*dRv +(LC2VFU(pix[1][-w1])-LC2VFU(pix[c][-w1]))*dUv + (LC2VFU(pix[1][w1])-LC2VFU(pix[c][w1]))*dDv ) / (dLv+dRv+dUv+dDv));
STC2VFU(pix[c][0],v0v);
}
#endif
for (; col < width-2; col+=2) {
int indx = row*width+col;
pix[0] = (float*)(*rgb[0]) + indx;
pix[1] = (float*)(*rgb[1]) + indx;
@@ -2535,11 +2670,15 @@ void RawImageSource::refinement(int PassCount)
float v0 = (pix[1][0] + 0.5f -((pix[1][ -1]-pix[c][ -1])*dL +(pix[1][ 1]-pix[c][ 1])*dR +(pix[1][-w1]-pix[c][-w1])*dU + (pix[1][ w1]-pix[c][ w1])*dD ) / (dL+dR+dU+dD));
pix[c][0] = CLIP(v0);
}
}
} // end parallel
}
t2e.set();
if (settings->verbose) printf("Refinement Lee %d usec\n", t2e.etime(t1e));
}
#ifdef __SSE2__
#undef CLIPV
#endif
// Refinement based on EECI demozaicing algorithm by L. Chang and Y.P. Tan