diff --git a/rtengine/color.cc b/rtengine/color.cc index c0e6bdb0f..7612c471a 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -156,7 +156,7 @@ namespace rtengine { igammatab_26_11(65536,0); gammatab_26_11(65536,0); igammatab_24_17(65536,0); - gammatab_24_17a(65536,0); + gammatab_24_17a(65536,LUT_CLIP_ABOVE | LUT_CLIP_BELOW); gammatab_13_2(65536,0); for (int i=0; i<65536; i++) diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc index 17a0b56b1..f218f700a 100644 --- a/rtengine/demosaic_algos.cc +++ b/rtengine/demosaic_algos.cc @@ -58,6 +58,7 @@ namespace rtengine { #define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} } +#define PIX_SORTV(av,bv) tempv = _mm_min_ps(av,bv); bv = _mm_max_ps(av,bv); av = tempv; extern const Settings* settings; //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -1185,13 +1186,10 @@ void RawImageSource::jdl_interpolate_omp() // from "Lassus" // IEEE Trans. on Image Processing, vol. 14, pp. 2167-2178, // Dec. 2005. // Adapted to RT by Jacques Desmis 3/2013 +// Improved speed and reduced memory consumption by Ingo Weyrich 2/2015 //TODO Tiles to reduce memory consumption -void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) +SSEFUNCTION void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) { - int c, ii; - float h0, h1, h2, h3, h4, hs; - - const int width=winw, height=winh; const int ba = 10; const int rr1 = height + 2*ba; @@ -1200,7 +1198,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) const int w2 = 2*w1; const int w3 = 3*w1; const int w4 = 4*w1; - int iter; + float h0, h1, h2, h3, h4, hs; h0 = 1.0f; h1 = exp( -1.0f/8.0f); h2 = exp( -4.0f/8.0f); @@ -1213,77 +1211,73 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) h3 /= hs; h4 /= hs; int passref; - if(iterations <=4) {iter = iterations-1;passref=0;} - else if (iterations <=6){iter=3;passref=iterations-4;} - else if (iterations <=8){iter=3;passref=iterations-6;} + int iter; + if(iterations <=4) { + iter = iterations-1; + passref=0; + } else if (iterations <=6) { + iter=3; + passref=iterations-4; + } else if (iterations <=8) { + iter=3; + passref=iterations-6; + } bool applyGamma=true; - if(iterations==0) {applyGamma=false;iter=0;} else applyGamma=true; + if(iterations==0) { + applyGamma=false; + iter=0; + } else + applyGamma=true; + + LUTf *gamtab; + if(applyGamma) + gamtab = &(Color::gammatab_24_17a); + else { + gamtab = new LUTf(65536, LUT_CLIP_ABOVE | LUT_CLIP_BELOW); + for(int i=0;i<65536;i++) + (*gamtab)[i] = (float)i / 65535.f; + } if (plistener) { plistener->setProgressStr (Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), RAWParams::BayerSensor::methodstring[RAWParams::BayerSensor::lmmse])); plistener->setProgress (0.0); } - float *imageBuffer = (float*)malloc (((((width+1)/2)*height) + (((height+1)/2)*(width+1))) * sizeof *imageBuffer); - float *image[3]; - image[0] = imageBuffer; - image[1] = imageBuffer + ((height+1)/2)*((width+1)/2); - image[2] = imageBuffer + ((height+1)/2)*((width+1)/2) + (((width+1)/2)*height); - - - unsigned int a=0; - - float *rix[6]; - float *qix[6]; - float *buffer = (float *)calloc(rr1*cc1*6*sizeof(float),1); + float *rix[5]; + float *qix[5]; + float *buffer = (float *)calloc(rr1*cc1*5*sizeof(float),1); if(buffer == NULL) { // allocation of big block of memory failed, try to get 6 smaller ones - printf("lmmse_interpolate_omp: allocation of big memory block failed, try to get 6 smaller ones now...\n"); - for(int i=0;i<6;i++) + printf("lmmse_interpolate_omp: allocation of big memory block failed, try to get 5 smaller ones now...\n"); + for(int i=0;i<5;i++) qix[i] = (float *)calloc(rr1*cc1*sizeof(float),1); } else { qix[0] = buffer; - for(int i=1;i<6;i++) + for(int i=1;i<5;i++) qix[i] = qix[i-1] + rr1*cc1; } #ifdef _OPENMP -#pragma omp parallel for -#endif - for (int row=0; row>(1-(c&1)))+col)>>1] = CLIP(rawData[row][col]); - } - -{ - if (plistener) plistener->setProgress (0.1); -} - -#ifdef _OPENMP -#pragma omp parallel firstprivate (image,rix,qix,h0,h1,h2,h3,h4) +#pragma omp parallel private(rix) #endif { #ifdef _OPENMP #pragma omp for #endif - for (int rrr=ba; rrr < rr1-ba; rrr++) + for (int rrr=ba; rrr < rr1-ba; rrr++) { for (int ccc=ba, row=rrr-ba; ccc < cc1-ba; ccc++) { int col = ccc - ba; - rix[4] = qix[4] + rrr*cc1 + ccc; - int c = fc(row,col); - if (applyGamma) - rix[4][0] = Color::gammatab_24_17a[image[c][(((row*width)>>(1-(c&1)))+col)>>1]]; - else - rix[4][0] = (float)image[c][(((row*width)>>(1-(c&1)))+col)>>1]/65535.0f; + float *rix = qix[4] + rrr*cc1 + ccc; + rix[0] = (*gamtab)[rawData[row][col]]; } + } + #ifdef _OPENMP #pragma omp single #endif { - if (plistener) plistener->setProgress (0.2); + if (plistener) plistener->setProgress (0.1); } - // G-R(B) #ifdef _OPENMP #pragma omp for schedule(dynamic,16) @@ -1327,7 +1321,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) #pragma omp single #endif { - if (plistener) plistener->setProgress (0.25); + if (plistener) plistener->setProgress (0.2); } @@ -1351,69 +1345,136 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) if (plistener) plistener->setProgress (0.3); } - // interpolate G-R(B) at R(B) #ifdef _OPENMP -#pragma omp for +#pragma omp for #endif - for (int rr=4; rr < rr1-4; rr++) - for (int cc=4+(FC(rr,4)&1); cc < cc1-4; cc+=2) { + for (int rr=4; rr < rr1-4; rr++) { + int cc=4+(FC(rr,4)&1); +#ifdef __SSE2__ + __m128 p1v, p2v, p3v, p4v, p5v, p6v, p7v, p8v, p9v, muv, vxv, vnv, xhv, vhv, xvv, vvv; + __m128 epsv = _mm_set1_ps(1e-7); + __m128 ninev = _mm_set1_ps(9.f); + for (; cc < cc1-10; cc+=8) { rix[0] = qix[0] + rr*cc1 + cc; rix[1] = qix[1] + rr*cc1 + cc; rix[2] = qix[2] + rr*cc1 + cc; rix[3] = qix[3] + rr*cc1 + cc; rix[4] = qix[4] + rr*cc1 + cc; // horizontal - float mu = (rix[2][-4] + rix[2][-3] + rix[2][-2] + rix[2][-1] + rix[2][0]+ rix[2][ 1] + rix[2][ 2] + rix[2][ 3] + rix[2][ 4]) / 9.0f; - float p1 = rix[2][-4] - mu; - float p2 = rix[2][-3] - mu; - float p3 = rix[2][-2] - mu; - float p4 = rix[2][-1] - mu; - float p5 = rix[2][ 0] - mu; - float p6 = rix[2][ 1] - mu; - float p7 = rix[2][ 2] - mu; - float p8 = rix[2][ 3] - mu; - float p9 = rix[2][ 4] - mu; - float vx = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9; - p1 = rix[0][-4] - rix[2][-4]; - p2 = rix[0][-3] - rix[2][-3]; - p3 = rix[0][-2] - rix[2][-2]; - p4 = rix[0][-1] - rix[2][-1]; - p5 = rix[0][ 0] - rix[2][ 0]; - p6 = rix[0][ 1] - rix[2][ 1]; - p7 = rix[0][ 2] - rix[2][ 2]; - p8 = rix[0][ 3] - rix[2][ 3]; - p9 = rix[0][ 4] - rix[2][ 4]; - float vn = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9; + p1v = LC2VFU(rix[2][-4]); + p2v = LC2VFU(rix[2][-3]); + p3v = LC2VFU(rix[2][-2]); + p4v = LC2VFU(rix[2][-1]); + p5v = LC2VFU(rix[2][ 0]); + p6v = LC2VFU(rix[2][ 1]); + p7v = LC2VFU(rix[2][ 2]); + p8v = LC2VFU(rix[2][ 3]); + p9v = LC2VFU(rix[2][ 4]); + muv = (p1v + p2v + p3v + p4v + p5v + p6v + p7v + p8v + p9v) / ninev; + vxv = epsv+SQRV(p1v-muv)+SQRV(p2v-muv)+SQRV(p3v-muv)+SQRV(p4v-muv)+SQRV(p5v-muv)+SQRV(p6v-muv)+SQRV(p7v-muv)+SQRV(p8v-muv)+SQRV(p9v-muv); + p1v -= LC2VFU(rix[0][-4]); + p2v -= LC2VFU(rix[0][-3]); + p3v -= LC2VFU(rix[0][-2]); + p4v -= LC2VFU(rix[0][-1]); + p5v -= LC2VFU(rix[0][ 0]); + p6v -= LC2VFU(rix[0][ 1]); + p7v -= LC2VFU(rix[0][ 2]); + p8v -= LC2VFU(rix[0][ 3]); + p9v -= LC2VFU(rix[0][ 4]); + vnv = epsv+SQRV(p1v)+SQRV(p2v)+SQRV(p3v)+SQRV(p4v)+SQRV(p5v)+SQRV(p6v)+SQRV(p7v)+SQRV(p8v)+SQRV(p9v); + xhv = (LC2VFU(rix[0][0])*vxv + LC2VFU(rix[2][0])*vnv)/(vxv + vnv); + vhv = vxv*vnv/(vxv + vnv); + + // vertical + p1v = LC2VFU(rix[3][-w4]); + p2v = LC2VFU(rix[3][-w3]); + p3v = LC2VFU(rix[3][-w2]); + p4v = LC2VFU(rix[3][-w1]); + p5v = LC2VFU(rix[3][ 0]); + p6v = LC2VFU(rix[3][ w1]); + p7v = LC2VFU(rix[3][ w2]); + p8v = LC2VFU(rix[3][ w3]); + p9v = LC2VFU(rix[3][ w4]); + muv = (p1v + p2v + p3v + p4v + p5v + p6v + p7v + p8v + p9v) / ninev; + vxv = epsv+SQRV(p1v-muv)+SQRV(p2v-muv)+SQRV(p3v-muv)+SQRV(p4v-muv)+SQRV(p5v-muv)+SQRV(p6v-muv)+SQRV(p7v-muv)+SQRV(p8v-muv)+SQRV(p9v-muv); + p1v -= LC2VFU(rix[1][-w4]); + p2v -= LC2VFU(rix[1][-w3]); + p3v -= LC2VFU(rix[1][-w2]); + p4v -= LC2VFU(rix[1][-w1]); + p5v -= LC2VFU(rix[1][ 0]); + p6v -= LC2VFU(rix[1][ w1]); + p7v -= LC2VFU(rix[1][ w2]); + p8v -= LC2VFU(rix[1][ w3]); + p9v -= LC2VFU(rix[1][ w4]); + vnv = epsv+SQRV(p1v)+SQRV(p2v)+SQRV(p3v)+SQRV(p4v)+SQRV(p5v)+SQRV(p6v)+SQRV(p7v)+SQRV(p8v)+SQRV(p9v); + xvv = (LC2VFU(rix[1][0])*vxv + LC2VFU(rix[3][0])*vnv)/(vxv + vnv); + vvv = vxv*vnv/(vxv + vnv); + // interpolated G-R(B) + muv = (xhv*vvv + xvv*vhv)/(vhv + vvv); + STC2VFU(rix[4][0], muv); + } +#endif + for (; cc < cc1-4; cc+=2) { + rix[0] = qix[0] + rr*cc1 + cc; + rix[1] = qix[1] + rr*cc1 + cc; + rix[2] = qix[2] + rr*cc1 + cc; + rix[3] = qix[3] + rr*cc1 + cc; + rix[4] = qix[4] + rr*cc1 + cc; + // horizontal + float p1 = rix[2][-4]; + float p2 = rix[2][-3]; + float p3 = rix[2][-2]; + float p4 = rix[2][-1]; + float p5 = rix[2][ 0]; + float p6 = rix[2][ 1]; + float p7 = rix[2][ 2]; + float p8 = rix[2][ 3]; + float p9 = rix[2][ 4]; + float mu = (p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9) / 9.f; + float vx = 1e-7+SQR(p1-mu)+SQR(p2-mu)+SQR(p3-mu)+SQR(p4-mu)+SQR(p5-mu)+SQR(p6-mu)+SQR(p7-mu)+SQR(p8-mu)+SQR(p9-mu); + p1 -= rix[0][-4]; + p2 -= rix[0][-3]; + p3 -= rix[0][-2]; + p4 -= rix[0][-1]; + p5 -= rix[0][ 0]; + p6 -= rix[0][ 1]; + p7 -= rix[0][ 2]; + p8 -= rix[0][ 3]; + p9 -= rix[0][ 4]; + float vn = 1e-7+SQR(p1)+SQR(p2)+SQR(p3)+SQR(p4)+SQR(p5)+SQR(p6)+SQR(p7)+SQR(p8)+SQR(p9); float xh = (rix[0][0]*vx + rix[2][0]*vn)/(vx + vn); float vh = vx*vn/(vx + vn); + // vertical - mu = (rix[3][-w4] + rix[3][-w3] + rix[3][-w2] + rix[3][-w1] + rix[3][0]+rix[3][ w1] + rix[3][ w2] + rix[3][ w3] + rix[3][ w4]) / 9.0f; - p1 = rix[3][-w4] - mu; - p2 = rix[3][-w3] - mu; - p3 = rix[3][-w2] - mu; - p4 = rix[3][-w1] - mu; - p5 = rix[3][ 0] - mu; - p6 = rix[3][ w1] - mu; - p7 = rix[3][ w2] - mu; - p8 = rix[3][ w3] - mu; - p9 = rix[3][ w4] - mu; - vx = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9; - p1 = rix[1][-w4] - rix[3][-w4]; - p2 = rix[1][-w3] - rix[3][-w3]; - p3 = rix[1][-w2] - rix[3][-w2]; - p4 = rix[1][-w1] - rix[3][-w1]; - p5 = rix[1][ 0] - rix[3][ 0]; - p6 = rix[1][ w1] - rix[3][ w1]; - p7 = rix[1][ w2] - rix[3][ w2]; - p8 = rix[1][ w3] - rix[3][ w3]; - p9 = rix[1][ w4] - rix[3][ w4]; - vn = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9; + p1 = rix[3][-w4]; + p2 = rix[3][-w3]; + p3 = rix[3][-w2]; + p4 = rix[3][-w1]; + p5 = rix[3][ 0]; + p6 = rix[3][ w1]; + p7 = rix[3][ w2]; + p8 = rix[3][ w3]; + p9 = rix[3][ w4]; + mu = (p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9) / 9.f; + vx = 1e-7+SQR(p1-mu)+SQR(p2-mu)+SQR(p3-mu)+SQR(p4-mu)+SQR(p5-mu)+SQR(p6-mu)+SQR(p7-mu)+SQR(p8-mu)+SQR(p9-mu); + p1 -= rix[1][-w4]; + p2 -= rix[1][-w3]; + p3 -= rix[1][-w2]; + p4 -= rix[1][-w1]; + p5 -= rix[1][ 0]; + p6 -= rix[1][ w1]; + p7 -= rix[1][ w2]; + p8 -= rix[1][ w3]; + p9 -= rix[1][ w4]; + vn = 1e-7+SQR(p1)+SQR(p2)+SQR(p3)+SQR(p4)+SQR(p5)+SQR(p6)+SQR(p7)+SQR(p8)+SQR(p9); float xv = (rix[1][0]*vx + rix[3][0]*vn)/(vx + vn); float vv = vx*vn/(vx + vn); // interpolated G-R(B) rix[4][0] = (xh*vv + xv*vh)/(vh + vv); } + } + #ifdef _OPENMP #pragma omp single #endif @@ -1421,7 +1482,6 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) if (plistener) plistener->setProgress (0.4); } - // copy CFA values #ifdef _OPENMP #pragma omp for @@ -1432,11 +1492,8 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) int c = FC(rr,cc); rix[c] = qix[c] + rr*cc1 + cc; if ((row >= 0) & (row < height) & (col >= 0) & (col < width)) { - if (applyGamma) - rix[c][0] = Color::gammatab_24_17a[image[c][(((row*width)>>(1-(c&1)))+col)>>1]]; - else - rix[c][0] = (float)image[c][(((row*width)>>(1-(c&1)))+col)>>1]/65535.0f; - } + rix[c][0] = (*gamtab)[rawData[row][col]]; + } else rix[c][0] = 0.f; if (c != 1) { @@ -1445,6 +1502,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) rix[1][0] = rix[c][0] + rix[4][0]; } } + #ifdef _OPENMP #pragma omp single #endif @@ -1452,7 +1510,6 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) if (plistener) plistener->setProgress (0.5); } - // bilinear interpolation for R/B // interpolate R/B at G location #ifdef _OPENMP @@ -1468,6 +1525,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) rix[c][0] = rix[1][0]+ xdiv2f(rix[c][-w1] - rix[1][-w1] + rix[c][w1] - rix[1][w1]); c = 2 - c; } + #ifdef _OPENMP #pragma omp single #endif @@ -1475,7 +1533,6 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) if (plistener) plistener->setProgress (0.6); } - // interpolate R/B at B/R location #ifdef _OPENMP #pragma omp for @@ -1486,6 +1543,7 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) rix[1] = qix[1] + rr*cc1 + cc; rix[c][0] = rix[1][0]+ x0250(rix[c][-w1] - rix[1][-w1] + rix[c][ -1] - rix[1][ -1]+ rix[c][ 1] - rix[1][ 1] + rix[c][ w1] - rix[1][ w1]); } + #ifdef _OPENMP #pragma omp single #endif @@ -1499,26 +1557,44 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) for (int pass=0; pass < iter; pass++) { for (int c=0; c < 3; c+=2) { // Compute median(R-G) and median(B-G) - int d = c + 3; - -#ifdef _OPENMP -#pragma omp parallel for firstprivate (qix) -#endif - for (int ii=0; ii < rr1*cc1; ii++) - qix[4][ii] = qix[c][ii] - qix[1][ii]; + int d = c + 3 - (c == 0 ? 0 : 1); // Apply 3x3 median filter + #ifdef _OPENMP -#pragma omp parallel for firstprivate (rix,qix) +#pragma omp parallel for private(rix) #endif - for (int rr=1; rr < rr1-1; rr++) - for (int cc=1; cc < cc1-1; cc++) { + for (int rr=1; rr < rr1-1; rr++) { + int cc=1; +#ifdef __SSE2__ + __m128 p1v,p2v,p3v,p4v,p5v,p6v,p7v,p8v,p9v,tempv; + for (; cc < cc1-4; cc+=4) { + rix[d] = qix[d] + rr*cc1 + cc; + rix[c] = qix[c] + rr*cc1 + cc; + rix[1] = qix[1] + rr*cc1 + cc; + // Assign 3x3 differential color values + p1v = LVFU(rix[c][-w1-1])-LVFU(rix[1][-w1-1]); p2v = LVFU(rix[c][-w1])-LVFU(rix[1][-w1]); p3v = LVFU(rix[c][-w1+1])-LVFU(rix[1][-w1+1]); + p4v = LVFU(rix[c][ -1])-LVFU(rix[1][ -1]); p5v = LVFU(rix[c][ 0])-LVFU(rix[1][ 0]); p6v = LVFU(rix[c][ 1])-LVFU(rix[1][ 1]); + p7v = LVFU(rix[c][ w1-1])-LVFU(rix[1][ w1-1]); p8v = LVFU(rix[c][ w1])-LVFU(rix[1][ w1]); p9v = LVFU(rix[c][ w1+1])-LVFU(rix[1][ w1+1]); + // Sort for median of 9 values + PIX_SORTV(p2v,p3v); PIX_SORTV(p5v,p6v); PIX_SORTV(p8v,p9v); + PIX_SORTV(p1v,p2v); PIX_SORTV(p4v,p5v); PIX_SORTV(p7v,p8v); + PIX_SORTV(p2v,p3v); PIX_SORTV(p5v,p6v); PIX_SORTV(p8v,p9v); + p4v = _mm_max_ps(p1v,p4v); p6v = _mm_min_ps(p6v,p9v); PIX_SORTV(p5v,p8v); + p7v = _mm_max_ps(p4v,p7v); p5v = _mm_max_ps(p5v,p2v); p3v = _mm_min_ps(p3v,p6v); + p5v = _mm_min_ps(p5v,p8v); PIX_SORTV(p5v,p3v); p5v = _mm_max_ps(p7v,p5v); + p5v = _mm_min_ps(p3v,p5v); + _mm_storeu_ps(&rix[d][0], p5v); + } +#endif + for (; cc < cc1-1; cc++) { float temp; rix[d] = qix[d] + rr*cc1 + cc; - rix[4] = qix[4] + rr*cc1 + cc; + rix[c] = qix[c] + rr*cc1 + cc; + rix[1] = qix[1] + rr*cc1 + cc; // Assign 3x3 differential color values - float p1 = rix[4][-w1-1]; float p2 = rix[4][-w1]; float p3 = rix[4][-w1+1]; - float p4 = rix[4][ -1]; float p5 = rix[4][ 0]; float p6 = rix[4][ 1]; - float p7 = rix[4][ w1-1]; float p8 = rix[4][ w1]; float p9 = rix[4][ w1+1]; + float p1 = rix[c][-w1-1]-rix[1][-w1-1]; float p2 = rix[c][-w1]-rix[1][-w1]; float p3 = rix[c][-w1+1]-rix[1][-w1+1]; + float p4 = rix[c][ -1]-rix[1][ -1]; float p5 = rix[c][ 0]-rix[1][ 0]; float p6 = rix[c][ 1]-rix[1][ 1]; + float p7 = rix[c][ w1-1]-rix[1][ w1-1]; float p8 = rix[c][ w1]-rix[1][ w1]; float p9 = rix[c][ w1+1]-rix[1][ w1+1]; // Sort for median of 9 values PIX_SORT(p2,p3); PIX_SORT(p5,p6); PIX_SORT(p8,p9); PIX_SORT(p1,p2); PIX_SORT(p4,p5); PIX_SORT(p7,p8); @@ -1529,104 +1605,99 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) PIX_SORT(p5,p3); rix[d][0] = p5; } + } } - // red/blue at GREEN pixel locations + + // red/blue at GREEN pixel locations & red/blue and green at BLUE/RED pixel locations #ifdef _OPENMP -#pragma omp parallel for firstprivate (rix,qix) +#pragma omp parallel for private (rix) #endif - for (int rr=0; rr < rr1; rr++) - for (int cc=(FC(rr,1)&1) /*, c=FC(rr,cc+1)*/; cc < cc1; cc+=2) { - rix[0] = qix[0] + rr*cc1 + cc; - rix[1] = qix[1] + rr*cc1 + cc; - rix[2] = qix[2] + rr*cc1 + cc; - rix[3] = qix[3] + rr*cc1 + cc; - rix[5] = qix[5] + rr*cc1 + cc; - rix[0][0] = rix[1][0] + rix[3][0]; - rix[2][0] = rix[1][0] + rix[5][0]; - } - // red/blue and green at BLUE/RED pixel locations -#ifdef _OPENMP -#pragma omp parallel for firstprivate (rix,qix) -#endif - for (int rr=0; rr < rr1; rr++) - for (int cc=(FC(rr,0)&1), c=2-FC(rr,cc),d=c+3; cc < cc1; cc+=2) { - rix[0] = qix[0] + rr*cc1 + cc; - rix[1] = qix[1] + rr*cc1 + cc; - rix[2] = qix[2] + rr*cc1 + cc; - rix[3] = qix[3] + rr*cc1 + cc; - rix[5] = qix[5] + rr*cc1 + cc; - rix[c][0] = rix[1][0] + rix[d][0]; - rix[1][0] = 0.5f * (rix[0][0] - rix[3][0] + rix[2][0] - rix[5][0]); + for (int rr=0; rr < rr1; rr++) { + rix[0] = qix[0] + rr*cc1; + rix[1] = qix[1] + rr*cc1; + rix[2] = qix[2] + rr*cc1; + rix[3] = qix[3] + rr*cc1; + rix[4] = qix[4] + rr*cc1; + int c0 = FC(rr,0); + int c1 = FC(rr,1); + if(c0 == 1){ + c1 = 2 - c1; + int d = c1 + 3 - (c1 == 0 ? 0 : 1); + int cc; + for (cc=0; cc < cc1-1; cc+=2) { + rix[0][0] = rix[1][0] + rix[3][0]; + rix[2][0] = rix[1][0] + rix[4][0]; + rix[0]++; rix[1]++; rix[2]++; rix[3]++; rix[4]++; + rix[c1][0] = rix[1][0] + rix[d][0]; + rix[1][0] = 0.5f * (rix[0][0] - rix[3][0] + rix[2][0] - rix[4][0]); + rix[0]++; rix[1]++; rix[2]++; rix[3]++; rix[4]++; + } + if(cc < cc1) { // remaining pixel, only if width is odd + rix[0][0] = rix[1][0] + rix[3][0]; + rix[2][0] = rix[1][0] + rix[4][0]; + } + } else { + c0 = 2 - c0; + int d = c0 + 3 - (c0 == 0 ? 0 : 1); + int cc; + for (cc=0; cc < cc1-1; cc+=2) { + rix[c0][0] = rix[1][0] + rix[d][0]; + rix[1][0] = 0.5f * (rix[0][0] - rix[3][0] + rix[2][0] - rix[4][0]); + rix[0]++; rix[1]++; rix[2]++; rix[3]++; rix[4]++; + rix[0][0] = rix[1][0] + rix[3][0]; + rix[2][0] = rix[1][0] + rix[4][0]; + rix[0]++; rix[1]++; rix[2]++; rix[3]++; rix[4]++; + } + if(cc < cc1) { // remaining pixel, only if width is odd + rix[c0][0] = rix[1][0] + rix[d][0]; + rix[1][0] = 0.5f * (rix[0][0] - rix[3][0] + rix[2][0] - rix[4][0]); + } } + } } if (plistener) plistener->setProgress (0.8); -#ifdef _OPENMP -#pragma omp parallel firstprivate (image,rix,qix) -#endif -{ + + if(applyGamma) + gamtab = &(Color::igammatab_24_17); + else { + for(int i=0;i<65536;i++) + (*gamtab)[i] = (float)i + 0.5f; + } + + array2D (*rgb[3]); + rgb[0] = &red; + rgb[1] = &green; + rgb[2] = &blue; + // copy result back to image matrix #ifdef _OPENMP -#pragma omp for +#pragma omp parallel for #endif for (int row=0; row < height; row++) { for (int col=0, rr=row+ba; col < width; col++) { int cc = col+ba; int c = FC(row,col); - array2D *nonInterpolated; - if(c==0) - nonInterpolated = &red; - else if(c==1) - nonInterpolated = &green; - else - nonInterpolated = &blue; - - if (applyGamma) { - for (int ii=0; ii < 3; ii++) - if (ii != c) { - array2D *interpolated; - if(ii==0) - interpolated = &red; - else if(ii==1) - interpolated = &green; - else - interpolated = &blue; - rix[ii] = qix[ii] + rr*cc1 + cc; - float v0 = 65535.f*rix[ii][0]; - (*interpolated)[row][col] = Color::igammatab_24_17[v0]; - } else { - (*nonInterpolated)[row][col] = image[ii][(((row*width)>>(1-(ii&1)))+col)>>1]; - } - } - else - for (int ii=0; ii < 3; ii++) - if (ii != c) { - array2D *interpolated; - if(ii==0) - interpolated = &red; - else if(ii==1) - interpolated = &green; - else - interpolated = &blue; - rix[ii] = qix[ii] + rr*cc1 + cc; - (*interpolated)[row][col] = ((65535.0f*rix[ii][0] + 0.5f)); - } else { - (*nonInterpolated)[row][col] = image[ii][(((row*width)>>(1-(ii&1)))+col)>>1]; - } + for (int ii=0; ii < 3; ii++) + if (ii != c) { + float *rix = qix[ii] + rr*cc1 + cc; + (*(rgb[ii]))[row][col] = (*gamtab)[65535.f*rix[0]]; + } else { + (*(rgb[ii]))[row][col] = CLIP(rawData[row][col]); + } } } -} -// End of parallelization 2 if (plistener) plistener->setProgress (1.0); if(buffer) free(buffer); else - for(int i=0;i<6;i++) + for(int i=0;i<5;i++) free(qix[i]); - free(imageBuffer); - //if(iterations > 4) refinement_lassus(passref); + if(!applyGamma) + delete gamtab; + if(iterations > 4 && iterations <=6) refinement(passref); else if(iterations > 6) refinement_lassus(passref); @@ -2451,7 +2522,10 @@ void RawImageSource::nodemosaic(bool bw) Adapted for Rawtherapee - Jacques Desmis 04/2013 */ -void RawImageSource::refinement(int PassCount) +#ifdef __SSE2__ +#define CLIPV(a) LIMV(a,ZEROV,c65535v) +#endif +SSEFUNCTION void RawImageSource::refinement(int PassCount) { MyTime t1e,t2e; t1e.set(); @@ -2485,8 +2559,27 @@ void RawImageSource::refinement(int PassCount) #ifdef _OPENMP #pragma omp for #endif - for (int row=2; row < height-2; row++) - for (int col=2+(FC(row,2) & 1), c=FC(row,col); col < width-2; col+=2) { + for (int row=2; row < height-2; row++) { + int col = 2+(FC(row,2) & 1); + int c = FC(row,col); +#ifdef __SSE2__ + __m128 dLv, dRv, dUv, dDv, v0v; + __m128 onev = _mm_set1_ps(1.f); + __m128 zd5v = _mm_set1_ps(0.5f); + __m128 c65535v = _mm_set1_ps(65535.f); + for (; col < width-8; col+=8) { + int indx = row*width+col; + pix[c] = (float*)(*rgb[c]) + indx; + pix[1] = (float*)(*rgb[1]) + indx; + dLv = onev/(onev+vabsf(LC2VFU(pix[c][ -2])-LC2VFU(pix[c][0]))+vabsf(LC2VFU(pix[1][ 1])-LC2VFU(pix[1][ -1]))); + dRv = onev/(onev+vabsf(LC2VFU(pix[c][ 2])-LC2VFU(pix[c][0]))+vabsf(LC2VFU(pix[1][ 1])-LC2VFU(pix[1][ -1]))); + dUv = onev/(onev+vabsf(LC2VFU(pix[c][-w2])-LC2VFU(pix[c][0]))+vabsf(LC2VFU(pix[1][w1])-LC2VFU(pix[1][-w1]))); + dDv = onev/(onev+vabsf(LC2VFU(pix[c][ w2])-LC2VFU(pix[c][0]))+vabsf(LC2VFU(pix[1][w1])-LC2VFU(pix[1][-w1]))); + v0v = CLIPV(LC2VFU(pix[c][0]) + zd5v +((LC2VFU(pix[1][-1])-LC2VFU(pix[c][-1]))*dLv +(LC2VFU(pix[1][1])-LC2VFU(pix[c][1]))*dRv +(LC2VFU(pix[1][-w1])-LC2VFU(pix[c][-w1]))*dUv + (LC2VFU(pix[1][w1])-LC2VFU(pix[c][w1]))*dDv ) / (dLv+dRv+dUv+dDv)); + STC2VFU(pix[1][0],v0v); + } +#endif + for (; col < width-2; col+=2) { int indx = row*width+col; pix[c] = (float*)(*rgb[c]) + indx; pix[1] = (float*)(*rgb[1]) + indx; @@ -2497,13 +2590,34 @@ void RawImageSource::refinement(int PassCount) float v0 = (pix[c][0] + 0.5f +((pix[1][ -1]-pix[c][ -1])*dL +(pix[1][ 1]-pix[c][ 1])*dR +(pix[1][-w1]-pix[c][-w1])*dU + (pix[1][ w1]-pix[c][ w1])*dD ) / (dL+dR+dU+dD)); pix[1][0] = CLIP(v0); } - + } /* Reinforce interpolated red/blue pixels on GREEN pixel locations */ #ifdef _OPENMP #pragma omp for #endif - for (int row=2; row < height-2; row++) - for (int col=2+(FC(row,3) & 1), c=FC(row,col+1); col < width-2; col+=2) { + for (int row=2; row < height-2; row++) { + int col = 2+(FC(row,3) & 1); + int c = FC(row,col+1); +#ifdef __SSE2__ + __m128 dLv, dRv, dUv, dDv, v0v; + __m128 onev = _mm_set1_ps(1.f); + __m128 zd5v = _mm_set1_ps(0.5f); + __m128 c65535v = _mm_set1_ps(65535.f); + for (; col < width-8; col+=8) { + int indx = row*width+col; + pix[1] = (float*)(*rgb[1]) + indx; + for (int i=0; i < 2; c=2-c, i++) { + pix[c] = (float*)(*rgb[c]) + indx; + dLv = onev/(onev+vabsf(LC2VFU(pix[1][ -2])-LC2VFU(pix[1][0]))+vabsf(LC2VFU(pix[c][ 1])-LC2VFU(pix[c][ -1]))); + dRv = onev/(onev+vabsf(LC2VFU(pix[1][ 2])-LC2VFU(pix[1][0]))+vabsf(LC2VFU(pix[c][ 1])-LC2VFU(pix[c][ -1]))); + dUv = onev/(onev+vabsf(LC2VFU(pix[1][-w2])-LC2VFU(pix[1][0]))+vabsf(LC2VFU(pix[c][w1])-LC2VFU(pix[c][-w1]))); + dDv = onev/(onev+vabsf(LC2VFU(pix[1][ w2])-LC2VFU(pix[1][0]))+vabsf(LC2VFU(pix[c][w1])-LC2VFU(pix[c][-w1]))); + v0v = CLIPV(LC2VFU(pix[1][0]) + zd5v -((LC2VFU(pix[1][-1])-LC2VFU(pix[c][-1]))*dLv + (LC2VFU(pix[1][1])-LC2VFU(pix[c][1]))*dRv +(LC2VFU(pix[1][-w1])-LC2VFU(pix[c][-w1]))*dUv + (LC2VFU(pix[1][w1])-LC2VFU(pix[c][w1]))*dDv ) / (dLv+dRv+dUv+dDv)); + STC2VFU(pix[c][0],v0v); + } + } +#endif + for (; col < width-2; col+=2) { int indx = row*width+col; pix[1] = (float*)(*rgb[1]) + indx; for (int i=0; i < 2; c=2-c, i++) { @@ -2516,13 +2630,34 @@ void RawImageSource::refinement(int PassCount) pix[c][0] = CLIP(v0); } } - + } /* Reinforce integrated red/blue pixels on BLUE/RED pixel locations */ #ifdef _OPENMP #pragma omp for #endif - for (int row=2; row < height-2; row++) - for (int col=2+(FC(row,2) & 1), c=2-FC(row,col); col < width-2; col+=2) { + for (int row=2; row < height-2; row++) { + int col = 2+(FC(row,2) & 1); + int c = 2-FC(row,col); +#ifdef __SSE2__ + __m128 dLv, dRv, dUv, dDv, v0v; + __m128 onev = _mm_set1_ps(1.f); + __m128 zd5v = _mm_set1_ps(0.5f); + __m128 c65535v = _mm_set1_ps(65535.f); + for (; col < width-8; col+=8) { + int indx = row*width+col; + pix[0] = (float*)(*rgb[0]) + indx; + pix[1] = (float*)(*rgb[1]) + indx; + pix[2] = (float*)(*rgb[2]) + indx; + int d = 2 - c; + dLv = onev/(onev+vabsf(LC2VFU(pix[d][ -2])-LC2VFU(pix[d][0]))+vabsf(LC2VFU(pix[1][ 1])-LC2VFU(pix[1][ -1]))); + dRv = onev/(onev+vabsf(LC2VFU(pix[d][ 2])-LC2VFU(pix[d][0]))+vabsf(LC2VFU(pix[1][ 1])-LC2VFU(pix[1][ -1]))); + dUv = onev/(onev+vabsf(LC2VFU(pix[d][-w2])-LC2VFU(pix[d][0]))+vabsf(LC2VFU(pix[1][w1])-LC2VFU(pix[1][-w1]))); + dDv = onev/(onev+vabsf(LC2VFU(pix[d][ w2])-LC2VFU(pix[d][0]))+vabsf(LC2VFU(pix[1][w1])-LC2VFU(pix[1][-w1]))); + v0v = CLIPV(LC2VFU(pix[1][0]) + zd5v -((LC2VFU(pix[1][-1])-LC2VFU(pix[c][-1]))*dLv +(LC2VFU(pix[1][1])-LC2VFU(pix[c][1]))*dRv +(LC2VFU(pix[1][-w1])-LC2VFU(pix[c][-w1]))*dUv + (LC2VFU(pix[1][w1])-LC2VFU(pix[c][w1]))*dDv ) / (dLv+dRv+dUv+dDv)); + STC2VFU(pix[c][0],v0v); + } +#endif + for (; col < width-2; col+=2) { int indx = row*width+col; pix[0] = (float*)(*rgb[0]) + indx; pix[1] = (float*)(*rgb[1]) + indx; @@ -2535,11 +2670,15 @@ void RawImageSource::refinement(int PassCount) float v0 = (pix[1][0] + 0.5f -((pix[1][ -1]-pix[c][ -1])*dL +(pix[1][ 1]-pix[c][ 1])*dR +(pix[1][-w1]-pix[c][-w1])*dU + (pix[1][ w1]-pix[c][ w1])*dD ) / (dL+dR+dU+dD)); pix[c][0] = CLIP(v0); } + } } // end parallel } t2e.set(); if (settings->verbose) printf("Refinement Lee %d usec\n", t2e.etime(t1e)); } +#ifdef __SSE2__ +#undef CLIPV +#endif // Refinement based on EECI demozaicing algorithm by L. Chang and Y.P. Tan diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index cbfcc6691..ed1090d03 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -9,11 +9,7 @@ #define INLINE inline #endif -#if defined( WIN32 ) && defined(__x86_64__) - #include -#else - #include -#endif +#include #include @@ -40,7 +36,33 @@ typedef __m128i vint2; #define LVFU(x) _mm_loadu_ps(&x) #define STVF(x,y) _mm_store_ps(&x,y) #endif + +// Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats #define LC2VFU(a) _mm_shuffle_ps( LVFU(a), _mm_loadu_ps( (&a) + 4 ), _MM_SHUFFLE( 2,0,2,0 ) ) + +// Store a vector of 4 floats in a[0],a[2],a[4] and a[6] +#if defined(__x86_64__) && defined(__SSE4_1__) +// SSE4.1 => use _mm_blend_ps instead of _mm_set_epi32 and vself + #define STC2VFU(a,v) {\ + __m128 TST1V = _mm_loadu_ps(&a);\ + __m128 TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 1,1,0,0 ));\ + _mm_storeu_ps(&a, _mm_blend_ps(TST1V,TST2V,5));\ + TST1V = _mm_loadu_ps((&a)+4);\ + TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 3,3,2,2 ));\ + _mm_storeu_ps((&a)+4, _mm_blend_ps(TST1V,TST2V,5));\ + } +#else + #define STC2VFU(a,v) {\ + __m128 TST1V = _mm_loadu_ps(&a);\ + __m128 TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 1,1,0,0 ));\ + vmask cmask = _mm_set_epi32(0xffffffff,0,0xffffffff,0);\ + _mm_storeu_ps(&a, vself(cmask,TST1V,TST2V));\ + TST1V = _mm_loadu_ps((&a)+4);\ + TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 3,3,2,2 ));\ + _mm_storeu_ps((&a)+4, vself(cmask,TST1V,TST2V));\ + } +#endif + #define ZEROV _mm_setzero_ps() static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); } diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index 219f9cf82..693842fe5 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -901,17 +901,25 @@ static INLINE vdouble xlog1p(vdouble a) { typedef struct { vfloat x, y; } vfloat2; -#if defined( __FMA__ ) && defined( __x86_64__ ) && defined(WIN32) // experimental -static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x,y,z); } +#if defined( __FMA__ ) && defined( __x86_64__ ) + static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x,y,z); } #else -static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return vaddf(vmulf(x, y), z); } + static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return vaddf(vmulf(x, y), z); } #endif static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); } static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); } -static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) { - return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y)); -} +#if defined( __SSE4_1__ ) && defined( __x86_64__ ) + // only one instruction when using SSE4.1 + static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) { + return _mm_blendv_ps(y,x,(vfloat)mask); + } +#else + // three instructions when using SSE2 + static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) { + return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y)); + } +#endif static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) { vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));