From a967a8b5844b9480dac95f9377aa75fd4f2e9dc4 Mon Sep 17 00:00:00 2001 From: Ingo Date: Thu, 6 Jun 2013 18:48:08 +0200 Subject: [PATCH] Optimization for IGV-Demosaic, Issue 1893 --- rtengine/demosaic_algos.cc | 584 +++++++++++++++++++++++++++++++++---- 1 file changed, 523 insertions(+), 61 deletions(-) diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc index afb751128..ea5fa857f 100644 --- a/rtengine/demosaic_algos.cc +++ b/rtengine/demosaic_algos.cc @@ -35,6 +35,9 @@ #include "../rtgui/multilangmgr.h" #include "procparams.h" #include "sleef.c" +#ifdef __SSE2__ +#include "sleefsseavx.c" +#endif #ifdef _OPENMP @@ -1456,18 +1459,430 @@ void RawImageSource::lmmse_interpolate_omp(int winw, int winh, int iterations) * ***/ // Adapted to RT by Jacques Desmis 3/2013 +// Optimized by Ingo Weyrich 5/2013 +#ifdef __SSE2__ +#ifdef WIN32 +__attribute__((force_align_arg_pointer)) void RawImageSource::igv_interpolate(int winw, int winh) +#else +void RawImageSource::igv_interpolate(int winw, int winh) +#endif +{ + static const float eps=1e-5f, epssq=1e-5f;//mod epssq -10f =>-5f Jacques 3/2013 to prevent artifact (divide by zero) + + static const int h1=1, h2=2, h3=3, h4=4, h5=5, h6=6; + const int width=winw, height=winh; + const int v1=1*width, v2=2*width, v3=3*width, v4=4*width, v5=5*width, v6=6*width; + float* rgb[3]; + float* chr[2]; + float *rgbarray, *vdif, *hdif, *chrarray; + rgbarray = (float (*)) calloc(width*height*3, sizeof( float)); + rgb[0] = rgbarray; + rgb[1] = rgbarray + (width*height); + rgb[2] = rgbarray + 2*(width*height); + + chrarray = (float (*)) calloc(width*height*2, sizeof( float)); + chr[0] = chrarray; + chr[1] = chrarray + (width*height); + + vdif = (float (*)) calloc(width*height/2, sizeof *vdif); + hdif = (float (*)) calloc(width*height/2, sizeof *hdif); + + border_interpolate2(winw,winh,7); + + if (plistener) { + plistener->setProgressStr (Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), RAWParams::methodstring[RAWParams::igv])); + plistener->setProgress (0.0); + } +#ifdef _OPENMP +#pragma omp parallel default(none) shared(rgb,vdif,hdif,chr) +#endif +{ + __m128 ngv, egv, wgv, sgv, nvv, evv, wvv, svv, nwgv, negv, swgv, segv, nwvv, nevv, swvv, sevv,tempv,temp1v,temp2v,tempoldv; + __m128 epsv = _mm_set1_ps( eps ); + __m128 epssqv = _mm_set1_ps( epssq ); + __m128 c65535v = _mm_set1_ps( 65535.f ); + __m128 c23v = _mm_set1_ps( 23.f ); + __m128 c40v = _mm_set1_ps( 40.f ); + __m128 c51v = _mm_set1_ps( 51.f ); + __m128 c32v = _mm_set1_ps( 32.f ); + __m128 c8v = _mm_set1_ps( 8.f ); + __m128 c7v = _mm_set1_ps( 7.f ); + __m128 c6v = _mm_set1_ps( 6.f ); + __m128 c10v = _mm_set1_ps( 10.f ); + __m128 c21v = _mm_set1_ps( 21.f ); + __m128 c78v = _mm_set1_ps( 78.f ); + __m128 c69v = _mm_set1_ps( 69.f ); + __m128 c3145680v = _mm_set1_ps( 3145680.f ); + __m128 onev = _mm_set1_ps ( 1.f ); + __m128 zerov = _mm_set1_ps ( 0.f ); + __m128 d725v = _mm_set1_ps ( 0.725f ); + __m128 d1375v = _mm_set1_ps ( 0.1375f ); + + float ng, eg, wg, sg, nv, ev, wv, sv, nwg, neg, swg, seg, nwv, nev, swv, sev; + int lastcol; +#ifdef _OPENMP +#pragma omp for +#endif + for (int row=0; rowsetProgress (0.13); +} + +#ifdef _OPENMP +#pragma omp for +#endif + for (int row=5; row>1], (sgv*nvv+ngv*svv)/(ngv+sgv)- tempv ); + _mm_storeu_ps( &hdif[indx>>1], (wgv*evv+egv*wvv)/(egv+wgv)- tempv ); + } + lastcol = col; + // borders without SSE + for (col=lastcol, indx=row*width+col, c=FC(row,col); col>1]=(sg*nv+ng*sv)/(ng+sg)-(rgb[c][indx])/65535.f; + hdif[indx>>1]=(wg*ev+eg*wv)/(eg+wg)-(rgb[c][indx])/65535.f; + } + } + +#ifdef _OPENMP +#pragma omp single +#endif +{ + if (plistener) plistener->setProgress (0.26); +} + +#ifdef _OPENMP +#pragma omp for +#endif + for (int row=7; row>1, c=FC(row,col), d=c/2; col>1, c=FC(row,col), d=c/2; colsetProgress (0.39); +} + +// free(vdif); free(hdif); + +#pragma omp for + for (int row=7; rowsetProgress (0.52); +} + +#pragma omp for + for (int row=8; rowsetProgress (0.65); +} +#ifdef _OPENMP +#pragma omp for +#endif + for (int row=7; rowsetProgress (0.78); +} +#ifdef _OPENMP +#pragma omp for +#endif + for (int row=7; rowsetProgress (0.91); +} +#ifdef _OPENMP +#pragma omp for +#endif + for(int row=7; rowsetProgress (1.0); + + free(chrarray); free(rgbarray); + free(vdif); free(hdif); +} + +#else void RawImageSource::igv_interpolate(int winw, int winh) { static const float eps=1e-5f, epssq=1e-5f;//mod epssq -10f =>-5f Jacques 3/2013 to prevent artifact (divide by zero) static const int h1=1, h2=2, h3=3, h4=4, h5=5, h6=6; const int width=winw, height=winh; const int v1=1*width, v2=2*width, v3=3*width, v4=4*width, v5=5*width, v6=6*width; - float (*rgb)[3], *vdif, *hdif, (*chr)[2]; - rgb = (float (*)[3]) calloc(width*height, sizeof *rgb); - vdif = (float (*)) calloc(width*height, sizeof *vdif); - hdif = (float (*)) calloc(width*height, sizeof *hdif); - chr = (float (*)[2]) calloc(width*height, sizeof *chr); + float* rgb[3]; + float* chr[2]; + float (*rgbarray), *vdif, *hdif, (*chrarray); + + rgbarray = (float (*)) calloc(width*height*3, sizeof( float)); + rgb[0] = rgbarray; + rgb[1] = rgbarray + (width*height); + rgb[2] = rgbarray + 2*(width*height); + + chrarray = (float (*)) calloc(width*height*2, sizeof( float)); + chr[0] = chrarray; + chr[1] = chrarray + (width*height); + + vdif = (float (*)) calloc(width*height/2, sizeof *vdif); + hdif = (float (*)) calloc(width*height/2, sizeof *hdif); border_interpolate2(winw,winh,7); @@ -1488,7 +1903,7 @@ void RawImageSource::igv_interpolate(int winw, int winh) for (int row=0; rowsetProgress (0.1); + if (plistener) plistener->setProgress (0.13); } #ifdef _OPENMP @@ -1505,26 +1920,27 @@ void RawImageSource::igv_interpolate(int winw, int winh) for (int row=5; row>1]=(sg*nv+ng*sv)/(ng+sg)-(rgb[c][indx])/65535.f; + hdif[indx>>1]=(wg*ev+eg*wv)/(eg+wg)-(rgb[c][indx])/65535.f; } #ifdef _OPENMP #pragma omp single #endif { - if (plistener) plistener->setProgress (0.25); + if (plistener) plistener->setProgress (0.26); } #ifdef _OPENMP @@ -1534,62 +1950,113 @@ void RawImageSource::igv_interpolate(int winw, int winh) for (int col=7+(FC(row,1)&1), indx=row*width+col, c=FC(row,col), d=c/2; col>1])+69.0f*(SQR(vdif[(indx-v2)>>1])+SQR(vdif[(indx+v2)>>1]))+51.0f*(SQR(vdif[(indx-v4)>>1])+SQR(vdif[(indx+v4)>>1]))+21.0f*(SQR(vdif[(indx-v6)>>1])+SQR(vdif[(indx+v6)>>1]))-6.0f*SQR(vdif[(indx-v2)>>1]+vdif[indx>>1]+vdif[(indx+v2)>>1]) + -10.0f*(SQR(vdif[(indx-v4)>>1]+vdif[(indx-v2)>>1]+vdif[indx>>1])+SQR(vdif[indx>>1]+vdif[(indx+v2)>>1]+vdif[(indx+v4)>>1]))-7.0f*(SQR(vdif[(indx-v6)>>1]+vdif[(indx-v4)>>1]+vdif[(indx-v2)>>1])+SQR(vdif[(indx+v2)>>1]+vdif[(indx+v4)>>1]+vdif[(indx+v6)>>1])),0.f,1.f); + eg=LIM(epssq+78.0f*SQR(hdif[indx>>1])+69.0f*(SQR(hdif[(indx-h2)>>1])+SQR(hdif[(indx+h2)>>1]))+51.0f*(SQR(hdif[(indx-h4)>>1])+SQR(hdif[(indx+h4)>>1]))+21.0f*(SQR(hdif[(indx-h6)>>1])+SQR(hdif[(indx+h6)>>1]))-6.0f*SQR(hdif[(indx-h2)>>1]+hdif[indx>>1]+hdif[(indx+h2)>>1]) + -10.0f*(SQR(hdif[(indx-h4)>>1]+hdif[(indx-h2)>>1]+hdif[indx>>1])+SQR(hdif[indx>>1]+hdif[(indx+h2)>>1]+hdif[(indx+h4)>>1]))-7.0f*(SQR(hdif[(indx-h6)>>1]+hdif[(indx-h4)>>1]+hdif[(indx-h2)>>1])+SQR(hdif[(indx+h2)>>1]+hdif[(indx+h4)>>1]+hdif[(indx+h6)>>1])),0.f,1.f); //Limit chrominance using H/V neighbourhood - nv=ULIM(0.725f*vdif[indx]+0.1375f*vdif[indx-v2]+0.1375f*vdif[indx+v2],vdif[indx-v2],vdif[indx+v2]); - ev=ULIM(0.725f*hdif[indx]+0.1375f*hdif[indx-h2]+0.1375f*hdif[indx+h2],hdif[indx-h2],hdif[indx+h2]); + nv=ULIM(0.725f*vdif[indx>>1]+0.1375f*vdif[(indx-v2)>>1]+0.1375f*vdif[(indx+v2)>>1],vdif[(indx-v2)>>1],vdif[(indx+v2)>>1]); + ev=ULIM(0.725f*hdif[indx>>1]+0.1375f*hdif[(indx-h2)>>1]+0.1375f*hdif[(indx+h2)>>1],hdif[(indx-h2)>>1],hdif[(indx+h2)>>1]); //Chrominance estimation - chr[indx][d]=(eg*nv+ng*ev)/(ng+eg); + chr[d][indx]=(eg*nv+ng*ev)/(ng+eg); //Green channel population - rgb[indx][1]=rgb[indx][c]+65535.f*chr[indx][d]; + rgb[1][indx]=rgb[c][indx]+65535.f*chr[d][indx]; } #ifdef _OPENMP #pragma omp single #endif { - if (plistener) plistener->setProgress (0.45); + if (plistener) plistener->setProgress (0.39); } // free(vdif); free(hdif); - - // The following block has to be executed by only one thread, because of memory access "collision" +#ifdef _OPENMP +#pragma omp for +#endif + for (int row=7; rowsetProgress (0.52); +} +#ifdef _OPENMP +#pragma omp for +#endif + for (int row=8; rowsetProgress (0.6); - +#ifdef _OPENMP +#pragma omp single +#endif +{ + if (plistener) plistener->setProgress (0.65); +} +#ifdef _OPENMP +#pragma omp for +#endif for (int row=7; rowsetProgress (0.78); +} +#ifdef _OPENMP +#pragma omp for +#endif + for (int row=7; rowsetProgress (0.8); + //N,E,W,S Gradients + ng=1.0f/(eps+fabsf(chr[1][indx-v1]-chr[1][indx-v3])+fabsf(chr[1][indx+v1]-chr[1][indx-v3])); + eg=1.0f/(eps+fabsf(chr[1][indx+h1]-chr[1][indx+h3])+fabsf(chr[1][indx-h1]-chr[1][indx+h3])); + wg=1.0f/(eps+fabsf(chr[1][indx-h1]-chr[1][indx-h3])+fabsf(chr[1][indx+h1]-chr[1][indx-h3])); + sg=1.0f/(eps+fabsf(chr[1][indx+v1]-chr[1][indx+v3])+fabsf(chr[1][indx-v1]-chr[1][indx+v3])); + //Interpolate chrominance: R@G and B@G + chr[1][indx]=((ng*chr[1][indx-v1]+eg*chr[1][indx+h1]+wg*chr[1][indx-h1]+sg*chr[1][indx+v1])/(ng+eg+wg+sg)); + } +#ifdef _OPENMP +#pragma omp single +#endif +{ + if (plistener) plistener->setProgress (0.91); //Interpolate borders // border_interpolate2(7, rgb); @@ -1608,29 +2075,24 @@ void RawImageSource::igv_interpolate(int winw, int winh) blue [row][col] = rgb[indxc][2]; } */ -#ifdef _OPENMP -#pragma omp single -#endif -{ - if (plistener) plistener->setProgress (0.9); -} #ifdef _OPENMP #pragma omp for #endif for(int row=7; rowsetProgress (1.0); - free(chr); free(rgb); + free(chrarray); free(rgbarray); free(vdif); free(hdif); } +#endif /*