From 3874a927faf9a1eea0df316b15ce997bdb3d3f83 Mon Sep 17 00:00:00 2001 From: Ingo Date: Tue, 12 Nov 2013 00:07:47 +0100 Subject: [PATCH] Amaze optimization step 2, Issue 1835 --- rtengine/amaze_demosaic_RT.cc | 1059 ++++++++++++++++++++++----------- rtengine/helpersse2.h | 5 +- 2 files changed, 702 insertions(+), 362 deletions(-) diff --git a/rtengine/amaze_demosaic_RT.cc b/rtengine/amaze_demosaic_RT.cc index 80fe733cb..90d2e6093 100644 --- a/rtengine/amaze_demosaic_RT.cc +++ b/rtengine/amaze_demosaic_RT.cc @@ -30,12 +30,11 @@ #include "../rtgui/multilangmgr.h" #include "procparams.h" #include "sleef.c" +#include "opthelper.h" -using namespace rtengine; +namespace rtengine { -void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh) { -// clock_t t1,t2; -// t1 = clock(); +SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh) { #define HCLIP(x) x //is this still necessary??? //min(clip_pt,x) @@ -44,10 +43,12 @@ void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh) { const float clip_pt = 1/initialGain; + const float clip_pt8 = 0.8f/initialGain; + + +#define TS 160 // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading +#define TSH 80 // half of Tile size -#define TS 512 // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading -#define TSH 256 -#define TS6 500 // local variables @@ -76,6 +77,7 @@ void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh) { static const float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f}; volatile double progress = 0.0; + // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // Issue 1676 @@ -84,32 +86,27 @@ void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh) { plistener->setProgressStr (Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), RAWParams::methodstring[RAWParams::amaze])); plistener->setProgress (0.0); } -struct s_mp { - float m; - float p; -}; -struct s_hv { - float h; - float v; -}; + struct s_hv { + float h; + float v; + }; + #pragma omp parallel { + int progresscounter=0; //position of top/left corner of the tile int top, left; // beginning of storage block for tile char *buffer; - // rgb values - float (*rgb)[3]; - // horizontal gradient - float (*delh); - // vertical gradient - float (*delv); - // square of delh - float (*delhsq); - // square of delv - float (*delvsq); + // green values + float (*rgbgreen); + + // sum of square of horizontal gradient and square of vertical gradient + float (*delhvsqsum); // gradient based directional weights for interpolation - float (*dirwts)[2]; + float (*dirwts0); + float (*dirwts1); + // vertically interpolated color differences G-R, G-B float (*vcd); // horizontally interpolated color differences @@ -123,18 +120,16 @@ struct s_hv { // weight to give horizontal vs vertical interpolation float (*hvwt); // final interpolated color difference - float (*Dgrb)[2]; + float (*Dgrb)[TS*TSH]; +// float (*Dgrb)[2]; // gradient in plus (NE/SW) direction float (*delp); // gradient in minus (NW/SE) direction float (*delm); // diagonal interpolation of R+B float (*rbint); + // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions) s_hv (*Dgrb2); - // horizontal curvature of interpolated G (used to refine interpolation in Nyquist texture regions) -// float (*Dgrbh2); - // vertical curvature of interpolated G -// float (*Dgrbv2); // difference between up/down interpolations of G float (*dgintv); // difference between left/right interpolations of G @@ -143,7 +138,9 @@ struct s_hv { // float (*Dgrbp1); // diagonal (minus) color difference R-B or G1-G2 // float (*Dgrbm1); - s_mp (*Dgrbsq1); + float (*Dgrbsq1m); + float (*Dgrbsq1p); +// s_mp (*Dgrbsq1); // square of diagonal color difference // float (*Dgrbpsq1); // square of diagonal color difference @@ -153,83 +150,47 @@ struct s_hv { // relative weight for combining plus and minus diagonal interpolations float (*pmwt); // interpolated color difference R-B in minus and plus direction - s_mp (*rb); - // interpolated color difference R-B in plus direction -// float (*rbp); - // interpolated color difference R-B in minus direction -// float (*rbm); + float (*rbm); + float (*rbp); // nyquist texture flag 1=nyquist, 0=not nyquist char (*nyquist); #define CLF 1 // assign working space - buffer = (char *) malloc(29*sizeof(float)*TS*TS - sizeof(float)*TS*TSH + sizeof(char)*TS*TSH+23*CLF*64); + buffer = (char *) malloc(22*sizeof(float)*TS*TS + sizeof(char)*TS*TSH+23*CLF*64 + 63); char *data; data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64); - //merror(buffer,"amaze_interpolate()"); - //memset(buffer,0,(34*sizeof(float)+sizeof(int))*TS*TS); - // rgb array - rgb = (float (*)[3]) data; //pointers to array - delh = (float (*)) (data + 3*sizeof(float)*TS*TS+1*CLF*64); - delv = (float (*)) (data + 4*sizeof(float)*TS*TS+2*CLF*64); - delhsq = (float (*)) (data + 5*sizeof(float)*TS*TS+3*CLF*64); - delvsq = (float (*)) (data + 6*sizeof(float)*TS*TS+4*CLF*64); - dirwts = (float (*)[2]) (data + 7*sizeof(float)*TS*TS+5*CLF*64); - vcd = (float (*)) (data + 9*sizeof(float)*TS*TS+6*CLF*64); - hcd = (float (*)) (data + 10*sizeof(float)*TS*TS+7*CLF*64); - vcdalt = (float (*)) (data + 11*sizeof(float)*TS*TS+8*CLF*64); - hcdalt = (float (*)) (data + 12*sizeof(float)*TS*TS+9*CLF*64); - cddiffsq = (float (*)) (data + 13*sizeof(float)*TS*TS+10*CLF*64); - hvwt = (float (*)) (data + 14*sizeof(float)*TS*TS+11*CLF*64); //compressed 0.5 MB - Dgrb = (float (*)[2]) (data + 15*sizeof(float)*TS*TS - sizeof(float)*TS*TSH+12*CLF*64); - delp = (float (*)) (data + 17*sizeof(float)*TS*TS - sizeof(float)*TS*TSH+13*CLF*64); // compressed 0.5 MB - delm = (float (*)) (data + 17*sizeof(float)*TS*TS+14*CLF*64); // compressed 0.5 MB - rbint = (float (*)) (data + 18*sizeof(float)*TS*TS - sizeof(float)*TS*TSH+15*CLF*64); // compressed 0.5 MB - Dgrb2 = (s_hv (*)) (data + 18*sizeof(float)*TS*TS+16*CLF*64); // compressed 1.0 MB -// Dgrbh2 = (float (*)) (data + 19*sizeof(float)*TS*TS); -// Dgrbv2 = (float (*)) (data + 20*sizeof(float)*TS*TS); - dgintv = (float (*)) (data + 19*sizeof(float)*TS*TS+17*CLF*64); - dginth = (float (*)) (data + 20*sizeof(float)*TS*TS+18*CLF*64); -// Dgrbp1 = (float (*)) (data + 23*sizeof(float)*TS*TS); 1.0 MB -// Dgrbm1 = (float (*)) (data + 23*sizeof(float)*TS*TS); 1.0 MB - Dgrbsq1 = (s_mp (*)) (data + 21*sizeof(float)*TS*TS+19*CLF*64); // compressed 1.0 MB -// Dgrbpsq1 = (float (*)) (data + 23*sizeof(float)*TS*TS); -// Dgrbmsq1 = (float (*)) (data + 24*sizeof(float)*TS*TS); - cfa = (float (*)) (data + 22*sizeof(float)*TS*TS+20*CLF*64); - pmwt = (float (*)) (data + 23*sizeof(float)*TS*TS+21*CLF*64); // compressed 0.5 MB - rb = (s_mp (*)) (data + 24*sizeof(float)*TS*TS - sizeof(float)*TS*TSH+22*CLF*64); // compressed 1.0 MB -// rbp = (float (*)) (data + 30*sizeof(float)*TS*TS); -// rbm = (float (*)) (data + 31*sizeof(float)*TS*TS); + rgbgreen = (float (*)) data; //pointers to array + delhvsqsum = (float (*)) ((char*)rgbgreen + sizeof(float)*TS*TS + CLF*64); + dirwts0 = (float (*)) ((char*)delhvsqsum + sizeof(float)*TS*TS + CLF*64); + dirwts1 = (float (*)) ((char*)dirwts0 + sizeof(float)*TS*TS + CLF*64); + vcd = (float (*)) ((char*)dirwts1 + sizeof(float)*TS*TS + CLF*64); + hcd = (float (*)) ((char*)vcd + sizeof(float)*TS*TS + CLF*64); + vcdalt = (float (*)) ((char*)hcd + sizeof(float)*TS*TS + CLF*64); + hcdalt = (float (*)) ((char*)vcdalt + sizeof(float)*TS*TS + CLF*64); + cddiffsq = (float (*)) ((char*)hcdalt + sizeof(float)*TS*TS + CLF*64); + hvwt = (float (*)) ((char*)cddiffsq + sizeof(float)*TS*TS + CLF*64); + Dgrb = (float (*)[TS*TSH]) ((char*)hvwt + sizeof(float)*TS*TSH + CLF*64); + delp = (float (*)) ((char*)Dgrb + sizeof(float)*TS*TS + CLF*64); + delm = (float (*)) ((char*)delp + sizeof(float)*TS*TSH + CLF*64); + rbint = (float (*)) ((char*)delm + sizeof(float)*TS*TSH + CLF*64); + Dgrb2 = (s_hv (*)) ((char*)rbint + sizeof(float)*TS*TSH + CLF*64); + dgintv = (float (*)) ((char*)Dgrb2 + sizeof(float)*TS*TS + CLF*64); + dginth = (float (*)) ((char*)dgintv + sizeof(float)*TS*TS + CLF*64); + Dgrbsq1m = (float (*)) ((char*)dginth + sizeof(float)*TS*TS + CLF*64); + Dgrbsq1p = (float (*)) ((char*)Dgrbsq1m + sizeof(float)*TS*TSH + CLF*64); + cfa = (float (*)) ((char*)Dgrbsq1p + sizeof(float)*TS*TSH + CLF*64); + pmwt = (float (*)) ((char*)cfa + sizeof(float)*TS*TS + CLF*64); + rbm = (float (*)) ((char*)pmwt + sizeof(float)*TS*TSH + CLF*64); + rbp = (float (*)) ((char*)rbm + sizeof(float)*TS*TSH + CLF*64); - nyquist = (char (*)) (data + 25*sizeof(float)*TS*TS - sizeof(float)*TS*TSH+23*CLF*64); //compressed 0.875 MB + nyquist = (char (*)) ((char*)rbp + sizeof(float)*TS*TSH + CLF*64); #undef CLF // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - /*double dt; - clock_t t1, t2; - - clock_t t1_init, t2_init = 0; - clock_t t1_vcdhcd, t2_vcdhcd = 0; - clock_t t1_cdvar, t2_cdvar = 0; - clock_t t1_nyqtest, t2_nyqtest = 0; - clock_t t1_areainterp, t2_areainterp = 0; - clock_t t1_compare, t2_compare = 0; - clock_t t1_diag, t2_diag = 0; - clock_t t1_chroma, t2_chroma = 0;*/ - - - // start - //if (verbose) fprintf (stderr,_("AMaZE interpolation ...\n")); - //t1 = clock(); - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - //determine GRBG coset; (ey,ex) is the offset of the R subarray if (FC(0,0)==1) {//first pixel is G @@ -271,8 +232,6 @@ struct s_hv { int indx, indx1; //dummy indices int i, j; - // +1 or -1 -// int sgn; //color ratios in up/down/left/right directions float cru, crd, crl, crr; @@ -288,6 +247,7 @@ struct s_hv { float Ginthar, Ginthha, Gintvar, Gintvha; //color difference (G-R or G-B) variance in up/down/left/right directions float Dgrbvvaru, Dgrbvvard, Dgrbhvarl, Dgrbhvarr; + float uave, dave, lave, rave; //color difference variances in vertical and horizontal directions @@ -298,8 +258,6 @@ struct s_hv { float diffwt; // 640 - 644 //alternative adaptive weight for combining horizontal/vertical interpolations float hvwtalt; // 745 - 748 - //temporary variables for combining interpolation weights at R and B sites -// float vo, ve; //interpolation of G in four directions float gu, gd, gl, gr; //variance of G in vertical/horizontal directions @@ -321,11 +279,8 @@ struct s_hv { //variance of R-B in plus/minus directions float rbvarm; // 843 - 848 - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - // rgb from input CFA data // rgb values should be floating point number between 0 and 1 // after white balance multipliers are applied @@ -337,51 +292,133 @@ struct s_hv { if (bottom>(winy+height)) {rrmax=winy+height-top;} else {rrmax=rr1;} if (right>(winx+width)) {ccmax=winx+width-left;} else {ccmax=cc1;} - for (rr=rrmin; rr < rrmax; rr++) - for (row=rr+top, cc=ccmin; cc < ccmax; cc++) { - col = cc+left; - c = FC(rr,cc); +#ifdef __SSE2__ + const __m128 c65535v = _mm_set1_ps( 65535.0f ); + __m128 tempv; + for (rr=rrmin; rr < rrmax; rr++){ + for (row=rr+top, cc=ccmin; cc < ccmax; cc+=4) { indx1=rr*TS+cc; - rgb[indx1][c] = (rawData[row][col])/65535.0f; - //indx=row*width+col; - //rgb[indx1][c] = image[indx][c]/65535.0f;//for dcraw implementation - - cfa[indx1] = rgb[indx1][c]; + tempv = LVFU(rawData[row][cc+left]) / c65535v; + _mm_store_ps( &cfa[indx1], tempv ); + _mm_store_ps( &rgbgreen[indx1], tempv ); } + } // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //fill borders if (rrmin>0) { for (rr=0; rr<16; rr++) - for (cc=ccmin; cc0) { + for (rr=rrmin; rr0 && ccmin>0) { + for (rr=0; rr<16; rr++) + for (cc=0; cc<16; cc+=4) { + indx1 = (rr)*TS+cc; + tempv = LVFU(rawData[winy+32-rr][winx+32-cc]) / c65535v; + _mm_store_ps( &cfa[indx1], tempv ); + _mm_store_ps( &rgbgreen[indx1], tempv ); + } + } + if (rrmax0 && ccmax0) { + for (rr=0; rr<16; rr++) + for (cc=0; cc<16; cc++) { + cfa[(rrmax+rr)*TS+cc] = (rawData[(winy+height-rr-2)][(winx+32-cc)])/65535.0f; + if(FC(rr,cc)==1) + rgbgreen[(rrmax+rr)*TS+cc] = cfa[(rrmax+rr)*TS+cc]; + } + } + +#else + for (rr=rrmin; rr < rrmax; rr++) + for (row=rr+top, cc=ccmin; cc < ccmax; cc++) { + indx1=rr*TS+cc; + cfa[indx1] = (rawData[row][cc+left])/65535.0f; + if(FC(rr,cc)==1) + rgbgreen[indx1] = cfa[indx1]; + + } + + // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + //fill borders + if (rrmin>0) { + for (rr=0; rr<16; rr++) + for (cc=ccmin,row = 32-rr+top; cc0) { for (rr=rrmin; rr0 && ccmin>0) { for (rr=0; rr<16; rr++) for (cc=0; cc<16; cc++) { - c=FC(rr,cc); - rgb[(rr)*TS+cc][c] = (rawData[winy+32-rr][winx+32-cc])/65535.0f; - //rgb[(rr)*TS+cc][c] = (rgb[(32-rr)*TS+(32-cc)][c]);//for dcraw implementation - cfa[(rr)*TS+cc] = rgb[(rr)*TS+cc][c]; + cfa[(rr)*TS+cc] = (rawData[winy+32-rr][winx+32-cc])/65535.0f; + if(FC(rr,cc)==1) + rgbgreen[(rr)*TS+cc] = cfa[(rr)*TS+cc]; } } if (rrmax0 && ccmax0) { for (rr=0; rr<16; rr++) for (cc=0; cc<16; cc++) { - c=FC(rr,cc); - rgb[(rrmax+rr)*TS+cc][c] = (rawData[(winy+height-rr-2)][(winx+32-cc)])/65535.0f; - //rgb[(rrmax+rr)*TS+cc][c] = (image[(height-rr-2)*width+(32-cc)][c])/65535.0f;//for dcraw implementation - cfa[(rrmax+rr)*TS+cc] = rgb[(rrmax+rr)*TS+cc][c]; + cfa[(rrmax+rr)*TS+cc] = (rawData[(winy+height-rr-2)][(winx+32-cc)])/65535.0f; + if(FC(rr,cc)==1) + rgbgreen[(rrmax+rr)*TS+cc] = cfa[(rrmax+rr)*TS+cc]; } } +#endif //end of border fill // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#ifdef __SSE2__ + __m128 delhv,delvv; + const __m128 epsv = _mm_set1_ps( eps ); - for (rr=1; rr < rr1-1; rr++) - for (cc=1, indx=(rr)*TS+cc; cc < cc1-1; cc++, indx++) { - - delh[indx] = fabsf(cfa[indx+1]-cfa[indx-1]); - delv[indx] = fabsf(cfa[indx+v1]-cfa[indx-v1]); - delhsq[indx] = SQR(delh[indx]); - delvsq[indx] = SQR(delv[indx]); -// delp[indx] = fabsf(cfa[indx+p1]-cfa[indx-p1]); -// delm[indx] = fabsf(cfa[indx+m1]-cfa[indx-m1]); + for (rr=2; rr < rr1-2; rr++) { + for (cc=0, indx=(rr)*TS+cc; cc < cc1; cc+=4, indx+=4) { + delhv = vabsf( LVFU( cfa[indx+1] ) - LVFU( cfa[indx-1] ) ); + delvv = vabsf( LVF( cfa[indx+v1] ) - LVF( cfa[indx-v1] ) ); + _mm_store_ps( &dirwts1[indx], epsv + vabsf( LVFU( cfa[indx+2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx-2] )) + delhv ); + delhv = delhv * delhv; + _mm_store_ps( &dirwts0[indx], epsv + vabsf( LVF( cfa[indx+v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx-v2] )) + delvv ); + delvv = delvv * delvv; + _mm_store_ps( &delhvsqsum[indx], delhv + delvv); } - + } +#else + // horizontal and vedrtical gradient + float delh,delv; for (rr=2; rr < rr1-2; rr++) - for (cc=2,indx=(rr)*TS+cc; cc < cc1-2; cc++, indx++) { - dirwts[indx][0] = eps+delv[indx+v1]+delv[indx-v1]+delv[indx];//+fabsf(cfa[indx+v2]-cfa[indx-v2]); - //vert directional averaging weights - dirwts[indx][1] = eps+delh[indx+1]+delh[indx-1]+delh[indx];//+fabsf(cfa[indx+2]-cfa[indx-2]); - //horizontal weights - + for (cc=2, indx=(rr)*TS+cc; cc < cc1-2; cc++, indx++) { + delh = fabsf(cfa[indx+1]-cfa[indx-1]); + delv = fabsf(cfa[indx+v1]-cfa[indx-v1]); + dirwts0[indx] = eps+fabsf(cfa[indx+v2]-cfa[indx])+fabsf(cfa[indx]-cfa[indx-v2])+delv; + dirwts1[indx] = eps+fabsf(cfa[indx+2]-cfa[indx])+fabsf(cfa[indx]-cfa[indx-2])+delh;//+fabsf(cfa[indx+2]-cfa[indx-2]); + delhvsqsum[indx] = SQR(delh) + SQR(delv); } +#endif - for (rr=6; rr < rr1-6; rr++) - for (cc=6+(FC(rr,2)&1), indx=(rr)*TS+cc; cc < cc1-6; cc+=2, indx+=2) { - delp[indx>>1] = fabsf(cfa[indx+p1]-cfa[indx-p1]); - delm[indx>>1] = fabsf(cfa[indx+m1]-cfa[indx-m1]); +#ifdef __SSE2__ + __m128 Dgrbsq1pv, Dgrbsq1mv,temp2v; + for (rr=6; rr < rr1-6; rr++){ + if((FC(rr,2)&1)==0) { + for (cc=6, indx=(rr)*TS+cc; cc < cc1-6; cc+=8, indx+=8) { + tempv = LC2VFU(cfa[indx+1]); + Dgrbsq1pv = (SQRV(tempv-LC2VFU(cfa[indx+1-p1]))+SQRV(tempv-LC2VFU(cfa[indx+1+p1]))); + _mm_storeu_ps( &delp[indx>>1], vabsf(LC2VFU(cfa[indx+p1])-LC2VFU(cfa[indx-p1]))); + _mm_storeu_ps( &delm[indx>>1], vabsf(LC2VFU(cfa[indx+m1])-LC2VFU(cfa[indx-m1]))); + Dgrbsq1mv = (SQRV(tempv-LC2VFU(cfa[indx+1-m1]))+SQRV(tempv-LC2VFU(cfa[indx+1+m1]))); + _mm_storeu_ps( &Dgrbsq1m[indx>>1], Dgrbsq1mv ); + _mm_storeu_ps( &Dgrbsq1p[indx>>1], Dgrbsq1pv ); + } } - - for (rr=6; rr < rr1-6; rr++) - for (cc=6+(FC(rr,1)&1),indx=(rr)*TS+cc; cc < cc1-6; cc+=2, indx+=2) { - Dgrbsq1[indx>>1].p=(SQR(cfa[indx]-cfa[indx-p1])+SQR(cfa[indx]-cfa[indx+p1])); - Dgrbsq1[indx>>1].m=(SQR(cfa[indx]-cfa[indx-m1])+SQR(cfa[indx]-cfa[indx+m1])); + else { + for (cc=6, indx=(rr)*TS+cc; cc < cc1-6; cc+=8, indx+=8) { + tempv = LC2VFU(cfa[indx]); + Dgrbsq1pv = (SQRV(tempv-LC2VFU(cfa[indx-p1]))+SQRV(tempv-LC2VFU(cfa[indx+p1]))); + _mm_storeu_ps( &delp[indx>>1], vabsf(LC2VFU(cfa[indx+1+p1])-LC2VFU(cfa[indx+1-p1]))); + _mm_storeu_ps( &delm[indx>>1], vabsf(LC2VFU(cfa[indx+1+m1])-LC2VFU(cfa[indx+1-m1]))); + Dgrbsq1mv = (SQRV(tempv-LC2VFU(cfa[indx-m1]))+SQRV(tempv-LC2VFU(cfa[indx+m1]))); + _mm_storeu_ps( &Dgrbsq1m[indx>>1], Dgrbsq1mv ); + _mm_storeu_ps( &Dgrbsq1p[indx>>1], Dgrbsq1pv ); + } } + } +#else + for (rr=6; rr < rr1-6; rr++){ + if((FC(rr,2)&1)==0) { + for (cc=6, indx=(rr)*TS+cc; cc < cc1-6; cc+=2, indx+=2) { + delp[indx>>1] = fabsf(cfa[indx+p1]-cfa[indx-p1]); + delm[indx>>1] = fabsf(cfa[indx+m1]-cfa[indx-m1]); + Dgrbsq1p[indx>>1]=(SQR(cfa[indx+1]-cfa[indx+1-p1])+SQR(cfa[indx+1]-cfa[indx+1+p1])); + Dgrbsq1m[indx>>1]=(SQR(cfa[indx+1]-cfa[indx+1-m1])+SQR(cfa[indx+1]-cfa[indx+1+m1])); + } + } + else { + for (cc=6, indx=(rr)*TS+cc; cc < cc1-6; cc+=2, indx+=2) { + Dgrbsq1p[indx>>1]=(SQR(cfa[indx]-cfa[indx-p1])+SQR(cfa[indx]-cfa[indx+p1])); + Dgrbsq1m[indx>>1]=(SQR(cfa[indx]-cfa[indx-m1])+SQR(cfa[indx]-cfa[indx+m1])); + delp[indx>>1] = fabsf(cfa[indx+1+p1]-cfa[indx+1-p1]); + delm[indx>>1] = fabsf(cfa[indx+1+m1]-cfa[indx+1-m1]); + } + } + } +#endif - - - //t2_init += clock()-t1_init; // end of tile initialization // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //interpolate vertical and horizontal color differences - //t1_vcdhcd = clock(); - for (rr=4; rr 0.8*clip_pt || Gintvha > 0.8*clip_pt || Ginthha > 0.8*clip_pt) { + fcswitch = !fcswitch; + + if (cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8) { //use HA if highlights are (nearly) clipped guar=guha; gdar=gdha; glar=glha; grar=grha; vcd[indx]=vcdalt[indx]; hcd[indx]=hcdalt[indx]; @@ -537,14 +649,71 @@ struct s_hv { dginth[indx]=min(SQR(glha-grha),SQR(glar-grar)); } - //t2_vcdhcd += clock() - t1_vcdhcd; - //t1_cdvar = clock(); - for (rr=4; rr pre_mul[c]) vcd[indx]=ULIM(Gintv,cfa[indx-v1],cfa[indx+v1])-cfa[indx]; cddiffsq[indx] = SQR(vcd[indx]-hcd[indx]); } - -// cddiffsq[indx] = SQR(vcd[indx]-hcd[indx]); + c = !c; } + } +#endif - for (rr=6; rr>1], vself( decmask, varwtv, diffwtv)); + } + } +#else + for (rr=6; rr0) {nyquist[indx>>1]=1;}//nyquist=1 for nyquist region + if (nyqtest>0) + nyquist[indx>>1]=1;//nyquist=1 for nyquist region } + unsigned int nyquisttemp; - for (rr=8; rr>1]+nyquist[(indx-m1)>>1]+nyquist[(indx+p1)>>1]+ @@ -699,17 +916,12 @@ struct s_hv { //or not if (nyquisttemp<4) nyquist[indx>>1]=0; } - - //t2_nyqtest += clock() - t1_nyqtest; + } // end of Nyquist test - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + // in areas of Nyquist texture, do area interpolation - //t1_areainterp = clock(); for (rr=8; rr>1]) + vcd[indx]*hvwt[indx>>1]);//evaluate color differences + Dgrb[0][indx>>1] = (hcd[indx]*(1.0f-hvwt[indx>>1]) + vcd[indx]*hvwt[indx>>1]);//evaluate color differences //if (hvwt[indx]<0.5) Dgrb[indx][0]=hcd[indx]; //if (hvwt[indx]>0.5) Dgrb[indx][0]=vcd[indx]; - rgb[indx][1] = cfa[indx] + Dgrb[indx][0];//evaluate G (finally!) + rgbgreen[indx] = cfa[indx] + Dgrb[0][indx>>1];//evaluate G (finally!) //local curvature in G (preparation for nyquist refinement step) if (nyquist[indx>>1]) { - Dgrb2[indx>>1].h = SQR(rgb[indx][1] - xdiv2f(rgb[indx-1][1]+rgb[indx+1][1])); - Dgrb2[indx>>1].v = SQR(rgb[indx][1] - xdiv2f(rgb[indx-v1][1]+rgb[indx+v1][1])); + Dgrb2[indx>>1].h = SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx-1]+rgbgreen[indx+1])); + Dgrb2[indx>>1].v = SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx-v1]+rgbgreen[indx+v1])); } else { Dgrb2[indx>>1].h = Dgrb2[indx>>1].v = 0; } @@ -794,30 +1005,91 @@ struct s_hv { gquinc[2]*(Dgrb2[(indx-v2)>>1].v+Dgrb2[(indx-2)>>1].v+Dgrb2[(indx+2)>>1].v+Dgrb2[(indx+v2)>>1].v)+ gquinc[3]*(Dgrb2[(indx-m2)>>1].v+Dgrb2[(indx+p2)>>1].v+Dgrb2[(indx-p2)>>1].v+Dgrb2[(indx+m2)>>1].v)); //use the results as weights for refined G interpolation - Dgrb[indx][0] = (hcd[indx]*gvarv + vcd[indx]*gvarh)/(gvarv+gvarh); - rgb[indx][1] = cfa[indx] + Dgrb[indx][0]; + Dgrb[0][indx>>1] = (hcd[indx]*gvarv + vcd[indx]*gvarh)/(gvarv+gvarh); + rgbgreen[indx] = cfa[indx] + Dgrb[0][indx>>1]; } } // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - //t1_diag = clock(); - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // diagonal interpolation correction - for (rr=8; rr>1; cc>1; cc>1])+LVFU(delm[(indx+m2)>>1]);//same as for wtu,wtd,wtl,wtr + wtnwv= temp1v+LVFU(delm[(indx-m1)>>1])+LVFU(delm[(indx-m2)>>1]); + + rbmv = (wtsev*rbnwv+wtnwv*rbsev)/(wtsev+wtnwv); + + temp1v = ULIMV(rbmv ,LC2VFU(cfa[indx-m1]),LC2VFU(cfa[indx+m1])); + wtv = twov * (cfav-rbmv)/(epsv+rbmv+cfav); + temp2v = wtv * rbmv + (onev-wtv)*temp1v; + + temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v); + temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv); + _mm_storeu_ps(&rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v ,LC2VFU(cfa[indx-m1]),LC2VFU(cfa[indx+m1])), temp2v )); + + + temp1v = LC2VFU(cfa[indx+p1]); + temp2v = LC2VFU(cfa[indx+p2]); + rbnev = (temp1v + temp1v) / (epsv + cfav + temp2v ); + rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev, temp1v + zd5v * (cfav - temp2v)); + + temp1v = LC2VFU(cfa[indx-p1]); + temp2v = LC2VFU(cfa[indx-p2]); + rbswv = (temp1v + temp1v) / (epsv + cfav + temp2v ); + rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv, temp1v + zd5v * (cfav - temp2v)); + + temp1v = epsv + LVFU(delp[indx1]); + wtnev= temp1v+LVFU(delp[(indx+p1)>>1])+LVFU(delp[(indx+p2)>>1]); + wtswv= temp1v+LVFU(delp[(indx-p1)>>1])+LVFU(delp[(indx-p2)>>1]); + + rbpv = (wtnev*rbswv+wtswv*rbnev)/(wtnev+wtswv); + + temp1v = ULIMV(rbpv ,LC2VFU(cfa[indx-p1]),LC2VFU(cfa[indx+p1])); + wtv = twov * (cfav-rbpv)/(epsv+rbpv+cfav); + temp2v = wtv * rbpv + (onev-wtv)*temp1v; + + temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v); + temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv); + _mm_storeu_ps(&rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v ,LC2VFU(cfa[indx-p1]),LC2VFU(cfa[indx+p1])), temp2v )); + + + + rbvarmv = epssqv + (gausseven0v*(LVFU(Dgrbsq1m[(indx-v1)>>1])+LVFU(Dgrbsq1m[(indx-1)>>1])+LVFU(Dgrbsq1m[(indx+1)>>1])+LVFU(Dgrbsq1m[(indx+v1)>>1])) + + gausseven1v*(LVFU(Dgrbsq1m[(indx-v2-1)>>1])+LVFU(Dgrbsq1m[(indx-v2+1)>>1])+LVFU(Dgrbsq1m[(indx-2-v1)>>1])+LVFU(Dgrbsq1m[(indx+2-v1)>>1])+ + LVFU(Dgrbsq1m[(indx-2+v1)>>1])+LVFU(Dgrbsq1m[(indx+2+v1)>>1])+LVFU(Dgrbsq1m[(indx+v2-1)>>1])+LVFU(Dgrbsq1m[(indx+v2+1)>>1]))); + _mm_storeu_ps(&pmwt[indx1] , rbvarmv/((epssqv + (gausseven0v*(LVFU(Dgrbsq1p[(indx-v1)>>1])+LVFU(Dgrbsq1p[(indx-1)>>1])+LVFU(Dgrbsq1p[(indx+1)>>1])+LVFU(Dgrbsq1p[(indx+v1)>>1])) + + gausseven1v*(LVFU(Dgrbsq1p[(indx-v2-1)>>1])+LVFU(Dgrbsq1p[(indx-v2+1)>>1])+LVFU(Dgrbsq1p[(indx-2-v1)>>1])+LVFU(Dgrbsq1p[(indx+2-v1)>>1])+ + LVFU(Dgrbsq1p[(indx-2+v1)>>1])+LVFU(Dgrbsq1p[(indx+2+v1)>>1])+LVFU(Dgrbsq1p[(indx+v2-1)>>1])+LVFU(Dgrbsq1p[(indx+v2+1)>>1]))))+rbvarmv)); + + } + +#else + for (cc=8+(FC(rr,2)&1),indx=rr*TS+cc,indx1=indx>>1; cc>1]+delm[(indx+m2)>>1];//same as for wtu,wtd,wtl,wtr wtnw= eps+delm[indx1]+delm[(indx-m1)>>1]+delm[(indx-m2)>>1]; @@ -841,41 +1121,41 @@ struct s_hv { wtsw= eps+delp[indx1]+delp[(indx-p1)>>1]+delp[(indx-p2)>>1]; - rb[indx1].m = (wtse*rbnw+wtnw*rbse)/(wtse+wtnw); - rb[indx1].p = (wtne*rbsw+wtsw*rbne)/(wtne+wtsw); + rbm[indx1] = (wtse*rbnw+wtnw*rbse)/(wtse+wtnw); + rbp[indx1] = (wtne*rbsw+wtsw*rbne)/(wtne+wtsw); /* rbvarp = epssq + (gausseven[0]*(Dgrbsq1[indx-v1].p+Dgrbsq1[indx-1].p+Dgrbsq1[indx+1].p+Dgrbsq1[indx+v1].p) + gausseven[1]*(Dgrbsq1[indx-v2-1].p+Dgrbsq1[indx-v2+1].p+Dgrbsq1[indx-2-v1].p+Dgrbsq1[indx+2-v1].p+ Dgrbsq1[indx-2+v1].p+Dgrbsq1[indx+2+v1].p+Dgrbsq1[indx+v2-1].p+Dgrbsq1[indx+v2+1].p)); */ - rbvarm = epssq + (gausseven[0]*(Dgrbsq1[(indx-v1)>>1].m+Dgrbsq1[(indx-1)>>1].m+Dgrbsq1[(indx+1)>>1].m+Dgrbsq1[(indx+v1)>>1].m) + - gausseven[1]*(Dgrbsq1[(indx-v2-1)>>1].m+Dgrbsq1[(indx-v2+1)>>1].m+Dgrbsq1[(indx-2-v1)>>1].m+Dgrbsq1[(indx+2-v1)>>1].m+ - Dgrbsq1[(indx-2+v1)>>1].m+Dgrbsq1[(indx+2+v1)>>1].m+Dgrbsq1[(indx+v2-1)>>1].m+Dgrbsq1[(indx+v2+1)>>1].m)); - pmwt[indx1] = rbvarm/((epssq + (gausseven[0]*(Dgrbsq1[(indx-v1)>>1].p+Dgrbsq1[(indx-1)>>1].p+Dgrbsq1[(indx+1)>>1].p+Dgrbsq1[(indx+v1)>>1].p) + - gausseven[1]*(Dgrbsq1[(indx-v2-1)>>1].p+Dgrbsq1[(indx-v2+1)>>1].p+Dgrbsq1[(indx-2-v1)>>1].p+Dgrbsq1[(indx+2-v1)>>1].p+ - Dgrbsq1[(indx-2+v1)>>1].p+Dgrbsq1[(indx+2+v1)>>1].p+Dgrbsq1[(indx+v2-1)>>1].p+Dgrbsq1[(indx+v2+1)>>1].p)))+rbvarm); + rbvarm = epssq + (gausseven[0]*(Dgrbsq1m[(indx-v1)>>1]+Dgrbsq1m[(indx-1)>>1]+Dgrbsq1m[(indx+1)>>1]+Dgrbsq1m[(indx+v1)>>1]) + + gausseven[1]*(Dgrbsq1m[(indx-v2-1)>>1]+Dgrbsq1m[(indx-v2+1)>>1]+Dgrbsq1m[(indx-2-v1)>>1]+Dgrbsq1m[(indx+2-v1)>>1]+ + Dgrbsq1m[(indx-2+v1)>>1]+Dgrbsq1m[(indx+2+v1)>>1]+Dgrbsq1m[(indx+v2-1)>>1]+Dgrbsq1m[(indx+v2+1)>>1])); + pmwt[indx1] = rbvarm/((epssq + (gausseven[0]*(Dgrbsq1p[(indx-v1)>>1]+Dgrbsq1p[(indx-1)>>1]+Dgrbsq1p[(indx+1)>>1]+Dgrbsq1p[(indx+v1)>>1]) + + gausseven[1]*(Dgrbsq1p[(indx-v2-1)>>1]+Dgrbsq1p[(indx-v2+1)>>1]+Dgrbsq1p[(indx-2-v1)>>1]+Dgrbsq1p[(indx+2-v1)>>1]+ + Dgrbsq1p[(indx-2+v1)>>1]+Dgrbsq1p[(indx+2+v1)>>1]+Dgrbsq1p[(indx+v2-1)>>1]+Dgrbsq1p[(indx+v2+1)>>1])))+rbvarm); // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //bound the interpolation in regions of high saturation - if (rb[indx1].p clip_pt) rb[indx1].p=ULIM(rb[indx1].p,cfa[indx-p1],cfa[indx+p1]);//for RT implementation - if (rb[indx1].m > clip_pt) rb[indx1].m=ULIM(rb[indx1].m,cfa[indx-m1],cfa[indx+m1]); + if (rbp[indx1] > clip_pt) rbp[indx1]=ULIM(rbp[indx1],cfa[indx-p1],cfa[indx+p1]);//for RT implementation + if (rbm[indx1] > clip_pt) rbm[indx1]=ULIM(rbm[indx1],cfa[indx-m1],cfa[indx+m1]); //c=2-FC(rr,cc);//for dcraw implementation //if (rbp[indx] > pre_mul[c]) rbp[indx]=ULIM(rbp[indx],cfa[indx-p1],cfa[indx+p1]); //if (rbm[indx] > pre_mul[c]) rbm[indx]=ULIM(rbm[indx],cfa[indx-m1],cfa[indx+m1]); @@ -883,26 +1163,41 @@ struct s_hv { //rbint[indx] = 0.5*(cfa[indx] + (rbp*rbvarm+rbm*rbvarp)/(rbvarp+rbvarm));//this is R+B, interpolated } +#endif + } - - +#ifdef __SSE2__ + __m128 pmwtaltv; + __m128 zd25v = _mm_set1_ps(0.25f); +#endif for (rr=10; rr>1; cc>1])+LVFU(pmwt[(indx+p1)>>1])+LVFU(pmwt[(indx-p1)>>1])+LVFU(pmwt[(indx+m1)>>1])); + tempv = LVFU(pmwt[indx1]); + tempv = vself(vmaskf_lt(vabsf(zd5v-tempv), vabsf(zd5v-pmwtaltv)), pmwtaltv, tempv); + _mm_storeu_ps( &pmwt[indx1], tempv); + _mm_storeu_ps( &rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + LVFU(rbm[indx1]) * (onev - tempv) + LVFU(rbp[indx1]) * tempv)); + } + +#else for (cc=10+(FC(rr,2)&1),indx=rr*TS+cc,indx1=indx>>1; cc>1]+pmwt[(indx+p1)>>1]+pmwt[(indx-p1)>>1]+pmwt[(indx+m1)>>1],2); -// vo=fabsf(0.5-pmwt[indx1]); -// ve=fabsf(0.5-pmwtalt); if (fabsf(0.5-pmwt[indx1])>1; cc>1])>1]) ) continue; + if (fabsf(0.5-pmwt[indx>>1])>1]) ) + continue; //now interpolate G vertically/horizontally using R+B values //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts @@ -929,8 +1224,8 @@ struct s_hv { //gr=rbint[indx]*crr; //interpolated G via adaptive weights of cardinal evaluations - Gintv = (dirwts[indx-v1][0]*gd+dirwts[indx+v1][0]*gu)/(dirwts[indx+v1][0]+dirwts[indx-v1][0]); - Ginth = (dirwts[indx-1][1]*gr+dirwts[indx+1][1]*gl)/(dirwts[indx-1][1]+dirwts[indx+1][1]); + Gintv = (dirwts0[indx-v1]*gd+dirwts0[indx+v1]*gu)/(dirwts0[indx+v1]+dirwts0[indx-v1]); + Ginth = (dirwts1[indx-1]*gr+dirwts1[indx+1]*gl)/(dirwts1[indx-1]+dirwts1[indx+1]); // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //bound the interpolation in regions of high saturation @@ -958,87 +1253,128 @@ struct s_hv { //if (Gintv > pre_mul[c]) Gintv=ULIM(Gintv,cfa[indx-v1],cfa[indx+v1]); // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - rgb[indx][1] = Ginth*(1.0f-hvwt[indx1]) + Gintv*hvwt[indx1]; + rgbgreen[indx] = Ginth*(1.0f-hvwt[indx1]) + Gintv*hvwt[indx1]; //rgb[indx][1] = 0.5*(rgb[indx][1]+0.25*(rgb[indx-v1][1]+rgb[indx+v1][1]+rgb[indx-1][1]+rgb[indx+1][1])); - Dgrb[indx][0] = rgb[indx][1]-cfa[indx]; + Dgrb[0][indx>>1] = rgbgreen[indx]-cfa[indx]; //rgb[indx][2-FC(rr,cc)]=2*rbint[indx]-cfa[indx]; } //end of diagonal interpolation correction - //t2_diag += clock() - t1_diag; - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - //t1_chroma = clock(); //fancy chrominance interpolation //(ey,ex) is location of R site for (rr=13-ey; rr>1; cc>1])-LVFU(Dgrb[c][(indx+m1)>>1]))+vabsf(LVFU(Dgrb[c][(indx-m1)>>1])-LVFU(Dgrb[c][(indx-m3)>>1]))+vabsf(LVFU(Dgrb[c][(indx+m1)>>1])-LVFU(Dgrb[c][(indx-m3)>>1]))); + wtnev=onev/(epsv+vabsf(LVFU(Dgrb[c][(indx+p1)>>1])-LVFU(Dgrb[c][(indx-p1)>>1]))+vabsf(LVFU(Dgrb[c][(indx+p1)>>1])-LVFU(Dgrb[c][(indx+p3)>>1]))+vabsf(LVFU(Dgrb[c][(indx-p1)>>1])-LVFU(Dgrb[c][(indx+p3)>>1]))); + wtswv=onev/(epsv+vabsf(LVFU(Dgrb[c][(indx-p1)>>1])-LVFU(Dgrb[c][(indx+p1)>>1]))+vabsf(LVFU(Dgrb[c][(indx-p1)>>1])-LVFU(Dgrb[c][(indx+m3)>>1]))+vabsf(LVFU(Dgrb[c][(indx+p1)>>1])-LVFU(Dgrb[c][(indx-p3)>>1]))); + wtsev=onev/(epsv+vabsf(LVFU(Dgrb[c][(indx+m1)>>1])-LVFU(Dgrb[c][(indx-m1)>>1]))+vabsf(LVFU(Dgrb[c][(indx+m1)>>1])-LVFU(Dgrb[c][(indx-p3)>>1]))+vabsf(LVFU(Dgrb[c][(indx-m1)>>1])-LVFU(Dgrb[c][(indx+m3)>>1]))); //Dgrb[indx][c]=(wtnw*Dgrb[indx-m1][c]+wtne*Dgrb[indx+p1][c]+wtsw*Dgrb[indx-p1][c]+wtse*Dgrb[indx+m1][c])/(wtnw+wtne+wtsw+wtse); - Dgrb[indx][c]=(wtnw*(1.325*Dgrb[indx-m1][c]-0.175*Dgrb[indx-m3][c]-0.075*Dgrb[indx-m1-2][c]-0.075*Dgrb[indx-m1-v2][c] )+ - wtne*(1.325*Dgrb[indx+p1][c]-0.175*Dgrb[indx+p3][c]-0.075*Dgrb[indx+p1+2][c]-0.075*Dgrb[indx+p1+v2][c] )+ - wtsw*(1.325*Dgrb[indx-p1][c]-0.175*Dgrb[indx-p3][c]-0.075*Dgrb[indx-p1-2][c]-0.075*Dgrb[indx-p1-v2][c] )+ - wtse*(1.325*Dgrb[indx+m1][c]-0.175*Dgrb[indx+m3][c]-0.075*Dgrb[indx+m1+2][c]-0.075*Dgrb[indx+m1+v2][c] ))/(wtnw+wtne+wtsw+wtse); + _mm_storeu_ps(&Dgrb[c][indx>>1], (wtnwv*(oned325v*LVFU(Dgrb[c][(indx-m1)>>1])-zd175v*LVFU(Dgrb[c][(indx-m3)>>1])-zd075v*LVFU(Dgrb[c][(indx-m1-2)>>1])-zd075v*LVFU(Dgrb[c][(indx-m1-v2)>>1]) )+ + wtnev*(oned325v*LVFU(Dgrb[c][(indx+p1)>>1])-zd175v*LVFU(Dgrb[c][(indx+p3)>>1])-zd075v*LVFU(Dgrb[c][(indx+p1+2)>>1])-zd075v*LVFU(Dgrb[c][(indx+p1+v2)>>1]) )+ + wtswv*(oned325v*LVFU(Dgrb[c][(indx-p1)>>1])-zd175v*LVFU(Dgrb[c][(indx-p3)>>1])-zd075v*LVFU(Dgrb[c][(indx-p1-2)>>1])-zd075v*LVFU(Dgrb[c][(indx-p1-v2)>>1]) )+ + wtsev*(oned325v*LVFU(Dgrb[c][(indx+m1)>>1])-zd175v*LVFU(Dgrb[c][(indx+m3)>>1])-zd075v*LVFU(Dgrb[c][(indx+m1+2)>>1])-zd075v*LVFU(Dgrb[c][(indx+m1+v2)>>1]) ))/(wtnwv+wtnev+wtswv+wtsev)); } - for (rr=12; rr>1])*Dgrb[indx-v1][c]+(1.0f-hvwt[(indx+1)>>1])*Dgrb[indx+1][c]+(1.0f-hvwt[(indx-1)>>1])*Dgrb[indx-1][c]+(hvwt[(indx+v1)>>1])*Dgrb[indx+v1][c])/ - ((hvwt[(indx-v1)>>1])+(1.0f-hvwt[(indx+1)>>1])+(1.0f-hvwt[(indx-1)>>1])+(hvwt[(indx+v1)>>1])); +#else + for (cc=14+(FC(rr,2)&1),indx=rr*TS+cc,c=1-FC(rr,cc)/2; cc>1]-Dgrb[c][(indx+m1)>>1])+fabsf(Dgrb[c][(indx-m1)>>1]-Dgrb[c][(indx-m3)>>1])+fabsf(Dgrb[c][(indx+m1)>>1]-Dgrb[c][(indx-m3)>>1])); + wtne=1.0f/(eps+fabsf(Dgrb[c][(indx+p1)>>1]-Dgrb[c][(indx-p1)>>1])+fabsf(Dgrb[c][(indx+p1)>>1]-Dgrb[c][(indx+p3)>>1])+fabsf(Dgrb[c][(indx-p1)>>1]-Dgrb[c][(indx+p3)>>1])); + wtsw=1.0f/(eps+fabsf(Dgrb[c][(indx-p1)>>1]-Dgrb[c][(indx+p1)>>1])+fabsf(Dgrb[c][(indx-p1)>>1]-Dgrb[c][(indx+m3)>>1])+fabsf(Dgrb[c][(indx+p1)>>1]-Dgrb[c][(indx-p3)>>1])); + wtse=1.0f/(eps+fabsf(Dgrb[c][(indx+m1)>>1]-Dgrb[c][(indx-m1)>>1])+fabsf(Dgrb[c][(indx+m1)>>1]-Dgrb[c][(indx-p3)>>1])+fabsf(Dgrb[c][(indx-m1)>>1]-Dgrb[c][(indx+m3)>>1])); + //Dgrb[indx][c]=(wtnw*Dgrb[indx-m1][c]+wtne*Dgrb[indx+p1][c]+wtsw*Dgrb[indx-p1][c]+wtse*Dgrb[indx+m1][c])/(wtnw+wtne+wtsw+wtse); + + Dgrb[c][indx>>1]=(wtnw*(1.325f*Dgrb[c][(indx-m1)>>1]-0.175f*Dgrb[c][(indx-m3)>>1]-0.075f*Dgrb[c][(indx-m1-2)>>1]-0.075f*Dgrb[c][(indx-m1-v2)>>1] )+ + wtne*(1.325f*Dgrb[c][(indx+p1)>>1]-0.175f*Dgrb[c][(indx+p3)>>1]-0.075f*Dgrb[c][(indx+p1+2)>>1]-0.075f*Dgrb[c][(indx+p1+v2)>>1] )+ + wtsw*(1.325f*Dgrb[c][(indx-p1)>>1]-0.175f*Dgrb[c][(indx-p3)>>1]-0.075f*Dgrb[c][(indx-p1-2)>>1]-0.075f*Dgrb[c][(indx-p1-v2)>>1] )+ + wtse*(1.325f*Dgrb[c][(indx+m1)>>1]-0.175f*Dgrb[c][(indx+m3)>>1]-0.075f*Dgrb[c][(indx+m1+2)>>1]-0.075f*Dgrb[c][(indx+m1+v2)>>1] ))/(wtnw+wtne+wtsw+wtse); + } +#endif + float temp; + for (rr=16; rr>1])+(1.0f-hvwt[(indx+1)>>1])+(1.0f-hvwt[(indx-1)>>1])+(hvwt[(indx+v1)>>1])); + red[row][col]=65535.0f*(rgbgreen[indx]- ((hvwt[(indx-v1)>>1])*Dgrb[0][(indx-v1)>>1]+(1.0f-hvwt[(indx+1)>>1])*Dgrb[0][(indx+1)>>1]+(1.0f-hvwt[(indx-1)>>1])*Dgrb[0][(indx-1)>>1]+(hvwt[(indx+v1)>>1])*Dgrb[0][(indx+v1)>>1])* + temp); + blue[row][col]=65535.0f*(rgbgreen[indx]- ((hvwt[(indx-v1)>>1])*Dgrb[1][(indx-v1)>>1]+(1.0f-hvwt[(indx+1)>>1])*Dgrb[1][(indx+1)>>1]+(1.0f-hvwt[(indx-1)>>1])*Dgrb[1][(indx-1)>>1]+(hvwt[(indx+v1)>>1])*Dgrb[1][(indx+v1)>>1])* + temp); + + indx++; + col++; + red[row][col]=65535.0f*(rgbgreen[indx]-Dgrb[0][indx>>1]); + blue[row][col]=65535.0f*(rgbgreen[indx]-Dgrb[1][indx>>1]); } - for(rr=12; rr>1]); + blue[row][col]=65535.0f*(rgbgreen[indx]-Dgrb[1][indx>>1]); + indx++; + col++; + temp = 1.0f/((hvwt[(indx-v1)>>1])+(1.0f-hvwt[(indx+1)>>1])+(1.0f-hvwt[(indx-1)>>1])+(hvwt[(indx+v1)>>1])); + red[row][col]=65535.0f*(rgbgreen[indx]- ((hvwt[(indx-v1)>>1])*Dgrb[0][(indx-v1)>>1]+(1.0f-hvwt[(indx+1)>>1])*Dgrb[0][(indx+1)>>1]+(1.0f-hvwt[(indx-1)>>1])*Dgrb[0][(indx-1)>>1]+(hvwt[(indx+v1)>>1])*Dgrb[0][(indx+v1)>>1])* + temp); + blue[row][col]=65535.0f*(rgbgreen[indx]- ((hvwt[(indx-v1)>>1])*Dgrb[1][(indx-v1)>>1]+(1.0f-hvwt[(indx+1)>>1])*Dgrb[1][(indx+1)>>1]+(1.0f-hvwt[(indx-1)>>1])*Dgrb[1][(indx-1)>>1]+(hvwt[(indx+v1)>>1])*Dgrb[1][(indx+v1)>>1])* + temp); + + } + } - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // copy smoothed results back to image matrix - for (rr=16; rr < rr1-16; rr++) + for (rr=16; rr < rr1-16; rr++){ +#ifdef __SSE2__ + for (row=rr+top, cc=16; cc < cc1-19; cc+=4) { + _mm_storeu_ps(&green[row][cc + left], LVF(rgbgreen[rr*TS+cc]) * c65535v); + } +#else for (row=rr+top, cc=16; cc < cc1-16; cc++) { col = cc + left; - indx=rr*TS+cc; - - red[row][col] = ((65535.0f*rgb[indx][0] )); - green[row][col] = ((65535.0f*rgb[indx][1])); - blue[row][col] = ((65535.0f*rgb[indx][2])); + green[row][col] = ((65535.0f*rgbgreen[indx])); //for dcraw implementation //for (c=0; c<3; c++){ // image[indx][c] = CLIP((int)(65535.0f*rgb[rr*TS+cc][c] + 0.5f)); //} - - } +#endif + } //end of main loop - // clean up - //free(buffer); - progress+=(double)((TS-32)*(TS-32))/(height*width); - if (progress>1.0) - { - progress=1.0; + if(plistener) { + progresscounter++; + if(progresscounter % 4 == 0) { +#pragma omp critical +{ + progress+=(double)4*((TS-32)*(TS-32))/(height*width); + if (progress>1.0) + { + progress=1.0; + } + plistener->setProgress(progress); +} + } } - if(plistener) plistener->setProgress(progress); } // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -1048,10 +1384,13 @@ struct s_hv { // clean up free(buffer); } + if(plistener) + plistener->setProgress(1.0); + + // done #undef TS -//t2 = clock() - t1; -//printf("Amaze took %d ms\n",t2); } +} \ No newline at end of file diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index f07c47631..6e4d352bb 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -25,10 +25,10 @@ typedef __m128 vfloat; typedef __m128i vint2; // - +#define LVF(x) _mm_load_ps(&x) #define LVFU(x) _mm_loadu_ps(&x) #define LC2VFU(a) _mm_shuffle_ps( LVFU(a), _mm_loadu_ps( (&a) + 4 ), _MM_SHUFFLE( 2,0,2,0 ) ) - +#define ZEROV _mm_setzero_ps() static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); } @@ -106,6 +106,7 @@ static INLINE vmask vmaskf_le(vfloat x, vfloat y) { return (__m128i)_mm_cmple_ps static INLINE vmask vmaskf_gt(vfloat x, vfloat y) { return (__m128i)_mm_cmpgt_ps(x, y); } static INLINE vmask vmaskf_ge(vfloat x, vfloat y) { return (__m128i)_mm_cmpge_ps(x, y); } + static INLINE vmask vmaski_eq(vint x, vint y) { __m128 s = (__m128)_mm_cmpeq_epi32(x, y); return (__m128i)_mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 0, 0));