From 2017a0e59223878913f5c1e04ab97c29c4c02d7c Mon Sep 17 00:00:00 2001 From: heckflosse Date: Sun, 24 Jan 2016 01:44:35 +0100 Subject: [PATCH 1/3] Code review and speedup for Amaze Demosaic --- rtengine/amaze_demosaic_RT.cc | 1697 ++++++++++++++++----------------- rtengine/helpersse2.h | 9 +- rtengine/rt_math.h | 10 + rtengine/sleefsseavx.c | 36 +- 4 files changed, 884 insertions(+), 868 deletions(-) diff --git a/rtengine/amaze_demosaic_RT.cc b/rtengine/amaze_demosaic_RT.cc index 46c77d7d7..3b367ee2b 100644 --- a/rtengine/amaze_demosaic_RT.cc +++ b/rtengine/amaze_demosaic_RT.cc @@ -4,6 +4,7 @@ // (Aliasing Minimization and Zipper Elimination) // // copyright (c) 2008-2010 Emil Martinec +// optimized for speed by Ingo Weyrich // // incorporating ideas of Luis Sanz Rodrigues and Paul Lee // @@ -28,9 +29,9 @@ #include "rawimagesource.h" #include "rt_math.h" #include "../rtgui/multilangmgr.h" -#include "procparams.h" #include "sleef.c" #include "opthelper.h" +#define BENCHMARK #include "StopWatch.h" namespace rtengine @@ -39,25 +40,44 @@ namespace rtengine SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh) { BENCHFUN -#define HCLIP(x) x //is this still necessary??? - //min(clip_pt,x) - int width = winw, height = winh; + volatile double progress = 0.0; + if (plistener) { + plistener->setProgressStr (Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), RAWParams::BayerSensor::methodstring[RAWParams::BayerSensor::amaze])); + plistener->setProgress (0.0); + } - const float clip_pt = 1 / initialGain; - const float clip_pt8 = 0.8f / initialGain; + const int width = winw, height = winh; + const float clip_pt = 1.0 / initialGain; + const float clip_pt8 = 0.8 / initialGain; #define TS 160 // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading #define TSH 80 // half of Tile size - // local variables - - //offset of R pixel within a Bayer quartet int ex, ey; + //determine GRBG coset; (ey,ex) is the offset of the R subarray + if (FC(0, 0) == 1) { //first pixel is G + if (FC(0, 1) == 0) { + ey = 0; + ex = 1; + } else { + ey = 1; + ex = 0; + } + } else {//first pixel is R or B + if (FC(0, 0) == 0) { + ey = 0; + ex = 0; + } else { + ey = 1; + ex = 1; + } + } + //shifts of pointer value to access pixels in vertical and diagonal directions static const int v1 = TS, v2 = 2 * TS, v3 = 3 * TS, p1 = -TS + 1, p2 = -2 * TS + 2, p3 = -3 * TS + 3, m1 = TS + 1, m2 = 2 * TS + 2, m3 = 3 * TS + 3; @@ -66,514 +86,347 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, //adaptive ratios threshold static const float arthresh = 0.75; - //nyquist texture test threshold - static const float nyqthresh = 0.5; //gaussian on 5x5 quincunx, sigma=1.2 static const float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f}; - //gaussian on 5x5, sigma=1.2 - static const float gaussgrad[6] = {0.07384411893421103f, 0.06207511968171489f, 0.0521818194747806f, - 0.03687419286733595f, 0.03099732204057846f, 0.018413194161458882f + //nyquist texture test threshold + static const float nyqthresh = 0.5; + //gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop + // Is this really sigma=1.2????, seems more like sigma = 1.672 + static const float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f, + nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f }; //gaussian on 5x5 alt quincunx, sigma=1.5 static const float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f}; //guassian on quincunx grid static const float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f}; - volatile double progress = 0.0; - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -// Issue 1676 -// Moved from inside the parallel section - if (plistener) { - plistener->setProgressStr (Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), RAWParams::BayerSensor::methodstring[RAWParams::BayerSensor::amaze])); - plistener->setProgress (0.0); - } - - struct s_hv { + typedef struct { float h; float v; - }; + } s_hv; +#ifdef _OPENMP #pragma omp parallel +#endif { int progresscounter = 0; - //position of top/left corner of the tile - int top, left; - // beginning of storage block for tile - char *buffer; - // green values - float (*rgbgreen); - - // sum of square of horizontal gradient and square of vertical gradient - float (*delhvsqsum); - // gradient based directional weights for interpolation - float (*dirwts0); - float (*dirwts1); - - // vertically interpolated color differences G-R, G-B - float (*vcd); - // horizontally interpolated color differences - float (*hcd); - // alternative vertical interpolation - float (*vcdalt); - // alternative horizontal interpolation - float (*hcdalt); - // square of average color difference - float (*cddiffsq); - // weight to give horizontal vs vertical interpolation - float (*hvwt); - // final interpolated color difference - float (*Dgrb)[TS * TSH]; -// float (*Dgrb)[2]; - // gradient in plus (NE/SW) direction - float (*delp); - // gradient in minus (NW/SE) direction - float (*delm); - // diagonal interpolation of R+B - float (*rbint); - // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions) - s_hv (*Dgrb2); - // difference between up/down interpolations of G - float (*dgintv); - // difference between left/right interpolations of G - float (*dginth); - // diagonal (plus) color difference R-B or G1-G2 -// float (*Dgrbp1); - // diagonal (minus) color difference R-B or G1-G2 -// float (*Dgrbm1); - float (*Dgrbsq1m); - float (*Dgrbsq1p); -// s_mp (*Dgrbsq1); - // square of diagonal color difference -// float (*Dgrbpsq1); - // square of diagonal color difference -// float (*Dgrbmsq1); - // tile raw data - float (*cfa); - // relative weight for combining plus and minus diagonal interpolations - float (*pmwt); - // interpolated color difference R-B in minus and plus direction - float (*rbm); - float (*rbp); - - // nyquist texture flag 1=nyquist, 0=not nyquist - char (*nyquist); #define CLF 1 // assign working space - buffer = (char *) calloc(22 * sizeof(float) * TS * TS + sizeof(char) * TS * TSH + 23 * CLF * 64 + 63, 1); - char *data; - data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64); + char *buffer = (char *) calloc(13 * sizeof(float) * TS * TS + sizeof(float) * TS * TSH + sizeof(char) * TS * TSH + 18 * CLF * 64 + 63, 1); + // aligned to 64 byte boundary + char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64); - //merror(buffer,"amaze_interpolate()"); - rgbgreen = (float (*)) data; //pointers to array - delhvsqsum = (float (*)) ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64); - dirwts0 = (float (*)) ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64); - dirwts1 = (float (*)) ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64); - vcd = (float (*)) ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64); - hcd = (float (*)) ((char*)vcd + sizeof(float) * TS * TS + CLF * 64); - vcdalt = (float (*)) ((char*)hcd + sizeof(float) * TS * TS + CLF * 64); - hcdalt = (float (*)) ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64); - cddiffsq = (float (*)) ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64); - hvwt = (float (*)) ((char*)cddiffsq + sizeof(float) * TS * TS + CLF * 64); - Dgrb = (float (*)[TS * TSH]) ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64); - delp = (float (*)) ((char*)Dgrb + sizeof(float) * TS * TS + CLF * 64); - delm = (float (*)) ((char*)delp + sizeof(float) * TS * TSH + CLF * 64); - rbint = (float (*)) ((char*)delm + sizeof(float) * TS * TSH + CLF * 64); - Dgrb2 = (s_hv (*)) ((char*)rbint + sizeof(float) * TS * TSH + CLF * 64); - dgintv = (float (*)) ((char*)Dgrb2 + sizeof(float) * TS * TS + CLF * 64); - dginth = (float (*)) ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64); - Dgrbsq1m = (float (*)) ((char*)dginth + sizeof(float) * TS * TS + CLF * 64); - Dgrbsq1p = (float (*)) ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64); - cfa = (float (*)) ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64); - pmwt = (float (*)) ((char*)cfa + sizeof(float) * TS * TS + CLF * 64); - rbm = (float (*)) ((char*)pmwt + sizeof(float) * TS * TSH + CLF * 64); - rbp = (float (*)) ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64); + // green values + float *rgbgreen = (float (*)) data; + // sum of square of horizontal gradient and square of vertical gradient + float *delhvsqsum = (float (*)) ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64); + // gradient based directional weights for interpolation + float *dirwts0 = (float (*)) ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64); + float *dirwts1 = (float (*)) ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64); + // vertically interpolated color differences G-R, G-B + float *vcd = (float (*)) ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64); + // horizontally interpolated color differences + float *hcd = (float (*)) ((char*)vcd + sizeof(float) * TS * TS + CLF * 64); + // alternative vertical interpolation + float *vcdalt = (float (*)) ((char*)hcd + sizeof(float) * TS * TS + CLF * 64); + // alternative horizontal interpolation + float *hcdalt = (float (*)) ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64); + // square of average color difference + float *cddiffsq = (float (*)) ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64); + // weight to give horizontal vs vertical interpolation + float *hvwt = (float (*)) ((char*)cddiffsq + sizeof(float) * TS * TS + 2 * CLF * 64); + // final interpolated color difference + float (*Dgrb)[TS * TSH] = (float (*)[TS * TSH])vcdalt; // there is no overlap in buffer usage => share + // gradient in plus (NE/SW) direction + float *delp = (float (*))cddiffsq; // there is no overlap in buffer usage => share + // gradient in minus (NW/SE) direction + float *delm = (float (*)) ((char*)delp + sizeof(float) * TS * TSH + CLF * 64); + // diagonal interpolation of R+B + float *rbint = (float (*))delm; // there is no overlap in buffer usage => share + // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions) + s_hv *Dgrb2 = (s_hv (*)) ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64); + // difference between up/down interpolations of G + float *dgintv = (float (*))Dgrb2; // there is no overlap in buffer usage => share + // difference between left/right interpolations of G + float *dginth = (float (*)) ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64); + // square of diagonal colour differences + float *Dgrbsq1m = (float (*)) ((char*)dginth + sizeof(float) * TS * TS + CLF * 64); + float *Dgrbsq1p = (float (*)) ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64); + // tile raw data + float *cfa = (float (*)) ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64); + // relative weight for combining plus and minus diagonal interpolations + float *pmwt = (float (*))delhvsqsum; // there is no overlap in buffer usage => share + // interpolated color difference R-B in minus and plus direction + float *rbm = (float (*))vcd; // there is no overlap in buffer usage => share + float *rbp = (float (*)) ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64); + // nyquist texture flag 1=nyquist, 0=not nyquist + unsigned char *nyquist = (unsigned char (*)) ((char*)cfa + sizeof(float) * TS * TS + CLF * 64); + /* + rgbgreen = (float (*)) data; //pointers to array + delhvsqsum = (float (*)) ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64); + dirwts0 = (float (*)) ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64); + dirwts1 = (float (*)) ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64); + vcd = (float (*)) ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64); + hcd = (float (*)) ((char*)vcd + sizeof(float) * TS * TS + CLF * 64); + vcdalt = (float (*)) ((char*)hcd + sizeof(float) * TS * TS + CLF * 64); + hcdalt = (float (*)) ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64); + cddiffsq = (float (*)) ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64); + hvwt = (float (*)) ((char*)cddiffsq + sizeof(float) * TS * TS + CLF * 64); + Dgrb = (float (*)[TS * TSH]) ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64); + delp = (float (*)) ((char*)Dgrb + sizeof(float) * TS * TS + CLF * 64); + delm = (float (*)) ((char*)delp + sizeof(float) * TS * TSH + CLF * 64); + rbint = (float (*)) ((char*)delm + sizeof(float) * TS * TSH + CLF * 64); + Dgrb2 = (s_hv (*)) ((char*)rbint + sizeof(float) * TS * TSH + CLF * 64); + dgintv = (float (*)) ((char*)Dgrb2 + sizeof(float) * TS * TS + CLF * 64); + dginth = (float (*)) ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64); + Dgrbsq1m = (float (*)) ((char*)dginth + sizeof(float) * TS * TS + CLF * 64); + Dgrbsq1p = (float (*)) ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64); + cfa = (float (*)) ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64); + pmwt = (float (*)) ((char*)cfa + sizeof(float) * TS * TS + CLF * 64); + rbm = (float (*)) ((char*)pmwt + sizeof(float) * TS * TSH + CLF * 64); + rbp = (float (*)) ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64); - nyquist = (char (*)) ((char*)rbp + sizeof(float) * TS * TSH + CLF * 64); + nyquist = (char (*)) ((char*)rbp + sizeof(float) * TS * TSH + CLF * 64); + */ #undef CLF - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - - //determine GRBG coset; (ey,ex) is the offset of the R subarray - if (FC(0, 0) == 1) { //first pixel is G - if (FC(0, 1) == 0) { - ey = 0; - ex = 1; - } else { - ey = 1; - ex = 0; - } - } else {//first pixel is R or B - if (FC(0, 0) == 0) { - ey = 0; - ex = 0; - } else { - ey = 1; - ex = 1; - } - } // Main algorithm: Tile loop - //#pragma omp parallel for shared(rawData,height,width,red,green,blue) private(top,left) schedule(dynamic) - //code is openmp ready; just have to pull local tile variable declarations inside the tile loop -// Issue 1676 -// use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling + // Issue 1676 + // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling +#ifdef _OPENMP #pragma omp for schedule(dynamic) collapse(2) nowait +#endif - for (top = winy - 16; top < winy + height; top += TS - 32) - for (left = winx - 16; left < winx + width; left += TS - 32) { - memset(nyquist, 0, sizeof(char)*TS * TSH); - memset(rbint, 0, sizeof(float)*TS * TSH); + for (int top = winy - 16; top < winy + height; top += TS - 32) + for (int left = winx - 16; left < winx + width; left += TS - 32) { +#ifdef __SSE2__ + // Using SSE2 we can zero the memory without cache pollution + vfloat zerov = ZEROV; + + for(int i = 3 * TSH; i < (TS - 6)*TSH; i += 16) { + _mm_stream_ps((float*)&nyquist[i], zerov); + } + +#else + memset(&nyquist[3 * TSH], 0, sizeof(unsigned char) * (TS - 6) * TSH); +#endif //location of tile bottom edge - int bottom = min(top + TS, winy + height + 16); + const int bottom = min(top + TS, winy + height + 16); //location of tile right edge - int right = min(left + TS, winx + width + 16); + const int right = min(left + TS, winx + width + 16); //tile width (=TS except for right edge of image) - int rr1 = bottom - top; + const int rr1 = bottom - top; //tile height (=TS except for bottom edge of image) - int cc1 = right - left; - - //tile vars - //counters for pixel location in the image - int row, col; - //min and max row/column in the tile - int rrmin, rrmax, ccmin, ccmax; - //counters for pixel location within the tile - int rr, cc; - //color index 0=R, 1=G, 2=B - int c; - //pointer counters within the tile - int indx, indx1; - //dummy indices - int i, j; - - //color ratios in up/down/left/right directions - float cru, crd, crl, crr; - //adaptive weights for vertical/horizontal/plus/minus directions - float vwt, hwt, pwt, mwt; - //vertical and horizontal G interpolations - float Gintv, Ginth; - //G interpolated in vert/hor directions using adaptive ratios - float guar, gdar, glar, grar; - //G interpolated in vert/hor directions using Hamilton-Adams method - float guha, gdha, glha, grha; - //interpolated G from fusing left/right or up/down - float Ginthar, Ginthha, Gintvar, Gintvha; - //color difference (G-R or G-B) variance in up/down/left/right directions - float Dgrbvvaru, Dgrbvvard, Dgrbhvarl, Dgrbhvarr; - - float uave, dave, lave, rave; - - //color difference variances in vertical and horizontal directions - float vcdvar, hcdvar, vcdvar1, hcdvar1, hcdaltvar, vcdaltvar; - //adaptive interpolation weight using variance of color differences - float varwt; // 639 - 644 - //adaptive interpolation weight using difference of left-right and up-down G interpolations - float diffwt; // 640 - 644 - //alternative adaptive weight for combining horizontal/vertical interpolations - float hvwtalt; // 745 - 748 - //interpolation of G in four directions - float gu, gd, gl, gr; - //variance of G in vertical/horizontal directions - float gvarh, gvarv; - - //Nyquist texture test - float nyqtest; // 658 - 681 - //accumulators for Nyquist texture interpolation - float sumh, sumv, sumsqh, sumsqv, areawt; - - //color ratios in diagonal directions - float crse, crnw, crne, crsw; - //color differences in diagonal directions - float rbse, rbnw, rbne, rbsw; - //adaptive weights for combining diagonal interpolations - float wtse, wtnw, wtsw, wtne; - //alternate weight for combining diagonal interpolations - float pmwtalt; // 885 - 888 - //variance of R-B in plus/minus directions - float rbvarm; // 843 - 848 - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + const int cc1 = right - left; + // bookkeeping for borders + // min and max row/column in the tile + int rrmin = top < winy ? 16 : 0; + int ccmin = left < winx ? 16 : 0; + int rrmax = bottom > (winy + height) ? winy + height - top : rr1; + int ccmax = right > (winx + width) ? winx + width - left : cc1; // rgb from input CFA data // rgb values should be floating point number between 0 and 1 // after white balance multipliers are applied // a 16 pixel border is added to each side of the image - - // bookkeeping for borders - if (top < winy) { - rrmin = 16; - } else { - rrmin = 0; - } - - if (left < winx) { - ccmin = 16; - } else { - ccmin = 0; - } - - if (bottom > (winy + height)) { - rrmax = winy + height - top; - } else { - rrmax = rr1; - } - - if (right > (winx + width)) { - ccmax = winx + width - left; - } else { - ccmax = cc1; - } - #ifdef __SSE2__ - const __m128 c65535v = _mm_set1_ps( 65535.0f ); - __m128 tempv; + const vfloat c65535v = F2V( 65535.0f ); - for (rr = rrmin; rr < rrmax; rr++) { - for (row = rr + top, cc = ccmin; cc < ccmax - 3; cc += 4) { - indx1 = rr * TS + cc; - tempv = LVFU(rawData[row][cc + left]) / c65535v; - _mm_store_ps( &cfa[indx1], tempv ); - _mm_store_ps( &rgbgreen[indx1], tempv ); + //fill upper border + if (rrmin > 0) { + for (int rr = 0; rr < 16; rr++) + for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) { + cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f; + rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; + } + } + + // fill inner part + for (int rr = rrmin; rr < rrmax; rr++) { + int row = rr + top; + int cc = ccmin; + + for (; cc < ccmax - 3; cc += 4) { + int indx1 = rr * TS + cc; + vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v; + STVF(cfa[indx1], tempv ); + STVF(rgbgreen[indx1], tempv ); } for (; cc < ccmax; cc++) { - indx1 = rr * TS + cc; + int indx1 = rr * TS + cc; cfa[indx1] = (rawData[row][cc + left]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[indx1] = cfa[indx1]; - } - + rgbgreen[indx1] = cfa[indx1]; } - - } - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - //fill borders - if (rrmin > 0) { - for (rr = 0; rr < 16; rr++) - for (cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) { - cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; - } - } } + //fill lower border if (rrmax < rr1) { - for (rr = 0; rr < 16; rr++) - for (cc = ccmin; cc < ccmax; cc += 4) { - indx1 = (rrmax + rr) * TS + cc; - tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / c65535v; - _mm_store_ps( &cfa[indx1], tempv ); - _mm_store_ps( &rgbgreen[indx1], tempv ); + for (int rr = 0; rr < 16; rr++) + for (int cc = ccmin; cc < ccmax; cc += 4) { + int indx1 = (rrmax + rr) * TS + cc; + vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / c65535v; + STVF(cfa[indx1], tempv ); + STVF(rgbgreen[indx1], tempv ); } } + //fill left border if (ccmin > 0) { - for (rr = rrmin; rr < rrmax; rr++) - for (cc = 0, row = rr + top; cc < 16; cc++) { + for (int rr = rrmin; rr < rrmax; rr++) + for (int cc = 0, row = rr + top; cc < 16; cc++) { cfa[rr * TS + cc] = (rawData[row][32 - cc + left]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; - } + rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; } } + //fill right border if (ccmax < cc1) { - for (rr = rrmin; rr < rrmax; rr++) - for (cc = 0; cc < 16; cc++) { + for (int rr = rrmin; rr < rrmax; rr++) + for (int cc = 0; cc < 16; cc++) { cfa[rr * TS + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc]; - } + rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc]; } } //also, fill the image corners if (rrmin > 0 && ccmin > 0) { - for (rr = 0; rr < 16; rr++) - for (cc = 0; cc < 16; cc += 4) { - indx1 = (rr) * TS + cc; - tempv = LVFU(rawData[winy + 32 - rr][winx + 32 - cc]) / c65535v; - _mm_store_ps( &cfa[indx1], tempv ); - _mm_store_ps( &rgbgreen[indx1], tempv ); + for (int rr = 0; rr < 16; rr++) + for (int cc = 0; cc < 16; cc += 4) { + int indx1 = (rr) * TS + cc; + vfloat tempv = LVFU(rawData[winy + 32 - rr][winx + 32 - cc]) / c65535v; + STVF(cfa[indx1], tempv ); + STVF(rgbgreen[indx1], tempv ); } } if (rrmax < rr1 && ccmax < cc1) { - for (rr = 0; rr < 16; rr++) - for (cc = 0; cc < 16; cc += 4) { - indx1 = (rrmax + rr) * TS + ccmax + cc; - tempv = LVFU(rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / c65535v; - _mm_storeu_ps( &cfa[indx1], tempv ); - _mm_storeu_ps( &rgbgreen[indx1], tempv ); + for (int rr = 0; rr < 16; rr++) + for (int cc = 0; cc < 16; cc += 4) { + int indx1 = (rrmax + rr) * TS + ccmax + cc; + vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / c65535v; + STVFU(cfa[indx1], tempv ); + STVFU(rgbgreen[indx1], tempv ); } } if (rrmin > 0 && ccmax < cc1) { - for (rr = 0; rr < 16; rr++) - for (cc = 0; cc < 16; cc++) { - + for (int rr = 0; rr < 16; rr++) + for (int cc = 0; cc < 16; cc++) { cfa[(rr)*TS + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc]; - } + rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc]; } } if (rrmax < rr1 && ccmin > 0) { - for (rr = 0; rr < 16; rr++) - for (cc = 0; cc < 16; cc++) { + for (int rr = 0; rr < 16; rr++) + for (int cc = 0; cc < 16; cc++) { cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; - } + rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; } } #else - for (rr = rrmin; rr < rrmax; rr++) - for (row = rr + top, cc = ccmin; cc < ccmax; cc++) { - indx1 = rr * TS + cc; + for (int rr = rrmin; rr < rrmax; rr++) + for (int row = rr + top, cc = ccmin; cc < ccmax; cc++) { + int indx1 = rr * TS + cc; cfa[indx1] = (rawData[row][cc + left]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[indx1] = cfa[indx1]; - } - + rgbgreen[indx1] = cfa[indx1]; } - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //fill borders if (rrmin > 0) { - for (rr = 0; rr < 16; rr++) - for (cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) { + for (int rr = 0; rr < 16; rr++) + for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) { cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; - } + rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; } } if (rrmax < rr1) { - for (rr = 0; rr < 16; rr++) - for (cc = ccmin; cc < ccmax; cc++) { + for (int rr = 0; rr < 16; rr++) + for (int cc = ccmin; cc < ccmax; cc++) { cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; - } + rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; } } if (ccmin > 0) { - for (rr = rrmin; rr < rrmax; rr++) - for (cc = 0, row = rr + top; cc < 16; cc++) { + for (int rr = rrmin; rr < rrmax; rr++) + for (int cc = 0, row = rr + top; cc < 16; cc++) { cfa[rr * TS + cc] = (rawData[row][32 - cc + left]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; - } + rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; } } if (ccmax < cc1) { - for (rr = rrmin; rr < rrmax; rr++) - for (cc = 0; cc < 16; cc++) { + for (int rr = rrmin; rr < rrmax; rr++) + for (int cc = 0; cc < 16; cc++) { cfa[rr * TS + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc]; - } + rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc]; } } //also, fill the image corners if (rrmin > 0 && ccmin > 0) { - for (rr = 0; rr < 16; rr++) - for (cc = 0; cc < 16; cc++) { + for (int rr = 0; rr < 16; rr++) + for (int cc = 0; cc < 16; cc++) { cfa[(rr)*TS + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[(rr)*TS + cc] = cfa[(rr) * TS + cc]; - } + rgbgreen[(rr)*TS + cc] = cfa[(rr) * TS + cc]; } } if (rrmax < rr1 && ccmax < cc1) { - for (rr = 0; rr < 16; rr++) - for (cc = 0; cc < 16; cc++) { + for (int rr = 0; rr < 16; rr++) + for (int cc = 0; cc < 16; cc++) { cfa[(rrmax + rr)*TS + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[(rrmax + rr)*TS + ccmax + cc] = cfa[(rrmax + rr) * TS + ccmax + cc]; - } + rgbgreen[(rrmax + rr)*TS + ccmax + cc] = cfa[(rrmax + rr) * TS + ccmax + cc]; } } if (rrmin > 0 && ccmax < cc1) { - for (rr = 0; rr < 16; rr++) - for (cc = 0; cc < 16; cc++) { + for (int rr = 0; rr < 16; rr++) + for (int cc = 0; cc < 16; cc++) { cfa[(rr)*TS + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc]; - } + rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc]; } } if (rrmax < rr1 && ccmin > 0) { - for (rr = 0; rr < 16; rr++) - for (cc = 0; cc < 16; cc++) { + for (int rr = 0; rr < 16; rr++) + for (int cc = 0; cc < 16; cc++) { cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.0f; - - if(FC(rr, cc) == 1) { - rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; - } + rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; } } #endif //end of border fill - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #ifdef __SSE2__ - __m128 delhv, delvv; - const __m128 epsv = _mm_set1_ps( eps ); + const vfloat epsv = F2V( eps ); - for (rr = 2; rr < rr1 - 2; rr++) { - for (cc = 0, indx = (rr) * TS + cc; cc < cc1; cc += 4, indx += 4) { - delhv = vabsf( LVFU( cfa[indx + 1] ) - LVFU( cfa[indx - 1] ) ); - delvv = vabsf( LVF( cfa[indx + v1] ) - LVF( cfa[indx - v1] ) ); - _mm_store_ps( &dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv ); - delhv = delhv * delhv; - _mm_store_ps( &dirwts0[indx], epsv + vabsf( LVF( cfa[indx + v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx - v2] )) + delvv ); - delvv = delvv * delvv; - _mm_store_ps( &delhvsqsum[indx], delhv + delvv); + for (int rr = 2; rr < rr1 - 2; rr++) { + for (int indx = rr * TS; indx < rr * TS + cc1; indx += 4) { + vfloat delhv = vabsf( LVFU( cfa[indx + 1] ) - LVFU( cfa[indx - 1] ) ); + vfloat delvv = vabsf( LVF( cfa[indx + v1] ) - LVF( cfa[indx - v1] ) ); + STVF(dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv ); + STVF(dirwts0[indx], epsv + vabsf( LVF( cfa[indx + v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx - v2] )) + delvv ); + STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv)); } } #else - // horizontal and vedrtical gradient - float delh, delv; - for (rr = 2; rr < rr1 - 2; rr++) - for (cc = 2, indx = (rr) * TS + cc; cc < cc1 - 2; cc++, indx++) { - delh = fabsf(cfa[indx + 1] - cfa[indx - 1]); - delv = fabsf(cfa[indx + v1] - cfa[indx - v1]); + for (int rr = 2; rr < rr1 - 2; rr++) + for (int cc = 2, indx = (rr) * TS + cc; cc < cc1 - 2; cc++, indx++) { + // horizontal and vedrtical gradient + float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]); + float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]); dirwts0[indx] = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv; dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh; //+fabsf(cfa[indx+2]-cfa[indx-2]); delhvsqsum[indx] = SQR(delh) + SQR(delv); @@ -581,63 +434,9 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #endif -#ifdef __SSE2__ - __m128 Dgrbsq1pv, Dgrbsq1mv, temp2v; - - for (rr = 6; rr < rr1 - 6; rr++) { - if((FC(rr, 2) & 1) == 0) { - for (cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) { - tempv = LC2VFU(cfa[indx + 1]); - Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1]))); - _mm_storeu_ps( &delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1]))); - _mm_storeu_ps( &delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1]))); - Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1]))); - _mm_storeu_ps( &Dgrbsq1m[indx >> 1], Dgrbsq1mv ); - _mm_storeu_ps( &Dgrbsq1p[indx >> 1], Dgrbsq1pv ); - } - } else { - for (cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) { - tempv = LC2VFU(cfa[indx]); - Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1]))); - _mm_storeu_ps( &delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1]))); - _mm_storeu_ps( &delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1]))); - Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1]))); - _mm_storeu_ps( &Dgrbsq1m[indx >> 1], Dgrbsq1mv ); - _mm_storeu_ps( &Dgrbsq1p[indx >> 1], Dgrbsq1pv ); - } - } - } - -#else - - for (rr = 6; rr < rr1 - 6; rr++) { - if((FC(rr, 2) & 1) == 0) { - for (cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { - delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]); - delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]); - Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1])); - Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1])); - } - } else { - for (cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { - Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1])); - Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1])); - delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]); - delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]); - } - } - } - -#endif - - // end of tile initialization - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - //interpolate vertical and horizontal color differences - #ifdef __SSE2__ - __m128 sgnv, cruv, crdv, crlv, crrv, guhav, gdhav, glhav, grhav, hwtv, vwtv, Gintvhav, Ginthhav, guarv, gdarv, glarv, grarv; - vmask clipmask; + vfloat sgnv; if( !(FC(4, 4) & 1) ) { sgnv = _mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f ); @@ -645,73 +444,81 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, sgnv = _mm_set_ps( -1.0f, 1.0f, -1.0f, 1.0f ); } - __m128 zd5v = _mm_set1_ps( 0.5f ); - __m128 onev = _mm_set1_ps( 1.0f ); - __m128 arthreshv = _mm_set1_ps( arthresh ); - __m128 clip_pt8v = _mm_set1_ps( clip_pt8 ); + vfloat zd5v = F2V( 0.5f ); + vfloat onev = F2V( 1.0f ); + vfloat arthreshv = F2V( arthresh ); + vfloat clip_pt8v = F2V( clip_pt8 ); - for (rr = 4; rr < rr1 - 4; rr++) { + for (int rr = 4; rr < rr1 - 4; rr++) { sgnv = -sgnv; - for (cc = 4, indx = rr * TS + cc; cc < cc1 - 7; cc += 4, indx += 4) { + for (int indx = rr * TS + 4; indx < rr * TS + cc1 - 7; indx += 4) { //color ratios in each cardinal direction - cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + LVF(cfa[indx])) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2]))); - crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + LVF(cfa[indx])) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2]))); - crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + LVF(cfa[indx])) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2]))); - crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + LVF(cfa[indx])) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2]))); + vfloat cfav = LVF(cfa[indx]); + vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2]))); + vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2]))); + vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2]))); + vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2]))); - guhav = LVF(cfa[indx - v1]) + zd5v * (LVF(cfa[indx]) - LVF(cfa[indx - v2])); - gdhav = LVF(cfa[indx + v1]) + zd5v * (LVF(cfa[indx]) - LVF(cfa[indx + v2])); - glhav = LVFU(cfa[indx - 1]) + zd5v * (LVF(cfa[indx]) - LVFU(cfa[indx - 2])); - grhav = LVFU(cfa[indx + 1]) + zd5v * (LVF(cfa[indx]) - LVFU(cfa[indx + 2])); + vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2])); + vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2])); + vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2])); + vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2])); - guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), LVF(cfa[indx]) * cruv, guhav); - gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), LVF(cfa[indx]) * crdv, gdhav); - glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), LVF(cfa[indx]) * crlv, glhav); - grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), LVF(cfa[indx]) * crrv, grhav); + vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav); + vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav); + vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav); + vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav); - hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1])); - vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1])); + vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1])); + vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1])); //interpolated G via adaptive weights of cardinal evaluations - Ginthhav = hwtv * grhav + (onev - hwtv) * glhav; - Gintvhav = vwtv * gdhav + (onev - vwtv) * guhav; + vfloat Ginthhav = vintpf(hwtv, grhav, glhav); + vfloat Gintvhav = vintpf(vwtv, gdhav, guhav); + //interpolated color differences + vfloat hcdaltv = sgnv * (Ginthhav - cfav); + vfloat vcdaltv = sgnv * (Gintvhav - cfav); + STVF(hcdalt[indx], hcdaltv); + STVF(vcdalt[indx], vcdaltv); - _mm_store_ps( &hcdalt[indx], sgnv * (Ginthhav - LVF(cfa[indx]))); - _mm_store_ps( &vcdalt[indx], sgnv * (Gintvhav - LVF(cfa[indx]))); - - clipmask = vorm( vorm( vmaskf_gt( LVF(cfa[indx]), clip_pt8v ), vmaskf_gt( Gintvhav, clip_pt8v ) ), vmaskf_gt( Ginthhav, clip_pt8v )); + vmask clipmask = vorm( vorm( vmaskf_gt( cfav, clip_pt8v ), vmaskf_gt( Gintvhav, clip_pt8v ) ), vmaskf_gt( Ginthhav, clip_pt8v )); guarv = vself( clipmask, guhav, guarv); gdarv = vself( clipmask, gdhav, gdarv); glarv = vself( clipmask, glhav, glarv); grarv = vself( clipmask, grhav, grarv); - _mm_store_ps( &vcd[indx], vself( clipmask, LVF(vcdalt[indx]), sgnv * ((vwtv * gdarv + (onev - vwtv)*guarv) - LVF(cfa[indx])))); - _mm_store_ps( &hcd[indx], vself( clipmask, LVF(hcdalt[indx]), sgnv * ((hwtv * grarv + (onev - hwtv)*glarv) - LVF(cfa[indx])))); + STVF(vcd[indx], vself( clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav))); + STVF(hcd[indx], vself( clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav))); //differences of interpolations in opposite directions - _mm_store_ps(&dgintv[indx], _mm_min_ps(SQRV(guhav - gdhav), SQRV(guarv - gdarv))); - _mm_store_ps(&dginth[indx], _mm_min_ps(SQRV(glhav - grhav), SQRV(glarv - grarv))); + STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv))); + STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv))); } } #else - bool fcswitch; - for (rr = 4; rr < rr1 - 4; rr++) { - for (cc = 4, indx = rr * TS + cc, fcswitch = FC(rr, cc) & 1; cc < cc1 - 4; cc++, indx++) { + for (int rr = 4; rr < rr1 - 4; rr++) { + bool fcswitch = FC(rr, 4) & 1; + + for (int cc = 4, indx = rr * TS + cc; cc < cc1 - 4; cc++, indx++) { //color ratios in each cardinal direction - cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2])); - crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2])); - crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2])); - crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx]) / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2])); + float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2])); + float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2])); + float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2])); + float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx]) / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2])); - guha = HCLIP(cfa[indx - v1]) + xdiv2f(cfa[indx] - cfa[indx - v2]); - gdha = HCLIP(cfa[indx + v1]) + xdiv2f(cfa[indx] - cfa[indx + v2]); - glha = HCLIP(cfa[indx - 1]) + xdiv2f(cfa[indx] - cfa[indx - 2]); - grha = HCLIP(cfa[indx + 1]) + xdiv2f(cfa[indx] - cfa[indx + 2]); + //G interpolated in vert/hor directions using Hamilton-Adams method + float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]); + float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]); + float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]); + float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]); + + //G interpolated in vert/hor directions using adaptive ratios + float guar, gdar, glar, grar; if (fabsf(1.0f - cru) < arthresh) { guar = cfa[indx] * cru; @@ -737,12 +544,13 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, grar = grha; } - hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]); - vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]); + //adaptive weights for vertical/horizontal directions + float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]); + float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]); //interpolated G via adaptive weights of cardinal evaluations - Gintvha = vwt * gdha + (1.0f - vwt) * guha; - Ginthha = hwt * grha + (1.0f - hwt) * glha; + float Gintvha = vwt * gdha + (1.0f - vwt) * guha; + float Ginthha = hwt * grha + (1.0f - hwt) * glha; //interpolated color differences if (fcswitch) { @@ -781,13 +589,11 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #endif + + #ifdef __SSE2__ - __m128 hcdvarv, vcdvarv; - __m128 hcdaltvarv, vcdaltvarv, hcdv, vcdv, hcdaltv, vcdaltv, sgn3v, Ginthv, Gintvv, hcdoldv, vcdoldv; - __m128 threev = _mm_set1_ps( 3.0f ); - __m128 clip_ptv = _mm_set1_ps( clip_pt ); - __m128 nsgnv; - vmask hcdmask, vcdmask, tempmask; + vfloat clip_ptv = F2V( clip_pt ); + vfloat sgn3v; if( !(FC(4, 4) & 1) ) { sgnv = _mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f ); @@ -795,61 +601,61 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, sgnv = _mm_set_ps( -1.0f, 1.0f, -1.0f, 1.0f ); } - sgn3v = threev * sgnv; + sgn3v = sgnv + sgnv + sgnv; - for (rr = 4; rr < rr1 - 4; rr++) { - nsgnv = sgnv; + for (int rr = 4; rr < rr1 - 4; rr++) { + vfloat nsgnv = sgnv; sgnv = -sgnv; sgn3v = -sgn3v; - for (cc = 4, indx = rr * TS + cc, c = FC(rr, cc) & 1; cc < cc1 - 4; cc += 4, indx += 4) { - hcdv = LVF( hcd[indx] ); - hcdvarv = threev * (SQRV(LVFU(hcd[indx - 2])) + SQRV(hcdv) + SQRV(LVFU(hcd[indx + 2]))) - SQRV(LVFU(hcd[indx - 2]) + hcdv + LVFU(hcd[indx + 2])); - hcdaltv = LVF( hcdalt[indx] ); - hcdaltvarv = threev * (SQRV(LVFU(hcdalt[indx - 2])) + SQRV(hcdaltv) + SQRV(LVFU(hcdalt[indx + 2]))) - SQRV(LVFU(hcdalt[indx - 2]) + hcdaltv + LVFU(hcdalt[indx + 2])); - vcdv = LVF( vcd[indx] ); - vcdvarv = threev * (SQRV(LVF(vcd[indx - v2])) + SQRV(vcdv) + SQRV(LVF(vcd[indx + v2]))) - SQRV(LVF(vcd[indx - v2]) + vcdv + LVF(vcd[indx + v2])); - vcdaltv = LVF( vcdalt[indx] ); - vcdaltvarv = threev * (SQRV(LVF(vcdalt[indx - v2])) + SQRV(vcdaltv) + SQRV(LVF(vcdalt[indx + v2]))) - SQRV(LVF(vcdalt[indx - v2]) + vcdaltv + LVF(vcdalt[indx + v2])); + for (int indx = rr * TS + 4; indx < rr * TS + cc1 - 4; indx += 4) { + vfloat hcdv = LVF( hcd[indx] ); + vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv) + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2])) + SQRV(hcdv - LVFU(hcd[indx + 2])); + vfloat hcdaltv = LVF( hcdalt[indx] ); + vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv) + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2])) + SQRV(hcdaltv - LVFU(hcdalt[indx + 2])); + vfloat vcdv = LVF( vcd[indx] ); + vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv) + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2])) + SQRV(vcdv - LVF(vcd[indx + v2])); + vfloat vcdaltv = LVF( vcdalt[indx] ); + vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv) + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2])) + SQRV(vcdaltv - LVF(vcdalt[indx + v2])); + //choose the smallest variance; this yields a smoother interpolation hcdv = vself( vmaskf_lt( hcdaltvarv, hcdvarv ), hcdaltv, hcdv); vcdv = vself( vmaskf_lt( vcdaltvarv, vcdvarv ), vcdaltv, vcdv); - Ginthv = sgnv * hcdv + LVF( cfa[indx] ); - temp2v = sgn3v * hcdv; - hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx])); - hcdmask = vmaskf_gt( nsgnv * hcdv, ZEROV ); - hcdoldv = hcdv; - tempv = nsgnv * (LVF(cfa[indx]) - ULIMV( Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1]) )); - hcdv = vself( vmaskf_lt( (temp2v), -(LVF(cfa[indx]) + Ginthv)), tempv, hwtv * hcdv + (onev - hwtv) * tempv); + vfloat Ginthv = sgnv * hcdv + LVF( cfa[indx] ); + vfloat temp2v = sgn3v * hcdv; + vfloat hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx])); + vmask hcdmask = vmaskf_gt( nsgnv * hcdv, ZEROV ); + vfloat hcdoldv = hcdv; + vfloat tempv = nsgnv * (LVF(cfa[indx]) - ULIMV( Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1]) )); + hcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv)); hcdv = vself( hcdmask, hcdv, hcdoldv ); hcdv = vself( vmaskf_gt( Ginthv, clip_ptv), tempv, hcdv); - _mm_store_ps( &hcd[indx], hcdv); + STVF(hcd[indx], hcdv); - Gintvv = sgnv * vcdv + LVF( cfa[indx] ); + vfloat Gintvv = sgnv * vcdv + LVF( cfa[indx] ); temp2v = sgn3v * vcdv; - vwtv = onev + temp2v / ( epsv + Gintvv + LVF( cfa[indx])); - vcdmask = vmaskf_gt( nsgnv * vcdv, ZEROV ); - vcdoldv = vcdv; + vfloat vwtv = onev + temp2v / ( epsv + Gintvv + LVF( cfa[indx])); + vmask vcdmask = vmaskf_gt( nsgnv * vcdv, ZEROV ); + vfloat vcdoldv = vcdv; tempv = nsgnv * (LVF(cfa[indx]) - ULIMV( Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1]) )); - vcdv = vself( vmaskf_lt( (temp2v), -(LVF(cfa[indx]) + Gintvv)), tempv, vwtv * vcdv + (onev - vwtv) * tempv); + vcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv)); vcdv = vself( vcdmask, vcdv, vcdoldv ); vcdv = vself( vmaskf_gt( Gintvv, clip_ptv), tempv, vcdv); - _mm_store_ps( &vcd[indx], vcdv); - _mm_storeu_ps(&cddiffsq[indx], SQRV(vcdv - hcdv)); + STVF(vcd[indx], vcdv); + STVFU(cddiffsq[indx], SQRV(vcdv - hcdv)); } } #else - for (rr = 4; rr < rr1 - 4; rr++) { - //for (cc=4+(FC(rr,2)&1),indx=rr*TS+cc,c=FC(rr,cc); cc (Ginth + cfa[indx])) { hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]; } else { - hwt = 1.0f - 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]); + float hwt = 1.0f - 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]); hcd[indx] = hwt * hcd[indx] + (1.0f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]); } } @@ -878,7 +688,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if (3.0f * vcd[indx] > (Gintv + cfa[indx])) { vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]; } else { - vwt = 1.0f - 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]); + float vwt = 1.0f - 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]); vcd[indx] = vwt * vcd[indx] + (1.0f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]); } } @@ -903,7 +713,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if (3.0f * hcd[indx] < -(Ginth + cfa[indx])) { hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]; } else { - hwt = 1.0f + 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]); + float hwt = 1.0f + 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]); hcd[indx] = hwt * hcd[indx] + (1.0f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]); } } @@ -912,7 +722,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if (3.0f * vcd[indx] < -(Gintv + cfa[indx])) { vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]; } else { - vwt = 1.0f + 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]); + float vwt = 1.0f + 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]); vcd[indx] = vwt * vcd[indx] + (1.0f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]); } } @@ -925,8 +735,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]; } - //if (Ginth > pre_mul[c]) hcd[indx]=ULIM(Ginth,cfa[indx-1],cfa[indx+1])-cfa[indx];//for dcraw implementation - //if (Gintv > pre_mul[c]) vcd[indx]=ULIM(Gintv,cfa[indx-v1],cfa[indx+v1])-cfa[indx]; cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]); } @@ -936,75 +744,78 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #endif + + #ifdef __SSE2__ - __m128 uavev, davev, lavev, ravev, Dgrbvvaruv, Dgrbvvardv, Dgrbhvarlv, Dgrbhvarrv, varwtv, diffwtv, vcdvar1v, hcdvar1v; - __m128 epssqv = _mm_set1_ps( epssq ); - vmask decmask; + vfloat epssqv = F2V( epssq ); - for (rr = 6; rr < rr1 - 6; rr++) { - for (cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) { - //compute color difference variances in cardinal directions - tempv = LC2VFU(vcd[indx]); - uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]); - davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]); - Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev); - Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev); + for (int rr = 6; rr < rr1 - 6; rr++) { + for (int indx = rr * TS + 6 + (FC(rr, 2) & 1); indx < rr * TS + cc1 - 6; indx += 8) { + //compute colour difference variances in cardinal directions + vfloat tempv = LC2VFU(vcd[indx]); + vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]); + vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]); + vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev); + vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev); - hwtv = LC2VFU(dirwts1[indx - 1]) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1])); - vwtv = LC2VFU(dirwts0[indx - v1]) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1])); + vfloat hwtv = LC2VFU(dirwts1[indx - 1]) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1])); + vfloat vwtv = LC2VFU(dirwts0[indx - v1]) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1])); tempv = LC2VFU(hcd[indx]); - lavev = tempv + LC2VFU(hcd[indx - 1]) + LC2VFU(hcd[indx - 2]) + LC2VFU(hcd[indx - 3]); - ravev = tempv + LC2VFU(hcd[indx + 1]) + LC2VFU(hcd[indx + 2]) + LC2VFU(hcd[indx + 3]); - Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev) + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev); - Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev) + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev); + vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]); + vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]); + + vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev) + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev); + vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev) + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev); - vcdvarv = epssqv + vwtv * Dgrbvvardv + (onev - vwtv) * Dgrbvvaruv; - hcdvarv = epssqv + hwtv * Dgrbhvarrv + (onev - hwtv) * Dgrbhvarlv; + vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv); + vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv); //compute fluctuations in up/down and left/right interpolations of colors - Dgrbvvaruv = (LC2VFU(dgintv[indx])) + (LC2VFU(dgintv[indx - v1])) + (LC2VFU(dgintv[indx - v2])); - Dgrbvvardv = (LC2VFU(dgintv[indx])) + (LC2VFU(dgintv[indx + v1])) + (LC2VFU(dgintv[indx + v2])); - Dgrbhvarlv = (LC2VFU(dginth[indx])) + (LC2VFU(dginth[indx - 1])) + (LC2VFU(dginth[indx - 2])); - Dgrbhvarrv = (LC2VFU(dginth[indx])) + (LC2VFU(dginth[indx + 1])) + (LC2VFU(dginth[indx + 2])); + Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]); + Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]); - vcdvar1v = epssqv + vwtv * Dgrbvvardv + (onev - vwtv) * Dgrbvvaruv; - hcdvar1v = epssqv + hwtv * Dgrbhvarrv + (onev - hwtv) * Dgrbhvarlv; + Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]); + Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]); + + vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv); + vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv); //determine adaptive weights for G interpolation - varwtv = hcdvarv / (vcdvarv + hcdvarv); - diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v); + vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv); + vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v); //if both agree on interpolation direction, choose the one with strongest directional discrimination; //otherwise, choose the u/d and l/r difference fluctuation weights - decmask = vandm( vmaskf_gt( (zd5v - varwtv) * (zd5v - diffwtv), ZEROV ), vmaskf_lt( vabsf( zd5v - diffwtv), vabsf( zd5v - varwtv) ) ); - _mm_storeu_ps( &hvwt[indx >> 1], vself( decmask, varwtv, diffwtv)); + vmask decmask = vandm( vmaskf_gt( (zd5v - varwtv) * (zd5v - diffwtv), ZEROV ), vmaskf_lt( vabsf( zd5v - diffwtv), vabsf( zd5v - varwtv) ) ); + STVFU(hvwt[indx >> 1], vself( decmask, varwtv, diffwtv)); } } #else - for (rr = 6; rr < rr1 - 6; rr++) { - for (cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { + for (int rr = 6; rr < rr1 - 6; rr++) { + for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { //compute color difference variances in cardinal directions - uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3]; - dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3]; - lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3]; - rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3]; + float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3]; + float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3]; + float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3]; + float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3]; - Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave); - Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave); - Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave); - Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave) + SQR(hcd[indx + 3] - rave); + //color difference (G-R or G-B) variance in up/down/left/right directions + float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave); + float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave); + float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave); + float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave) + SQR(hcd[indx + 3] - rave); - hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]); - vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]); + float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]); + float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]); - vcdvar = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru; - hcdvar = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl; + float vcdvar = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru; + float hcdvar = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl; //compute fluctuations in up/down and left/right interpolations of colors Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]); @@ -1012,12 +823,12 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]); Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]); - vcdvar1 = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru; - hcdvar1 = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl; + float vcdvar1 = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru; + float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl; //determine adaptive weights for G interpolation - varwt = hcdvar / (vcdvar + hcdvar); - diffwt = hcdvar1 / (vcdvar1 + hcdvar1); + float varwt = hcdvar / (vcdvar + hcdvar); + float diffwt = hcdvar1 / (vcdvar1 + hcdvar1); //if both agree on interpolation direction, choose the one with strongest directional discrimination; //otherwise, choose the u/d and l/r difference fluctuation weights @@ -1027,255 +838,316 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, hvwt[indx >> 1] = diffwt; } - //hvwt[indx]=varwt; } } #endif - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + // Nyquist test - for (rr = 6; rr < rr1 - 6; rr++) - for (cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { + int nystartrow = 0; + int nyendrow = 0; + int nystartcol = TS + 1; + int nyendcol = 0; + + for (int rr = 6; rr < rr1 - 6; rr++) { + for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { //nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB gradients - nyqtest = (gaussodd[0] * cddiffsq[indx] + - gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] + - cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) + - gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] + - cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) + - gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] + - cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)])); - - nyqtest -= nyqthresh * (gaussgrad[0] * (delhvsqsum[indx]) + - gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] + - delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) + - gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] + - delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) + - gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] + - delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) + - gaussgrad[4] * (delhvsqsum[indx - 2 * TS - 1] + delhvsqsum[indx - 2 * TS + 1] + - delhvsqsum[indx - TS - 2] + delhvsqsum[indx - TS + 2] + - delhvsqsum[indx + TS - 2] + delhvsqsum[indx + TS + 2] + - delhvsqsum[indx + 2 * TS - 1] + delhvsqsum[indx + 2 * TS + 1]) + - gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] + - delhvsqsum[indx - p2] + delhvsqsum[indx + m2])); + // TODO_INGO: currently this part needs 10 float mults, 36 float adds, 4 int mults and 44 int adds for every second pixel + // it reads 304 bytes for every second pixel and writes <= 1 byte for every second pixel + // a precalculated vectorized version could do this with 1/4 of the operations + // but it would read 304 bytes for every second pixel and write 8 bytes for every second pixel for the precalculation + // (though the vectorized read should be faster than the scalar version) + // and read 8 bytes for every second pixel and write 1 byte for every second pixel for final calculation (maybe this last step can be avoided too) + float nyqtest1 = gaussodd[0] * cddiffsq[indx] + + gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] + + cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) + + gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] + + cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) + + gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] + + cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)]); + float nyqtest2 = gaussgrad[0] * delhvsqsum[indx] + + gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] + + delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) + + gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] + + delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) + + gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] + + delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) + + gaussgrad[4] * (delhvsqsum[indx - 2 * TS - 1] + delhvsqsum[indx - 2 * TS + 1] + + delhvsqsum[indx - TS - 2] + delhvsqsum[indx - TS + 2] + + delhvsqsum[indx + TS - 2] + delhvsqsum[indx + TS + 2] + + delhvsqsum[indx + 2 * TS - 1] + delhvsqsum[indx + 2 * TS + 1]) + + gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] + + delhvsqsum[indx - p2] + delhvsqsum[indx + m2]); - if (nyqtest > 0) { + if(nyqtest1 > nyqtest2) { nyquist[indx >> 1] = 1; //nyquist=1 for nyquist region - } - } - - unsigned int nyquisttemp; - - for (rr = 8; rr < rr1 - 8; rr++) { - for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 8; cc += 2, indx += 2) { - - nyquisttemp = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] + - nyquist[(indx - 2) >> 1] + nyquist[indx >> 1] + nyquist[(indx + 2) >> 1] + - nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]); - - //if most of your neighbors are named Nyquist, it's likely that you're one too - if (nyquisttemp > 4) { - nyquist[indx >> 1] = 1; - } - - //or not - if (nyquisttemp < 4) { - nyquist[indx >> 1] = 0; + nystartrow = nystartrow ? nystartrow : rr; + nyendrow = rr; + nystartcol = nystartcol > cc ? cc : nystartcol; + nyendcol = nyendcol < cc ? cc : nyendcol; } } } - // end of Nyquist test - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol; - // in areas of Nyquist texture, do area interpolation - for (rr = 8; rr < rr1 - 8; rr++) - for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 8; cc += 2, indx += 2) { + if(doNyquist) { + nyendrow ++; // because of < condition + nyendcol ++; // because of < condition + nystartcol -= (nystartcol & 1); + nystartrow = std::max(8, nystartrow); + nyendrow = std::min(rr1 - 8, nyendrow); + nystartcol = std::max(8, nystartcol); + nyendcol = std::min(cc1 - 8, nyendcol); - if (nyquist[indx >> 1]) { - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - // area interpolation + for (int rr = nystartrow; rr < nyendrow; rr++) { + for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) { + // TODO_INGO: if you look at the comments below, it does not seem to be correct to include nyquist[indx >> 1] into the summation + // Also this implementation has loop dependencies, which are not correct IMHO + // An implementation which uses a second buffer could avoid this dependencies and could be vectorized by factor 16 too (we're working with single bytes here) + // That would lead to differences in output compared to current code, but also would lead to more consistent output when changing TS + unsigned int nyquistneighbours = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] + + nyquist[(indx - 2) >> 1] + nyquist[indx >> 1] + nyquist[(indx + 2) >> 1] + + nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]); - sumh = sumv = sumsqh = sumsqv = areawt = 0; + //if most of your neighbours are named Nyquist, it's likely that you're one too + if (nyquistneighbours > 4) { + nyquist[indx >> 1] = 1; + } - for (i = -6; i < 7; i += 2) - for (j = -6; j < 7; j += 2) { - indx1 = (rr + i) * TS + cc + j; + //or not + if (nyquistneighbours < 4) { + nyquist[indx >> 1] = 0; + } + } + } - if (nyquist[indx1 >> 1]) { - sumh += cfa[indx1] - xdiv2f(cfa[indx1 - 1] + cfa[indx1 + 1]); - sumv += cfa[indx1] - xdiv2f(cfa[indx1 - v1] + cfa[indx1 + v1]); - sumsqh += xdiv2f(SQR(cfa[indx1] - cfa[indx1 - 1]) + SQR(cfa[indx1] - cfa[indx1 + 1])); - sumsqv += xdiv2f(SQR(cfa[indx1] - cfa[indx1 - v1]) + SQR(cfa[indx1] - cfa[indx1 + v1])); - areawt += 1; + // end of Nyquist test + + // in areas of Nyquist texture, do area interpolation + for (int rr = nystartrow; rr < nyendrow; rr++) + for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) { + + if (nyquist[indx >> 1]) { + // area interpolation + + float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f; + + for (int i = -6; i < 7; i += 2) { + int indx1 = indx + (i * TS) - 6; + + for (int j = -6; j < 7; j += 2, indx1 += 2) { + + if (nyquist[indx1 >> 1]) { + float cfatemp = cfa[indx1]; + sumcfa += cfatemp; + sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]); + sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]); + sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]); + sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]); + areawt += 1; + } } } - //horizontal and vertical color differences, and adaptive weight - hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh); - vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv); - hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar); + //horizontal and vertical color differences, and adaptive weight + sumh = sumcfa - xdiv2f(sumh); + sumv = sumcfa - xdiv2f(sumv); + sumsqh = xdiv2f(sumsqh); + sumsqv = xdiv2f(sumsqv); + float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh); + float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv); + hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar); - // end of area interpolation - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + // end of area interpolation + } } - } + } - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //populate G at R/B sites - for (rr = 8; rr < rr1 - 8; rr++) - for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 8; cc += 2, indx += 2) { + for (int rr = 8; rr < rr1 - 8; rr++) + for (int indx = rr * TS + 8 + (FC(rr, 2) & 1); indx < rr * TS + cc1 - 8; indx += 2) { //first ask if one gets more directional discrimination from nearby B/R sites - hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2); + float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2); -// hvwtalt = 0.25*(hvwt[(indx-m1)>>1]+hvwt[(indx+p1)>>1]+hvwt[(indx-p1)>>1]+hvwt[(indx+m1)>>1]); -// vo=fabsf(0.5-hvwt[indx>>1]); -// ve=fabsf(0.5-hvwtalt); - if (fabsf(0.5 - hvwt[indx >> 1]) < fabsf(0.5 - hvwtalt)) { - hvwt[indx >> 1] = hvwtalt; //a better result was obtained from the neighbors - } + hvwt[indx >> 1] = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1]; + //a better result was obtained from the neighbours -// if (vo>1]=hvwtalt;}//a better result was obtained from the neighbors + Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate color differences - - - Dgrb[0][indx >> 1] = (hcd[indx] * (1.0f - hvwt[indx >> 1]) + vcd[indx] * hvwt[indx >> 1]); //evaluate color differences - //if (hvwt[indx]<0.5) Dgrb[indx][0]=hcd[indx]; - //if (hvwt[indx]>0.5) Dgrb[indx][0]=vcd[indx]; rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; //evaluate G (finally!) //local curvature in G (preparation for nyquist refinement step) - if (nyquist[indx >> 1]) { - Dgrb2[indx >> 1].h = SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])); - Dgrb2[indx >> 1].v = SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])); - } else { - Dgrb2[indx >> 1].h = Dgrb2[indx >> 1].v = 0; - } + Dgrb2[indx >> 1].h = nyquist[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f; + Dgrb2[indx >> 1].v = nyquist[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f; } + //end of standard interpolation - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // refine Nyquist areas using G curvatures + if(doNyquist) { + for (int rr = nystartrow; rr < nyendrow; rr++) + // TODO_INGO: maybe this part is also worth vectorizing using _mm_movemask_ps + for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) { - for (rr = 8; rr < rr1 - 8; rr++) - for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 8; cc += 2, indx += 2) { + if (nyquist[indx >> 1]) { + //local averages (over Nyquist pixels only) of G curvature squared + float gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h + + gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) + + gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h) + + gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h)); + float gvarv = epssq + (gquinc[0] * Dgrb2[indx >> 1].v + + gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v) + + gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v) + + gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v)); + //use the results as weights for refined G interpolation + Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh); + rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; + } + } + } - if (nyquist[indx >> 1]) { - //local averages (over Nyquist pixels only) of G curvature squared - gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h + - gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) + - gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h) + - gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h)); - gvarv = epssq + (gquinc[0] * Dgrb2[indx >> 1].v + - gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v) + - gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v) + - gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v)); - //use the results as weights for refined G interpolation - Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh); - rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; + +#ifdef __SSE2__ + + for (int rr = 6; rr < rr1 - 6; rr++) { + if((FC(rr, 2) & 1) == 0) { + for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) { + vfloat tempv = LC2VFU(cfa[indx + 1]); + vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1]))); + STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1]))); + STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1]))); + vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1]))); + STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv ); + STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv ); + } + } else { + for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) { + vfloat tempv = LC2VFU(cfa[indx]); + vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1]))); + STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1]))); + STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1]))); + vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1]))); + STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv ); + STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv ); } } + } - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#else + + for (int rr = 6; rr < rr1 - 6; rr++) { + if((FC(rr, 2) & 1) == 0) { + for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { + delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]); + delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]); + Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1])); + Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1])); + } + } else { + for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { + Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1])); + Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1])); + delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]); + delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]); + } + } + } + +#endif // diagonal interpolation correction #ifdef __SSE2__ - __m128 rbsev, rbnwv, rbnev, rbswv, cfav, rbmv, rbpv, temp1v, wtv; - __m128 wtsev, wtnwv, wtnev, wtswv, rbvarmv; - __m128 gausseven0v = _mm_set1_ps(gausseven[0]); - __m128 gausseven1v = _mm_set1_ps(gausseven[1]); - __m128 twov = _mm_set1_ps(2.0f); + vfloat gausseven0v = F2V(gausseven[0]); + vfloat gausseven1v = F2V(gausseven[1]); #endif - for (rr = 8; rr < rr1 - 8; rr++) { + for (int rr = 8; rr < rr1 - 8; rr++) { #ifdef __SSE2__ - for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 8, indx += 8, indx1 += 4) { + for (int indx = rr * TS + 8 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 8; indx += 8, indx1 += 4) { //diagonal color ratios - cfav = LC2VFU(cfa[indx]); + vfloat cfav = LC2VFU(cfa[indx]); - temp1v = LC2VFU(cfa[indx + m1]); - temp2v = LC2VFU(cfa[indx + m2]); - rbsev = (temp1v + temp1v) / (epsv + cfav + temp2v ); + vfloat temp1v = LC2VFU(cfa[indx + m1]); + vfloat temp2v = LC2VFU(cfa[indx + m2]); + vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v ); rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev, temp1v + zd5v * (cfav - temp2v)); temp1v = LC2VFU(cfa[indx - m1]); temp2v = LC2VFU(cfa[indx - m2]); - rbnwv = (temp1v + temp1v) / (epsv + cfav + temp2v ); + vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v ); rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv, temp1v + zd5v * (cfav - temp2v)); temp1v = epsv + LVFU(delm[indx1]); - wtsev = temp1v + LVFU(delm[(indx + m1) >> 1]) + LVFU(delm[(indx + m2) >> 1]); //same as for wtu,wtd,wtl,wtr - wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]); + vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1]) + LVFU(delm[(indx + m2) >> 1]); //same as for wtu,wtd,wtl,wtr + vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]); - rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv); + vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv); temp1v = ULIMV(rbmv , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])); - wtv = twov * (cfav - rbmv) / (epsv + rbmv + cfav); - temp2v = wtv * rbmv + (onev - wtv) * temp1v; + vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav); + temp2v = vintpf(wtv, rbmv, temp1v); temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v); temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv); - _mm_storeu_ps(&rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v )); + STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v )); temp1v = LC2VFU(cfa[indx + p1]); temp2v = LC2VFU(cfa[indx + p2]); - rbnev = (temp1v + temp1v) / (epsv + cfav + temp2v ); + vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v ); rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev, temp1v + zd5v * (cfav - temp2v)); temp1v = LC2VFU(cfa[indx - p1]); temp2v = LC2VFU(cfa[indx - p2]); - rbswv = (temp1v + temp1v) / (epsv + cfav + temp2v ); + vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v ); rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv, temp1v + zd5v * (cfav - temp2v)); temp1v = epsv + LVFU(delp[indx1]); - wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]); - wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]); + vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]); + vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]); - rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv); + vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv); temp1v = ULIMV(rbpv , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])); - wtv = twov * (cfav - rbpv) / (epsv + rbpv + cfav); - temp2v = wtv * rbpv + (onev - wtv) * temp1v; + wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav); + temp2v = vintpf(wtv, rbpv, temp1v); temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v); temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv); - _mm_storeu_ps(&rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v )); + STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v )); - - - rbvarmv = epssqv + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1])) + - gausseven1v * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1]) + + vfloat rbvarmv = epssqv + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1])) + + gausseven1v * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1]))); - _mm_storeu_ps(&pmwt[indx1] , rbvarmv / ((epssqv + (gausseven0v * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1])) + - gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1]) + - LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1])))) + rbvarmv)); + STVFU(pmwt[indx1] , rbvarmv / ((epssqv + (gausseven0v * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1])) + + gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1]) + + LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1])))) + rbvarmv)); } #else - for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) { + for (int cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) { //diagonal color ratios - crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2])); - crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2])); - crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2])); - crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2])); + float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2])); + float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2])); + float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2])); + float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2])); + //color differences in diagonal directions + float rbse, rbnw, rbne, rbsw; //assign B/R at R/B sites if (fabsf(1.0f - crse) < arthresh) { @@ -1302,33 +1174,30 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]); } - wtse = eps + delm[indx1] + delm[(indx + m1) >> 1] + delm[(indx + m2) >> 1]; //same as for wtu,wtd,wtl,wtr - wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1]; - wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1]; - wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1]; + float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1] + delm[(indx + m2) >> 1]; //same as for wtu,wtd,wtl,wtr + float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1]; + float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1]; + float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1]; rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw); rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw); - /* - rbvarp = epssq + (gausseven[0]*(Dgrbsq1[indx-v1].p+Dgrbsq1[indx-1].p+Dgrbsq1[indx+1].p+Dgrbsq1[indx+v1].p) + - gausseven[1]*(Dgrbsq1[indx-v2-1].p+Dgrbsq1[indx-v2+1].p+Dgrbsq1[indx-2-v1].p+Dgrbsq1[indx+2-v1].p+ - Dgrbsq1[indx-2+v1].p+Dgrbsq1[indx+2+v1].p+Dgrbsq1[indx+v2-1].p+Dgrbsq1[indx+v2+1].p)); - */ - rbvarm = epssq + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1] + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1]) + - gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1] + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1] + - Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1] + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1])); + + //variance of R-B in plus/minus directions + float rbvarm = epssq + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1] + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1]) + + gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1] + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1] + + Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1] + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1])); pmwt[indx1] = rbvarm / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1] + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1]) + gausseven[1] * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1] + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1] + Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1] + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1]))) + rbvarm); - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //bound the interpolation in regions of high saturation + if (rbp[indx1] < cfa[indx]) { if (xmul2f(rbp[indx1]) < cfa[indx]) { rbp[indx1] = ULIM(rbp[indx1] , cfa[indx - p1], cfa[indx + p1]); } else { - pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]); + float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]); rbp[indx1] = pwt * rbp[indx1] + (1.0f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]); } } @@ -1337,56 +1206,48 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if (xmul2f(rbm[indx1]) < cfa[indx]) { rbm[indx1] = ULIM(rbm[indx1] , cfa[indx - m1], cfa[indx + m1]); } else { - mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]); + float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]); rbm[indx1] = mwt * rbm[indx1] + (1.0f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]); } } if (rbp[indx1] > clip_pt) { - rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]); //for RT implementation + rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]); } if (rbm[indx1] > clip_pt) { rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]); } - - //c=2-FC(rr,cc);//for dcraw implementation - //if (rbp[indx] > pre_mul[c]) rbp[indx]=ULIM(rbp[indx],cfa[indx-p1],cfa[indx+p1]); - //if (rbm[indx] > pre_mul[c]) rbm[indx]=ULIM(rbm[indx],cfa[indx-m1],cfa[indx+m1]); - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - //rbint[indx] = 0.5*(cfa[indx] + (rbp*rbvarm+rbm*rbvarp)/(rbvarp+rbvarm));//this is R+B, interpolated } #endif } #ifdef __SSE2__ - __m128 pmwtaltv; - __m128 zd25v = _mm_set1_ps(0.25f); + vfloat zd25v = F2V(0.25f); #endif - for (rr = 10; rr < rr1 - 10; rr++) + for (int rr = 10; rr < rr1 - 10; rr++) #ifdef __SSE2__ - for (cc = 10 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 8, indx += 8, indx1 += 4) { + for (int indx = rr * TS + 10 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 10; indx += 8, indx1 += 4) { //first ask if one gets more directional discrimination from nearby B/R sites - pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1])); - tempv = LVFU(pmwt[indx1]); + vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1])); + vfloat tempv = LVFU(pmwt[indx1]); tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv); - _mm_storeu_ps( &pmwt[indx1], tempv); - _mm_storeu_ps( &rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + LVFU(rbm[indx1]) * (onev - tempv) + LVFU(rbp[indx1]) * tempv)); + STVFU(pmwt[indx1], tempv); + STVFU(rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1])))); } #else - for (cc = 10 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) { + for (int cc = 10 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) { //first ask if one gets more directional discrimination from nearby B/R sites - pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2); + float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2); if (fabsf(0.5 - pmwt[indx1]) < fabsf(0.5 - pmwtalt)) { - pmwt[indx1] = pmwtalt; //a better result was obtained from the neighbors + pmwt[indx1] = pmwtalt; //a better result was obtained from the neighbours } rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.0f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated @@ -1394,8 +1255,64 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #endif - for (rr = 12; rr < rr1 - 12; rr++) - for (cc = 12 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) { + for (int rr = 12; rr < rr1 - 12; rr++) +#ifdef __SSE2__ + for (int indx = rr * TS + 12 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 12; indx += 8, indx1 += 4) { + vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1]))); + + if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the math for all 4 pixels and mask the unused out at the end + //now interpolate G vertically/horizontally using R+B values + //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts + //color ratios for G interpolation + vfloat rbintv = LVFU(rbint[indx1]); + + //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction + vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)])); + vfloat guv = rbintv * cruv; + vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)])); + guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v); + + vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)])); + vfloat gdv = rbintv * crdv; + vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)])); + gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v); + + vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1])); + vfloat Gint1v = ULIMV(Gintvv , LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])); + vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv); + vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v); + Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v); + Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv); + Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv), ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv); + + vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)])); + vfloat glv = rbintv * crlv; + vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)])); + glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v); + + vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)])); + vfloat grv = rbintv * crrv; + vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)])); + grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v); + + vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1])); + vfloat Gint1h = ULIMV(Ginthv , LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])); + vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv); + vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h); + Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h); + Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv); + Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv), ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv); + + vfloat greenv = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx])); + STC2VFU(rgbgreen[indx], greenv); + + STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1]))); + } + } + +#else + + for (int cc = 12 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) { if (fabsf(0.5 - pmwt[indx >> 1]) < fabsf(0.5 - hvwt[indx >> 1]) ) { continue; @@ -1403,55 +1320,52 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, //now interpolate G vertically/horizontally using R+B values //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts - //color ratios for G interpolation - cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]); - crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]); - crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]); - crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]); + //color ratios for G interpolation + float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]); + float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]); + float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]); + float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]); + + //interpolation of G in four directions + float gu, gd, gl, gr; //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction - if (fabsf(1.0f - cru) < arthresh) { + if (fabsf(1.f - cru) < arthresh) { gu = rbint[indx1] * cru; } else { gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]); } - if (fabsf(1.0f - crd) < arthresh) { + if (fabsf(1.f - crd) < arthresh) { gd = rbint[indx1] * crd; } else { gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]); } - if (fabsf(1.0f - crl) < arthresh) { + if (fabsf(1.f - crl) < arthresh) { gl = rbint[indx1] * crl; } else { gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]); } - if (fabsf(1.0f - crr) < arthresh) { + if (fabsf(1.f - crr) < arthresh) { gr = rbint[indx1] * crr; } else { gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]); } - //gu=rbint[indx]*cru; - //gd=rbint[indx]*crd; - //gl=rbint[indx]*crl; - //gr=rbint[indx]*crr; - //interpolated G via adaptive weights of cardinal evaluations - Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu) / (dirwts0[indx + v1] + dirwts0[indx - v1]); - Ginth = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]); + float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu) / (dirwts0[indx + v1] + dirwts0[indx - v1]); + float Ginth = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]); - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //bound the interpolation in regions of high saturation if (Gintv < rbint[indx1]) { if (2 * Gintv < rbint[indx1]) { Gintv = ULIM(Gintv , cfa[indx - v1], cfa[indx + v1]); } else { - vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]); - Gintv = vwt * Gintv + (1.0f - vwt) * ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]); + float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]); + Gintv = vwt * Gintv + (1.f - vwt) * ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]); } } @@ -1459,74 +1373,64 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if (2 * Ginth < rbint[indx1]) { Ginth = ULIM(Ginth , cfa[indx - 1], cfa[indx + 1]); } else { - hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]); - Ginth = hwt * Ginth + (1.0f - hwt) * ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]); + float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]); + Ginth = hwt * Ginth + (1.f - hwt) * ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]); } } if (Ginth > clip_pt) { - Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]); //for RT implementation + Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]); } if (Gintv > clip_pt) { Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]); } - //c=FC(rr,cc);//for dcraw implementation - //if (Ginth > pre_mul[c]) Ginth=ULIM(Ginth,cfa[indx-1],cfa[indx+1]); - //if (Gintv > pre_mul[c]) Gintv=ULIM(Gintv,cfa[indx-v1],cfa[indx+v1]); - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - rgbgreen[indx] = Ginth * (1.0f - hvwt[indx1]) + Gintv * hvwt[indx1]; - //rgb[indx][1] = 0.5*(rgb[indx][1]+0.25*(rgb[indx-v1][1]+rgb[indx+v1][1]+rgb[indx-1][1]+rgb[indx+1][1])); + rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1]; Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx]; - - //rgb[indx][2-FC(rr,cc)]=2*rbint[indx]-cfa[indx]; } +#endif + //end of diagonal interpolation correction - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //fancy chrominance interpolation //(ey,ex) is location of R site - for (rr = 13 - ey; rr < rr1 - 12; rr += 2) - for (cc = 13 - ex, indx1 = (rr * TS + cc) >> 1; cc < cc1 - 12; cc += 2, indx1++) { //B coset + for (int rr = 13 - ey; rr < rr1 - 12; rr += 2) + for (int indx1 = (rr * TS + 13 - ex) >> 1; indx1 < (rr * TS + cc1 - 12) >> 1; indx1++) { //B coset Dgrb[1][indx1] = Dgrb[0][indx1]; //split out G-B from G-R Dgrb[0][indx1] = 0; } #ifdef __SSE2__ -// __m128 wtnwv,wtnev,wtswv,wtsev; - __m128 oned325v = _mm_set1_ps( 1.325f ); - __m128 zd175v = _mm_set1_ps( 0.175f ); - __m128 zd075v = _mm_set1_ps( 0.075f ); + vfloat oned325v = F2V( 1.325f ); + vfloat zd175v = F2V( 0.175f ); + vfloat zd075v = F2V( 0.075f ); #endif - for (rr = 14; rr < rr1 - 14; rr++) + for (int rr = 14; rr < rr1 - 14; rr++) #ifdef __SSE2__ - for (cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) { - wtnwv = onev / (epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1]))); - wtnev = onev / (epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1]))); - wtswv = onev / (epsv + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p1) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1]))); - wtsev = onev / (epsv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m1) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1]))); + for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) { + vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1])); + vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1])); + vfloat wtnwv = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1]))); + vfloat wtnev = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1]))); + vfloat wtswv = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1]))); + vfloat wtsev = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1]))); - //Dgrb[indx][c]=(wtnw*Dgrb[indx-m1][c]+wtne*Dgrb[indx+p1][c]+wtsw*Dgrb[indx-p1][c]+wtse*Dgrb[indx+m1][c])/(wtnw+wtne+wtsw+wtse); - - _mm_storeu_ps(&Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1]) - zd075v * LVFU(Dgrb[c][(indx - m1 - 2) >> 1]) - zd075v * LVFU(Dgrb[c][(indx - m1 - v2) >> 1]) ) + - wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1]) - zd075v * LVFU(Dgrb[c][(indx + p1 + 2) >> 1]) - zd075v * LVFU(Dgrb[c][(indx + p1 + v2) >> 1]) ) + - wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1]) - zd075v * LVFU(Dgrb[c][(indx - p1 - 2) >> 1]) - zd075v * LVFU(Dgrb[c][(indx - p1 - v2) >> 1]) ) + - wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1]) - zd075v * LVFU(Dgrb[c][(indx + m1 + 2) >> 1]) - zd075v * LVFU(Dgrb[c][(indx + m1 + v2) >> 1]) )) / (wtnwv + wtnev + wtswv + wtsev)); + STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])) ) + + wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])) ) + + wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])) ) + + wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + m1 + v2) >> 1])) )) / (wtnwv + wtnev + wtswv + wtsev)); } #else - for (cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) { - wtnw = 1.0f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1])); - wtne = 1.0f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1])); - wtsw = 1.0f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1])); - wtse = 1.0f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1])); - - //Dgrb[indx][c]=(wtnw*Dgrb[indx-m1][c]+wtne*Dgrb[indx+p1][c]+wtsw*Dgrb[indx-p1][c]+wtse*Dgrb[indx+m1][c])/(wtnw+wtne+wtsw+wtse); + for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) { + float wtnw = 1.0f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1])); + float wtne = 1.0f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1])); + float wtsw = 1.0f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1])); + float wtse = 1.0f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1])); Dgrb[c][indx >> 1] = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1] - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1] ) + wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1] - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1] ) + @@ -1535,13 +1439,55 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, } #endif - float temp; + //tile vars + //counters for pixel location in the image + int row, col; + //counters for pixel location within the tile + int cc; + //pointer counters within the tile + int indx; - for (rr = 16; rr < rr1 - 16; rr++) { - if((FC(rr, 2) & 1) == 1) { - for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) { + // end of tile initialization + +#ifdef __SSE2__ + int offset; + vfloat twov = F2V(2.f); + vmask selmask; + + if((FC(16, 2) & 1) == 1) { + selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0); + offset = 1; + } else { + selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + offset = 0; + } + +#endif + + for (int rr = 16; rr < rr1 - 16; rr++) { +#ifdef __SSE2__ + offset = 1 - offset; + selmask = vnotm(selmask); + + for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 18 - (cc1 & 1); cc += 4, indx += 4) { + col = cc + left; + vfloat greenv = LVF(rgbgreen[indx]); + vfloat temp00v = vdup(LVF(hvwt[(indx - v1) >> 1])); + vfloat temp01v = vdup(LVF(hvwt[(indx + v1) >> 1])); + vfloat tempv = onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])) - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v); + + vfloat redv1 = greenv - (temp00v * vdup(LVF(Dgrb[0][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVF(Dgrb[0][(indx + v1) >> 1]))) * tempv; + vfloat bluev1 = greenv - (temp00v * vdup(LVF(Dgrb[1][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVF(Dgrb[1][(indx + v1) >> 1]))) * tempv; + vfloat redv2 = greenv - vdup(LVF(Dgrb[0][indx >> 1])); + vfloat bluev2 = greenv - vdup(LVF(Dgrb[1][indx >> 1])); + STVFU(red[row][col], c65535v * vself(selmask, redv1, redv2)); + STVFU(blue[row][col], c65535v * vself(selmask, bluev1, bluev2)); + } + + if(offset == 0) { + for (indx = rr * TS + cc; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) { col = cc + left; - temp = 1.0f / ((hvwt[(indx - v1) >> 1]) + (1.0f - hvwt[(indx + 1) >> 1]) + (1.0f - hvwt[(indx - 1) >> 1]) + (hvwt[(indx + v1) >> 1])); + float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * @@ -1555,7 +1501,54 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if(cc1 & 1) { // width of tile is odd col = cc + left; - temp = 1.0f / ((hvwt[(indx - v1) >> 1]) + (1.0f - hvwt[(indx + 1) >> 1]) + (1.0f - hvwt[(indx - 1) >> 1]) + (hvwt[(indx + v1) >> 1])); + float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + temp); + blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + temp); + } + } else { + for (indx = rr * TS + cc; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) { + col = cc + left; + red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + + indx++; + col++; + float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + temp); + blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + temp); + } + + if(cc1 & 1) { // width of tile is odd + col = cc + left; + red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + } + } + +#else + + if((FC(rr, 2) & 1) == 1) { + for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) { + col = cc + left; + float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + temp); + blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + temp); + + indx++; + col++; + red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + } + + if(cc1 & 1) { // width of tile is odd + col = cc + left; + float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * @@ -1569,7 +1562,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, indx++; col++; - temp = 1.0f / ((hvwt[(indx - v1) >> 1]) + (1.0f - hvwt[(indx + 1) >> 1]) + (1.0f - hvwt[(indx - 1) >> 1]) + (hvwt[(indx + v1) >> 1])); + float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * @@ -1582,33 +1575,25 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); } } + +#endif } - - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - // copy smoothed results back to image matrix - for (rr = 16; rr < rr1 - 16; rr++) { + for (int rr = 16; rr < rr1 - 16; rr++) { + int row = rr + top; + int cc = 16; #ifdef __SSE2__ - for (row = rr + top, cc = 16; cc < cc1 - 19; cc += 4) { - _mm_storeu_ps(&green[row][cc + left], LVF(rgbgreen[rr * TS + cc]) * c65535v); - } - -#else - - for (row = rr + top, cc = 16; cc < cc1 - 16; cc++) { - col = cc + left; - indx = rr * TS + cc; - green[row][col] = ((65535.0f * rgbgreen[indx])); - - //for dcraw implementation - //for (c=0; c<3; c++){ - // image[indx][c] = CLIP((int)(65535.0f*rgb[rr*TS+cc][c] + 0.5f)); - //} + for (; cc < cc1 - 19; cc += 4) { + STVFU(green[row][cc + left], LVF(rgbgreen[rr * TS + cc]) * c65535v); } #endif + + for (; cc < cc1 - 16; cc++) { + green[row][cc + left] = 65535.0f * rgbgreen[rr * TS + cc]; + } } //end of main loop @@ -1616,26 +1601,19 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if(plistener) { progresscounter++; - if(progresscounter % 16 == 0) { - #pragma omp critical + if(progresscounter % 32 == 0) { +#ifdef _OPENMP + #pragma omp critical (amazeprogress) +#endif { - progress += (double)16 * ((TS - 32) * (TS - 32)) / (height * width); - - if (progress > 1.0) - { - progress = 1.0; - } - + progress += (double)32 * ((TS - 32) * (TS - 32)) / (height * width); + progress = progress > 1.0 ? 1.0 : progress; plistener->setProgress(progress); } } } } - // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - - // clean up free(buffer); } @@ -1644,7 +1622,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, plistener->setProgress(1.0); } - // done #undef TS diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index b60b5c9bc..7bc480861 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -4,7 +4,6 @@ #ifdef __GNUC__ #define INLINE __inline -//#define INLINE __attribute__((always_inline)) #else #define INLINE inline #endif @@ -48,20 +47,20 @@ typedef __m128i vint2; // SSE4.1 => use _mm_blend_ps instead of _mm_set_epi32 and vself #define STC2VFU(a,v) {\ __m128 TST1V = _mm_loadu_ps(&a);\ - __m128 TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 1,1,0,0 ));\ + __m128 TST2V = _mm_unpacklo_ps(v,v);\ _mm_storeu_ps(&a, _mm_blend_ps(TST1V,TST2V,5));\ TST1V = _mm_loadu_ps((&a)+4);\ - TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 3,3,2,2 ));\ + TST2V = _mm_unpackhi_ps(v,v);\ _mm_storeu_ps((&a)+4, _mm_blend_ps(TST1V,TST2V,5));\ } #else #define STC2VFU(a,v) {\ __m128 TST1V = _mm_loadu_ps(&a);\ - __m128 TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 1,1,0,0 ));\ + __m128 TST2V = _mm_unpacklo_ps(v,v);\ vmask cmask = _mm_set_epi32(0xffffffff,0,0xffffffff,0);\ _mm_storeu_ps(&a, vself(cmask,TST1V,TST2V));\ TST1V = _mm_loadu_ps((&a)+4);\ - TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 3,3,2,2 ));\ + TST2V = _mm_unpackhi_ps(v,v);\ _mm_storeu_ps((&a)+4, vself(cmask,TST1V,TST2V));\ } #endif diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h index 060f0c4ff..951d397bf 100644 --- a/rtengine/rt_math.h +++ b/rtengine/rt_math.h @@ -78,5 +78,15 @@ inline const _Tp& max(const _Tp& a, const _Tp& b, const _Tp& c, const _Tp& d) { return std::max(d, std::max(c, std::max(a, b))); } + +template +inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c) { + // calculate a * b + (1 - a) * c + // following is valid: + // intp(a, b+x, c+x) = vintpf(a, b, c) + x + // intp(a, b*x, c*x) = vintpf(a, b, c) * x + return a * (b-c) + c; +} + } #endif diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index 453025fd6..a0300a1cc 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -1316,7 +1316,9 @@ return vmaxf( b, vminf(a,c)); } static INLINE vfloat ULIMV( vfloat a, vfloat b, vfloat c ){ - return vself( vmaskf_lt(b,c), LIMV(a,b,c), LIMV(a,c,b)); + // made to clamp a in range [b,c] but in fact it's also the median of a,b,c, which means that the result is independent on order of arguments + // ULIMV(a,b,c) = ULIMV(a,c,b) = ULIMV(b,a,c) = ULIMV(b,c,a) = ULIMV(c,a,b) = ULIMV(c,b,a) + return vmaxf(vminf(a,b), vminf(vmaxf(a,b),c)); } static INLINE vfloat SQRV(vfloat a){ @@ -1324,17 +1326,45 @@ static INLINE vfloat SQRV(vfloat a){ } static inline void vswap( vmask condition, vfloat &a, vfloat &b) { + // conditional swap the elements of two vfloats vfloat temp = vself(condition, a, b); // the values which fit to condition condition = vnotm(condition); // invert the condition a = vself(condition, a, b); // the values which fit to inverted condition b = temp; } -static inline float vhadd( vfloat a ) -{ +static inline float vhadd( vfloat a ) { + // returns a[0] + a[1] + a[2] + a[3] a += _mm_movehl_ps(a, a); return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1))); } +static INLINE vfloat vmul2f(vfloat a){ + // fastest way to multiply by 2 + return a + a; +} + +static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) { + // calculate a * b + (1 - a) * c (interpolate two values) + // following is valid: + // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x + // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x + return a * (b-c) + c; +} + +static INLINE vfloat vdup(vfloat a){ + // returns { a[0],a[0],a[1],a[1] } + return _mm_unpacklo_ps( a, a ); +} + +static INLINE vfloat vaddc2vfu(float &a) +{ + // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] } + vfloat a1 = _mm_loadu_ps( &a ); + vfloat a2 = _mm_loadu_ps( (&a) + 4 ); + return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 )); +} + + #endif // __SSE2__ #endif // SLEEFSSEAVX From ee665d67908eb9b3652aaf9d42a1923b5aab838f Mon Sep 17 00:00:00 2001 From: heckflosse Date: Tue, 26 Jan 2016 13:10:38 +0100 Subject: [PATCH 2/3] Amaze Demosaic: Speedup, cleaned code, changed nyquist code --- rtengine/amaze_demosaic_RT.cc | 799 ++++++++++++++++------------------ rtengine/helpersse2.h | 11 +- rtengine/rt_math.h | 4 +- rtengine/sleefsseavx.c | 24 +- 4 files changed, 418 insertions(+), 420 deletions(-) diff --git a/rtengine/amaze_demosaic_RT.cc b/rtengine/amaze_demosaic_RT.cc index 3b367ee2b..2720521aa 100644 --- a/rtengine/amaze_demosaic_RT.cc +++ b/rtengine/amaze_demosaic_RT.cc @@ -9,6 +9,7 @@ // incorporating ideas of Luis Sanz Rodrigues and Paul Lee // // code dated: May 27, 2010 +// latest modification: Ingo Weyrich, January 25, 2016 // // amaze_interpolate_RT.cc is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -52,9 +53,15 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, const float clip_pt = 1.0 / initialGain; const float clip_pt8 = 0.8 / initialGain; - -#define TS 160 // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading -#define TSH 80 // half of Tile size +// this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster +// If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines +#ifndef AMAZETS +#define AMAZETS 160 +#endif + // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading + // We assure that Tile size is a multiple of 32 in the range [96;992] + constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992); + constexpr int tsh = ts / 2; // half of Tile size //offset of R pixel within a Bayer quartet int ex, ey; @@ -79,27 +86,27 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, } //shifts of pointer value to access pixels in vertical and diagonal directions - static const int v1 = TS, v2 = 2 * TS, v3 = 3 * TS, p1 = -TS + 1, p2 = -2 * TS + 2, p3 = -3 * TS + 3, m1 = TS + 1, m2 = 2 * TS + 2, m3 = 3 * TS + 3; + constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3, m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3; //tolerance to avoid dividing by zero - static const float eps = 1e-5, epssq = 1e-10; //tolerance to avoid dividing by zero + constexpr float eps = 1e-5, epssq = 1e-10; //tolerance to avoid dividing by zero //adaptive ratios threshold - static const float arthresh = 0.75; + constexpr float arthresh = 0.75; //gaussian on 5x5 quincunx, sigma=1.2 - static const float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f}; + constexpr float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f}; //nyquist texture test threshold - static const float nyqthresh = 0.5; + constexpr float nyqthresh = 0.5; //gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop // Is this really sigma=1.2????, seems more like sigma = 1.672 - static const float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f, - nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f - }; + constexpr float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f, + nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f + }; //gaussian on 5x5 alt quincunx, sigma=1.5 - static const float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f}; + constexpr float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f}; //guassian on quincunx grid - static const float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f}; + constexpr float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f}; typedef struct { float h; @@ -112,115 +119,87 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, { int progresscounter = 0; -#define CLF 1 + constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ... // assign working space - char *buffer = (char *) calloc(13 * sizeof(float) * TS * TS + sizeof(float) * TS * TSH + sizeof(char) * TS * TSH + 18 * CLF * 64 + 63, 1); + char *buffer = (char *) calloc(14 * sizeof(float) * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1); // aligned to 64 byte boundary char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64); // green values float *rgbgreen = (float (*)) data; // sum of square of horizontal gradient and square of vertical gradient - float *delhvsqsum = (float (*)) ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64); + float *delhvsqsum = (float (*)) ((char*)rgbgreen + sizeof(float) * ts * ts + cldf * 64); // 1 // gradient based directional weights for interpolation - float *dirwts0 = (float (*)) ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64); - float *dirwts1 = (float (*)) ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64); - // vertically interpolated color differences G-R, G-B - float *vcd = (float (*)) ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64); - // horizontally interpolated color differences - float *hcd = (float (*)) ((char*)vcd + sizeof(float) * TS * TS + CLF * 64); + float *dirwts0 = (float (*)) ((char*)delhvsqsum + sizeof(float) * ts * ts + cldf * 64); // 1 + float *dirwts1 = (float (*)) ((char*)dirwts0 + sizeof(float) * ts * ts + cldf * 64); // 1 + // vertically interpolated colour differences G-R, G-B + float *vcd = (float (*)) ((char*)dirwts1 + sizeof(float) * ts * ts + cldf * 64); // 1 + // horizontally interpolated colour differences + float *hcd = (float (*)) ((char*)vcd + sizeof(float) * ts * ts + cldf * 64); // 1 // alternative vertical interpolation - float *vcdalt = (float (*)) ((char*)hcd + sizeof(float) * TS * TS + CLF * 64); + float *vcdalt = (float (*)) ((char*)hcd + sizeof(float) * ts * ts + cldf * 64); // 1 // alternative horizontal interpolation - float *hcdalt = (float (*)) ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64); - // square of average color difference - float *cddiffsq = (float (*)) ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64); + float *hcdalt = (float (*)) ((char*)vcdalt + sizeof(float) * ts * ts + cldf * 64); // 1 + // square of average colour difference + float *cddiffsq = (float (*)) ((char*)hcdalt + sizeof(float) * ts * ts + cldf * 64); // 1 // weight to give horizontal vs vertical interpolation - float *hvwt = (float (*)) ((char*)cddiffsq + sizeof(float) * TS * TS + 2 * CLF * 64); - // final interpolated color difference - float (*Dgrb)[TS * TSH] = (float (*)[TS * TSH])vcdalt; // there is no overlap in buffer usage => share + float *hvwt = (float (*)) ((char*)cddiffsq + sizeof(float) * ts * ts + 2 * cldf * 64); // 1 + // final interpolated colour difference + float (*Dgrb)[ts * tsh] = (float (*)[ts * tsh])vcdalt; // there is no overlap in buffer usage => share // gradient in plus (NE/SW) direction float *delp = (float (*))cddiffsq; // there is no overlap in buffer usage => share // gradient in minus (NW/SE) direction - float *delm = (float (*)) ((char*)delp + sizeof(float) * TS * TSH + CLF * 64); + float *delm = (float (*)) ((char*)delp + sizeof(float) * ts * tsh + cldf * 64); // diagonal interpolation of R+B float *rbint = (float (*))delm; // there is no overlap in buffer usage => share // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions) - s_hv *Dgrb2 = (s_hv (*)) ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64); + s_hv *Dgrb2 = (s_hv (*)) ((char*)hvwt + sizeof(float) * ts * tsh + cldf * 64); // 1 // difference between up/down interpolations of G float *dgintv = (float (*))Dgrb2; // there is no overlap in buffer usage => share // difference between left/right interpolations of G - float *dginth = (float (*)) ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64); + float *dginth = (float (*)) ((char*)dgintv + sizeof(float) * ts * ts + cldf * 64); // 1 // square of diagonal colour differences - float *Dgrbsq1m = (float (*)) ((char*)dginth + sizeof(float) * TS * TS + CLF * 64); - float *Dgrbsq1p = (float (*)) ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64); + float *Dgrbsq1m = (float (*)) ((char*)dginth + sizeof(float) * ts * ts + cldf * 64); // 1 + float *Dgrbsq1p = (float (*)) ((char*)Dgrbsq1m + sizeof(float) * ts * tsh + cldf * 64); // 1 // tile raw data - float *cfa = (float (*)) ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64); + float *cfa = (float (*)) ((char*)Dgrbsq1p + sizeof(float) * ts * tsh + cldf * 64); // 1 // relative weight for combining plus and minus diagonal interpolations float *pmwt = (float (*))delhvsqsum; // there is no overlap in buffer usage => share - // interpolated color difference R-B in minus and plus direction + // interpolated colour difference R-B in minus and plus direction float *rbm = (float (*))vcd; // there is no overlap in buffer usage => share - float *rbp = (float (*)) ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64); - // nyquist texture flag 1=nyquist, 0=not nyquist - unsigned char *nyquist = (unsigned char (*)) ((char*)cfa + sizeof(float) * TS * TS + CLF * 64); - /* - rgbgreen = (float (*)) data; //pointers to array - delhvsqsum = (float (*)) ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64); - dirwts0 = (float (*)) ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64); - dirwts1 = (float (*)) ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64); - vcd = (float (*)) ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64); - hcd = (float (*)) ((char*)vcd + sizeof(float) * TS * TS + CLF * 64); - vcdalt = (float (*)) ((char*)hcd + sizeof(float) * TS * TS + CLF * 64); - hcdalt = (float (*)) ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64); - cddiffsq = (float (*)) ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64); - hvwt = (float (*)) ((char*)cddiffsq + sizeof(float) * TS * TS + CLF * 64); - Dgrb = (float (*)[TS * TSH]) ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64); - delp = (float (*)) ((char*)Dgrb + sizeof(float) * TS * TS + CLF * 64); - delm = (float (*)) ((char*)delp + sizeof(float) * TS * TSH + CLF * 64); - rbint = (float (*)) ((char*)delm + sizeof(float) * TS * TSH + CLF * 64); - Dgrb2 = (s_hv (*)) ((char*)rbint + sizeof(float) * TS * TSH + CLF * 64); - dgintv = (float (*)) ((char*)Dgrb2 + sizeof(float) * TS * TS + CLF * 64); - dginth = (float (*)) ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64); - Dgrbsq1m = (float (*)) ((char*)dginth + sizeof(float) * TS * TS + CLF * 64); - Dgrbsq1p = (float (*)) ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64); - cfa = (float (*)) ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64); - pmwt = (float (*)) ((char*)cfa + sizeof(float) * TS * TS + CLF * 64); - rbm = (float (*)) ((char*)pmwt + sizeof(float) * TS * TSH + CLF * 64); - rbp = (float (*)) ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64); - - nyquist = (char (*)) ((char*)rbp + sizeof(float) * TS * TSH + CLF * 64); - */ -#undef CLF + float *rbp = (float (*)) ((char*)rbm + sizeof(float) * ts * tsh + cldf * 64); + // nyquist texture flags 1=nyquist, 0=not nyquist + unsigned char *nyquist = (unsigned char (*)) ((char*)cfa + sizeof(float) * ts * ts + cldf * 64); // 1 + unsigned char *nyquist2 = (unsigned char (*))cddiffsq; + float *nyqutest = (float(*)) ((char*)nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64); // 1 // Main algorithm: Tile loop - - // Issue 1676 // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling #ifdef _OPENMP #pragma omp for schedule(dynamic) collapse(2) nowait #endif - for (int top = winy - 16; top < winy + height; top += TS - 32) - for (int left = winx - 16; left < winx + width; left += TS - 32) { + for (int top = winy - 16; top < winy + height; top += ts - 32) { + for (int left = winx - 16; left < winx + width; left += ts - 32) { #ifdef __SSE2__ // Using SSE2 we can zero the memory without cache pollution vfloat zerov = ZEROV; - for(int i = 3 * TSH; i < (TS - 6)*TSH; i += 16) { + for(int i = 3 * tsh; i < (ts - 6)*tsh; i += 16) { _mm_stream_ps((float*)&nyquist[i], zerov); } #else - memset(&nyquist[3 * TSH], 0, sizeof(unsigned char) * (TS - 6) * TSH); + memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh); #endif //location of tile bottom edge - const int bottom = min(top + TS, winy + height + 16); + int bottom = min(top + ts, winy + height + 16); //location of tile right edge - const int right = min(left + TS, winx + width + 16); - //tile width (=TS except for right edge of image) - const int rr1 = bottom - top; - //tile height (=TS except for bottom edge of image) - const int cc1 = right - left; + int right = min(left + ts, winx + width + 16); + //tile width (=ts except for right edge of image) + int rr1 = bottom - top; + //tile height (=ts except for bottom edge of image) + int cc1 = right - left; // bookkeeping for borders // min and max row/column in the tile int rrmin = top < winy ? 16 : 0; @@ -232,54 +211,87 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, // rgb values should be floating point number between 0 and 1 // after white balance multipliers are applied // a 16 pixel border is added to each side of the image + + // begin of tile initialization #ifdef __SSE2__ - const vfloat c65535v = F2V( 65535.0f ); + vfloat c65535v = F2V( 65535.f ); //fill upper border if (rrmin > 0) { - for (int rr = 0; rr < 16; rr++) - for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) { - cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f; - rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; + for (int rr = 0; rr < 16; rr++) { + int row = 32 - rr + top; + + for (int cc = ccmin; cc < ccmax; cc += 4) { + int indx1 = rr * ts + cc; + vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v; + STVF(cfa[indx1], tempv); + STVF(rgbgreen[indx1], tempv ); } + } } // fill inner part for (int rr = rrmin; rr < rrmax; rr++) { int row = rr + top; - int cc = ccmin; - for (; cc < ccmax - 3; cc += 4) { - int indx1 = rr * TS + cc; + for (int cc = ccmin; cc < ccmax; cc += 4) { + int indx1 = rr * ts + cc; vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v; STVF(cfa[indx1], tempv ); STVF(rgbgreen[indx1], tempv ); } - - for (; cc < ccmax; cc++) { - int indx1 = rr * TS + cc; - cfa[indx1] = (rawData[row][cc + left]) / 65535.0f; - rgbgreen[indx1] = cfa[indx1]; - } } //fill lower border if (rrmax < rr1) { for (int rr = 0; rr < 16; rr++) for (int cc = ccmin; cc < ccmax; cc += 4) { - int indx1 = (rrmax + rr) * TS + cc; + int indx1 = (rrmax + rr) * ts + cc; vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / c65535v; STVF(cfa[indx1], tempv ); STVF(rgbgreen[indx1], tempv ); } } +#else + + //fill upper border + if (rrmin > 0) { + for (int rr = 0; rr < 16; rr++) + for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) { + cfa[rr * ts + cc] = (rawData[row][cc + left]) / 65535.f; + rgbgreen[rr * ts + cc] = cfa[rr * ts + cc]; + } + } + + // fill inner part + for (int rr = rrmin; rr < rrmax; rr++) { + int row = rr + top; + + for (int cc = ccmin; cc < ccmax; cc++) { + int indx1 = rr * ts + cc; + cfa[indx1] = (rawData[row][cc + left]) / 65535.f; + rgbgreen[indx1] = cfa[indx1]; + } + } + + //fill lower border + if (rrmax < rr1) { + for (int rr = 0; rr < 16; rr++) + for (int cc = ccmin; cc < ccmax; cc++) { + cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / 65535.f; + rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc]; + } + } + +#endif + //fill left border if (ccmin > 0) { for (int rr = rrmin; rr < rrmax; rr++) for (int cc = 0, row = rr + top; cc < 16; cc++) { - cfa[rr * TS + cc] = (rawData[row][32 - cc + left]) / 65535.0f; - rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; + cfa[rr * ts + cc] = (rawData[row][32 - cc + left]) / 65535.f; + rgbgreen[rr * ts + cc] = cfa[rr * ts + cc]; } } @@ -287,87 +299,8 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if (ccmax < cc1) { for (int rr = rrmin; rr < rrmax; rr++) for (int cc = 0; cc < 16; cc++) { - cfa[rr * TS + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.0f; - rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc]; - } - } - - //also, fill the image corners - if (rrmin > 0 && ccmin > 0) { - for (int rr = 0; rr < 16; rr++) - for (int cc = 0; cc < 16; cc += 4) { - int indx1 = (rr) * TS + cc; - vfloat tempv = LVFU(rawData[winy + 32 - rr][winx + 32 - cc]) / c65535v; - STVF(cfa[indx1], tempv ); - STVF(rgbgreen[indx1], tempv ); - } - } - - if (rrmax < rr1 && ccmax < cc1) { - for (int rr = 0; rr < 16; rr++) - for (int cc = 0; cc < 16; cc += 4) { - int indx1 = (rrmax + rr) * TS + ccmax + cc; - vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / c65535v; - STVFU(cfa[indx1], tempv ); - STVFU(rgbgreen[indx1], tempv ); - } - } - - if (rrmin > 0 && ccmax < cc1) { - for (int rr = 0; rr < 16; rr++) - for (int cc = 0; cc < 16; cc++) { - cfa[(rr)*TS + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.0f; - rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc]; - } - } - - if (rrmax < rr1 && ccmin > 0) { - for (int rr = 0; rr < 16; rr++) - for (int cc = 0; cc < 16; cc++) { - cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.0f; - rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; - } - } - -#else - - for (int rr = rrmin; rr < rrmax; rr++) - for (int row = rr + top, cc = ccmin; cc < ccmax; cc++) { - int indx1 = rr * TS + cc; - cfa[indx1] = (rawData[row][cc + left]) / 65535.0f; - rgbgreen[indx1] = cfa[indx1]; - } - - //fill borders - if (rrmin > 0) { - for (int rr = 0; rr < 16; rr++) - for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) { - cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f; - rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; - } - } - - if (rrmax < rr1) { - for (int rr = 0; rr < 16; rr++) - for (int cc = ccmin; cc < ccmax; cc++) { - cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / 65535.0f; - rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; - } - } - - if (ccmin > 0) { - for (int rr = rrmin; rr < rrmax; rr++) - for (int cc = 0, row = rr + top; cc < 16; cc++) { - cfa[rr * TS + cc] = (rawData[row][32 - cc + left]) / 65535.0f; - rgbgreen[rr * TS + cc] = cfa[rr * TS + cc]; - } - } - - if (ccmax < cc1) { - for (int rr = rrmin; rr < rrmax; rr++) - for (int cc = 0; cc < 16; cc++) { - cfa[rr * TS + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.0f; - rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc]; + cfa[rr * ts + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.f; + rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc]; } } @@ -375,43 +308,43 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, if (rrmin > 0 && ccmin > 0) { for (int rr = 0; rr < 16; rr++) for (int cc = 0; cc < 16; cc++) { - cfa[(rr)*TS + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / 65535.0f; - rgbgreen[(rr)*TS + cc] = cfa[(rr) * TS + cc]; + cfa[(rr)*ts + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / 65535.f; + rgbgreen[(rr)*ts + cc] = cfa[(rr) * ts + cc]; } } if (rrmax < rr1 && ccmax < cc1) { for (int rr = 0; rr < 16; rr++) for (int cc = 0; cc < 16; cc++) { - cfa[(rrmax + rr)*TS + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / 65535.0f; - rgbgreen[(rrmax + rr)*TS + ccmax + cc] = cfa[(rrmax + rr) * TS + ccmax + cc]; + cfa[(rrmax + rr)*ts + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / 65535.f; + rgbgreen[(rrmax + rr)*ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc]; } } if (rrmin > 0 && ccmax < cc1) { for (int rr = 0; rr < 16; rr++) for (int cc = 0; cc < 16; cc++) { - cfa[(rr)*TS + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.0f; - rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc]; + cfa[(rr)*ts + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.f; + rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr) * ts + ccmax + cc]; } } if (rrmax < rr1 && ccmin > 0) { for (int rr = 0; rr < 16; rr++) for (int cc = 0; cc < 16; cc++) { - cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.0f; - rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc]; + cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.f; + rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc]; } } -#endif + // end of tile initialization - //end of border fill + // horizontal and vertical gradients #ifdef __SSE2__ - const vfloat epsv = F2V( eps ); + vfloat epsv = F2V( eps ); for (int rr = 2; rr < rr1 - 2; rr++) { - for (int indx = rr * TS; indx < rr * TS + cc1; indx += 4) { + for (int indx = rr * ts; indx < rr * ts + cc1; indx += 4) { vfloat delhv = vabsf( LVFU( cfa[indx + 1] ) - LVFU( cfa[indx - 1] ) ); vfloat delvv = vabsf( LVF( cfa[indx + v1] ) - LVF( cfa[indx - v1] ) ); STVF(dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv ); @@ -423,53 +356,55 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #else for (int rr = 2; rr < rr1 - 2; rr++) - for (int cc = 2, indx = (rr) * TS + cc; cc < cc1 - 2; cc++, indx++) { - // horizontal and vedrtical gradient + for (int cc = 2, indx = (rr) * ts + cc; cc < cc1 - 2; cc++, indx++) { float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]); float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]); dirwts0[indx] = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv; - dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh; //+fabsf(cfa[indx+2]-cfa[indx-2]); + dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh; delhvsqsum[indx] = SQR(delh) + SQR(delv); } #endif - //interpolate vertical and horizontal color differences + //interpolate vertical and horizontal colour differences #ifdef __SSE2__ vfloat sgnv; if( !(FC(4, 4) & 1) ) { - sgnv = _mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f ); + sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f ); } else { - sgnv = _mm_set_ps( -1.0f, 1.0f, -1.0f, 1.0f ); + sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f ); } - vfloat zd5v = F2V( 0.5f ); - vfloat onev = F2V( 1.0f ); - vfloat arthreshv = F2V( arthresh ); - vfloat clip_pt8v = F2V( clip_pt8 ); + vfloat zd5v = F2V( 0.5f ); + vfloat onev = F2V( 1.f ); + vfloat arthreshv = F2V( arthresh ); + vfloat clip_pt8v = F2V( clip_pt8 ); for (int rr = 4; rr < rr1 - 4; rr++) { sgnv = -sgnv; - for (int indx = rr * TS + 4; indx < rr * TS + cc1 - 7; indx += 4) { - //color ratios in each cardinal direction + for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4) { + //colour ratios in each cardinal direction vfloat cfav = LVF(cfa[indx]); vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2]))); vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2]))); vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2]))); vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2]))); + //G interpolated in vert/hor directions using Hamilton-Adams method vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2])); vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2])); vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2])); vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2])); + //G interpolated in vert/hor directions using adaptive ratios vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav); vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav); vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav); vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav); + //adaptive weights for vertical/horizontal directions vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1])); vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1])); @@ -477,7 +412,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, vfloat Ginthhav = vintpf(hwtv, grhav, glhav); vfloat Gintvhav = vintpf(vwtv, gdhav, guhav); - //interpolated color differences + //interpolated colour differences vfloat hcdaltv = sgnv * (Ginthhav - cfav); vfloat vcdaltv = sgnv * (Gintvhav - cfav); STVF(hcdalt[indx], hcdaltv); @@ -488,13 +423,14 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, gdarv = vself( clipmask, gdhav, gdarv); glarv = vself( clipmask, glhav, glarv); grarv = vself( clipmask, grhav, grarv); + + //use HA if highlights are (nearly) clipped STVF(vcd[indx], vself( clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav))); STVF(hcd[indx], vself( clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav))); - //differences of interpolations in opposite directions + //differences of interpolations in opposite directions STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv))); STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv))); - } } @@ -503,9 +439,9 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, for (int rr = 4; rr < rr1 - 4; rr++) { bool fcswitch = FC(rr, 4) & 1; - for (int cc = 4, indx = rr * TS + cc; cc < cc1 - 4; cc++, indx++) { + for (int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++) { - //color ratios in each cardinal direction + //colour ratios in each cardinal direction float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2])); float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2])); float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2])); @@ -520,25 +456,25 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, //G interpolated in vert/hor directions using adaptive ratios float guar, gdar, glar, grar; - if (fabsf(1.0f - cru) < arthresh) { + if (fabsf(1.f - cru) < arthresh) { guar = cfa[indx] * cru; } else { guar = guha; } - if (fabsf(1.0f - crd) < arthresh) { + if (fabsf(1.f - crd) < arthresh) { gdar = cfa[indx] * crd; } else { gdar = gdha; } - if (fabsf(1.0f - crl) < arthresh) { + if (fabsf(1.f - crl) < arthresh) { glar = cfa[indx] * crl; } else { glar = glha; } - if (fabsf(1.0f - crr) < arthresh) { + if (fabsf(1.f - crr) < arthresh) { grar = cfa[indx] * crr; } else { grar = grha; @@ -549,19 +485,19 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]); //interpolated G via adaptive weights of cardinal evaluations - float Gintvha = vwt * gdha + (1.0f - vwt) * guha; - float Ginthha = hwt * grha + (1.0f - hwt) * glha; + float Gintvha = vwt * gdha + (1.f - vwt) * guha; + float Ginthha = hwt * grha + (1.f - hwt) * glha; - //interpolated color differences + //interpolated colour differences if (fcswitch) { - vcd[indx] = cfa[indx] - (vwt * gdar + (1.0f - vwt) * guar); - hcd[indx] = cfa[indx] - (hwt * grar + (1.0f - hwt) * glar); + vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar); + hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar); vcdalt[indx] = cfa[indx] - Gintvha; hcdalt[indx] = cfa[indx] - Ginthha; } else { - //interpolated color differences - vcd[indx] = (vwt * gdar + (1.0f - vwt) * guar) - cfa[indx]; - hcd[indx] = (hwt * grar + (1.0f - hwt) * glar) - cfa[indx]; + //interpolated colour differences + vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx]; + hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx]; vcdalt[indx] = Gintvha - cfa[indx]; hcdalt[indx] = Ginthha - cfa[indx]; } @@ -583,8 +519,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, dginth[indx] = min(SQR(glha - grha), SQR(glar - grar)); } - - } #endif @@ -596,9 +530,9 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, vfloat sgn3v; if( !(FC(4, 4) & 1) ) { - sgnv = _mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f ); + sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f ); } else { - sgnv = _mm_set_ps( -1.0f, 1.0f, -1.0f, 1.0f ); + sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f ); } sgn3v = sgnv + sgnv + sgnv; @@ -608,7 +542,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, sgnv = -sgnv; sgn3v = -sgn3v; - for (int indx = rr * TS + 4; indx < rr * TS + cc1 - 4; indx += 4) { + for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4) { vfloat hcdv = LVF( hcd[indx] ); vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv) + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2])) + SQRV(hcdv - LVFU(hcd[indx + 2])); vfloat hcdaltv = LVF( hcdalt[indx] ); @@ -622,6 +556,8 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, hcdv = vself( vmaskf_lt( hcdaltvarv, hcdvarv ), hcdaltv, hcdv); vcdv = vself( vmaskf_lt( vcdaltvarv, vcdvarv ), vcdaltv, vcdv); + //bound the interpolation in regions of high saturation + //vertical and horizontal G interpolations vfloat Ginthv = sgnv * hcdv + LVF( cfa[indx] ); vfloat temp2v = sgn3v * hcdv; vfloat hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx])); @@ -651,11 +587,11 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #else for (int rr = 4; rr < rr1 - 4; rr++) { - for (int cc = 4, indx = rr * TS + cc, c = FC(rr, cc) & 1; cc < cc1 - 4; cc++, indx++) { - float hcdvar = 3.0f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]); - float hcdaltvar = 3.0f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]); - float vcdvar = 3.0f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]); - float vcdaltvar = 3.0f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]); + for (int cc = 4, indx = rr * ts + cc, c = FC(rr, cc) & 1; cc < cc1 - 4; cc++, indx++) { + float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]); + float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]); + float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]); + float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]); //choose the smallest variance; this yields a smoother interpolation if (hcdaltvar < hcdvar) { @@ -667,7 +603,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, } //bound the interpolation in regions of high saturation - //vertical and horizontal G interpolations float Gintv, Ginth; @@ -676,33 +611,31 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, Gintv = -vcd[indx] + cfa[indx]; //B or R if (hcd[indx] > 0) { - if (3.0f * hcd[indx] > (Ginth + cfa[indx])) { + if (3.f * hcd[indx] > (Ginth + cfa[indx])) { hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]; } else { - float hwt = 1.0f - 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]); - hcd[indx] = hwt * hcd[indx] + (1.0f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]); + float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]); + hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]); } } if (vcd[indx] > 0) { - if (3.0f * vcd[indx] > (Gintv + cfa[indx])) { + if (3.f * vcd[indx] > (Gintv + cfa[indx])) { vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]; } else { - float vwt = 1.0f - 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]); - vcd[indx] = vwt * vcd[indx] + (1.0f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]); + float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]); + vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]); } } if (Ginth > clip_pt) { - hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]; //for RT implementation + hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]; } if (Gintv > clip_pt) { vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]; } - //if (Ginth > pre_mul[c]) hcd[indx]=-ULIM(Ginth,cfa[indx-1],cfa[indx+1])+cfa[indx];//for dcraw implementation - //if (Gintv > pre_mul[c]) vcd[indx]=-ULIM(Gintv,cfa[indx-v1],cfa[indx+v1])+cfa[indx]; } else {//R or B site @@ -710,25 +643,25 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, Gintv = vcd[indx] + cfa[indx]; if (hcd[indx] < 0) { - if (3.0f * hcd[indx] < -(Ginth + cfa[indx])) { + if (3.f * hcd[indx] < -(Ginth + cfa[indx])) { hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]; } else { - float hwt = 1.0f + 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]); - hcd[indx] = hwt * hcd[indx] + (1.0f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]); + float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]); + hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]); } } if (vcd[indx] < 0) { - if (3.0f * vcd[indx] < -(Gintv + cfa[indx])) { + if (3.f * vcd[indx] < -(Gintv + cfa[indx])) { vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]; } else { - float vwt = 1.0f + 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]); - vcd[indx] = vwt * vcd[indx] + (1.0f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]); + float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]); + vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]); } } if (Ginth > clip_pt) { - hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]; //for RT implementation + hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]; } if (Gintv > clip_pt) { @@ -750,7 +683,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, vfloat epssqv = F2V( epssq ); for (int rr = 6; rr < rr1 - 6; rr++) { - for (int indx = rr * TS + 6 + (FC(rr, 2) & 1); indx < rr * TS + cc1 - 6; indx += 8) { + for (int indx = rr * ts + 6 + (FC(rr, 2) & 1); indx < rr * ts + cc1 - 6; indx += 8) { //compute colour difference variances in cardinal directions vfloat tempv = LC2VFU(vcd[indx]); vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]); @@ -758,8 +691,8 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev); vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev); - vfloat hwtv = LC2VFU(dirwts1[indx - 1]) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1])); - vfloat vwtv = LC2VFU(dirwts0[indx - v1]) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1])); + vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1])); + vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1])); tempv = LC2VFU(hcd[indx]); vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]); @@ -772,7 +705,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv); vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv); - //compute fluctuations in up/down and left/right interpolations of colors + //compute fluctuations in up/down and left/right interpolations of colours Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]); Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]); @@ -796,16 +729,16 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #else for (int rr = 6; rr < rr1 - 6; rr++) { - for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { + for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) { - //compute color difference variances in cardinal directions + //compute colour difference variances in cardinal directions float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3]; float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3]; float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3]; float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3]; - //color difference (G-R or G-B) variance in up/down/left/right directions + //colour difference (G-R or G-B) variance in up/down/left/right directions float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave); float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave); float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave); @@ -814,17 +747,17 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]); float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]); - float vcdvar = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru; - float hcdvar = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl; + float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru; + float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl; - //compute fluctuations in up/down and left/right interpolations of colors + //compute fluctuations in up/down and left/right interpolations of colours Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]); Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]); Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]); Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]); - float vcdvar1 = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru; - float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl; + float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru; + float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl; //determine adaptive weights for G interpolation float varwt = hcdvar / (vcdvar + hcdvar); @@ -837,52 +770,94 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, } else { hvwt[indx >> 1] = diffwt; } - } } #endif +#ifdef __SSE2__ + vfloat gaussg0 = F2V(gaussgrad[0]); + vfloat gaussg1 = F2V(gaussgrad[1]); + vfloat gaussg2 = F2V(gaussgrad[2]); + vfloat gaussg3 = F2V(gaussgrad[3]); + vfloat gaussg4 = F2V(gaussgrad[4]); + vfloat gaussg5 = F2V(gaussgrad[5]); + vfloat gausso0 = F2V(gaussodd[0]); + vfloat gausso1 = F2V(gaussodd[1]); + vfloat gausso2 = F2V(gaussodd[2]); + vfloat gausso3 = F2V(gaussodd[3]); + +#endif + + // precompute nyquist + for (int rr = 6; rr < rr1 - 6; rr++) { + int cc = 6 + (FC(rr, 2) & 1); + int indx = rr * ts + cc; + +#ifdef __SSE2__ + + for (; cc < cc1 - 7; cc += 8, indx += 8) { + vfloat valv = (gausso0 * LC2VFU(cddiffsq[indx]) + + gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)]) + + LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)])) + + gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)]) + + LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)])) + + gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)]) + + LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)]))) - + (gaussg0 * LC2VFU(delhvsqsum[indx]) + + gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1]) + + LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1])) + + gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1]) + + LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1])) + + gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2]) + + LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2])) + + gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1]) + + LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2]) + + LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2]) + + LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1])) + + gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2]) + + LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2]))); + STVFU(nyqutest[indx >> 1], valv); + + } + +#endif + + for (; cc < cc1 - 6; cc += 2, indx += 2) { + nyqutest[indx >> 1] = (gaussodd[0] * cddiffsq[indx] + + gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] + + cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) + + gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] + + cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) + + gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] + + cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)])) - + (gaussgrad[0] * delhvsqsum[indx] + + gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] + + delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) + + gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] + + delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) + + gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] + + delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) + + gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1] + + delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2] + + delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2] + + delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1]) + + gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] + + delhvsqsum[indx - p2] + delhvsqsum[indx + m2])); + } + } // Nyquist test int nystartrow = 0; int nyendrow = 0; - int nystartcol = TS + 1; + int nystartcol = ts + 1; int nyendcol = 0; for (int rr = 6; rr < rr1 - 6; rr++) { - for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { + for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) { //nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB gradients - // TODO_INGO: currently this part needs 10 float mults, 36 float adds, 4 int mults and 44 int adds for every second pixel - // it reads 304 bytes for every second pixel and writes <= 1 byte for every second pixel - // a precalculated vectorized version could do this with 1/4 of the operations - // but it would read 304 bytes for every second pixel and write 8 bytes for every second pixel for the precalculation - // (though the vectorized read should be faster than the scalar version) - // and read 8 bytes for every second pixel and write 1 byte for every second pixel for final calculation (maybe this last step can be avoided too) - float nyqtest1 = gaussodd[0] * cddiffsq[indx] + - gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] + - cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) + - gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] + - cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) + - gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] + - cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)]); - float nyqtest2 = gaussgrad[0] * delhvsqsum[indx] + - gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] + - delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) + - gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] + - delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) + - gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] + - delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) + - gaussgrad[4] * (delhvsqsum[indx - 2 * TS - 1] + delhvsqsum[indx - 2 * TS + 1] + - delhvsqsum[indx - TS - 2] + delhvsqsum[indx - TS + 2] + - delhvsqsum[indx + TS - 2] + delhvsqsum[indx + TS + 2] + - delhvsqsum[indx + 2 * TS - 1] + delhvsqsum[indx + 2 * TS + 1]) + - gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] + - delhvsqsum[indx - p2] + delhvsqsum[indx + m2]); - - - if(nyqtest1 > nyqtest2) { + if(nyqutest[indx >> 1] > 0.f) { nyquist[indx >> 1] = 1; //nyquist=1 for nyquist region nystartrow = nystartrow ? nystartrow : rr; nyendrow = rr; @@ -903,46 +878,60 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, nyendrow = std::min(rr1 - 8, nyendrow); nystartcol = std::max(8, nystartcol); nyendcol = std::min(cc1 - 8, nyendcol); + memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh); + +#ifdef __SSE2__ + vint fourvb = _mm_set1_epi8(4); + vint onevb = _mm_set1_epi8(1); + +#endif for (int rr = nystartrow; rr < nyendrow; rr++) { - for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) { - // TODO_INGO: if you look at the comments below, it does not seem to be correct to include nyquist[indx >> 1] into the summation - // Also this implementation has loop dependencies, which are not correct IMHO - // An implementation which uses a second buffer could avoid this dependencies and could be vectorized by factor 16 too (we're working with single bytes here) - // That would lead to differences in output compared to current code, but also would lead to more consistent output when changing TS - unsigned int nyquistneighbours = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] + - nyquist[(indx - 2) >> 1] + nyquist[indx >> 1] + nyquist[(indx + 2) >> 1] + - nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]); +#ifdef __SSE2__ - //if most of your neighbours are named Nyquist, it's likely that you're one too - if (nyquistneighbours > 4) { - nyquist[indx >> 1] = 1; - } - - //or not - if (nyquistneighbours < 4) { - nyquist[indx >> 1] = 0; - } + for (int indx = rr * ts; indx < rr * ts + cc1; indx += 32) { + vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx - v2) >> 1]), _mm_load_si128((vint*)&nyquist[(indx - m1) >> 1])); + vint nyquisttemp2v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx + p1) >> 1]), _mm_load_si128((vint*)&nyquist[(indx - 2) >> 1])); + vint nyquisttemp3v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx + 2) >> 1]), _mm_load_si128((vint*)&nyquist[(indx - p1) >> 1])); + vint valv = _mm_load_si128((vint*)&nyquist[indx >> 1]); + vint nyquisttemp4v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx + m1) >> 1]), _mm_load_si128((vint*)&nyquist[(indx + v2) >> 1])); + nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v); + nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v); + nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v); + valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv); + valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv); + _mm_store_si128((vint*)&nyquist2[indx >> 1], valv); } + +#else + + for (int indx = rr * ts + nystartcol + (FC(rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) { + unsigned int nyquisttemp = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] + + nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] + + nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]); + //if most of your neighbours are named Nyquist, it's likely that you're one too, or not + nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]); + } + +#endif } // end of Nyquist test // in areas of Nyquist texture, do area interpolation for (int rr = nystartrow; rr < nyendrow; rr++) - for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) { + for (int indx = rr * ts + nystartcol + (FC(rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) { - if (nyquist[indx >> 1]) { + if (nyquist2[indx >> 1]) { // area interpolation float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f; for (int i = -6; i < 7; i += 2) { - int indx1 = indx + (i * TS) - 6; + int indx1 = indx + (i * ts) - 6; for (int j = -6; j < 7; j += 2, indx1 += 2) { - - if (nyquist[indx1 >> 1]) { + if (nyquist2[indx1 >> 1]) { float cfatemp = cfa[indx1]; sumcfa += cfatemp; sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]); @@ -954,11 +943,10 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, } } - //horizontal and vertical color differences, and adaptive weight + //horizontal and vertical colour differences, and adaptive weight sumh = sumcfa - xdiv2f(sumh); sumv = sumcfa - xdiv2f(sumv); - sumsqh = xdiv2f(sumsqh); - sumsqv = xdiv2f(sumsqv); + areawt = xdiv2f(areawt); float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh); float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv); hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar); @@ -972,7 +960,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, //populate G at R/B sites for (int rr = 8; rr < rr1 - 8; rr++) - for (int indx = rr * TS + 8 + (FC(rr, 2) & 1); indx < rr * TS + cc1 - 8; indx += 2) { + for (int indx = rr * ts + 8 + (FC(rr, 2) & 1); indx < rr * ts + cc1 - 8; indx += 2) { //first ask if one gets more directional discrimination from nearby B/R sites float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2); @@ -980,13 +968,13 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, hvwt[indx >> 1] = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1]; //a better result was obtained from the neighbours - Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate color differences + Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate colour differences rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; //evaluate G (finally!) //local curvature in G (preparation for nyquist refinement step) - Dgrb2[indx >> 1].h = nyquist[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f; - Dgrb2[indx >> 1].v = nyquist[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f; + Dgrb2[indx >> 1].h = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f; + Dgrb2[indx >> 1].v = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f; } @@ -995,10 +983,11 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, // refine Nyquist areas using G curvatures if(doNyquist) { for (int rr = nystartrow; rr < nyendrow; rr++) - // TODO_INGO: maybe this part is also worth vectorizing using _mm_movemask_ps - for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) { - if (nyquist[indx >> 1]) { + // TODO_INGO: maybe this part is also worth vectorizing using _mm_movemask_ps + for (int indx = rr * ts + nystartcol + (FC(rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) { + + if (nyquist2[indx >> 1]) { //local averages (over Nyquist pixels only) of G curvature squared float gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h + gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) + @@ -1020,7 +1009,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, for (int rr = 6; rr < rr1 - 6; rr++) { if((FC(rr, 2) & 1) == 0) { - for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) { + for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) { vfloat tempv = LC2VFU(cfa[indx + 1]); vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1]))); STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1]))); @@ -1030,7 +1019,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv ); } } else { - for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) { + for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) { vfloat tempv = LC2VFU(cfa[indx]); vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1]))); STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1]))); @@ -1046,14 +1035,14 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, for (int rr = 6; rr < rr1 - 6; rr++) { if((FC(rr, 2) & 1) == 0) { - for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { + for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) { delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]); delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]); Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1])); Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1])); } } else { - for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) { + for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) { Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1])); Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1])); delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]); @@ -1074,9 +1063,9 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, for (int rr = 8; rr < rr1 - 8; rr++) { #ifdef __SSE2__ - for (int indx = rr * TS + 8 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 8; indx += 8, indx1 += 4) { + for (int indx = rr * ts + 8 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8; indx += 8, indx1 += 4) { - //diagonal color ratios + //diagonal colour ratios vfloat cfav = LC2VFU(cfa[indx]); vfloat temp1v = LC2VFU(cfa[indx + m1]); @@ -1139,36 +1128,36 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #else - for (int cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) { + for (int cc = 8 + (FC(rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) { - //diagonal color ratios + //diagonal colour ratios float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2])); float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2])); float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2])); float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2])); - //color differences in diagonal directions + //colour differences in diagonal directions float rbse, rbnw, rbne, rbsw; //assign B/R at R/B sites - if (fabsf(1.0f - crse) < arthresh) { + if (fabsf(1.f - crse) < arthresh) { rbse = cfa[indx] * crse; //use this if more precise diag interp is necessary } else { rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]); } - if (fabsf(1.0f - crnw) < arthresh) { + if (fabsf(1.f - crnw) < arthresh) { rbnw = cfa[indx] * crnw; } else { rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]); } - if (fabsf(1.0f - crne) < arthresh) { + if (fabsf(1.f - crne) < arthresh) { rbne = cfa[indx] * crne; } else { rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]); } - if (fabsf(1.0f - crsw) < arthresh) { + if (fabsf(1.f - crsw) < arthresh) { rbsw = cfa[indx] * crsw; } else { rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]); @@ -1198,7 +1187,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, rbp[indx1] = ULIM(rbp[indx1] , cfa[indx - p1], cfa[indx + p1]); } else { float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]); - rbp[indx1] = pwt * rbp[indx1] + (1.0f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]); + rbp[indx1] = pwt * rbp[indx1] + (1.f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]); } } @@ -1207,7 +1196,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, rbm[indx1] = ULIM(rbm[indx1] , cfa[indx - m1], cfa[indx + m1]); } else { float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]); - rbm[indx1] = mwt * rbm[indx1] + (1.0f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]); + rbm[indx1] = mwt * rbm[indx1] + (1.f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]); } } @@ -1229,7 +1218,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, for (int rr = 10; rr < rr1 - 10; rr++) #ifdef __SSE2__ - for (int indx = rr * TS + 10 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 10; indx += 8, indx1 += 4) { + for (int indx = rr * ts + 10 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4) { //first ask if one gets more directional discrimination from nearby B/R sites vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1])); @@ -1241,7 +1230,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #else - for (int cc = 10 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) { + for (int cc = 10 + (FC(rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) { //first ask if one gets more directional discrimination from nearby B/R sites float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2); @@ -1250,20 +1239,20 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, pmwt[indx1] = pmwtalt; //a better result was obtained from the neighbours } - rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.0f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated + rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated } #endif for (int rr = 12; rr < rr1 - 12; rr++) #ifdef __SSE2__ - for (int indx = rr * TS + 12 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 12; indx += 8, indx1 += 4) { + for (int indx = rr * ts + 12 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4) { vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1]))); - if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the math for all 4 pixels and mask the unused out at the end + if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the unused out at the end //now interpolate G vertically/horizontally using R+B values - //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts - //color ratios for G interpolation + //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts + //colour ratios for G interpolation vfloat rbintv = LVFU(rbint[indx1]); //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction @@ -1312,16 +1301,16 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #else - for (int cc = 12 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) { + for (int cc = 12 + (FC(rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) { if (fabsf(0.5 - pmwt[indx >> 1]) < fabsf(0.5 - hvwt[indx >> 1]) ) { continue; } //now interpolate G vertically/horizontally using R+B values - //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts + //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts - //color ratios for G interpolation + //colour ratios for G interpolation float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]); float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]); float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]); @@ -1397,7 +1386,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, //fancy chrominance interpolation //(ey,ex) is location of R site for (int rr = 13 - ey; rr < rr1 - 12; rr += 2) - for (int indx1 = (rr * TS + 13 - ex) >> 1; indx1 < (rr * TS + cc1 - 12) >> 1; indx1++) { //B coset + for (int indx1 = (rr * ts + 13 - ex) >> 1; indx1 < (rr * ts + cc1 - 12) >> 1; indx1++) { //B coset Dgrb[1][indx1] = Dgrb[0][indx1]; //split out G-B from G-R Dgrb[0][indx1] = 0; } @@ -1410,7 +1399,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, for (int rr = 14; rr < rr1 - 14; rr++) #ifdef __SSE2__ - for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) { + for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) { vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1])); vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1])); vfloat wtnwv = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1]))); @@ -1426,11 +1415,11 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #else - for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) { - float wtnw = 1.0f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1])); - float wtne = 1.0f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1])); - float wtsw = 1.0f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1])); - float wtse = 1.0f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1])); + for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) { + float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1])); + float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1])); + float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1])); + float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1])); Dgrb[c][indx >> 1] = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1] - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1] ) + wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1] - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1] ) + @@ -1439,15 +1428,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, } #endif - //tile vars - //counters for pixel location in the image - int row, col; - //counters for pixel location within the tile - int cc; - //pointer counters within the tile - int indx; - - // end of tile initialization #ifdef __SSE2__ int offset; @@ -1465,12 +1445,14 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #endif for (int rr = 16; rr < rr1 - 16; rr++) { + int row = rr + top; + int col = left + 16; + int indx = rr * ts + 16; #ifdef __SSE2__ offset = 1 - offset; selmask = vnotm(selmask); - for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 18 - (cc1 & 1); cc += 4, indx += 4) { - col = cc + left; + for (; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4) { vfloat greenv = LVF(rgbgreen[indx]); vfloat temp00v = vdup(LVF(hvwt[(indx - v1) >> 1])); vfloat temp01v = vdup(LVF(hvwt[(indx + v1) >> 1])); @@ -1485,94 +1467,86 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, } if(offset == 0) { - for (indx = rr * TS + cc; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) { - col = cc + left; - float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); - red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) { + float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); - blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * temp); indx++; col++; - red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); - blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); } if(cc1 & 1) { // width of tile is odd - col = cc + left; - float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); - red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); - blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * temp); } } else { - for (indx = rr * TS + cc; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) { - col = cc + left; - red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); - blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) { + red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); indx++; col++; - float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); - red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); - blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * temp); } if(cc1 & 1) { // width of tile is odd - col = cc + left; - red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); - blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); } } #else if((FC(rr, 2) & 1) == 1) { - for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) { - col = cc + left; - float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); - red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) { + float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); - blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * temp); indx++; col++; - red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); - blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); } if(cc1 & 1) { // width of tile is odd - col = cc + left; - float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); - red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); - blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * temp); } } else { - for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) { - col = cc + left; - red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); - blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) { + red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); indx++; col++; - float temp = 1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); - red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * + float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) * temp); - blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * + blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) * temp); } if(cc1 & 1) { // width of tile is odd - col = cc + left; - red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); - blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); + red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]); + blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]); } } @@ -1586,18 +1560,16 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #ifdef __SSE2__ for (; cc < cc1 - 19; cc += 4) { - STVFU(green[row][cc + left], LVF(rgbgreen[rr * TS + cc]) * c65535v); + STVFU(green[row][cc + left], LVF(rgbgreen[rr * ts + cc]) * c65535v); } #endif for (; cc < cc1 - 16; cc++) { - green[row][cc + left] = 65535.0f * rgbgreen[rr * TS + cc]; + green[row][cc + left] = 65535.f * rgbgreen[rr * ts + cc]; } } - //end of main loop - if(plistener) { progresscounter++; @@ -1606,13 +1578,14 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, #pragma omp critical (amazeprogress) #endif { - progress += (double)32 * ((TS - 32) * (TS - 32)) / (height * width); + progress += (double)32 * ((ts - 32) * (ts - 32)) / (height * width); progress = progress > 1.0 ? 1.0 : progress; plistener->setProgress(progress); } } } } + } //end of main loop // clean up free(buffer); @@ -1622,9 +1595,5 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, plistener->setProgress(1.0); } - // done - -#undef TS - } } diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index 7bc480861..3f2bf6299 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -39,8 +39,15 @@ typedef __m128i vint2; #define STVFU(x,y) _mm_storeu_ps(&x,y) #endif -// Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats -#define LC2VFU(a) _mm_shuffle_ps( LVFU(a), _mm_loadu_ps( (&a) + 4 ), _MM_SHUFFLE( 2,0,2,0 ) ) + +static INLINE vfloat LC2VFU(float &a) +{ + // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats + vfloat a1 = _mm_loadu_ps( &a ); + vfloat a2 = _mm_loadu_ps( (&a) + 4 ); + return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )); +} + // Store a vector of 4 floats in a[0],a[2],a[4] and a[6] #if defined(__x86_64__) && defined(__SSE4_1__) diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h index 951d397bf..44c29fd97 100644 --- a/rtengine/rt_math.h +++ b/rtengine/rt_math.h @@ -83,8 +83,8 @@ template inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c) { // calculate a * b + (1 - a) * c // following is valid: - // intp(a, b+x, c+x) = vintpf(a, b, c) + x - // intp(a, b*x, c*x) = vintpf(a, b, c) * x + // intp(a, b+x, c+x) = intp(a, b, c) + x + // intp(a, b*x, c*x) = intp(a, b, c) * x return a * (b-c) + c; } diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index a0300a1cc..6fed6d3d1 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -910,11 +910,20 @@ static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vca static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) { return _mm_blendv_ps(y,x,(vfloat)mask); } + + static INLINE vint vselc(vmask mask, vint x, vint y) { + return _mm_blendv_epi8(y,x,mask); + } + #else // three instructions when using SSE2 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) { return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y)); } + + static INLINE vint vselc(vmask mask, vint x, vint y) { + return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y)); + } #endif static INLINE vfloat vselfzero(vmask mask, vfloat x) { @@ -928,6 +937,16 @@ static INLINE vfloat vselfnotzero(vmask mask, vfloat x) { return _mm_andnot_ps((vfloat)mask, x); } +static INLINE vint vselizero(vmask mask, vint x) { + // returns value of x if corresponding mask bits are 1, else returns 0 + // faster than vselc(mask, x, ZEROV) + return _mm_and_si128(mask, x); +} +static INLINE vint vselinotzero(vmask mask, vint x) { + // returns value of x if corresponding mask bits are 0, else returns 0 + // faster than vselc(mask, ZEROV, x) + return _mm_andnot_si128(mask, x); +} static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) { vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1)); @@ -1362,9 +1381,12 @@ static INLINE vfloat vaddc2vfu(float &a) // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] } vfloat a1 = _mm_loadu_ps( &a ); vfloat a2 = _mm_loadu_ps( (&a) + 4 ); - return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 )); + return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 )); } +static INLINE vfloat vadivapb (vfloat a, vfloat b) { + return a / (a+b); +} #endif // __SSE2__ #endif // SLEEFSSEAVX From ded93005d9cb0cd19d5f2c3d6669a0cccf171668 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Tue, 26 Jan 2016 23:27:52 +0100 Subject: [PATCH 3/3] removed streaming code because after adding _mm_mfence() it wasn't faster than the non SSE memset --- rtengine/amaze_demosaic_RT.cc | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/rtengine/amaze_demosaic_RT.cc b/rtengine/amaze_demosaic_RT.cc index 2720521aa..5bc9bb54d 100644 --- a/rtengine/amaze_demosaic_RT.cc +++ b/rtengine/amaze_demosaic_RT.cc @@ -181,17 +181,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, for (int top = winy - 16; top < winy + height; top += ts - 32) { for (int left = winx - 16; left < winx + width; left += ts - 32) { -#ifdef __SSE2__ - // Using SSE2 we can zero the memory without cache pollution - vfloat zerov = ZEROV; - - for(int i = 3 * tsh; i < (ts - 6)*tsh; i += 16) { - _mm_stream_ps((float*)&nyquist[i], zerov); - } - -#else memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh); -#endif //location of tile bottom edge int bottom = min(top + ts, winy + height + 16); //location of tile right edge @@ -983,8 +973,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, // refine Nyquist areas using G curvatures if(doNyquist) { for (int rr = nystartrow; rr < nyendrow; rr++) - - // TODO_INGO: maybe this part is also worth vectorizing using _mm_movemask_ps for (int indx = rr * ts + nystartcol + (FC(rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) { if (nyquist2[indx >> 1]) {