From 2017a0e59223878913f5c1e04ab97c29c4c02d7c Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Sun, 24 Jan 2016 01:44:35 +0100
Subject: [PATCH 1/3] Code review and speedup for Amaze Demosaic

---
 rtengine/amaze_demosaic_RT.cc | 1697 ++++++++++++++++-----------------
 rtengine/helpersse2.h         |    9 +-
 rtengine/rt_math.h            |   10 +
 rtengine/sleefsseavx.c        |   36 +-
 4 files changed, 884 insertions(+), 868 deletions(-)

diff --git a/rtengine/amaze_demosaic_RT.cc b/rtengine/amaze_demosaic_RT.cc
index 46c77d7d7..3b367ee2b 100644
--- a/rtengine/amaze_demosaic_RT.cc
+++ b/rtengine/amaze_demosaic_RT.cc
@@ -4,6 +4,7 @@
 // (Aliasing Minimization and Zipper Elimination)
 //
 //  copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
+//  optimized for speed by Ingo Weyrich
 //
 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
 //
@@ -28,9 +29,9 @@
 #include "rawimagesource.h"
 #include "rt_math.h"
 #include "../rtgui/multilangmgr.h"
-#include "procparams.h"
 #include "sleef.c"
 #include "opthelper.h"
+#define BENCHMARK
 #include "StopWatch.h"
 
 namespace rtengine
@@ -39,25 +40,44 @@ namespace rtengine
 SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh)
 {
     BENCHFUN
-#define HCLIP(x) x //is this still necessary???
-    //min(clip_pt,x)
 
-    int width = winw, height = winh;
+    volatile double progress = 0.0;
 
+    if (plistener) {
+        plistener->setProgressStr (Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), RAWParams::BayerSensor::methodstring[RAWParams::BayerSensor::amaze]));
+        plistener->setProgress (0.0);
+    }
 
-    const float clip_pt = 1 / initialGain;
-    const float clip_pt8 = 0.8f / initialGain;
+    const int width = winw, height = winh;
+    const float clip_pt = 1.0 / initialGain;
+    const float clip_pt8 = 0.8 / initialGain;
 
 
 #define TS 160   // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading
 #define TSH 80   // half of Tile size
 
-    // local variables
-
-
     //offset of R pixel within a Bayer quartet
     int ex, ey;
 
+    //determine GRBG coset; (ey,ex) is the offset of the R subarray
+    if (FC(0, 0) == 1) { //first pixel is G
+        if (FC(0, 1) == 0) {
+            ey = 0;
+            ex = 1;
+        } else {
+            ey = 1;
+            ex = 0;
+        }
+    } else {//first pixel is R or B
+        if (FC(0, 0) == 0) {
+            ey = 0;
+            ex = 0;
+        } else {
+            ey = 1;
+            ex = 1;
+        }
+    }
+
     //shifts of pointer value to access pixels in vertical and diagonal directions
     static const int v1 = TS, v2 = 2 * TS, v3 = 3 * TS, p1 = -TS + 1, p2 = -2 * TS + 2, p3 = -3 * TS + 3, m1 = TS + 1, m2 = 2 * TS + 2, m3 = 3 * TS + 3;
 
@@ -66,514 +86,347 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
     //adaptive ratios threshold
     static const float arthresh = 0.75;
-    //nyquist texture test threshold
-    static const float nyqthresh = 0.5;
 
     //gaussian on 5x5 quincunx, sigma=1.2
     static const float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f};
-    //gaussian on 5x5, sigma=1.2
-    static const float gaussgrad[6] = {0.07384411893421103f, 0.06207511968171489f, 0.0521818194747806f,
-                                       0.03687419286733595f, 0.03099732204057846f, 0.018413194161458882f
+    //nyquist texture test threshold
+    static const float nyqthresh = 0.5;
+    //gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
+    // Is this really sigma=1.2????, seems more like sigma = 1.672
+    static const float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f,
+                                       nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f
                                       };
     //gaussian on 5x5 alt quincunx, sigma=1.5
     static const float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f};
     //guassian on quincunx grid
     static const float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f};
 
-    volatile double progress = 0.0;
-
-    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-// Issue 1676
-// Moved from inside the parallel section
-    if (plistener) {
-        plistener->setProgressStr (Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), RAWParams::BayerSensor::methodstring[RAWParams::BayerSensor::amaze]));
-        plistener->setProgress (0.0);
-    }
-
-    struct s_hv {
+    typedef struct {
         float h;
         float v;
-    };
+    } s_hv;
 
+#ifdef _OPENMP
     #pragma omp parallel
+#endif
     {
         int progresscounter = 0;
-        //position of top/left corner of the tile
-        int top, left;
-        // beginning of storage block for tile
-        char  *buffer;
-        // green values
-        float (*rgbgreen);
-
-        // sum of square of horizontal gradient and square of vertical gradient
-        float (*delhvsqsum);
-        // gradient based directional weights for interpolation
-        float (*dirwts0);
-        float (*dirwts1);
-
-        // vertically interpolated color differences G-R, G-B
-        float (*vcd);
-        // horizontally interpolated color differences
-        float (*hcd);
-        // alternative vertical interpolation
-        float (*vcdalt);
-        // alternative horizontal interpolation
-        float (*hcdalt);
-        // square of average color difference
-        float (*cddiffsq);
-        // weight to give horizontal vs vertical interpolation
-        float (*hvwt);
-        // final interpolated color difference
-        float (*Dgrb)[TS * TSH];
-//  float (*Dgrb)[2];
-        // gradient in plus (NE/SW) direction
-        float (*delp);
-        // gradient in minus (NW/SE) direction
-        float (*delm);
-        // diagonal interpolation of R+B
-        float (*rbint);
-        // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions)
-        s_hv  (*Dgrb2);
-        // difference between up/down interpolations of G
-        float (*dgintv);
-        // difference between left/right interpolations of G
-        float (*dginth);
-        // diagonal (plus) color difference R-B or G1-G2
-//  float (*Dgrbp1);
-        // diagonal (minus) color difference R-B or G1-G2
-//  float (*Dgrbm1);
-        float (*Dgrbsq1m);
-        float (*Dgrbsq1p);
-//  s_mp  (*Dgrbsq1);
-        // square of diagonal color difference
-//  float (*Dgrbpsq1);
-        // square of diagonal color difference
-//  float (*Dgrbmsq1);
-        // tile raw data
-        float (*cfa);
-        // relative weight for combining plus and minus diagonal interpolations
-        float (*pmwt);
-        // interpolated color difference R-B in minus and plus direction
-        float (*rbm);
-        float (*rbp);
-
-        // nyquist texture flag 1=nyquist, 0=not nyquist
-        char   (*nyquist);
 
 #define CLF 1
         // assign working space
-        buffer = (char *) calloc(22 * sizeof(float) * TS * TS + sizeof(char) * TS * TSH + 23 * CLF * 64 + 63, 1);
-        char    *data;
-        data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
+        char *buffer = (char *) calloc(13 * sizeof(float) * TS * TS + sizeof(float) * TS * TSH + sizeof(char) * TS * TSH + 18 * CLF * 64 + 63, 1);
+        // aligned to 64 byte boundary
+        char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
 
-        //merror(buffer,"amaze_interpolate()");
-        rgbgreen   = (float (*))         data; //pointers to array
-        delhvsqsum = (float (*))         ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64);
-        dirwts0    = (float (*))         ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64);
-        dirwts1    = (float (*))         ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64);
-        vcd        = (float (*))         ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64);
-        hcd        = (float (*))         ((char*)vcd + sizeof(float) * TS * TS + CLF * 64);
-        vcdalt     = (float (*))         ((char*)hcd + sizeof(float) * TS * TS + CLF * 64);
-        hcdalt     = (float (*))         ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64);
-        cddiffsq   = (float (*))         ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64);
-        hvwt       = (float (*))         ((char*)cddiffsq + sizeof(float) * TS * TS + CLF * 64);
-        Dgrb       = (float (*)[TS * TSH]) ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64);
-        delp       = (float (*))         ((char*)Dgrb + sizeof(float) * TS * TS + CLF * 64);
-        delm       = (float (*))         ((char*)delp + sizeof(float) * TS * TSH + CLF * 64);
-        rbint      = (float (*))         ((char*)delm + sizeof(float) * TS * TSH + CLF * 64);
-        Dgrb2      = (s_hv  (*))         ((char*)rbint + sizeof(float) * TS * TSH + CLF * 64);
-        dgintv     = (float (*))         ((char*)Dgrb2 + sizeof(float) * TS * TS + CLF * 64);
-        dginth     = (float (*))         ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64);
-        Dgrbsq1m   = (float (*))         ((char*)dginth + sizeof(float) * TS * TS + CLF * 64);
-        Dgrbsq1p   = (float (*))         ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64);
-        cfa        = (float (*))         ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64);
-        pmwt       = (float (*))         ((char*)cfa + sizeof(float) * TS * TS + CLF * 64);
-        rbm        = (float (*))         ((char*)pmwt + sizeof(float) * TS * TSH + CLF * 64);
-        rbp        = (float (*))         ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64);
+        // green values
+        float *rgbgreen         = (float (*))         data;
+        // sum of square of horizontal gradient and square of vertical gradient
+        float *delhvsqsum       = (float (*))         ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64);
+        // gradient based directional weights for interpolation
+        float *dirwts0          = (float (*))         ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64);
+        float *dirwts1          = (float (*))         ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64);
+        // vertically interpolated color differences G-R, G-B
+        float *vcd              = (float (*))         ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64);
+        // horizontally interpolated color differences
+        float *hcd              = (float (*))         ((char*)vcd + sizeof(float) * TS * TS + CLF * 64);
+        // alternative vertical interpolation
+        float *vcdalt           = (float (*))         ((char*)hcd + sizeof(float) * TS * TS + CLF * 64);
+        // alternative horizontal interpolation
+        float *hcdalt           = (float (*))         ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64);
+        // square of average color difference
+        float *cddiffsq         = (float (*))         ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64);
+        // weight to give horizontal vs vertical interpolation
+        float *hvwt             = (float (*))         ((char*)cddiffsq + sizeof(float) * TS * TS + 2 * CLF * 64);
+        // final interpolated color difference
+        float (*Dgrb)[TS * TSH] = (float (*)[TS * TSH])vcdalt; // there is no overlap in buffer usage => share
+        // gradient in plus (NE/SW) direction
+        float *delp             = (float (*))cddiffsq; // there is no overlap in buffer usage => share
+        // gradient in minus (NW/SE) direction
+        float *delm             = (float (*))         ((char*)delp + sizeof(float) * TS * TSH + CLF * 64);
+        // diagonal interpolation of R+B
+        float *rbint            = (float (*))delm; // there is no overlap in buffer usage => share
+        // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions)
+        s_hv  *Dgrb2            = (s_hv  (*))         ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64);
+        // difference between up/down interpolations of G
+        float *dgintv           = (float (*))Dgrb2;   // there is no overlap in buffer usage => share
+        // difference between left/right interpolations of G
+        float *dginth           = (float (*))         ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64);
+        // square of diagonal colour differences
+        float *Dgrbsq1m         = (float (*))         ((char*)dginth + sizeof(float) * TS * TS + CLF * 64);
+        float *Dgrbsq1p         = (float (*))         ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64);
+        // tile raw data
+        float *cfa              = (float (*))         ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64);
+        // relative weight for combining plus and minus diagonal interpolations
+        float *pmwt             = (float (*))delhvsqsum;  // there is no overlap in buffer usage => share
+        // interpolated color difference R-B in minus and plus direction
+        float *rbm              = (float (*))vcd;  // there is no overlap in buffer usage => share
+        float *rbp              = (float (*))         ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64);
+        // nyquist texture flag 1=nyquist, 0=not nyquist
+        unsigned char *nyquist  = (unsigned char (*)) ((char*)cfa + sizeof(float) * TS * TS + CLF * 64);
+        /*
+                rgbgreen   = (float (*))         data; //pointers to array
+                delhvsqsum = (float (*))         ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64);
+                dirwts0    = (float (*))         ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64);
+                dirwts1    = (float (*))         ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64);
+                vcd        = (float (*))         ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64);
+                hcd        = (float (*))         ((char*)vcd + sizeof(float) * TS * TS + CLF * 64);
+                vcdalt     = (float (*))         ((char*)hcd + sizeof(float) * TS * TS + CLF * 64);
+                hcdalt     = (float (*))         ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64);
+                cddiffsq   = (float (*))         ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64);
+                hvwt       = (float (*))         ((char*)cddiffsq + sizeof(float) * TS * TS + CLF * 64);
+                Dgrb       = (float (*)[TS * TSH]) ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64);
+                delp       = (float (*))         ((char*)Dgrb + sizeof(float) * TS * TS + CLF * 64);
+                delm       = (float (*))         ((char*)delp + sizeof(float) * TS * TSH + CLF * 64);
+                rbint      = (float (*))         ((char*)delm + sizeof(float) * TS * TSH + CLF * 64);
+                Dgrb2      = (s_hv  (*))         ((char*)rbint + sizeof(float) * TS * TSH + CLF * 64);
+                dgintv     = (float (*))         ((char*)Dgrb2 + sizeof(float) * TS * TS + CLF * 64);
+                dginth     = (float (*))         ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64);
+                Dgrbsq1m   = (float (*))         ((char*)dginth + sizeof(float) * TS * TS + CLF * 64);
+                Dgrbsq1p   = (float (*))         ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64);
+                cfa        = (float (*))         ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64);
+                pmwt       = (float (*))         ((char*)cfa + sizeof(float) * TS * TS + CLF * 64);
+                rbm        = (float (*))         ((char*)pmwt + sizeof(float) * TS * TSH + CLF * 64);
+                rbp        = (float (*))         ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64);
 
-        nyquist    = (char (*))          ((char*)rbp + sizeof(float) * TS * TSH + CLF * 64);
+                nyquist    = (char (*))          ((char*)rbp + sizeof(float) * TS * TSH + CLF * 64);
+        */
 #undef CLF
-        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-        //determine GRBG coset; (ey,ex) is the offset of the R subarray
-        if (FC(0, 0) == 1) { //first pixel is G
-            if (FC(0, 1) == 0) {
-                ey = 0;
-                ex = 1;
-            } else {
-                ey = 1;
-                ex = 0;
-            }
-        } else {//first pixel is R or B
-            if (FC(0, 0) == 0) {
-                ey = 0;
-                ex = 0;
-            } else {
-                ey = 1;
-                ex = 1;
-            }
-        }
 
         // Main algorithm: Tile loop
-        //#pragma omp parallel for shared(rawData,height,width,red,green,blue) private(top,left) schedule(dynamic)
-        //code is openmp ready; just have to pull local tile variable declarations inside the tile loop
 
-// Issue 1676
-// use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
+        // Issue 1676
+        // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
+#ifdef _OPENMP
         #pragma omp for schedule(dynamic) collapse(2) nowait
+#endif
 
-        for (top = winy - 16; top < winy + height; top += TS - 32)
-            for (left = winx - 16; left < winx + width; left += TS - 32) {
-                memset(nyquist, 0, sizeof(char)*TS * TSH);
-                memset(rbint, 0, sizeof(float)*TS * TSH);
+        for (int top = winy - 16; top < winy + height; top += TS - 32)
+            for (int left = winx - 16; left < winx + width; left += TS - 32) {
+#ifdef __SSE2__
+                // Using SSE2 we can zero the memory without cache pollution
+                vfloat zerov = ZEROV;
+
+                for(int i = 3 * TSH; i < (TS - 6)*TSH; i += 16) {
+                    _mm_stream_ps((float*)&nyquist[i], zerov);
+                }
+
+#else
+                memset(&nyquist[3 * TSH], 0, sizeof(unsigned char) * (TS - 6) * TSH);
+#endif
                 //location of tile bottom edge
-                int bottom = min(top + TS, winy + height + 16);
+                const int bottom = min(top + TS, winy + height + 16);
                 //location of tile right edge
-                int right  = min(left + TS, winx + width + 16);
+                const int right  = min(left + TS, winx + width + 16);
                 //tile width  (=TS except for right edge of image)
-                int rr1 = bottom - top;
+                const int rr1 = bottom - top;
                 //tile height (=TS except for bottom edge of image)
-                int cc1 = right - left;
-
-                //tile vars
-                //counters for pixel location in the image
-                int row, col;
-                //min and max row/column in the tile
-                int rrmin, rrmax, ccmin, ccmax;
-                //counters for pixel location within the tile
-                int rr, cc;
-                //color index 0=R, 1=G, 2=B
-                int c;
-                //pointer counters within the tile
-                int indx, indx1;
-                //dummy indices
-                int i, j;
-
-                //color ratios in up/down/left/right directions
-                float cru, crd, crl, crr;
-                //adaptive weights for vertical/horizontal/plus/minus directions
-                float vwt, hwt, pwt, mwt;
-                //vertical and horizontal G interpolations
-                float Gintv, Ginth;
-                //G interpolated in vert/hor directions using adaptive ratios
-                float guar, gdar, glar, grar;
-                //G interpolated in vert/hor directions using Hamilton-Adams method
-                float guha, gdha, glha, grha;
-                //interpolated G from fusing left/right or up/down
-                float Ginthar, Ginthha, Gintvar, Gintvha;
-                //color difference (G-R or G-B) variance in up/down/left/right directions
-                float Dgrbvvaru, Dgrbvvard, Dgrbhvarl, Dgrbhvarr;
-
-                float uave, dave, lave, rave;
-
-                //color difference variances in vertical and horizontal directions
-                float vcdvar, hcdvar, vcdvar1, hcdvar1, hcdaltvar, vcdaltvar;
-                //adaptive interpolation weight using variance of color differences
-                float varwt;                                                                                                        // 639 - 644
-                //adaptive interpolation weight using difference of left-right and up-down G interpolations
-                float diffwt;                                                                                                       // 640 - 644
-                //alternative adaptive weight for combining horizontal/vertical interpolations
-                float hvwtalt;                                                                                                      // 745 - 748
-                //interpolation of G in four directions
-                float gu, gd, gl, gr;
-                //variance of G in vertical/horizontal directions
-                float gvarh, gvarv;
-
-                //Nyquist texture test
-                float nyqtest;                                                                                                      // 658 - 681
-                //accumulators for Nyquist texture interpolation
-                float sumh, sumv, sumsqh, sumsqv, areawt;
-
-                //color ratios in diagonal directions
-                float crse, crnw, crne, crsw;
-                //color differences in diagonal directions
-                float rbse, rbnw, rbne, rbsw;
-                //adaptive weights for combining diagonal interpolations
-                float wtse, wtnw, wtsw, wtne;
-                //alternate weight for combining diagonal interpolations
-                float pmwtalt;                                                                                                      // 885 - 888
-                //variance of R-B in plus/minus directions
-                float rbvarm;                                                                                                       // 843 - 848
-
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                const int cc1 = right - left;
+                // bookkeeping for borders
+                // min and max row/column in the tile
+                int rrmin = top < winy ? 16 : 0;
+                int ccmin = left < winx ? 16 : 0;
+                int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
+                int ccmax = right > (winx + width) ? winx + width - left : cc1;
 
                 // rgb from input CFA data
                 // rgb values should be floating point number between 0 and 1
                 // after white balance multipliers are applied
                 // a 16 pixel border is added to each side of the image
-
-                // bookkeeping for borders
-                if (top < winy) {
-                    rrmin = 16;
-                } else {
-                    rrmin = 0;
-                }
-
-                if (left < winx) {
-                    ccmin = 16;
-                } else {
-                    ccmin = 0;
-                }
-
-                if (bottom > (winy + height)) {
-                    rrmax = winy + height - top;
-                } else {
-                    rrmax = rr1;
-                }
-
-                if (right > (winx + width)) {
-                    ccmax = winx + width - left;
-                } else {
-                    ccmax = cc1;
-                }
-
 #ifdef __SSE2__
-                const __m128 c65535v = _mm_set1_ps( 65535.0f );
-                __m128  tempv;
+                const vfloat c65535v = F2V( 65535.0f );
 
-                for (rr = rrmin; rr < rrmax; rr++) {
-                    for (row = rr + top, cc = ccmin; cc < ccmax - 3; cc += 4) {
-                        indx1 = rr * TS + cc;
-                        tempv = LVFU(rawData[row][cc + left]) / c65535v;
-                        _mm_store_ps( &cfa[indx1], tempv );
-                        _mm_store_ps( &rgbgreen[indx1], tempv );
+                //fill upper border
+                if (rrmin > 0) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
+                            cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f;
+                            rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
+                        }
+                }
+
+                // fill inner part
+                for (int rr = rrmin; rr < rrmax; rr++) {
+                    int row = rr + top;
+                    int cc = ccmin;
+
+                    for (; cc < ccmax - 3; cc += 4) {
+                        int indx1 = rr * TS + cc;
+                        vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v;
+                        STVF(cfa[indx1], tempv );
+                        STVF(rgbgreen[indx1], tempv );
                     }
 
                     for (; cc < ccmax; cc++) {
-                        indx1 = rr * TS + cc;
+                        int indx1 = rr * TS + cc;
                         cfa[indx1] = (rawData[row][cc + left]) / 65535.0f;
-
-                        if(FC(rr, cc) == 1) {
-                            rgbgreen[indx1] = cfa[indx1];
-                        }
-
+                        rgbgreen[indx1] = cfa[indx1];
                     }
-
-                }
-
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                //fill borders
-                if (rrmin > 0) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
-                            cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
-                            }
-                        }
                 }
 
+                //fill lower border
                 if (rrmax < rr1) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = ccmin; cc < ccmax; cc += 4) {
-                            indx1 = (rrmax + rr) * TS + cc;
-                            tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / c65535v;
-                            _mm_store_ps( &cfa[indx1], tempv );
-                            _mm_store_ps( &rgbgreen[indx1], tempv );
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = ccmin; cc < ccmax; cc += 4) {
+                            int indx1 = (rrmax + rr) * TS + cc;
+                            vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / c65535v;
+                            STVF(cfa[indx1], tempv );
+                            STVF(rgbgreen[indx1], tempv );
                         }
                 }
 
+                //fill left border
                 if (ccmin > 0) {
-                    for (rr = rrmin; rr < rrmax; rr++)
-                        for (cc = 0, row = rr + top; cc < 16; cc++) {
+                    for (int rr = rrmin; rr < rrmax; rr++)
+                        for (int cc = 0, row = rr + top; cc < 16; cc++) {
                             cfa[rr * TS + cc] = (rawData[row][32 - cc + left]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
-                            }
+                            rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
                         }
                 }
 
+                //fill right border
                 if (ccmax < cc1) {
-                    for (rr = rrmin; rr < rrmax; rr++)
-                        for (cc = 0; cc < 16; cc++) {
+                    for (int rr = rrmin; rr < rrmax; rr++)
+                        for (int cc = 0; cc < 16; cc++) {
                             cfa[rr * TS + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc];
-                            }
+                            rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc];
                         }
                 }
 
                 //also, fill the image corners
                 if (rrmin > 0 && ccmin > 0) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = 0; cc < 16; cc += 4) {
-                            indx1 = (rr) * TS + cc;
-                            tempv = LVFU(rawData[winy + 32 - rr][winx + 32 - cc]) / c65535v;
-                            _mm_store_ps( &cfa[indx1], tempv );
-                            _mm_store_ps( &rgbgreen[indx1], tempv );
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = 0; cc < 16; cc += 4) {
+                            int indx1 = (rr) * TS + cc;
+                            vfloat tempv = LVFU(rawData[winy + 32 - rr][winx + 32 - cc]) / c65535v;
+                            STVF(cfa[indx1], tempv );
+                            STVF(rgbgreen[indx1], tempv );
                         }
                 }
 
                 if (rrmax < rr1 && ccmax < cc1) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = 0; cc < 16; cc += 4) {
-                            indx1 = (rrmax + rr) * TS + ccmax + cc;
-                            tempv = LVFU(rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / c65535v;
-                            _mm_storeu_ps( &cfa[indx1], tempv );
-                            _mm_storeu_ps( &rgbgreen[indx1], tempv );
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = 0; cc < 16; cc += 4) {
+                            int indx1 = (rrmax + rr) * TS + ccmax + cc;
+                            vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / c65535v;
+                            STVFU(cfa[indx1], tempv );
+                            STVFU(rgbgreen[indx1], tempv );
                         }
                 }
 
                 if (rrmin > 0 && ccmax < cc1) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = 0; cc < 16; cc++) {
-
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = 0; cc < 16; cc++) {
                             cfa[(rr)*TS + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc];
-                            }
+                            rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc];
                         }
                 }
 
                 if (rrmax < rr1 && ccmin > 0) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = 0; cc < 16; cc++) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = 0; cc < 16; cc++) {
                             cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
-                            }
+                            rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
                         }
                 }
 
 #else
 
-                for (rr = rrmin; rr < rrmax; rr++)
-                    for (row = rr + top, cc = ccmin; cc < ccmax; cc++) {
-                        indx1 = rr * TS + cc;
+                for (int rr = rrmin; rr < rrmax; rr++)
+                    for (int row = rr + top, cc = ccmin; cc < ccmax; cc++) {
+                        int indx1 = rr * TS + cc;
                         cfa[indx1] = (rawData[row][cc + left]) / 65535.0f;
-
-                        if(FC(rr, cc) == 1) {
-                            rgbgreen[indx1] = cfa[indx1];
-                        }
-
+                        rgbgreen[indx1] = cfa[indx1];
                     }
 
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                 //fill borders
                 if (rrmin > 0) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
                             cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
-                            }
+                            rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
                         }
                 }
 
                 if (rrmax < rr1) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = ccmin; cc < ccmax; cc++) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = ccmin; cc < ccmax; cc++) {
                             cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
-                            }
+                            rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
                         }
                 }
 
                 if (ccmin > 0) {
-                    for (rr = rrmin; rr < rrmax; rr++)
-                        for (cc = 0, row = rr + top; cc < 16; cc++) {
+                    for (int rr = rrmin; rr < rrmax; rr++)
+                        for (int cc = 0, row = rr + top; cc < 16; cc++) {
                             cfa[rr * TS + cc] = (rawData[row][32 - cc + left]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
-                            }
+                            rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
                         }
                 }
 
                 if (ccmax < cc1) {
-                    for (rr = rrmin; rr < rrmax; rr++)
-                        for (cc = 0; cc < 16; cc++) {
+                    for (int rr = rrmin; rr < rrmax; rr++)
+                        for (int cc = 0; cc < 16; cc++) {
                             cfa[rr * TS + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc];
-                            }
+                            rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc];
                         }
                 }
 
                 //also, fill the image corners
                 if (rrmin > 0 && ccmin > 0) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = 0; cc < 16; cc++) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = 0; cc < 16; cc++) {
                             cfa[(rr)*TS + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[(rr)*TS + cc] = cfa[(rr) * TS + cc];
-                            }
+                            rgbgreen[(rr)*TS + cc] = cfa[(rr) * TS + cc];
                         }
                 }
 
                 if (rrmax < rr1 && ccmax < cc1) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = 0; cc < 16; cc++) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = 0; cc < 16; cc++) {
                             cfa[(rrmax + rr)*TS + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[(rrmax + rr)*TS + ccmax + cc] = cfa[(rrmax + rr) * TS + ccmax + cc];
-                            }
+                            rgbgreen[(rrmax + rr)*TS + ccmax + cc] = cfa[(rrmax + rr) * TS + ccmax + cc];
                         }
                 }
 
                 if (rrmin > 0 && ccmax < cc1) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = 0; cc < 16; cc++) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = 0; cc < 16; cc++) {
                             cfa[(rr)*TS + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc];
-                            }
+                            rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc];
                         }
                 }
 
                 if (rrmax < rr1 && ccmin > 0) {
-                    for (rr = 0; rr < 16; rr++)
-                        for (cc = 0; cc < 16; cc++) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = 0; cc < 16; cc++) {
                             cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.0f;
-
-                            if(FC(rr, cc) == 1) {
-                                rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
-                            }
+                            rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
                         }
                 }
 
 #endif
 
                 //end of border fill
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 #ifdef __SSE2__
-                __m128 delhv, delvv;
-                const __m128 epsv = _mm_set1_ps( eps );
+                const vfloat epsv = F2V( eps );
 
-                for (rr = 2; rr < rr1 - 2; rr++) {
-                    for (cc = 0, indx = (rr) * TS + cc; cc < cc1; cc += 4, indx += 4) {
-                        delhv = vabsf( LVFU( cfa[indx + 1] ) -  LVFU( cfa[indx - 1] ) );
-                        delvv = vabsf( LVF( cfa[indx + v1] ) -  LVF( cfa[indx - v1] ) );
-                        _mm_store_ps( &dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv );
-                        delhv = delhv * delhv;
-                        _mm_store_ps( &dirwts0[indx], epsv + vabsf( LVF( cfa[indx + v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx - v2] )) + delvv );
-                        delvv = delvv * delvv;
-                        _mm_store_ps( &delhvsqsum[indx], delhv + delvv);
+                for (int rr = 2; rr < rr1 - 2; rr++) {
+                    for (int indx = rr * TS; indx < rr * TS + cc1; indx += 4) {
+                        vfloat delhv = vabsf( LVFU( cfa[indx + 1] ) -  LVFU( cfa[indx - 1] ) );
+                        vfloat delvv = vabsf( LVF( cfa[indx + v1] ) -  LVF( cfa[indx - v1] ) );
+                        STVF(dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv );
+                        STVF(dirwts0[indx], epsv + vabsf( LVF( cfa[indx + v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx - v2] )) + delvv );
+                        STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
                     }
                 }
 
 #else
-                // horizontal and vedrtical gradient
-                float delh, delv;
 
-                for (rr = 2; rr < rr1 - 2; rr++)
-                    for (cc = 2, indx = (rr) * TS + cc; cc < cc1 - 2; cc++, indx++) {
-                        delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
-                        delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
+                for (int rr = 2; rr < rr1 - 2; rr++)
+                    for (int cc = 2, indx = (rr) * TS + cc; cc < cc1 - 2; cc++, indx++) {
+                        // horizontal and vedrtical gradient
+                        float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
+                        float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
                         dirwts0[indx] = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
                         dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh; //+fabsf(cfa[indx+2]-cfa[indx-2]);
                         delhvsqsum[indx] = SQR(delh) + SQR(delv);
@@ -581,63 +434,9 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
 #endif
 
-#ifdef __SSE2__
-                __m128  Dgrbsq1pv, Dgrbsq1mv, temp2v;
-
-                for (rr = 6; rr < rr1 - 6; rr++) {
-                    if((FC(rr, 2) & 1) == 0) {
-                        for (cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) {
-                            tempv = LC2VFU(cfa[indx + 1]);
-                            Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
-                            _mm_storeu_ps( &delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
-                            _mm_storeu_ps( &delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
-                            Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
-                            _mm_storeu_ps( &Dgrbsq1m[indx >> 1], Dgrbsq1mv );
-                            _mm_storeu_ps( &Dgrbsq1p[indx >> 1], Dgrbsq1pv );
-                        }
-                    } else {
-                        for (cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) {
-                            tempv = LC2VFU(cfa[indx]);
-                            Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
-                            _mm_storeu_ps( &delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
-                            _mm_storeu_ps( &delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
-                            Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
-                            _mm_storeu_ps( &Dgrbsq1m[indx >> 1], Dgrbsq1mv );
-                            _mm_storeu_ps( &Dgrbsq1p[indx >> 1], Dgrbsq1pv );
-                        }
-                    }
-                }
-
-#else
-
-                for (rr = 6; rr < rr1 - 6; rr++) {
-                    if((FC(rr, 2) & 1) == 0) {
-                        for (cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
-                            delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
-                            delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
-                            Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
-                            Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
-                        }
-                    } else {
-                        for (cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
-                            Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
-                            Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
-                            delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
-                            delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
-                        }
-                    }
-                }
-
-#endif
-
-                // end of tile initialization
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
                 //interpolate vertical and horizontal color differences
-
 #ifdef __SSE2__
-                __m128  sgnv, cruv, crdv, crlv, crrv, guhav, gdhav, glhav, grhav, hwtv, vwtv, Gintvhav, Ginthhav, guarv, gdarv, glarv, grarv;
-                vmask   clipmask;
+                vfloat sgnv;
 
                 if( !(FC(4, 4) & 1) ) {
                     sgnv = _mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f );
@@ -645,73 +444,81 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                     sgnv = _mm_set_ps( -1.0f, 1.0f, -1.0f, 1.0f );
                 }
 
-                __m128  zd5v = _mm_set1_ps( 0.5f );
-                __m128  onev = _mm_set1_ps( 1.0f );
-                __m128  arthreshv = _mm_set1_ps( arthresh );
-                __m128  clip_pt8v = _mm_set1_ps( clip_pt8 );
+                vfloat  zd5v = F2V( 0.5f );
+                vfloat  onev = F2V( 1.0f );
+                vfloat  arthreshv = F2V( arthresh );
+                vfloat  clip_pt8v = F2V( clip_pt8 );
 
-                for (rr = 4; rr < rr1 - 4; rr++) {
+                for (int rr = 4; rr < rr1 - 4; rr++) {
                     sgnv = -sgnv;
 
-                    for (cc = 4, indx = rr * TS + cc; cc < cc1 - 7; cc += 4, indx += 4) {
+                    for (int indx = rr * TS + 4; indx < rr * TS + cc1 - 7; indx += 4) {
                         //color ratios in each cardinal direction
-                        cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + LVF(cfa[indx])) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
-                        crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + LVF(cfa[indx])) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
-                        crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + LVF(cfa[indx])) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
-                        crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + LVF(cfa[indx])) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
+                        vfloat cfav = LVF(cfa[indx]);
+                        vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
+                        vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
+                        vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
+                        vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
 
-                        guhav = LVF(cfa[indx - v1]) + zd5v * (LVF(cfa[indx]) - LVF(cfa[indx - v2]));
-                        gdhav = LVF(cfa[indx + v1]) + zd5v * (LVF(cfa[indx]) - LVF(cfa[indx + v2]));
-                        glhav = LVFU(cfa[indx - 1]) + zd5v * (LVF(cfa[indx]) - LVFU(cfa[indx - 2]));
-                        grhav = LVFU(cfa[indx + 1]) + zd5v * (LVF(cfa[indx]) - LVFU(cfa[indx + 2]));
+                        vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
+                        vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
+                        vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
+                        vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
 
-                        guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), LVF(cfa[indx]) * cruv, guhav);
-                        gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), LVF(cfa[indx]) * crdv, gdhav);
-                        glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), LVF(cfa[indx]) * crlv, glhav);
-                        grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), LVF(cfa[indx]) * crrv, grhav);
+                        vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
+                        vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
+                        vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
+                        vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
 
-                        hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
-                        vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
+                        vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
+                        vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
 
                         //interpolated G via adaptive weights of cardinal evaluations
-                        Ginthhav = hwtv * grhav + (onev - hwtv) * glhav;
-                        Gintvhav = vwtv * gdhav + (onev - vwtv) * guhav;
+                        vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
+                        vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
+
                         //interpolated color differences
+                        vfloat hcdaltv = sgnv * (Ginthhav - cfav);
+                        vfloat vcdaltv = sgnv * (Gintvhav - cfav);
+                        STVF(hcdalt[indx], hcdaltv);
+                        STVF(vcdalt[indx], vcdaltv);
 
-                        _mm_store_ps( &hcdalt[indx], sgnv * (Ginthhav - LVF(cfa[indx])));
-                        _mm_store_ps( &vcdalt[indx], sgnv * (Gintvhav - LVF(cfa[indx])));
-
-                        clipmask = vorm( vorm( vmaskf_gt( LVF(cfa[indx]), clip_pt8v ), vmaskf_gt( Gintvhav, clip_pt8v ) ), vmaskf_gt( Ginthhav, clip_pt8v ));
+                        vmask clipmask = vorm( vorm( vmaskf_gt( cfav, clip_pt8v ), vmaskf_gt( Gintvhav, clip_pt8v ) ), vmaskf_gt( Ginthhav, clip_pt8v ));
                         guarv = vself( clipmask, guhav, guarv);
                         gdarv = vself( clipmask, gdhav, gdarv);
                         glarv = vself( clipmask, glhav, glarv);
                         grarv = vself( clipmask, grhav, grarv);
-                        _mm_store_ps( &vcd[indx], vself( clipmask, LVF(vcdalt[indx]), sgnv * ((vwtv * gdarv + (onev - vwtv)*guarv) - LVF(cfa[indx]))));
-                        _mm_store_ps( &hcd[indx], vself( clipmask, LVF(hcdalt[indx]), sgnv * ((hwtv * grarv + (onev - hwtv)*glarv) - LVF(cfa[indx]))));
+                        STVF(vcd[indx], vself( clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
+                        STVF(hcd[indx], vself( clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
                         //differences of interpolations in opposite directions
 
-                        _mm_store_ps(&dgintv[indx], _mm_min_ps(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
-                        _mm_store_ps(&dginth[indx], _mm_min_ps(SQRV(glhav - grhav), SQRV(glarv - grarv)));
+                        STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
+                        STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
 
                     }
                 }
 
 #else
-                bool    fcswitch;
 
-                for (rr = 4; rr < rr1 - 4; rr++) {
-                    for (cc = 4, indx = rr * TS + cc, fcswitch = FC(rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
+                for (int rr = 4; rr < rr1 - 4; rr++) {
+                    bool fcswitch = FC(rr, 4) & 1;
+
+                    for (int cc = 4, indx = rr * TS + cc; cc < cc1 - 4; cc++, indx++) {
 
                         //color ratios in each cardinal direction
-                        cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
-                        crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
-                        crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
-                        crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx]) / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
+                        float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
+                        float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
+                        float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
+                        float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx]) / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
 
-                        guha = HCLIP(cfa[indx - v1]) + xdiv2f(cfa[indx] - cfa[indx - v2]);
-                        gdha = HCLIP(cfa[indx + v1]) + xdiv2f(cfa[indx] - cfa[indx + v2]);
-                        glha = HCLIP(cfa[indx - 1]) + xdiv2f(cfa[indx] - cfa[indx - 2]);
-                        grha = HCLIP(cfa[indx + 1]) + xdiv2f(cfa[indx] - cfa[indx + 2]);
+                        //G interpolated in vert/hor directions using Hamilton-Adams method
+                        float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
+                        float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
+                        float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
+                        float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
+
+                        //G interpolated in vert/hor directions using adaptive ratios
+                        float guar, gdar, glar, grar;
 
                         if (fabsf(1.0f - cru) < arthresh) {
                             guar = cfa[indx] * cru;
@@ -737,12 +544,13 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             grar = grha;
                         }
 
-                        hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
-                        vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
+                        //adaptive weights for vertical/horizontal directions
+                        float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
+                        float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
 
                         //interpolated G via adaptive weights of cardinal evaluations
-                        Gintvha = vwt * gdha + (1.0f - vwt) * guha;
-                        Ginthha = hwt * grha + (1.0f - hwt) * glha;
+                        float Gintvha = vwt * gdha + (1.0f - vwt) * guha;
+                        float Ginthha = hwt * grha + (1.0f - hwt) * glha;
 
                         //interpolated color differences
                         if (fcswitch) {
@@ -781,13 +589,11 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
 #endif
 
+
+
 #ifdef __SSE2__
-                __m128  hcdvarv, vcdvarv;
-                __m128  hcdaltvarv, vcdaltvarv, hcdv, vcdv, hcdaltv, vcdaltv, sgn3v, Ginthv, Gintvv, hcdoldv, vcdoldv;
-                __m128  threev = _mm_set1_ps( 3.0f );
-                __m128  clip_ptv = _mm_set1_ps( clip_pt );
-                __m128  nsgnv;
-                vmask   hcdmask, vcdmask, tempmask;
+                vfloat  clip_ptv = F2V( clip_pt );
+                vfloat  sgn3v;
 
                 if( !(FC(4, 4) & 1) ) {
                     sgnv = _mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f );
@@ -795,61 +601,61 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                     sgnv = _mm_set_ps( -1.0f, 1.0f, -1.0f, 1.0f );
                 }
 
-                sgn3v = threev * sgnv;
+                sgn3v = sgnv + sgnv + sgnv;
 
-                for (rr = 4; rr < rr1 - 4; rr++) {
-                    nsgnv = sgnv;
+                for (int rr = 4; rr < rr1 - 4; rr++) {
+                    vfloat nsgnv = sgnv;
                     sgnv = -sgnv;
                     sgn3v = -sgn3v;
 
-                    for (cc = 4, indx = rr * TS + cc, c = FC(rr, cc) & 1; cc < cc1 - 4; cc += 4, indx += 4) {
-                        hcdv = LVF( hcd[indx] );
-                        hcdvarv = threev * (SQRV(LVFU(hcd[indx - 2])) + SQRV(hcdv) + SQRV(LVFU(hcd[indx + 2]))) - SQRV(LVFU(hcd[indx - 2]) + hcdv + LVFU(hcd[indx + 2]));
-                        hcdaltv = LVF( hcdalt[indx] );
-                        hcdaltvarv = threev * (SQRV(LVFU(hcdalt[indx - 2])) + SQRV(hcdaltv) + SQRV(LVFU(hcdalt[indx + 2]))) - SQRV(LVFU(hcdalt[indx - 2]) + hcdaltv + LVFU(hcdalt[indx + 2]));
-                        vcdv = LVF( vcd[indx] );
-                        vcdvarv = threev * (SQRV(LVF(vcd[indx - v2])) + SQRV(vcdv) + SQRV(LVF(vcd[indx + v2]))) - SQRV(LVF(vcd[indx - v2]) + vcdv + LVF(vcd[indx + v2]));
-                        vcdaltv = LVF( vcdalt[indx] );
-                        vcdaltvarv = threev * (SQRV(LVF(vcdalt[indx - v2])) + SQRV(vcdaltv) + SQRV(LVF(vcdalt[indx + v2]))) - SQRV(LVF(vcdalt[indx - v2]) + vcdaltv + LVF(vcdalt[indx + v2]));
+                    for (int indx = rr * TS + 4; indx < rr * TS + cc1 - 4; indx += 4) {
+                        vfloat hcdv = LVF( hcd[indx] );
+                        vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv) + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2])) + SQRV(hcdv - LVFU(hcd[indx + 2]));
+                        vfloat hcdaltv = LVF( hcdalt[indx] );
+                        vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv) + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2])) + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
+                        vfloat vcdv = LVF( vcd[indx] );
+                        vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv) + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2])) + SQRV(vcdv - LVF(vcd[indx + v2]));
+                        vfloat vcdaltv = LVF( vcdalt[indx] );
+                        vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv) + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2])) + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
+
                         //choose the smallest variance; this yields a smoother interpolation
                         hcdv = vself( vmaskf_lt( hcdaltvarv, hcdvarv ), hcdaltv, hcdv);
                         vcdv = vself( vmaskf_lt( vcdaltvarv, vcdvarv ), vcdaltv, vcdv);
 
-                        Ginthv = sgnv * hcdv + LVF( cfa[indx] );
-                        temp2v = sgn3v * hcdv;
-                        hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx]));
-                        hcdmask = vmaskf_gt( nsgnv * hcdv, ZEROV );
-                        hcdoldv = hcdv;
-                        tempv = nsgnv * (LVF(cfa[indx]) - ULIMV( Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1]) ));
-                        hcdv = vself( vmaskf_lt( (temp2v), -(LVF(cfa[indx]) + Ginthv)), tempv, hwtv * hcdv + (onev - hwtv) * tempv);
+                        vfloat Ginthv = sgnv * hcdv + LVF( cfa[indx] );
+                        vfloat temp2v = sgn3v * hcdv;
+                        vfloat hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx]));
+                        vmask hcdmask = vmaskf_gt( nsgnv * hcdv, ZEROV );
+                        vfloat hcdoldv = hcdv;
+                        vfloat tempv = nsgnv * (LVF(cfa[indx]) - ULIMV( Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1]) ));
+                        hcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
                         hcdv = vself( hcdmask, hcdv, hcdoldv );
                         hcdv = vself( vmaskf_gt( Ginthv, clip_ptv), tempv, hcdv);
-                        _mm_store_ps( &hcd[indx], hcdv);
+                        STVF(hcd[indx], hcdv);
 
-                        Gintvv = sgnv * vcdv + LVF( cfa[indx] );
+                        vfloat Gintvv = sgnv * vcdv + LVF( cfa[indx] );
                         temp2v = sgn3v * vcdv;
-                        vwtv = onev + temp2v / ( epsv + Gintvv + LVF( cfa[indx]));
-                        vcdmask = vmaskf_gt( nsgnv * vcdv, ZEROV );
-                        vcdoldv = vcdv;
+                        vfloat vwtv = onev + temp2v / ( epsv + Gintvv + LVF( cfa[indx]));
+                        vmask vcdmask = vmaskf_gt( nsgnv * vcdv, ZEROV );
+                        vfloat vcdoldv = vcdv;
                         tempv = nsgnv * (LVF(cfa[indx]) - ULIMV( Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1]) ));
-                        vcdv = vself( vmaskf_lt( (temp2v), -(LVF(cfa[indx]) + Gintvv)), tempv, vwtv * vcdv + (onev - vwtv) * tempv);
+                        vcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
                         vcdv = vself( vcdmask, vcdv, vcdoldv );
                         vcdv = vself( vmaskf_gt( Gintvv, clip_ptv), tempv, vcdv);
-                        _mm_store_ps( &vcd[indx], vcdv);
-                        _mm_storeu_ps(&cddiffsq[indx], SQRV(vcdv - hcdv));
+                        STVF(vcd[indx], vcdv);
+                        STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
                     }
 
                 }
 
 #else
 
-                for (rr = 4; rr < rr1 - 4; rr++) {
-                    //for (cc=4+(FC(rr,2)&1),indx=rr*TS+cc,c=FC(rr,cc); cc<cc1-4; cc+=2,indx+=2) {
-                    for (cc = 4, indx = rr * TS + cc, c = FC(rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
-                        hcdvar = 3.0f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
-                        hcdaltvar = 3.0f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
-                        vcdvar = 3.0f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
-                        vcdaltvar = 3.0f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
+                for (int rr = 4; rr < rr1 - 4; rr++) {
+                    for (int cc = 4, indx = rr * TS + cc, c = FC(rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
+                        float hcdvar = 3.0f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
+                        float hcdaltvar = 3.0f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
+                        float vcdvar = 3.0f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
+                        float vcdaltvar = 3.0f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
 
                         //choose the smallest variance; this yields a smoother interpolation
                         if (hcdaltvar < hcdvar) {
@@ -861,6 +667,10 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         }
 
                         //bound the interpolation in regions of high saturation
+
+                        //vertical and horizontal G interpolations
+                        float Gintv, Ginth;
+
                         if (c) {//G site
                             Ginth = -hcd[indx] + cfa[indx]; //R or B
                             Gintv = -vcd[indx] + cfa[indx]; //B or R
@@ -869,7 +679,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                                 if (3.0f * hcd[indx] > (Ginth + cfa[indx])) {
                                     hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
                                 } else {
-                                    hwt = 1.0f - 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]);
+                                    float hwt = 1.0f - 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]);
                                     hcd[indx] = hwt * hcd[indx] + (1.0f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
                                 }
                             }
@@ -878,7 +688,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                                 if (3.0f * vcd[indx] > (Gintv + cfa[indx])) {
                                     vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
                                 } else {
-                                    vwt = 1.0f - 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]);
+                                    float vwt = 1.0f - 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]);
                                     vcd[indx] = vwt * vcd[indx] + (1.0f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
                                 }
                             }
@@ -903,7 +713,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                                 if (3.0f * hcd[indx] < -(Ginth + cfa[indx])) {
                                     hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
                                 } else {
-                                    hwt = 1.0f + 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]);
+                                    float hwt = 1.0f + 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]);
                                     hcd[indx] = hwt * hcd[indx] + (1.0f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
                                 }
                             }
@@ -912,7 +722,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                                 if (3.0f * vcd[indx] < -(Gintv + cfa[indx])) {
                                     vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
                                 } else {
-                                    vwt = 1.0f + 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]);
+                                    float vwt = 1.0f + 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]);
                                     vcd[indx] = vwt * vcd[indx] + (1.0f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
                                 }
                             }
@@ -925,8 +735,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                                 vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
                             }
 
-                            //if (Ginth > pre_mul[c]) hcd[indx]=ULIM(Ginth,cfa[indx-1],cfa[indx+1])-cfa[indx];//for dcraw implementation
-                            //if (Gintv > pre_mul[c]) vcd[indx]=ULIM(Gintv,cfa[indx-v1],cfa[indx+v1])-cfa[indx];
                             cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
                         }
 
@@ -936,75 +744,78 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
 #endif
 
+
+
 #ifdef __SSE2__
-                __m128  uavev, davev, lavev, ravev, Dgrbvvaruv, Dgrbvvardv, Dgrbhvarlv, Dgrbhvarrv, varwtv, diffwtv, vcdvar1v, hcdvar1v;
-                __m128  epssqv = _mm_set1_ps( epssq );
-                vmask   decmask;
+                vfloat  epssqv = F2V( epssq );
 
-                for (rr = 6; rr < rr1 - 6; rr++) {
-                    for (cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) {
-                        //compute color difference variances in cardinal directions
-                        tempv = LC2VFU(vcd[indx]);
-                        uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
-                        davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
-                        Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
-                        Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
+                for (int rr = 6; rr < rr1 - 6; rr++) {
+                    for (int indx = rr * TS + 6 + (FC(rr, 2) & 1); indx < rr * TS + cc1 - 6; indx += 8) {
+                        //compute colour difference variances in cardinal directions
+                        vfloat tempv = LC2VFU(vcd[indx]);
+                        vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
+                        vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
+                        vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
+                        vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
 
-                        hwtv = LC2VFU(dirwts1[indx - 1]) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
-                        vwtv = LC2VFU(dirwts0[indx - v1]) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
+                        vfloat hwtv = LC2VFU(dirwts1[indx - 1]) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
+                        vfloat vwtv = LC2VFU(dirwts0[indx - v1]) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
 
                         tempv = LC2VFU(hcd[indx]);
-                        lavev = tempv + LC2VFU(hcd[indx - 1]) + LC2VFU(hcd[indx - 2]) + LC2VFU(hcd[indx - 3]);
-                        ravev = tempv + LC2VFU(hcd[indx + 1]) + LC2VFU(hcd[indx + 2]) + LC2VFU(hcd[indx + 3]);
-                        Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev) + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
-                        Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev) + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
+                        vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
+                        vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
+
+                        vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev) + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
+                        vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev) + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
 
 
-                        vcdvarv = epssqv + vwtv * Dgrbvvardv + (onev - vwtv) * Dgrbvvaruv;
-                        hcdvarv = epssqv + hwtv * Dgrbhvarrv + (onev - hwtv) * Dgrbhvarlv;
+                        vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
+                        vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
 
                         //compute fluctuations in up/down and left/right interpolations of colors
-                        Dgrbvvaruv = (LC2VFU(dgintv[indx])) + (LC2VFU(dgintv[indx - v1])) + (LC2VFU(dgintv[indx - v2]));
-                        Dgrbvvardv = (LC2VFU(dgintv[indx])) + (LC2VFU(dgintv[indx + v1])) + (LC2VFU(dgintv[indx + v2]));
-                        Dgrbhvarlv = (LC2VFU(dginth[indx])) + (LC2VFU(dginth[indx - 1])) + (LC2VFU(dginth[indx - 2]));
-                        Dgrbhvarrv = (LC2VFU(dginth[indx])) + (LC2VFU(dginth[indx + 1])) + (LC2VFU(dginth[indx + 2]));
+                        Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
+                        Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
 
-                        vcdvar1v = epssqv + vwtv * Dgrbvvardv + (onev - vwtv) * Dgrbvvaruv;
-                        hcdvar1v = epssqv + hwtv * Dgrbhvarrv + (onev - hwtv) * Dgrbhvarlv;
+                        Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
+                        Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
+
+                        vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
+                        vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
 
                         //determine adaptive weights for G interpolation
-                        varwtv = hcdvarv / (vcdvarv + hcdvarv);
-                        diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
+                        vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
+                        vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
 
                         //if both agree on interpolation direction, choose the one with strongest directional discrimination;
                         //otherwise, choose the u/d and l/r difference fluctuation weights
-                        decmask = vandm( vmaskf_gt( (zd5v - varwtv) * (zd5v - diffwtv), ZEROV ), vmaskf_lt( vabsf( zd5v - diffwtv), vabsf( zd5v - varwtv) ) );
-                        _mm_storeu_ps( &hvwt[indx >> 1], vself( decmask, varwtv, diffwtv));
+                        vmask decmask = vandm( vmaskf_gt( (zd5v - varwtv) * (zd5v - diffwtv), ZEROV ), vmaskf_lt( vabsf( zd5v - diffwtv), vabsf( zd5v - varwtv) ) );
+                        STVFU(hvwt[indx >> 1], vself( decmask, varwtv, diffwtv));
                     }
                 }
 
 #else
 
-                for (rr = 6; rr < rr1 - 6; rr++) {
-                    for (cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
+                for (int rr = 6; rr < rr1 - 6; rr++) {
+                    for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
 
                         //compute color difference variances in cardinal directions
 
-                        uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
-                        dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
-                        lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
-                        rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
+                        float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
+                        float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
+                        float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
+                        float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
 
-                        Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave);
-                        Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave);
-                        Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave);
-                        Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave) + SQR(hcd[indx + 3] - rave);
+                        //color difference (G-R or G-B) variance in up/down/left/right directions
+                        float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave);
+                        float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave);
+                        float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave);
+                        float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave) + SQR(hcd[indx + 3] - rave);
 
-                        hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
-                        vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
+                        float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
+                        float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
 
-                        vcdvar = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru;
-                        hcdvar = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl;
+                        float vcdvar = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru;
+                        float hcdvar = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl;
 
                         //compute fluctuations in up/down and left/right interpolations of colors
                         Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
@@ -1012,12 +823,12 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
                         Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
 
-                        vcdvar1 = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru;
-                        hcdvar1 = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl;
+                        float vcdvar1 = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru;
+                        float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl;
 
                         //determine adaptive weights for G interpolation
-                        varwt = hcdvar / (vcdvar + hcdvar);
-                        diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
+                        float varwt = hcdvar / (vcdvar + hcdvar);
+                        float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
 
                         //if both agree on interpolation direction, choose the one with strongest directional discrimination;
                         //otherwise, choose the u/d and l/r difference fluctuation weights
@@ -1027,255 +838,316 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             hvwt[indx >> 1] = diffwt;
                         }
 
-                        //hvwt[indx]=varwt;
                     }
                 }
 
 #endif
 
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
                 // Nyquist test
-                for (rr = 6; rr < rr1 - 6; rr++)
-                    for (cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
+                int nystartrow = 0;
+                int nyendrow = 0;
+                int nystartcol = TS + 1;
+                int nyendcol = 0;
+
+                for (int rr = 6; rr < rr1 - 6; rr++) {
+                    for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
 
                         //nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB gradients
-                        nyqtest = (gaussodd[0] * cddiffsq[indx] +
-                                   gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] +
-                                                  cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) +
-                                   gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] +
-                                                  cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) +
-                                   gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] +
-                                                  cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)]));
-
-                        nyqtest -= nyqthresh * (gaussgrad[0] * (delhvsqsum[indx]) +
-                                                gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] +
-                                                                delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) +
-                                                gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] +
-                                                                delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) +
-                                                gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] +
-                                                                delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) +
-                                                gaussgrad[4] * (delhvsqsum[indx - 2 * TS - 1] + delhvsqsum[indx - 2 * TS + 1] +
-                                                                delhvsqsum[indx - TS - 2] + delhvsqsum[indx - TS + 2] +
-                                                                delhvsqsum[indx + TS - 2] + delhvsqsum[indx + TS + 2] +
-                                                                delhvsqsum[indx + 2 * TS - 1] + delhvsqsum[indx + 2 * TS + 1]) +
-                                                gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] +
-                                                                delhvsqsum[indx - p2] + delhvsqsum[indx + m2]));
+                        // TODO_INGO: currently this part needs 10 float mults, 36 float adds, 4 int mults and 44 int adds for every second pixel
+                        // it reads 304 bytes for every second pixel and writes <= 1 byte for every second pixel
+                        // a precalculated vectorized version could do this with 1/4 of the operations
+                        // but it would read 304 bytes for every second pixel and write 8 bytes for every second pixel for the precalculation
+                        // (though the vectorized read should be faster than the scalar version)
+                        // and read 8 bytes for every second pixel and write 1 byte for every second pixel for final calculation (maybe this last step can be avoided too)
+                        float nyqtest1 = gaussodd[0] * cddiffsq[indx] +
+                                         gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] +
+                                                        cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) +
+                                         gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] +
+                                                        cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) +
+                                         gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] +
+                                                        cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)]);
+                        float nyqtest2 = gaussgrad[0] * delhvsqsum[indx] +
+                                         gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] +
+                                                         delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) +
+                                         gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] +
+                                                         delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) +
+                                         gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] +
+                                                         delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) +
+                                         gaussgrad[4] * (delhvsqsum[indx - 2 * TS - 1] + delhvsqsum[indx - 2 * TS + 1] +
+                                                         delhvsqsum[indx - TS - 2] + delhvsqsum[indx - TS + 2] +
+                                                         delhvsqsum[indx + TS - 2] + delhvsqsum[indx + TS + 2] +
+                                                         delhvsqsum[indx + 2 * TS - 1] + delhvsqsum[indx + 2 * TS + 1]) +
+                                         gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] +
+                                                         delhvsqsum[indx - p2] + delhvsqsum[indx + m2]);
 
 
-                        if (nyqtest > 0) {
+                        if(nyqtest1 > nyqtest2) {
                             nyquist[indx >> 1] = 1;    //nyquist=1 for nyquist region
-                        }
-                    }
-
-                unsigned int nyquisttemp;
-
-                for (rr = 8; rr < rr1 - 8; rr++) {
-                    for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 8; cc += 2, indx += 2) {
-
-                        nyquisttemp = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] +
-                                       nyquist[(indx - 2) >> 1] + nyquist[indx >> 1] + nyquist[(indx + 2) >> 1] +
-                                       nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
-
-                        //if most of your neighbors are named Nyquist, it's likely that you're one too
-                        if (nyquisttemp > 4) {
-                            nyquist[indx >> 1] = 1;
-                        }
-
-                        //or not
-                        if (nyquisttemp < 4) {
-                            nyquist[indx >> 1] = 0;
+                            nystartrow = nystartrow ? nystartrow : rr;
+                            nyendrow = rr;
+                            nystartcol = nystartcol > cc ? cc : nystartcol;
+                            nyendcol = nyendcol < cc ? cc : nyendcol;
                         }
                     }
                 }
 
-                // end of Nyquist test
 
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
 
-                // in areas of Nyquist texture, do area interpolation
-                for (rr = 8; rr < rr1 - 8; rr++)
-                    for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 8; cc += 2, indx += 2) {
+                if(doNyquist) {
+                    nyendrow ++; // because of < condition
+                    nyendcol ++; // because of < condition
+                    nystartcol -= (nystartcol & 1);
+                    nystartrow = std::max(8, nystartrow);
+                    nyendrow = std::min(rr1 - 8, nyendrow);
+                    nystartcol = std::max(8, nystartcol);
+                    nyendcol = std::min(cc1 - 8, nyendcol);
 
-                        if (nyquist[indx >> 1]) {
-                            // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                            // area interpolation
+                    for (int rr = nystartrow; rr < nyendrow; rr++) {
+                        for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) {
+                            // TODO_INGO: if you look at the comments below, it does not seem to be correct to include nyquist[indx >> 1] into the summation
+                            // Also this implementation has loop dependencies, which are not correct IMHO
+                            // An implementation which uses a second buffer could avoid this dependencies and could be vectorized by factor 16 too (we're working with single bytes here)
+                            // That would lead to differences in output compared to current code, but also would lead to more consistent output when changing TS
+                            unsigned int nyquistneighbours = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] +
+                                                              nyquist[(indx - 2) >> 1] + nyquist[indx >> 1] + nyquist[(indx + 2) >> 1] +
+                                                              nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
 
-                            sumh = sumv = sumsqh = sumsqv = areawt = 0;
+                            //if most of your neighbours are named Nyquist, it's likely that you're one too
+                            if (nyquistneighbours > 4) {
+                                nyquist[indx >> 1] = 1;
+                            }
 
-                            for (i = -6; i < 7; i += 2)
-                                for (j = -6; j < 7; j += 2) {
-                                    indx1 = (rr + i) * TS + cc + j;
+                            //or not
+                            if (nyquistneighbours < 4) {
+                                nyquist[indx >> 1] = 0;
+                            }
+                        }
+                    }
 
-                                    if (nyquist[indx1 >> 1]) {
-                                        sumh += cfa[indx1] - xdiv2f(cfa[indx1 - 1] + cfa[indx1 + 1]);
-                                        sumv += cfa[indx1] - xdiv2f(cfa[indx1 - v1] + cfa[indx1 + v1]);
-                                        sumsqh += xdiv2f(SQR(cfa[indx1] - cfa[indx1 - 1]) + SQR(cfa[indx1] - cfa[indx1 + 1]));
-                                        sumsqv += xdiv2f(SQR(cfa[indx1] - cfa[indx1 - v1]) + SQR(cfa[indx1] - cfa[indx1 + v1]));
-                                        areawt += 1;
+                    // end of Nyquist test
+
+                    // in areas of Nyquist texture, do area interpolation
+                    for (int rr = nystartrow; rr < nyendrow; rr++)
+                        for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) {
+
+                            if (nyquist[indx >> 1]) {
+                                // area interpolation
+
+                                float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
+
+                                for (int i = -6; i < 7; i += 2) {
+                                    int indx1 = indx + (i * TS) - 6;
+
+                                    for (int j = -6; j < 7; j += 2, indx1 += 2) {
+
+                                        if (nyquist[indx1 >> 1]) {
+                                            float cfatemp = cfa[indx1];
+                                            sumcfa += cfatemp;
+                                            sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
+                                            sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
+                                            sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
+                                            sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
+                                            areawt += 1;
+                                        }
                                     }
                                 }
 
-                            //horizontal and vertical color differences, and adaptive weight
-                            hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
-                            vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
-                            hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
+                                //horizontal and vertical color differences, and adaptive weight
+                                sumh = sumcfa - xdiv2f(sumh);
+                                sumv = sumcfa - xdiv2f(sumv);
+                                sumsqh = xdiv2f(sumsqh);
+                                sumsqv = xdiv2f(sumsqv);
+                                float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
+                                float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
+                                hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
 
-                            // end of area interpolation
-                            // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                                // end of area interpolation
 
+                            }
                         }
-                    }
+                }
 
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
                 //populate G at R/B sites
-                for (rr = 8; rr < rr1 - 8; rr++)
-                    for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 8; cc += 2, indx += 2) {
+                for (int rr = 8; rr < rr1 - 8; rr++)
+                    for (int indx = rr * TS + 8 + (FC(rr, 2) & 1); indx < rr * TS + cc1 - 8; indx += 2) {
 
                         //first ask if one gets more directional discrimination from nearby B/R sites
-                        hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2);
+                        float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2);
 
-//                  hvwtalt = 0.25*(hvwt[(indx-m1)>>1]+hvwt[(indx+p1)>>1]+hvwt[(indx-p1)>>1]+hvwt[(indx+m1)>>1]);
-//                  vo=fabsf(0.5-hvwt[indx>>1]);
-//                  ve=fabsf(0.5-hvwtalt);
-                        if (fabsf(0.5 - hvwt[indx >> 1]) < fabsf(0.5 - hvwtalt)) {
-                            hvwt[indx >> 1] = hvwtalt;   //a better result was obtained from the neighbors
-                        }
+                        hvwt[indx >> 1] = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
+                        //a better result was obtained from the neighbours
 
-//                  if (vo<ve) {hvwt[indx>>1]=hvwtalt;}//a better result was obtained from the neighbors
+                        Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate color differences
 
-
-
-                        Dgrb[0][indx >> 1] = (hcd[indx] * (1.0f - hvwt[indx >> 1]) + vcd[indx] * hvwt[indx >> 1]); //evaluate color differences
-                        //if (hvwt[indx]<0.5) Dgrb[indx][0]=hcd[indx];
-                        //if (hvwt[indx]>0.5) Dgrb[indx][0]=vcd[indx];
                         rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; //evaluate G (finally!)
 
                         //local curvature in G (preparation for nyquist refinement step)
-                        if (nyquist[indx >> 1]) {
-                            Dgrb2[indx >> 1].h = SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1]));
-                            Dgrb2[indx >> 1].v = SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1]));
-                        } else {
-                            Dgrb2[indx >> 1].h = Dgrb2[indx >> 1].v = 0;
-                        }
+                        Dgrb2[indx >> 1].h = nyquist[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f;
+                        Dgrb2[indx >> 1].v = nyquist[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f;
                     }
 
+
                 //end of standard interpolation
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                 // refine Nyquist areas using G curvatures
+                if(doNyquist) {
+                    for (int rr = nystartrow; rr < nyendrow; rr++)
+                        // TODO_INGO: maybe this part is also worth vectorizing using _mm_movemask_ps
+                        for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) {
 
-                for (rr = 8; rr < rr1 - 8; rr++)
-                    for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 8; cc += 2, indx += 2) {
+                            if (nyquist[indx >> 1]) {
+                                //local averages (over Nyquist pixels only) of G curvature squared
+                                float gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h +
+                                                       gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) +
+                                                       gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h) +
+                                                       gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
+                                float gvarv = epssq + (gquinc[0] * Dgrb2[indx >> 1].v +
+                                                       gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v) +
+                                                       gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v) +
+                                                       gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
+                                //use the results as weights for refined G interpolation
+                                Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
+                                rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
+                            }
+                        }
+                }
 
-                        if (nyquist[indx >> 1]) {
-                            //local averages (over Nyquist pixels only) of G curvature squared
-                            gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h +
-                                             gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) +
-                                             gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h) +
-                                             gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
-                            gvarv = epssq + (gquinc[0] * Dgrb2[indx >> 1].v +
-                                             gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v) +
-                                             gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v) +
-                                             gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
-                            //use the results as weights for refined G interpolation
-                            Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
-                            rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
+
+#ifdef __SSE2__
+
+                for (int rr = 6; rr < rr1 - 6; rr++) {
+                    if((FC(rr, 2) & 1) == 0) {
+                        for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) {
+                            vfloat tempv = LC2VFU(cfa[indx + 1]);
+                            vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
+                            STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
+                            STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
+                            vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
+                            STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
+                            STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
+                        }
+                    } else {
+                        for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) {
+                            vfloat tempv = LC2VFU(cfa[indx]);
+                            vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
+                            STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
+                            STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
+                            vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
+                            STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
+                            STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
                         }
                     }
+                }
 
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+#else
+
+                for (int rr = 6; rr < rr1 - 6; rr++) {
+                    if((FC(rr, 2) & 1) == 0) {
+                        for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
+                            delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
+                            delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
+                            Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
+                            Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
+                        }
+                    } else {
+                        for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
+                            Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
+                            Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
+                            delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
+                            delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
+                        }
+                    }
+                }
+
+#endif
 
                 // diagonal interpolation correction
 
 #ifdef __SSE2__
-                __m128 rbsev, rbnwv, rbnev, rbswv, cfav, rbmv, rbpv, temp1v, wtv;
-                __m128 wtsev, wtnwv, wtnev, wtswv, rbvarmv;
-                __m128 gausseven0v = _mm_set1_ps(gausseven[0]);
-                __m128 gausseven1v = _mm_set1_ps(gausseven[1]);
-                __m128 twov = _mm_set1_ps(2.0f);
+                vfloat gausseven0v = F2V(gausseven[0]);
+                vfloat gausseven1v = F2V(gausseven[1]);
 #endif
 
-                for (rr = 8; rr < rr1 - 8; rr++) {
+                for (int rr = 8; rr < rr1 - 8; rr++) {
 #ifdef __SSE2__
 
-                    for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 8, indx += 8, indx1 += 4) {
+                    for (int indx = rr * TS + 8 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 8; indx += 8, indx1 += 4) {
 
                         //diagonal color ratios
-                        cfav = LC2VFU(cfa[indx]);
+                        vfloat cfav = LC2VFU(cfa[indx]);
 
-                        temp1v = LC2VFU(cfa[indx + m1]);
-                        temp2v = LC2VFU(cfa[indx + m2]);
-                        rbsev = (temp1v + temp1v) / (epsv + cfav + temp2v );
+                        vfloat temp1v = LC2VFU(cfa[indx + m1]);
+                        vfloat temp2v = LC2VFU(cfa[indx + m2]);
+                        vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v );
                         rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev, temp1v + zd5v * (cfav - temp2v));
 
                         temp1v = LC2VFU(cfa[indx - m1]);
                         temp2v = LC2VFU(cfa[indx - m2]);
-                        rbnwv = (temp1v + temp1v) / (epsv + cfav + temp2v );
+                        vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v );
                         rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv, temp1v + zd5v * (cfav - temp2v));
 
                         temp1v = epsv + LVFU(delm[indx1]);
-                        wtsev = temp1v + LVFU(delm[(indx + m1) >> 1]) + LVFU(delm[(indx + m2) >> 1]); //same as for wtu,wtd,wtl,wtr
-                        wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
+                        vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1]) + LVFU(delm[(indx + m2) >> 1]); //same as for wtu,wtd,wtl,wtr
+                        vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
 
-                        rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
+                        vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
 
                         temp1v = ULIMV(rbmv , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
-                        wtv = twov * (cfav - rbmv) / (epsv + rbmv + cfav);
-                        temp2v = wtv * rbmv + (onev - wtv) * temp1v;
+                        vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
+                        temp2v = vintpf(wtv, rbmv, temp1v);
 
                         temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
                         temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
-                        _mm_storeu_ps(&rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v ));
+                        STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v ));
 
 
                         temp1v = LC2VFU(cfa[indx + p1]);
                         temp2v = LC2VFU(cfa[indx + p2]);
-                        rbnev = (temp1v + temp1v) / (epsv + cfav + temp2v );
+                        vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v );
                         rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev, temp1v + zd5v * (cfav - temp2v));
 
                         temp1v = LC2VFU(cfa[indx - p1]);
                         temp2v = LC2VFU(cfa[indx - p2]);
-                        rbswv = (temp1v + temp1v) / (epsv + cfav + temp2v );
+                        vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v );
                         rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv, temp1v + zd5v * (cfav - temp2v));
 
                         temp1v = epsv + LVFU(delp[indx1]);
-                        wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
-                        wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
+                        vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
+                        vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
 
-                        rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
+                        vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
 
                         temp1v = ULIMV(rbpv , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
-                        wtv = twov * (cfav - rbpv) / (epsv + rbpv + cfav);
-                        temp2v = wtv * rbpv + (onev - wtv) * temp1v;
+                        wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
+                        temp2v = vintpf(wtv, rbpv, temp1v);
 
                         temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
                         temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
-                        _mm_storeu_ps(&rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v ));
+                        STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), ULIMV(temp2v , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v ));
 
-
-
-                        rbvarmv = epssqv + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1])) +
-                                            gausseven1v * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1]) +
+                        vfloat rbvarmv = epssqv + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1])) +
+                                                   gausseven1v * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1]) +
                                                            LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
-                        _mm_storeu_ps(&pmwt[indx1] , rbvarmv / ((epssqv + (gausseven0v * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1])) +
-                                                                gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1]) +
-                                                                        LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1])))) + rbvarmv));
+                        STVFU(pmwt[indx1] , rbvarmv / ((epssqv + (gausseven0v * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1])) +
+                                                        gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1]) +
+                                                                LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1])))) + rbvarmv));
 
                     }
 
 #else
 
-                    for (cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) {
+                    for (int cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) {
 
                         //diagonal color ratios
-                        crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
-                        crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
-                        crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
-                        crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
+                        float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
+                        float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
+                        float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
+                        float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
+                        //color differences in diagonal directions
+                        float rbse, rbnw, rbne, rbsw;
 
                         //assign B/R at R/B sites
                         if (fabsf(1.0f - crse) < arthresh) {
@@ -1302,33 +1174,30 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
                         }
 
-                        wtse = eps + delm[indx1] + delm[(indx + m1) >> 1] + delm[(indx + m2) >> 1]; //same as for wtu,wtd,wtl,wtr
-                        wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
-                        wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
-                        wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
+                        float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1] + delm[(indx + m2) >> 1]; //same as for wtu,wtd,wtl,wtr
+                        float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
+                        float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
+                        float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
 
 
                         rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
                         rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
-                        /*
-                                            rbvarp = epssq + (gausseven[0]*(Dgrbsq1[indx-v1].p+Dgrbsq1[indx-1].p+Dgrbsq1[indx+1].p+Dgrbsq1[indx+v1].p) +
-                                                            gausseven[1]*(Dgrbsq1[indx-v2-1].p+Dgrbsq1[indx-v2+1].p+Dgrbsq1[indx-2-v1].p+Dgrbsq1[indx+2-v1].p+
-                                                                          Dgrbsq1[indx-2+v1].p+Dgrbsq1[indx+2+v1].p+Dgrbsq1[indx+v2-1].p+Dgrbsq1[indx+v2+1].p));
-                        */
-                        rbvarm = epssq + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1] + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1]) +
-                                          gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1] + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1] +
-                                                          Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1] + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
+
+                        //variance of R-B in plus/minus directions
+                        float rbvarm = epssq + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1] + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1]) +
+                                                gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1] + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1] +
+                                                                Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1] + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
                         pmwt[indx1] = rbvarm / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1] + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1]) +
                                                           gausseven[1] * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1] + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1] +
                                                                   Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1] + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1]))) + rbvarm);
 
-                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                         //bound the interpolation in regions of high saturation
+
                         if (rbp[indx1] < cfa[indx]) {
                             if (xmul2f(rbp[indx1]) < cfa[indx]) {
                                 rbp[indx1] = ULIM(rbp[indx1] , cfa[indx - p1], cfa[indx + p1]);
                             } else {
-                                pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
+                                float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
                                 rbp[indx1] = pwt * rbp[indx1] + (1.0f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
                             }
                         }
@@ -1337,56 +1206,48 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             if (xmul2f(rbm[indx1]) < cfa[indx]) {
                                 rbm[indx1] = ULIM(rbm[indx1] , cfa[indx - m1], cfa[indx + m1]);
                             } else {
-                                mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
+                                float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
                                 rbm[indx1] = mwt * rbm[indx1] + (1.0f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
                             }
                         }
 
                         if (rbp[indx1] > clip_pt) {
-                            rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);    //for RT implementation
+                            rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
                         }
 
                         if (rbm[indx1] > clip_pt) {
                             rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
                         }
-
-                        //c=2-FC(rr,cc);//for dcraw implementation
-                        //if (rbp[indx] > pre_mul[c]) rbp[indx]=ULIM(rbp[indx],cfa[indx-p1],cfa[indx+p1]);
-                        //if (rbm[indx] > pre_mul[c]) rbm[indx]=ULIM(rbm[indx],cfa[indx-m1],cfa[indx+m1]);
-                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-                        //rbint[indx] = 0.5*(cfa[indx] + (rbp*rbvarm+rbm*rbvarp)/(rbvarp+rbvarm));//this is R+B, interpolated
                     }
 
 #endif
                 }
 
 #ifdef __SSE2__
-                __m128 pmwtaltv;
-                __m128 zd25v = _mm_set1_ps(0.25f);
+                vfloat zd25v = F2V(0.25f);
 #endif
 
-                for (rr = 10; rr < rr1 - 10; rr++)
+                for (int rr = 10; rr < rr1 - 10; rr++)
 #ifdef __SSE2__
-                    for (cc = 10 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 8, indx += 8, indx1 += 4) {
+                    for (int indx = rr * TS + 10 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 10; indx += 8, indx1 += 4) {
 
                         //first ask if one gets more directional discrimination from nearby B/R sites
-                        pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
-                        tempv = LVFU(pmwt[indx1]);
+                        vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
+                        vfloat tempv = LVFU(pmwt[indx1]);
                         tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
-                        _mm_storeu_ps( &pmwt[indx1], tempv);
-                        _mm_storeu_ps( &rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + LVFU(rbm[indx1]) * (onev - tempv) + LVFU(rbp[indx1]) * tempv));
+                        STVFU(pmwt[indx1], tempv);
+                        STVFU(rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
                     }
 
 #else
 
-                    for (cc = 10 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) {
+                    for (int cc = 10 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) {
 
                         //first ask if one gets more directional discrimination from nearby B/R sites
-                        pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2);
+                        float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2);
 
                         if (fabsf(0.5 - pmwt[indx1]) < fabsf(0.5 - pmwtalt)) {
-                            pmwt[indx1] = pmwtalt;   //a better result was obtained from the neighbors
+                            pmwt[indx1] = pmwtalt;   //a better result was obtained from the neighbours
                         }
 
                         rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.0f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated
@@ -1394,8 +1255,64 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
 #endif
 
-                for (rr = 12; rr < rr1 - 12; rr++)
-                    for (cc = 12 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) {
+                for (int rr = 12; rr < rr1 - 12; rr++)
+#ifdef __SSE2__
+                    for (int indx = rr * TS + 12 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 12; indx += 8, indx1 += 4) {
+                        vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
+
+                        if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the math for all 4 pixels and mask the unused out at the end
+                            //now interpolate G vertically/horizontally using R+B values
+                            //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts
+                            //color ratios for G interpolation
+                            vfloat rbintv = LVFU(rbint[indx1]);
+
+                            //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
+                            vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
+                            vfloat guv = rbintv * cruv;
+                            vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
+                            guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
+
+                            vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
+                            vfloat gdv = rbintv * crdv;
+                            vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
+                            gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
+
+                            vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
+                            vfloat Gint1v = ULIMV(Gintvv , LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
+                            vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
+                            vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
+                            Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
+                            Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
+                            Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv), ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
+
+                            vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
+                            vfloat glv = rbintv * crlv;
+                            vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
+                            glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
+
+                            vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
+                            vfloat grv = rbintv * crrv;
+                            vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
+                            grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
+
+                            vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
+                            vfloat Gint1h = ULIMV(Ginthv , LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
+                            vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
+                            vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
+                            Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
+                            Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
+                            Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv), ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
+
+                            vfloat greenv = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
+                            STC2VFU(rgbgreen[indx], greenv);
+
+                            STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
+                        }
+                    }
+
+#else
+
+                    for (int cc = 12 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) {
 
                         if (fabsf(0.5 - pmwt[indx >> 1]) < fabsf(0.5 - hvwt[indx >> 1]) ) {
                             continue;
@@ -1403,55 +1320,52 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
                         //now interpolate G vertically/horizontally using R+B values
                         //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts
-                        //color ratios for G interpolation
 
-                        cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
-                        crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
-                        crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
-                        crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
+                        //color ratios for G interpolation
+                        float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
+                        float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
+                        float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
+                        float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
+
+                        //interpolation of G in four directions
+                        float gu, gd, gl, gr;
 
                         //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
-                        if (fabsf(1.0f - cru) < arthresh) {
+                        if (fabsf(1.f - cru) < arthresh) {
                             gu = rbint[indx1] * cru;
                         } else {
                             gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
                         }
 
-                        if (fabsf(1.0f - crd) < arthresh) {
+                        if (fabsf(1.f - crd) < arthresh) {
                             gd = rbint[indx1] * crd;
                         } else {
                             gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
                         }
 
-                        if (fabsf(1.0f - crl) < arthresh) {
+                        if (fabsf(1.f - crl) < arthresh) {
                             gl = rbint[indx1] * crl;
                         } else {
                             gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
                         }
 
-                        if (fabsf(1.0f - crr) < arthresh) {
+                        if (fabsf(1.f - crr) < arthresh) {
                             gr = rbint[indx1] * crr;
                         } else {
                             gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
                         }
 
-                        //gu=rbint[indx]*cru;
-                        //gd=rbint[indx]*crd;
-                        //gl=rbint[indx]*crl;
-                        //gr=rbint[indx]*crr;
-
                         //interpolated G via adaptive weights of cardinal evaluations
-                        Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu) / (dirwts0[indx + v1] + dirwts0[indx - v1]);
-                        Ginth = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
+                        float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu) / (dirwts0[indx + v1] + dirwts0[indx - v1]);
+                        float Ginth = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
 
-                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                         //bound the interpolation in regions of high saturation
                         if (Gintv < rbint[indx1]) {
                             if (2 * Gintv < rbint[indx1]) {
                                 Gintv = ULIM(Gintv , cfa[indx - v1], cfa[indx + v1]);
                             } else {
-                                vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
-                                Gintv = vwt * Gintv + (1.0f - vwt) * ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
+                                float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
+                                Gintv = vwt * Gintv + (1.f - vwt) * ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
                             }
                         }
 
@@ -1459,74 +1373,64 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             if (2 * Ginth < rbint[indx1]) {
                                 Ginth = ULIM(Ginth , cfa[indx - 1], cfa[indx + 1]);
                             } else {
-                                hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
-                                Ginth = hwt * Ginth + (1.0f - hwt) * ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
+                                float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
+                                Ginth = hwt * Ginth + (1.f - hwt) * ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
                             }
                         }
 
                         if (Ginth > clip_pt) {
-                            Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);    //for RT implementation
+                            Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
                         }
 
                         if (Gintv > clip_pt) {
                             Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
                         }
 
-                        //c=FC(rr,cc);//for dcraw implementation
-                        //if (Ginth > pre_mul[c]) Ginth=ULIM(Ginth,cfa[indx-1],cfa[indx+1]);
-                        //if (Gintv > pre_mul[c]) Gintv=ULIM(Gintv,cfa[indx-v1],cfa[indx+v1]);
-                        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-                        rgbgreen[indx] = Ginth * (1.0f - hvwt[indx1]) + Gintv * hvwt[indx1];
-                        //rgb[indx][1] = 0.5*(rgb[indx][1]+0.25*(rgb[indx-v1][1]+rgb[indx+v1][1]+rgb[indx-1][1]+rgb[indx+1][1]));
+                        rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
                         Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
-
-                        //rgb[indx][2-FC(rr,cc)]=2*rbint[indx]-cfa[indx];
                     }
 
+#endif
+
                 //end of diagonal interpolation correction
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
                 //fancy chrominance interpolation
                 //(ey,ex) is location of R site
-                for (rr = 13 - ey; rr < rr1 - 12; rr += 2)
-                    for (cc = 13 - ex, indx1 = (rr * TS + cc) >> 1; cc < cc1 - 12; cc += 2, indx1++) { //B coset
+                for (int rr = 13 - ey; rr < rr1 - 12; rr += 2)
+                    for (int indx1 = (rr * TS + 13 - ex) >> 1; indx1 < (rr * TS + cc1 - 12) >> 1; indx1++) { //B coset
                         Dgrb[1][indx1] = Dgrb[0][indx1]; //split out G-B from G-R
                         Dgrb[0][indx1] = 0;
                     }
 
 #ifdef __SSE2__
-//          __m128 wtnwv,wtnev,wtswv,wtsev;
-                __m128 oned325v = _mm_set1_ps( 1.325f );
-                __m128 zd175v = _mm_set1_ps( 0.175f );
-                __m128 zd075v = _mm_set1_ps( 0.075f );
+                vfloat oned325v = F2V( 1.325f );
+                vfloat zd175v = F2V( 0.175f );
+                vfloat zd075v = F2V( 0.075f );
 #endif
 
-                for (rr = 14; rr < rr1 - 14; rr++)
+                for (int rr = 14; rr < rr1 - 14; rr++)
 #ifdef __SSE2__
-                    for (cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) {
-                        wtnwv = onev / (epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
-                        wtnev = onev / (epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
-                        wtswv = onev / (epsv + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p1) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
-                        wtsev = onev / (epsv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m1) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
+                    for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) {
+                        vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
+                        vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
+                        vfloat wtnwv = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
+                        vfloat wtnev = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
+                        vfloat wtswv = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
+                        vfloat wtsev = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
 
-                        //Dgrb[indx][c]=(wtnw*Dgrb[indx-m1][c]+wtne*Dgrb[indx+p1][c]+wtsw*Dgrb[indx-p1][c]+wtse*Dgrb[indx+m1][c])/(wtnw+wtne+wtsw+wtse);
-
-                        _mm_storeu_ps(&Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1]) - zd075v * LVFU(Dgrb[c][(indx - m1 - 2) >> 1]) - zd075v * LVFU(Dgrb[c][(indx - m1 - v2) >> 1]) ) +
-                                                            wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1]) - zd075v * LVFU(Dgrb[c][(indx + p1 + 2) >> 1]) - zd075v * LVFU(Dgrb[c][(indx + p1 + v2) >> 1]) ) +
-                                                            wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1]) - zd075v * LVFU(Dgrb[c][(indx - p1 - 2) >> 1]) - zd075v * LVFU(Dgrb[c][(indx - p1 - v2) >> 1]) ) +
-                                                            wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1]) - zd075v * LVFU(Dgrb[c][(indx + m1 + 2) >> 1]) - zd075v * LVFU(Dgrb[c][(indx + m1 + v2) >> 1]) )) / (wtnwv + wtnev + wtswv + wtsev));
+                        STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])) ) +
+                                                   wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])) ) +
+                                                   wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])) ) +
+                                                   wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + m1 + v2) >> 1])) )) / (wtnwv + wtnev + wtswv + wtsev));
                     }
 
 #else
 
-                    for (cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) {
-                        wtnw = 1.0f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
-                        wtne = 1.0f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
-                        wtsw = 1.0f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
-                        wtse = 1.0f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
-
-                        //Dgrb[indx][c]=(wtnw*Dgrb[indx-m1][c]+wtne*Dgrb[indx+p1][c]+wtsw*Dgrb[indx-p1][c]+wtse*Dgrb[indx+m1][c])/(wtnw+wtne+wtsw+wtse);
+                    for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) {
+                        float wtnw = 1.0f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
+                        float wtne = 1.0f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
+                        float wtsw = 1.0f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
+                        float wtse = 1.0f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
 
                         Dgrb[c][indx >> 1] = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1] - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1] ) +
                                               wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1] - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1] ) +
@@ -1535,13 +1439,55 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                     }
 
 #endif
-                float   temp;
+                //tile vars
+                //counters for pixel location in the image
+                int row, col;
+                //counters for pixel location within the tile
+                int cc;
+                //pointer counters within the tile
+                int indx;
 
-                for (rr = 16; rr < rr1 - 16; rr++) {
-                    if((FC(rr, 2) & 1) == 1) {
-                        for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) {
+                // end of tile initialization
+
+#ifdef __SSE2__
+                int offset;
+                vfloat twov = F2V(2.f);
+                vmask selmask;
+
+                if((FC(16, 2) & 1) == 1) {
+                    selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
+                    offset = 1;
+                } else {
+                    selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+                    offset = 0;
+                }
+
+#endif
+
+                for (int rr = 16; rr < rr1 - 16; rr++) {
+#ifdef __SSE2__
+                    offset = 1 - offset;
+                    selmask = vnotm(selmask);
+
+                    for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 18 - (cc1 & 1); cc += 4, indx += 4) {
+                        col = cc + left;
+                        vfloat greenv = LVF(rgbgreen[indx]);
+                        vfloat temp00v = vdup(LVF(hvwt[(indx - v1) >> 1]));
+                        vfloat temp01v = vdup(LVF(hvwt[(indx + v1) >> 1]));
+                        vfloat tempv =  onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])) - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
+
+                        vfloat redv1  = greenv - (temp00v * vdup(LVF(Dgrb[0][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVF(Dgrb[0][(indx + v1) >> 1]))) * tempv;
+                        vfloat bluev1 = greenv - (temp00v * vdup(LVF(Dgrb[1][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVF(Dgrb[1][(indx + v1) >> 1]))) * tempv;
+                        vfloat redv2  = greenv - vdup(LVF(Dgrb[0][indx >> 1]));
+                        vfloat bluev2 = greenv - vdup(LVF(Dgrb[1][indx >> 1]));
+                        STVFU(red[row][col], c65535v * vself(selmask, redv1, redv2));
+                        STVFU(blue[row][col], c65535v * vself(selmask, bluev1, bluev2));
+                    }
+
+                    if(offset == 0) {
+                        for (indx = rr * TS + cc; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) {
                             col = cc + left;
-                            temp =  1.0f / ((hvwt[(indx - v1) >> 1]) + (1.0f - hvwt[(indx + 1) >> 1]) + (1.0f - hvwt[(indx - 1) >> 1]) + (hvwt[(indx + v1) >> 1]));
+                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
                             red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
                             blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
@@ -1555,7 +1501,54 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
                         if(cc1 & 1) { // width of tile is odd
                             col = cc + left;
-                            temp =  1.0f / ((hvwt[(indx - v1) >> 1]) + (1.0f - hvwt[(indx + 1) >> 1]) + (1.0f - hvwt[(indx - 1) >> 1]) + (hvwt[(indx + v1) >> 1]));
+                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                                                        temp);
+                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                                                         temp);
+                        }
+                    } else {
+                        for (indx = rr * TS + cc; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) {
+                            col = cc + left;
+                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+
+                            indx++;
+                            col++;
+                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                                                        temp);
+                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                                                         temp);
+                        }
+
+                        if(cc1 & 1) { // width of tile is odd
+                            col = cc + left;
+                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+                        }
+                    }
+
+#else
+
+                    if((FC(rr, 2) & 1) == 1) {
+                        for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) {
+                            col = cc + left;
+                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                                                        temp);
+                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                                                         temp);
+
+                            indx++;
+                            col++;
+                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+                        }
+
+                        if(cc1 & 1) { // width of tile is odd
+                            col = cc + left;
+                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
                             red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
                             blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
@@ -1569,7 +1562,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
                             indx++;
                             col++;
-                            temp =  1.0f / ((hvwt[(indx - v1) >> 1]) + (1.0f - hvwt[(indx + 1) >> 1]) + (1.0f - hvwt[(indx - 1) >> 1]) + (hvwt[(indx + v1) >> 1]));
+                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
                             red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
                             blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
@@ -1582,33 +1575,25 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
                         }
                     }
+
+#endif
                 }
 
-
-                // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
                 // copy smoothed results back to image matrix
-                for (rr = 16; rr < rr1 - 16; rr++) {
+                for (int rr = 16; rr < rr1 - 16; rr++) {
+                    int row = rr + top;
+                    int cc = 16;
 #ifdef __SSE2__
 
-                    for (row = rr + top, cc = 16; cc < cc1 - 19; cc += 4) {
-                        _mm_storeu_ps(&green[row][cc + left], LVF(rgbgreen[rr * TS + cc]) * c65535v);
-                    }
-
-#else
-
-                    for (row = rr + top, cc = 16; cc < cc1 - 16; cc++) {
-                        col = cc + left;
-                        indx = rr * TS + cc;
-                        green[row][col] = ((65535.0f * rgbgreen[indx]));
-
-                        //for dcraw implementation
-                        //for (c=0; c<3; c++){
-                        //  image[indx][c] = CLIP((int)(65535.0f*rgb[rr*TS+cc][c] + 0.5f));
-                        //}
+                    for (; cc < cc1 - 19; cc += 4) {
+                        STVFU(green[row][cc + left], LVF(rgbgreen[rr * TS + cc]) * c65535v);
                     }
 
 #endif
+
+                    for (; cc < cc1 - 16; cc++) {
+                        green[row][cc + left] = 65535.0f * rgbgreen[rr * TS + cc];
+                    }
                 }
 
                 //end of main loop
@@ -1616,26 +1601,19 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 if(plistener) {
                     progresscounter++;
 
-                    if(progresscounter % 16 == 0) {
-                        #pragma omp critical
+                    if(progresscounter % 32 == 0) {
+#ifdef _OPENMP
+                        #pragma omp critical (amazeprogress)
+#endif
                         {
-                            progress += (double)16 * ((TS - 32) * (TS - 32)) / (height * width);
-
-                            if (progress > 1.0)
-                            {
-                                progress = 1.0;
-                            }
-
+                            progress += (double)32 * ((TS - 32) * (TS - 32)) / (height * width);
+                            progress = progress > 1.0 ? 1.0 : progress;
                             plistener->setProgress(progress);
                         }
                     }
                 }
             }
 
-        // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-
         // clean up
         free(buffer);
     }
@@ -1644,7 +1622,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
         plistener->setProgress(1.0);
     }
 
-
     // done
 
 #undef TS
diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h
index b60b5c9bc..7bc480861 100644
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -4,7 +4,6 @@
 
 #ifdef __GNUC__
 #define INLINE __inline
-//#define INLINE __attribute__((always_inline))
 #else
 #define INLINE inline
 #endif
@@ -48,20 +47,20 @@ typedef __m128i vint2;
 // SSE4.1 => use _mm_blend_ps instead of _mm_set_epi32 and vself
 #define STC2VFU(a,v) {\
                          __m128 TST1V = _mm_loadu_ps(&a);\
-                         __m128 TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 1,1,0,0 ));\
+                         __m128 TST2V = _mm_unpacklo_ps(v,v);\
                          _mm_storeu_ps(&a, _mm_blend_ps(TST1V,TST2V,5));\
                          TST1V = _mm_loadu_ps((&a)+4);\
-                         TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 3,3,2,2 ));\
+                         TST2V = _mm_unpackhi_ps(v,v);\
                          _mm_storeu_ps((&a)+4, _mm_blend_ps(TST1V,TST2V,5));\
                      }
 #else
 #define STC2VFU(a,v) {\
                          __m128 TST1V = _mm_loadu_ps(&a);\
-                         __m128 TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 1,1,0,0 ));\
+                         __m128 TST2V = _mm_unpacklo_ps(v,v);\
                          vmask cmask = _mm_set_epi32(0xffffffff,0,0xffffffff,0);\
                          _mm_storeu_ps(&a, vself(cmask,TST1V,TST2V));\
                          TST1V = _mm_loadu_ps((&a)+4);\
-                         TST2V = _mm_shuffle_ps(v,v,_MM_SHUFFLE( 3,3,2,2 ));\
+                         TST2V = _mm_unpackhi_ps(v,v);\
                          _mm_storeu_ps((&a)+4, vself(cmask,TST1V,TST2V));\
                      }
 #endif
diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h
index 060f0c4ff..951d397bf 100644
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@@ -78,5 +78,15 @@ inline const _Tp& max(const _Tp& a, const _Tp& b, const _Tp& c, const _Tp& d)
 {
     return std::max(d, std::max(c, std::max(a, b)));
 }
+
+template<typename _Tp>
+inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c) {
+    // calculate a * b + (1 - a) * c
+    // following is valid:
+    // intp(a, b+x, c+x) = vintpf(a, b, c) + x
+    // intp(a, b*x, c*x) = vintpf(a, b, c) * x
+    return a * (b-c) + c;
+}
+
 }
 #endif
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
index 453025fd6..a0300a1cc 100644
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@@ -1316,7 +1316,9 @@ return vmaxf( b, vminf(a,c));
 }
 
 static INLINE vfloat ULIMV( vfloat a, vfloat b, vfloat c  ){
-	return vself( vmaskf_lt(b,c), LIMV(a,b,c), LIMV(a,c,b));
+    // made to clamp a in range [b,c] but in fact it's also the median of a,b,c, which means that the result is independent on order of arguments
+    // ULIMV(a,b,c) = ULIMV(a,c,b) = ULIMV(b,a,c) = ULIMV(b,c,a) = ULIMV(c,a,b) = ULIMV(c,b,a)
+	return vmaxf(vminf(a,b), vminf(vmaxf(a,b),c));
 }
 
 static INLINE vfloat SQRV(vfloat a){
@@ -1324,17 +1326,45 @@ static INLINE vfloat SQRV(vfloat a){
 }
 
 static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
+    // conditional swap the elements of two vfloats
     vfloat temp = vself(condition, a, b); // the values which fit to condition
     condition = vnotm(condition); // invert the condition
     a = vself(condition, a, b); // the values which fit to inverted condition
     b = temp;
 }
 
-static inline float vhadd( vfloat a )
-{
+static inline float vhadd( vfloat a ) {
+    // returns a[0] + a[1] + a[2] + a[3]
     a += _mm_movehl_ps(a, a);
     return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
 }
 
+static INLINE vfloat vmul2f(vfloat a){
+    // fastest way to multiply by 2
+	return a + a;
+}
+
+static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
+    // calculate a * b + (1 - a) * c (interpolate two values)
+    // following is valid:
+    // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
+    // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
+    return a * (b-c) + c;
+}
+
+static INLINE vfloat vdup(vfloat a){
+    // returns { a[0],a[0],a[1],a[1] }
+    return _mm_unpacklo_ps( a, a );
+}
+
+static INLINE vfloat vaddc2vfu(float &a)
+{
+    // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
+    vfloat a1 = _mm_loadu_ps( &a );
+    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
+    return  _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
+}
+
+
 #endif // __SSE2__
 #endif // SLEEFSSEAVX

From ee665d67908eb9b3652aaf9d42a1923b5aab838f Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Tue, 26 Jan 2016 13:10:38 +0100
Subject: [PATCH 2/3] Amaze Demosaic: Speedup, cleaned code, changed nyquist
 code

---
 rtengine/amaze_demosaic_RT.cc | 799 ++++++++++++++++------------------
 rtengine/helpersse2.h         |  11 +-
 rtengine/rt_math.h            |   4 +-
 rtengine/sleefsseavx.c        |  24 +-
 4 files changed, 418 insertions(+), 420 deletions(-)

diff --git a/rtengine/amaze_demosaic_RT.cc b/rtengine/amaze_demosaic_RT.cc
index 3b367ee2b..2720521aa 100644
--- a/rtengine/amaze_demosaic_RT.cc
+++ b/rtengine/amaze_demosaic_RT.cc
@@ -9,6 +9,7 @@
 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
 //
 // code dated: May 27, 2010
+// latest modification: Ingo Weyrich, January 25, 2016
 //
 //  amaze_interpolate_RT.cc is free software: you can redistribute it and/or modify
 //  it under the terms of the GNU General Public License as published by
@@ -52,9 +53,15 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
     const float clip_pt = 1.0 / initialGain;
     const float clip_pt8 = 0.8 / initialGain;
 
-
-#define TS 160   // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading
-#define TSH 80   // half of Tile size
+// this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
+// If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
+#ifndef AMAZETS
+#define AMAZETS 160
+#endif
+    // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading
+    // We assure that Tile size is a multiple of 32 in the range [96;992]
+    constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
+    constexpr int tsh = ts / 2; // half of Tile size
 
     //offset of R pixel within a Bayer quartet
     int ex, ey;
@@ -79,27 +86,27 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
     }
 
     //shifts of pointer value to access pixels in vertical and diagonal directions
-    static const int v1 = TS, v2 = 2 * TS, v3 = 3 * TS, p1 = -TS + 1, p2 = -2 * TS + 2, p3 = -3 * TS + 3, m1 = TS + 1, m2 = 2 * TS + 2, m3 = 3 * TS + 3;
+    constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3, m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
 
     //tolerance to avoid dividing by zero
-    static const float eps = 1e-5, epssq = 1e-10;       //tolerance to avoid dividing by zero
+    constexpr float eps = 1e-5, epssq = 1e-10;       //tolerance to avoid dividing by zero
 
     //adaptive ratios threshold
-    static const float arthresh = 0.75;
+    constexpr float arthresh = 0.75;
 
     //gaussian on 5x5 quincunx, sigma=1.2
-    static const float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f};
+    constexpr float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f};
     //nyquist texture test threshold
-    static const float nyqthresh = 0.5;
+    constexpr float nyqthresh = 0.5;
     //gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
     // Is this really sigma=1.2????, seems more like sigma = 1.672
-    static const float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f,
-                                       nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f
-                                      };
+    constexpr float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f,
+                                    nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f
+                                   };
     //gaussian on 5x5 alt quincunx, sigma=1.5
-    static const float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f};
+    constexpr float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f};
     //guassian on quincunx grid
-    static const float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f};
+    constexpr float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f};
 
     typedef struct {
         float h;
@@ -112,115 +119,87 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
     {
         int progresscounter = 0;
 
-#define CLF 1
+        constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
         // assign working space
-        char *buffer = (char *) calloc(13 * sizeof(float) * TS * TS + sizeof(float) * TS * TSH + sizeof(char) * TS * TSH + 18 * CLF * 64 + 63, 1);
+        char *buffer = (char *) calloc(14 * sizeof(float) * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
         // aligned to 64 byte boundary
         char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
 
         // green values
         float *rgbgreen         = (float (*))         data;
         // sum of square of horizontal gradient and square of vertical gradient
-        float *delhvsqsum       = (float (*))         ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64);
+        float *delhvsqsum       = (float (*))         ((char*)rgbgreen + sizeof(float) * ts * ts + cldf * 64);       // 1
         // gradient based directional weights for interpolation
-        float *dirwts0          = (float (*))         ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64);
-        float *dirwts1          = (float (*))         ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64);
-        // vertically interpolated color differences G-R, G-B
-        float *vcd              = (float (*))         ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64);
-        // horizontally interpolated color differences
-        float *hcd              = (float (*))         ((char*)vcd + sizeof(float) * TS * TS + CLF * 64);
+        float *dirwts0          = (float (*))         ((char*)delhvsqsum + sizeof(float) * ts * ts + cldf * 64);     // 1
+        float *dirwts1          = (float (*))         ((char*)dirwts0 + sizeof(float) * ts * ts + cldf * 64);        // 1
+        // vertically interpolated colour differences G-R, G-B
+        float *vcd              = (float (*))         ((char*)dirwts1 + sizeof(float) * ts * ts + cldf * 64);        // 1
+        // horizontally interpolated colour differences
+        float *hcd              = (float (*))         ((char*)vcd + sizeof(float) * ts * ts + cldf * 64);            // 1
         // alternative vertical interpolation
-        float *vcdalt           = (float (*))         ((char*)hcd + sizeof(float) * TS * TS + CLF * 64);
+        float *vcdalt           = (float (*))         ((char*)hcd + sizeof(float) * ts * ts + cldf * 64);            // 1
         // alternative horizontal interpolation
-        float *hcdalt           = (float (*))         ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64);
-        // square of average color difference
-        float *cddiffsq         = (float (*))         ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64);
+        float *hcdalt           = (float (*))         ((char*)vcdalt + sizeof(float) * ts * ts + cldf * 64);         // 1
+        // square of average colour difference
+        float *cddiffsq         = (float (*))         ((char*)hcdalt + sizeof(float) * ts * ts + cldf * 64);         // 1
         // weight to give horizontal vs vertical interpolation
-        float *hvwt             = (float (*))         ((char*)cddiffsq + sizeof(float) * TS * TS + 2 * CLF * 64);
-        // final interpolated color difference
-        float (*Dgrb)[TS * TSH] = (float (*)[TS * TSH])vcdalt; // there is no overlap in buffer usage => share
+        float *hvwt             = (float (*))         ((char*)cddiffsq + sizeof(float) * ts * ts + 2 * cldf * 64);   // 1
+        // final interpolated colour difference
+        float (*Dgrb)[ts * tsh] = (float (*)[ts * tsh])vcdalt; // there is no overlap in buffer usage => share
         // gradient in plus (NE/SW) direction
         float *delp             = (float (*))cddiffsq; // there is no overlap in buffer usage => share
         // gradient in minus (NW/SE) direction
-        float *delm             = (float (*))         ((char*)delp + sizeof(float) * TS * TSH + CLF * 64);
+        float *delm             = (float (*))         ((char*)delp + sizeof(float) * ts * tsh + cldf * 64);
         // diagonal interpolation of R+B
         float *rbint            = (float (*))delm; // there is no overlap in buffer usage => share
         // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions)
-        s_hv  *Dgrb2            = (s_hv  (*))         ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64);
+        s_hv  *Dgrb2            = (s_hv  (*))         ((char*)hvwt + sizeof(float) * ts * tsh + cldf * 64);          // 1
         // difference between up/down interpolations of G
         float *dgintv           = (float (*))Dgrb2;   // there is no overlap in buffer usage => share
         // difference between left/right interpolations of G
-        float *dginth           = (float (*))         ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64);
+        float *dginth           = (float (*))         ((char*)dgintv + sizeof(float) * ts * ts + cldf * 64);         // 1
         // square of diagonal colour differences
-        float *Dgrbsq1m         = (float (*))         ((char*)dginth + sizeof(float) * TS * TS + CLF * 64);
-        float *Dgrbsq1p         = (float (*))         ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64);
+        float *Dgrbsq1m         = (float (*))         ((char*)dginth + sizeof(float) * ts * ts + cldf * 64);         // 1
+        float *Dgrbsq1p         = (float (*))         ((char*)Dgrbsq1m + sizeof(float) * ts * tsh + cldf * 64);      // 1
         // tile raw data
-        float *cfa              = (float (*))         ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64);
+        float *cfa              = (float (*))         ((char*)Dgrbsq1p + sizeof(float) * ts * tsh + cldf * 64);      // 1
         // relative weight for combining plus and minus diagonal interpolations
         float *pmwt             = (float (*))delhvsqsum;  // there is no overlap in buffer usage => share
-        // interpolated color difference R-B in minus and plus direction
+        // interpolated colour difference R-B in minus and plus direction
         float *rbm              = (float (*))vcd;  // there is no overlap in buffer usage => share
-        float *rbp              = (float (*))         ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64);
-        // nyquist texture flag 1=nyquist, 0=not nyquist
-        unsigned char *nyquist  = (unsigned char (*)) ((char*)cfa + sizeof(float) * TS * TS + CLF * 64);
-        /*
-                rgbgreen   = (float (*))         data; //pointers to array
-                delhvsqsum = (float (*))         ((char*)rgbgreen + sizeof(float) * TS * TS + CLF * 64);
-                dirwts0    = (float (*))         ((char*)delhvsqsum + sizeof(float) * TS * TS + CLF * 64);
-                dirwts1    = (float (*))         ((char*)dirwts0 + sizeof(float) * TS * TS + CLF * 64);
-                vcd        = (float (*))         ((char*)dirwts1 + sizeof(float) * TS * TS + CLF * 64);
-                hcd        = (float (*))         ((char*)vcd + sizeof(float) * TS * TS + CLF * 64);
-                vcdalt     = (float (*))         ((char*)hcd + sizeof(float) * TS * TS + CLF * 64);
-                hcdalt     = (float (*))         ((char*)vcdalt + sizeof(float) * TS * TS + CLF * 64);
-                cddiffsq   = (float (*))         ((char*)hcdalt + sizeof(float) * TS * TS + CLF * 64);
-                hvwt       = (float (*))         ((char*)cddiffsq + sizeof(float) * TS * TS + CLF * 64);
-                Dgrb       = (float (*)[TS * TSH]) ((char*)hvwt + sizeof(float) * TS * TSH + CLF * 64);
-                delp       = (float (*))         ((char*)Dgrb + sizeof(float) * TS * TS + CLF * 64);
-                delm       = (float (*))         ((char*)delp + sizeof(float) * TS * TSH + CLF * 64);
-                rbint      = (float (*))         ((char*)delm + sizeof(float) * TS * TSH + CLF * 64);
-                Dgrb2      = (s_hv  (*))         ((char*)rbint + sizeof(float) * TS * TSH + CLF * 64);
-                dgintv     = (float (*))         ((char*)Dgrb2 + sizeof(float) * TS * TS + CLF * 64);
-                dginth     = (float (*))         ((char*)dgintv + sizeof(float) * TS * TS + CLF * 64);
-                Dgrbsq1m   = (float (*))         ((char*)dginth + sizeof(float) * TS * TS + CLF * 64);
-                Dgrbsq1p   = (float (*))         ((char*)Dgrbsq1m + sizeof(float) * TS * TSH + CLF * 64);
-                cfa        = (float (*))         ((char*)Dgrbsq1p + sizeof(float) * TS * TSH + CLF * 64);
-                pmwt       = (float (*))         ((char*)cfa + sizeof(float) * TS * TS + CLF * 64);
-                rbm        = (float (*))         ((char*)pmwt + sizeof(float) * TS * TSH + CLF * 64);
-                rbp        = (float (*))         ((char*)rbm + sizeof(float) * TS * TSH + CLF * 64);
-
-                nyquist    = (char (*))          ((char*)rbp + sizeof(float) * TS * TSH + CLF * 64);
-        */
-#undef CLF
+        float *rbp              = (float (*))         ((char*)rbm + sizeof(float) * ts * tsh + cldf * 64);
+        // nyquist texture flags 1=nyquist, 0=not nyquist
+        unsigned char *nyquist  = (unsigned char (*)) ((char*)cfa + sizeof(float) * ts * ts + cldf * 64);            // 1
+        unsigned char *nyquist2 = (unsigned char (*))cddiffsq;
+        float *nyqutest = (float(*)) ((char*)nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64);                // 1
 
         // Main algorithm: Tile loop
-
-        // Issue 1676
         // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
 #ifdef _OPENMP
         #pragma omp for schedule(dynamic) collapse(2) nowait
 #endif
 
-        for (int top = winy - 16; top < winy + height; top += TS - 32)
-            for (int left = winx - 16; left < winx + width; left += TS - 32) {
+        for (int top = winy - 16; top < winy + height; top += ts - 32) {
+            for (int left = winx - 16; left < winx + width; left += ts - 32) {
 #ifdef __SSE2__
                 // Using SSE2 we can zero the memory without cache pollution
                 vfloat zerov = ZEROV;
 
-                for(int i = 3 * TSH; i < (TS - 6)*TSH; i += 16) {
+                for(int i = 3 * tsh; i < (ts - 6)*tsh; i += 16) {
                     _mm_stream_ps((float*)&nyquist[i], zerov);
                 }
 
 #else
-                memset(&nyquist[3 * TSH], 0, sizeof(unsigned char) * (TS - 6) * TSH);
+                memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
 #endif
                 //location of tile bottom edge
-                const int bottom = min(top + TS, winy + height + 16);
+                int bottom = min(top + ts, winy + height + 16);
                 //location of tile right edge
-                const int right  = min(left + TS, winx + width + 16);
-                //tile width  (=TS except for right edge of image)
-                const int rr1 = bottom - top;
-                //tile height (=TS except for bottom edge of image)
-                const int cc1 = right - left;
+                int right  = min(left + ts, winx + width + 16);
+                //tile width  (=ts except for right edge of image)
+                int rr1 = bottom - top;
+                //tile height (=ts except for bottom edge of image)
+                int cc1 = right - left;
                 // bookkeeping for borders
                 // min and max row/column in the tile
                 int rrmin = top < winy ? 16 : 0;
@@ -232,54 +211,87 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 // rgb values should be floating point number between 0 and 1
                 // after white balance multipliers are applied
                 // a 16 pixel border is added to each side of the image
+
+                // begin of tile initialization
 #ifdef __SSE2__
-                const vfloat c65535v = F2V( 65535.0f );
+                vfloat c65535v = F2V( 65535.f );
 
                 //fill upper border
                 if (rrmin > 0) {
-                    for (int rr = 0; rr < 16; rr++)
-                        for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
-                            cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f;
-                            rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
+                    for (int rr = 0; rr < 16; rr++) {
+                        int row = 32 - rr + top;
+
+                        for (int cc = ccmin; cc < ccmax; cc += 4) {
+                            int indx1 = rr * ts + cc;
+                            vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v;
+                            STVF(cfa[indx1], tempv);
+                            STVF(rgbgreen[indx1], tempv );
                         }
+                    }
                 }
 
                 // fill inner part
                 for (int rr = rrmin; rr < rrmax; rr++) {
                     int row = rr + top;
-                    int cc = ccmin;
 
-                    for (; cc < ccmax - 3; cc += 4) {
-                        int indx1 = rr * TS + cc;
+                    for (int cc = ccmin; cc < ccmax; cc += 4) {
+                        int indx1 = rr * ts + cc;
                         vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v;
                         STVF(cfa[indx1], tempv );
                         STVF(rgbgreen[indx1], tempv );
                     }
-
-                    for (; cc < ccmax; cc++) {
-                        int indx1 = rr * TS + cc;
-                        cfa[indx1] = (rawData[row][cc + left]) / 65535.0f;
-                        rgbgreen[indx1] = cfa[indx1];
-                    }
                 }
 
                 //fill lower border
                 if (rrmax < rr1) {
                     for (int rr = 0; rr < 16; rr++)
                         for (int cc = ccmin; cc < ccmax; cc += 4) {
-                            int indx1 = (rrmax + rr) * TS + cc;
+                            int indx1 = (rrmax + rr) * ts + cc;
                             vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / c65535v;
                             STVF(cfa[indx1], tempv );
                             STVF(rgbgreen[indx1], tempv );
                         }
                 }
 
+#else
+
+                //fill upper border
+                if (rrmin > 0) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
+                            cfa[rr * ts + cc] = (rawData[row][cc + left]) / 65535.f;
+                            rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
+                        }
+                }
+
+                // fill inner part
+                for (int rr = rrmin; rr < rrmax; rr++) {
+                    int row = rr + top;
+
+                    for (int cc = ccmin; cc < ccmax; cc++) {
+                        int indx1 = rr * ts + cc;
+                        cfa[indx1] = (rawData[row][cc + left]) / 65535.f;
+                        rgbgreen[indx1] = cfa[indx1];
+                    }
+                }
+
+                //fill lower border
+                if (rrmax < rr1) {
+                    for (int rr = 0; rr < 16; rr++)
+                        for (int cc = ccmin; cc < ccmax; cc++) {
+                            cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / 65535.f;
+                            rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
+                        }
+                }
+
+#endif
+
                 //fill left border
                 if (ccmin > 0) {
                     for (int rr = rrmin; rr < rrmax; rr++)
                         for (int cc = 0, row = rr + top; cc < 16; cc++) {
-                            cfa[rr * TS + cc] = (rawData[row][32 - cc + left]) / 65535.0f;
-                            rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
+                            cfa[rr * ts + cc] = (rawData[row][32 - cc + left]) / 65535.f;
+                            rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
                         }
                 }
 
@@ -287,87 +299,8 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 if (ccmax < cc1) {
                     for (int rr = rrmin; rr < rrmax; rr++)
                         for (int cc = 0; cc < 16; cc++) {
-                            cfa[rr * TS + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.0f;
-                            rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc];
-                        }
-                }
-
-                //also, fill the image corners
-                if (rrmin > 0 && ccmin > 0) {
-                    for (int rr = 0; rr < 16; rr++)
-                        for (int cc = 0; cc < 16; cc += 4) {
-                            int indx1 = (rr) * TS + cc;
-                            vfloat tempv = LVFU(rawData[winy + 32 - rr][winx + 32 - cc]) / c65535v;
-                            STVF(cfa[indx1], tempv );
-                            STVF(rgbgreen[indx1], tempv );
-                        }
-                }
-
-                if (rrmax < rr1 && ccmax < cc1) {
-                    for (int rr = 0; rr < 16; rr++)
-                        for (int cc = 0; cc < 16; cc += 4) {
-                            int indx1 = (rrmax + rr) * TS + ccmax + cc;
-                            vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / c65535v;
-                            STVFU(cfa[indx1], tempv );
-                            STVFU(rgbgreen[indx1], tempv );
-                        }
-                }
-
-                if (rrmin > 0 && ccmax < cc1) {
-                    for (int rr = 0; rr < 16; rr++)
-                        for (int cc = 0; cc < 16; cc++) {
-                            cfa[(rr)*TS + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.0f;
-                            rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc];
-                        }
-                }
-
-                if (rrmax < rr1 && ccmin > 0) {
-                    for (int rr = 0; rr < 16; rr++)
-                        for (int cc = 0; cc < 16; cc++) {
-                            cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.0f;
-                            rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
-                        }
-                }
-
-#else
-
-                for (int rr = rrmin; rr < rrmax; rr++)
-                    for (int row = rr + top, cc = ccmin; cc < ccmax; cc++) {
-                        int indx1 = rr * TS + cc;
-                        cfa[indx1] = (rawData[row][cc + left]) / 65535.0f;
-                        rgbgreen[indx1] = cfa[indx1];
-                    }
-
-                //fill borders
-                if (rrmin > 0) {
-                    for (int rr = 0; rr < 16; rr++)
-                        for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
-                            cfa[rr * TS + cc] = (rawData[row][cc + left]) / 65535.0f;
-                            rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
-                        }
-                }
-
-                if (rrmax < rr1) {
-                    for (int rr = 0; rr < 16; rr++)
-                        for (int cc = ccmin; cc < ccmax; cc++) {
-                            cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / 65535.0f;
-                            rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
-                        }
-                }
-
-                if (ccmin > 0) {
-                    for (int rr = rrmin; rr < rrmax; rr++)
-                        for (int cc = 0, row = rr + top; cc < 16; cc++) {
-                            cfa[rr * TS + cc] = (rawData[row][32 - cc + left]) / 65535.0f;
-                            rgbgreen[rr * TS + cc] = cfa[rr * TS + cc];
-                        }
-                }
-
-                if (ccmax < cc1) {
-                    for (int rr = rrmin; rr < rrmax; rr++)
-                        for (int cc = 0; cc < 16; cc++) {
-                            cfa[rr * TS + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.0f;
-                            rgbgreen[rr * TS + ccmax + cc] = cfa[rr * TS + ccmax + cc];
+                            cfa[rr * ts + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.f;
+                            rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
                         }
                 }
 
@@ -375,43 +308,43 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 if (rrmin > 0 && ccmin > 0) {
                     for (int rr = 0; rr < 16; rr++)
                         for (int cc = 0; cc < 16; cc++) {
-                            cfa[(rr)*TS + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / 65535.0f;
-                            rgbgreen[(rr)*TS + cc] = cfa[(rr) * TS + cc];
+                            cfa[(rr)*ts + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / 65535.f;
+                            rgbgreen[(rr)*ts + cc] = cfa[(rr) * ts + cc];
                         }
                 }
 
                 if (rrmax < rr1 && ccmax < cc1) {
                     for (int rr = 0; rr < 16; rr++)
                         for (int cc = 0; cc < 16; cc++) {
-                            cfa[(rrmax + rr)*TS + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / 65535.0f;
-                            rgbgreen[(rrmax + rr)*TS + ccmax + cc] = cfa[(rrmax + rr) * TS + ccmax + cc];
+                            cfa[(rrmax + rr)*ts + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / 65535.f;
+                            rgbgreen[(rrmax + rr)*ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
                         }
                 }
 
                 if (rrmin > 0 && ccmax < cc1) {
                     for (int rr = 0; rr < 16; rr++)
                         for (int cc = 0; cc < 16; cc++) {
-                            cfa[(rr)*TS + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.0f;
-                            rgbgreen[(rr)*TS + ccmax + cc] = cfa[(rr) * TS + ccmax + cc];
+                            cfa[(rr)*ts + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.f;
+                            rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr) * ts + ccmax + cc];
                         }
                 }
 
                 if (rrmax < rr1 && ccmin > 0) {
                     for (int rr = 0; rr < 16; rr++)
                         for (int cc = 0; cc < 16; cc++) {
-                            cfa[(rrmax + rr)*TS + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.0f;
-                            rgbgreen[(rrmax + rr)*TS + cc] = cfa[(rrmax + rr) * TS + cc];
+                            cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.f;
+                            rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
                         }
                 }
 
-#endif
+                // end of tile initialization
 
-                //end of border fill
+                // horizontal and vertical gradients
 #ifdef __SSE2__
-                const vfloat epsv = F2V( eps );
+                vfloat epsv = F2V( eps );
 
                 for (int rr = 2; rr < rr1 - 2; rr++) {
-                    for (int indx = rr * TS; indx < rr * TS + cc1; indx += 4) {
+                    for (int indx = rr * ts; indx < rr * ts + cc1; indx += 4) {
                         vfloat delhv = vabsf( LVFU( cfa[indx + 1] ) -  LVFU( cfa[indx - 1] ) );
                         vfloat delvv = vabsf( LVF( cfa[indx + v1] ) -  LVF( cfa[indx - v1] ) );
                         STVF(dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv );
@@ -423,53 +356,55 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 #else
 
                 for (int rr = 2; rr < rr1 - 2; rr++)
-                    for (int cc = 2, indx = (rr) * TS + cc; cc < cc1 - 2; cc++, indx++) {
-                        // horizontal and vedrtical gradient
+                    for (int cc = 2, indx = (rr) * ts + cc; cc < cc1 - 2; cc++, indx++) {
                         float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
                         float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
                         dirwts0[indx] = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
-                        dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh; //+fabsf(cfa[indx+2]-cfa[indx-2]);
+                        dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
                         delhvsqsum[indx] = SQR(delh) + SQR(delv);
                     }
 
 #endif
 
-                //interpolate vertical and horizontal color differences
+                //interpolate vertical and horizontal colour differences
 #ifdef __SSE2__
                 vfloat sgnv;
 
                 if( !(FC(4, 4) & 1) ) {
-                    sgnv = _mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f );
+                    sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
                 } else {
-                    sgnv = _mm_set_ps( -1.0f, 1.0f, -1.0f, 1.0f );
+                    sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
                 }
 
-                vfloat  zd5v = F2V( 0.5f );
-                vfloat  onev = F2V( 1.0f );
-                vfloat  arthreshv = F2V( arthresh );
-                vfloat  clip_pt8v = F2V( clip_pt8 );
+                vfloat zd5v = F2V( 0.5f );
+                vfloat onev = F2V( 1.f );
+                vfloat arthreshv = F2V( arthresh );
+                vfloat clip_pt8v = F2V( clip_pt8 );
 
                 for (int rr = 4; rr < rr1 - 4; rr++) {
                     sgnv = -sgnv;
 
-                    for (int indx = rr * TS + 4; indx < rr * TS + cc1 - 7; indx += 4) {
-                        //color ratios in each cardinal direction
+                    for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4) {
+                        //colour ratios in each cardinal direction
                         vfloat cfav = LVF(cfa[indx]);
                         vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
                         vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
                         vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
                         vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
 
+                        //G interpolated in vert/hor directions using Hamilton-Adams method
                         vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
                         vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
                         vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
                         vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
 
+                        //G interpolated in vert/hor directions using adaptive ratios
                         vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
                         vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
                         vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
                         vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
 
+                        //adaptive weights for vertical/horizontal directions
                         vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
                         vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
 
@@ -477,7 +412,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
                         vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
 
-                        //interpolated color differences
+                        //interpolated colour differences
                         vfloat hcdaltv = sgnv * (Ginthhav - cfav);
                         vfloat vcdaltv = sgnv * (Gintvhav - cfav);
                         STVF(hcdalt[indx], hcdaltv);
@@ -488,13 +423,14 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         gdarv = vself( clipmask, gdhav, gdarv);
                         glarv = vself( clipmask, glhav, glarv);
                         grarv = vself( clipmask, grhav, grarv);
+
+                        //use HA if highlights are (nearly) clipped
                         STVF(vcd[indx], vself( clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
                         STVF(hcd[indx], vself( clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
-                        //differences of interpolations in opposite directions
 
+                        //differences of interpolations in opposite directions
                         STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
                         STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
-
                     }
                 }
 
@@ -503,9 +439,9 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 for (int rr = 4; rr < rr1 - 4; rr++) {
                     bool fcswitch = FC(rr, 4) & 1;
 
-                    for (int cc = 4, indx = rr * TS + cc; cc < cc1 - 4; cc++, indx++) {
+                    for (int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++) {
 
-                        //color ratios in each cardinal direction
+                        //colour ratios in each cardinal direction
                         float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
                         float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
                         float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
@@ -520,25 +456,25 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         //G interpolated in vert/hor directions using adaptive ratios
                         float guar, gdar, glar, grar;
 
-                        if (fabsf(1.0f - cru) < arthresh) {
+                        if (fabsf(1.f - cru) < arthresh) {
                             guar = cfa[indx] * cru;
                         } else {
                             guar = guha;
                         }
 
-                        if (fabsf(1.0f - crd) < arthresh) {
+                        if (fabsf(1.f - crd) < arthresh) {
                             gdar = cfa[indx] * crd;
                         } else {
                             gdar = gdha;
                         }
 
-                        if (fabsf(1.0f - crl) < arthresh) {
+                        if (fabsf(1.f - crl) < arthresh) {
                             glar = cfa[indx] * crl;
                         } else {
                             glar = glha;
                         }
 
-                        if (fabsf(1.0f - crr) < arthresh) {
+                        if (fabsf(1.f - crr) < arthresh) {
                             grar = cfa[indx] * crr;
                         } else {
                             grar = grha;
@@ -549,19 +485,19 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
 
                         //interpolated G via adaptive weights of cardinal evaluations
-                        float Gintvha = vwt * gdha + (1.0f - vwt) * guha;
-                        float Ginthha = hwt * grha + (1.0f - hwt) * glha;
+                        float Gintvha = vwt * gdha + (1.f - vwt) * guha;
+                        float Ginthha = hwt * grha + (1.f - hwt) * glha;
 
-                        //interpolated color differences
+                        //interpolated colour differences
                         if (fcswitch) {
-                            vcd[indx] = cfa[indx] - (vwt * gdar + (1.0f - vwt) * guar);
-                            hcd[indx] = cfa[indx] - (hwt * grar + (1.0f - hwt) * glar);
+                            vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
+                            hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
                             vcdalt[indx] = cfa[indx] - Gintvha;
                             hcdalt[indx] = cfa[indx] - Ginthha;
                         } else {
-                            //interpolated color differences
-                            vcd[indx] = (vwt * gdar + (1.0f - vwt) * guar) - cfa[indx];
-                            hcd[indx] = (hwt * grar + (1.0f - hwt) * glar) - cfa[indx];
+                            //interpolated colour differences
+                            vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
+                            hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
                             vcdalt[indx] = Gintvha - cfa[indx];
                             hcdalt[indx] = Ginthha - cfa[indx];
                         }
@@ -583,8 +519,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         dginth[indx] = min(SQR(glha - grha), SQR(glar - grar));
 
                     }
-
-
                 }
 
 #endif
@@ -596,9 +530,9 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 vfloat  sgn3v;
 
                 if( !(FC(4, 4) & 1) ) {
-                    sgnv = _mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f );
+                    sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
                 } else {
-                    sgnv = _mm_set_ps( -1.0f, 1.0f, -1.0f, 1.0f );
+                    sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
                 }
 
                 sgn3v = sgnv + sgnv + sgnv;
@@ -608,7 +542,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                     sgnv = -sgnv;
                     sgn3v = -sgn3v;
 
-                    for (int indx = rr * TS + 4; indx < rr * TS + cc1 - 4; indx += 4) {
+                    for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4) {
                         vfloat hcdv = LVF( hcd[indx] );
                         vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv) + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2])) + SQRV(hcdv - LVFU(hcd[indx + 2]));
                         vfloat hcdaltv = LVF( hcdalt[indx] );
@@ -622,6 +556,8 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         hcdv = vself( vmaskf_lt( hcdaltvarv, hcdvarv ), hcdaltv, hcdv);
                         vcdv = vself( vmaskf_lt( vcdaltvarv, vcdvarv ), vcdaltv, vcdv);
 
+                        //bound the interpolation in regions of high saturation
+                        //vertical and horizontal G interpolations
                         vfloat Ginthv = sgnv * hcdv + LVF( cfa[indx] );
                         vfloat temp2v = sgn3v * hcdv;
                         vfloat hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx]));
@@ -651,11 +587,11 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 #else
 
                 for (int rr = 4; rr < rr1 - 4; rr++) {
-                    for (int cc = 4, indx = rr * TS + cc, c = FC(rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
-                        float hcdvar = 3.0f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
-                        float hcdaltvar = 3.0f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
-                        float vcdvar = 3.0f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
-                        float vcdaltvar = 3.0f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
+                    for (int cc = 4, indx = rr * ts + cc, c = FC(rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
+                        float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
+                        float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
+                        float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
+                        float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
 
                         //choose the smallest variance; this yields a smoother interpolation
                         if (hcdaltvar < hcdvar) {
@@ -667,7 +603,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         }
 
                         //bound the interpolation in regions of high saturation
-
                         //vertical and horizontal G interpolations
                         float Gintv, Ginth;
 
@@ -676,33 +611,31 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             Gintv = -vcd[indx] + cfa[indx]; //B or R
 
                             if (hcd[indx] > 0) {
-                                if (3.0f * hcd[indx] > (Ginth + cfa[indx])) {
+                                if (3.f * hcd[indx] > (Ginth + cfa[indx])) {
                                     hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
                                 } else {
-                                    float hwt = 1.0f - 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]);
-                                    hcd[indx] = hwt * hcd[indx] + (1.0f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
+                                    float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
+                                    hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
                                 }
                             }
 
                             if (vcd[indx] > 0) {
-                                if (3.0f * vcd[indx] > (Gintv + cfa[indx])) {
+                                if (3.f * vcd[indx] > (Gintv + cfa[indx])) {
                                     vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
                                 } else {
-                                    float vwt = 1.0f - 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]);
-                                    vcd[indx] = vwt * vcd[indx] + (1.0f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
+                                    float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
+                                    vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
                                 }
                             }
 
                             if (Ginth > clip_pt) {
-                                hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];    //for RT implementation
+                                hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
                             }
 
                             if (Gintv > clip_pt) {
                                 vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
                             }
 
-                            //if (Ginth > pre_mul[c]) hcd[indx]=-ULIM(Ginth,cfa[indx-1],cfa[indx+1])+cfa[indx];//for dcraw implementation
-                            //if (Gintv > pre_mul[c]) vcd[indx]=-ULIM(Gintv,cfa[indx-v1],cfa[indx+v1])+cfa[indx];
 
                         } else {//R or B site
 
@@ -710,25 +643,25 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             Gintv = vcd[indx] + cfa[indx];
 
                             if (hcd[indx] < 0) {
-                                if (3.0f * hcd[indx] < -(Ginth + cfa[indx])) {
+                                if (3.f * hcd[indx] < -(Ginth + cfa[indx])) {
                                     hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
                                 } else {
-                                    float hwt = 1.0f + 3.0f * hcd[indx] / (eps + Ginth + cfa[indx]);
-                                    hcd[indx] = hwt * hcd[indx] + (1.0f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
+                                    float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
+                                    hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
                                 }
                             }
 
                             if (vcd[indx] < 0) {
-                                if (3.0f * vcd[indx] < -(Gintv + cfa[indx])) {
+                                if (3.f * vcd[indx] < -(Gintv + cfa[indx])) {
                                     vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
                                 } else {
-                                    float vwt = 1.0f + 3.0f * vcd[indx] / (eps + Gintv + cfa[indx]);
-                                    vcd[indx] = vwt * vcd[indx] + (1.0f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
+                                    float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
+                                    vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
                                 }
                             }
 
                             if (Ginth > clip_pt) {
-                                hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];    //for RT implementation
+                                hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
                             }
 
                             if (Gintv > clip_pt) {
@@ -750,7 +683,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 vfloat  epssqv = F2V( epssq );
 
                 for (int rr = 6; rr < rr1 - 6; rr++) {
-                    for (int indx = rr * TS + 6 + (FC(rr, 2) & 1); indx < rr * TS + cc1 - 6; indx += 8) {
+                    for (int indx = rr * ts + 6 + (FC(rr, 2) & 1); indx < rr * ts + cc1 - 6; indx += 8) {
                         //compute colour difference variances in cardinal directions
                         vfloat tempv = LC2VFU(vcd[indx]);
                         vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
@@ -758,8 +691,8 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
                         vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
 
-                        vfloat hwtv = LC2VFU(dirwts1[indx - 1]) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
-                        vfloat vwtv = LC2VFU(dirwts0[indx - v1]) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
+                        vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
+                        vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
 
                         tempv = LC2VFU(hcd[indx]);
                         vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
@@ -772,7 +705,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
                         vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
 
-                        //compute fluctuations in up/down and left/right interpolations of colors
+                        //compute fluctuations in up/down and left/right interpolations of colours
                         Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
                         Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
 
@@ -796,16 +729,16 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 #else
 
                 for (int rr = 6; rr < rr1 - 6; rr++) {
-                    for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
+                    for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
 
-                        //compute color difference variances in cardinal directions
+                        //compute colour difference variances in cardinal directions
 
                         float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
                         float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
                         float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
                         float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
 
-                        //color difference (G-R or G-B) variance in up/down/left/right directions
+                        //colour difference (G-R or G-B) variance in up/down/left/right directions
                         float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave);
                         float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave);
                         float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave);
@@ -814,17 +747,17 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
                         float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
 
-                        float vcdvar = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru;
-                        float hcdvar = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl;
+                        float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
+                        float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
 
-                        //compute fluctuations in up/down and left/right interpolations of colors
+                        //compute fluctuations in up/down and left/right interpolations of colours
                         Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
                         Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
                         Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
                         Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
 
-                        float vcdvar1 = epssq + vwt * Dgrbvvard + (1.0f - vwt) * Dgrbvvaru;
-                        float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.0f - hwt) * Dgrbhvarl;
+                        float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
+                        float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
 
                         //determine adaptive weights for G interpolation
                         float varwt = hcdvar / (vcdvar + hcdvar);
@@ -837,52 +770,94 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         } else {
                             hvwt[indx >> 1] = diffwt;
                         }
-
                     }
                 }
 
 #endif
 
+#ifdef __SSE2__
+                vfloat gaussg0 = F2V(gaussgrad[0]);
+                vfloat gaussg1 = F2V(gaussgrad[1]);
+                vfloat gaussg2 = F2V(gaussgrad[2]);
+                vfloat gaussg3 = F2V(gaussgrad[3]);
+                vfloat gaussg4 = F2V(gaussgrad[4]);
+                vfloat gaussg5 = F2V(gaussgrad[5]);
+                vfloat gausso0 = F2V(gaussodd[0]);
+                vfloat gausso1 = F2V(gaussodd[1]);
+                vfloat gausso2 = F2V(gaussodd[2]);
+                vfloat gausso3 = F2V(gaussodd[3]);
+
+#endif
+
+                // precompute nyquist
+                for (int rr = 6; rr < rr1 - 6; rr++) {
+                    int cc = 6 + (FC(rr, 2) & 1);
+                    int indx = rr * ts + cc;
+
+#ifdef __SSE2__
+
+                    for (; cc < cc1 - 7; cc += 8, indx += 8) {
+                        vfloat valv = (gausso0 * LC2VFU(cddiffsq[indx]) +
+                                       gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)]) +
+                                                  LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)])) +
+                                       gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)]) +
+                                                  LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)])) +
+                                       gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)]) +
+                                                  LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)]))) -
+                                      (gaussg0 * LC2VFU(delhvsqsum[indx]) +
+                                       gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1]) +
+                                                  LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1])) +
+                                       gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1]) +
+                                                  LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1])) +
+                                       gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2]) +
+                                                  LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2])) +
+                                       gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1]) +
+                                                  LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2]) +
+                                                  LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2]) +
+                                                  LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1])) +
+                                       gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2]) +
+                                                  LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
+                        STVFU(nyqutest[indx >> 1], valv);
+
+                    }
+
+#endif
+
+                    for (; cc < cc1 - 6; cc += 2, indx += 2) {
+                        nyqutest[indx >> 1] = (gaussodd[0] * cddiffsq[indx] +
+                                               gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] +
+                                                              cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) +
+                                               gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] +
+                                                              cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) +
+                                               gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] +
+                                                              cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)])) -
+                                              (gaussgrad[0] *  delhvsqsum[indx] +
+                                               gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] +
+                                                               delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) +
+                                               gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] +
+                                                               delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) +
+                                               gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] +
+                                                               delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) +
+                                               gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1] +
+                                                               delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2] +
+                                                               delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2] +
+                                                               delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1]) +
+                                               gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] +
+                                                               delhvsqsum[indx - p2] + delhvsqsum[indx + m2]));
+                    }
+                }
 
                 // Nyquist test
                 int nystartrow = 0;
                 int nyendrow = 0;
-                int nystartcol = TS + 1;
+                int nystartcol = ts + 1;
                 int nyendcol = 0;
 
                 for (int rr = 6; rr < rr1 - 6; rr++) {
-                    for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
+                    for (int cc = 6 + (FC(rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
 
                         //nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB gradients
-                        // TODO_INGO: currently this part needs 10 float mults, 36 float adds, 4 int mults and 44 int adds for every second pixel
-                        // it reads 304 bytes for every second pixel and writes <= 1 byte for every second pixel
-                        // a precalculated vectorized version could do this with 1/4 of the operations
-                        // but it would read 304 bytes for every second pixel and write 8 bytes for every second pixel for the precalculation
-                        // (though the vectorized read should be faster than the scalar version)
-                        // and read 8 bytes for every second pixel and write 1 byte for every second pixel for final calculation (maybe this last step can be avoided too)
-                        float nyqtest1 = gaussodd[0] * cddiffsq[indx] +
-                                         gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] +
-                                                        cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) +
-                                         gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] +
-                                                        cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) +
-                                         gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] +
-                                                        cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)]);
-                        float nyqtest2 = gaussgrad[0] * delhvsqsum[indx] +
-                                         gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] +
-                                                         delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) +
-                                         gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] +
-                                                         delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) +
-                                         gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] +
-                                                         delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) +
-                                         gaussgrad[4] * (delhvsqsum[indx - 2 * TS - 1] + delhvsqsum[indx - 2 * TS + 1] +
-                                                         delhvsqsum[indx - TS - 2] + delhvsqsum[indx - TS + 2] +
-                                                         delhvsqsum[indx + TS - 2] + delhvsqsum[indx + TS + 2] +
-                                                         delhvsqsum[indx + 2 * TS - 1] + delhvsqsum[indx + 2 * TS + 1]) +
-                                         gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] +
-                                                         delhvsqsum[indx - p2] + delhvsqsum[indx + m2]);
-
-
-                        if(nyqtest1 > nyqtest2) {
+                        if(nyqutest[indx >> 1] > 0.f) {
                             nyquist[indx >> 1] = 1;    //nyquist=1 for nyquist region
                             nystartrow = nystartrow ? nystartrow : rr;
                             nyendrow = rr;
@@ -903,46 +878,60 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                     nyendrow = std::min(rr1 - 8, nyendrow);
                     nystartcol = std::max(8, nystartcol);
                     nyendcol = std::min(cc1 - 8, nyendcol);
+                    memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
+
+#ifdef __SSE2__
+                    vint fourvb = _mm_set1_epi8(4);
+                    vint onevb = _mm_set1_epi8(1);
+
+#endif
 
                     for (int rr = nystartrow; rr < nyendrow; rr++) {
-                        for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) {
-                            // TODO_INGO: if you look at the comments below, it does not seem to be correct to include nyquist[indx >> 1] into the summation
-                            // Also this implementation has loop dependencies, which are not correct IMHO
-                            // An implementation which uses a second buffer could avoid this dependencies and could be vectorized by factor 16 too (we're working with single bytes here)
-                            // That would lead to differences in output compared to current code, but also would lead to more consistent output when changing TS
-                            unsigned int nyquistneighbours = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] +
-                                                              nyquist[(indx - 2) >> 1] + nyquist[indx >> 1] + nyquist[(indx + 2) >> 1] +
-                                                              nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
+#ifdef __SSE2__
 
-                            //if most of your neighbours are named Nyquist, it's likely that you're one too
-                            if (nyquistneighbours > 4) {
-                                nyquist[indx >> 1] = 1;
-                            }
-
-                            //or not
-                            if (nyquistneighbours < 4) {
-                                nyquist[indx >> 1] = 0;
-                            }
+                        for (int indx = rr * ts; indx < rr * ts + cc1; indx += 32) {
+                            vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx - v2) >> 1]), _mm_load_si128((vint*)&nyquist[(indx - m1) >> 1]));
+                            vint nyquisttemp2v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx + p1) >> 1]), _mm_load_si128((vint*)&nyquist[(indx - 2) >> 1]));
+                            vint nyquisttemp3v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx +  2) >> 1]), _mm_load_si128((vint*)&nyquist[(indx - p1) >> 1]));
+                            vint valv = _mm_load_si128((vint*)&nyquist[indx >> 1]);
+                            vint nyquisttemp4v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx + m1) >> 1]), _mm_load_si128((vint*)&nyquist[(indx + v2) >> 1]));
+                            nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
+                            nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
+                            nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
+                            valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
+                            valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
+                            _mm_store_si128((vint*)&nyquist2[indx >> 1], valv);
                         }
+
+#else
+
+                        for (int indx = rr * ts + nystartcol + (FC(rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
+                            unsigned int nyquisttemp = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] +
+                                                        nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] +
+                                                        nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
+                            //if most of your neighbours are named Nyquist, it's likely that you're one too, or not
+                            nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
+                        }
+
+#endif
                     }
 
                     // end of Nyquist test
 
                     // in areas of Nyquist texture, do area interpolation
                     for (int rr = nystartrow; rr < nyendrow; rr++)
-                        for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) {
+                        for (int indx = rr * ts + nystartcol + (FC(rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
 
-                            if (nyquist[indx >> 1]) {
+                            if (nyquist2[indx >> 1]) {
                                 // area interpolation
 
                                 float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
 
                                 for (int i = -6; i < 7; i += 2) {
-                                    int indx1 = indx + (i * TS) - 6;
+                                    int indx1 = indx + (i * ts) - 6;
 
                                     for (int j = -6; j < 7; j += 2, indx1 += 2) {
-
-                                        if (nyquist[indx1 >> 1]) {
+                                        if (nyquist2[indx1 >> 1]) {
                                             float cfatemp = cfa[indx1];
                                             sumcfa += cfatemp;
                                             sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
@@ -954,11 +943,10 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                                     }
                                 }
 
-                                //horizontal and vertical color differences, and adaptive weight
+                                //horizontal and vertical colour differences, and adaptive weight
                                 sumh = sumcfa - xdiv2f(sumh);
                                 sumv = sumcfa - xdiv2f(sumv);
-                                sumsqh = xdiv2f(sumsqh);
-                                sumsqv = xdiv2f(sumsqv);
+                                areawt = xdiv2f(areawt);
                                 float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
                                 float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
                                 hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
@@ -972,7 +960,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
                 //populate G at R/B sites
                 for (int rr = 8; rr < rr1 - 8; rr++)
-                    for (int indx = rr * TS + 8 + (FC(rr, 2) & 1); indx < rr * TS + cc1 - 8; indx += 2) {
+                    for (int indx = rr * ts + 8 + (FC(rr, 2) & 1); indx < rr * ts + cc1 - 8; indx += 2) {
 
                         //first ask if one gets more directional discrimination from nearby B/R sites
                         float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2);
@@ -980,13 +968,13 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         hvwt[indx >> 1] = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
                         //a better result was obtained from the neighbours
 
-                        Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate color differences
+                        Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate colour differences
 
                         rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; //evaluate G (finally!)
 
                         //local curvature in G (preparation for nyquist refinement step)
-                        Dgrb2[indx >> 1].h = nyquist[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f;
-                        Dgrb2[indx >> 1].v = nyquist[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f;
+                        Dgrb2[indx >> 1].h = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f;
+                        Dgrb2[indx >> 1].v = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f;
                     }
 
 
@@ -995,10 +983,11 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 // refine Nyquist areas using G curvatures
                 if(doNyquist) {
                     for (int rr = nystartrow; rr < nyendrow; rr++)
-                        // TODO_INGO: maybe this part is also worth vectorizing using _mm_movemask_ps
-                        for (int indx = rr * TS + nystartcol + (FC(rr, 2) & 1); indx < rr * TS + nyendcol; indx += 2) {
 
-                            if (nyquist[indx >> 1]) {
+                        // TODO_INGO: maybe this part is also worth vectorizing using _mm_movemask_ps
+                        for (int indx = rr * ts + nystartcol + (FC(rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
+
+                            if (nyquist2[indx >> 1]) {
                                 //local averages (over Nyquist pixels only) of G curvature squared
                                 float gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h +
                                                        gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) +
@@ -1020,7 +1009,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
                 for (int rr = 6; rr < rr1 - 6; rr++) {
                     if((FC(rr, 2) & 1) == 0) {
-                        for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) {
+                        for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
                             vfloat tempv = LC2VFU(cfa[indx + 1]);
                             vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
                             STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
@@ -1030,7 +1019,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
                         }
                     } else {
-                        for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 8, indx += 8) {
+                        for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
                             vfloat tempv = LC2VFU(cfa[indx]);
                             vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
                             STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
@@ -1046,14 +1035,14 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
                 for (int rr = 6; rr < rr1 - 6; rr++) {
                     if((FC(rr, 2) & 1) == 0) {
-                        for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
+                        for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
                             delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
                             delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
                             Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
                             Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
                         }
                     } else {
-                        for (int cc = 6, indx = (rr) * TS + cc; cc < cc1 - 6; cc += 2, indx += 2) {
+                        for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
                             Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
                             Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
                             delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
@@ -1074,9 +1063,9 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 for (int rr = 8; rr < rr1 - 8; rr++) {
 #ifdef __SSE2__
 
-                    for (int indx = rr * TS + 8 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 8; indx += 8, indx1 += 4) {
+                    for (int indx = rr * ts + 8 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8; indx += 8, indx1 += 4) {
 
-                        //diagonal color ratios
+                        //diagonal colour ratios
                         vfloat cfav = LC2VFU(cfa[indx]);
 
                         vfloat temp1v = LC2VFU(cfa[indx + m1]);
@@ -1139,36 +1128,36 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
 #else
 
-                    for (int cc = 8 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) {
+                    for (int cc = 8 + (FC(rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) {
 
-                        //diagonal color ratios
+                        //diagonal colour ratios
                         float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
                         float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
                         float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
                         float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
-                        //color differences in diagonal directions
+                        //colour differences in diagonal directions
                         float rbse, rbnw, rbne, rbsw;
 
                         //assign B/R at R/B sites
-                        if (fabsf(1.0f - crse) < arthresh) {
+                        if (fabsf(1.f - crse) < arthresh) {
                             rbse = cfa[indx] * crse;    //use this if more precise diag interp is necessary
                         } else {
                             rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
                         }
 
-                        if (fabsf(1.0f - crnw) < arthresh) {
+                        if (fabsf(1.f - crnw) < arthresh) {
                             rbnw = cfa[indx] * crnw;
                         } else {
                             rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
                         }
 
-                        if (fabsf(1.0f - crne) < arthresh) {
+                        if (fabsf(1.f - crne) < arthresh) {
                             rbne = cfa[indx] * crne;
                         } else {
                             rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
                         }
 
-                        if (fabsf(1.0f - crsw) < arthresh) {
+                        if (fabsf(1.f - crsw) < arthresh) {
                             rbsw = cfa[indx] * crsw;
                         } else {
                             rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
@@ -1198,7 +1187,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                                 rbp[indx1] = ULIM(rbp[indx1] , cfa[indx - p1], cfa[indx + p1]);
                             } else {
                                 float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
-                                rbp[indx1] = pwt * rbp[indx1] + (1.0f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
+                                rbp[indx1] = pwt * rbp[indx1] + (1.f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
                             }
                         }
 
@@ -1207,7 +1196,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                                 rbm[indx1] = ULIM(rbm[indx1] , cfa[indx - m1], cfa[indx + m1]);
                             } else {
                                 float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
-                                rbm[indx1] = mwt * rbm[indx1] + (1.0f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
+                                rbm[indx1] = mwt * rbm[indx1] + (1.f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
                             }
                         }
 
@@ -1229,7 +1218,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
                 for (int rr = 10; rr < rr1 - 10; rr++)
 #ifdef __SSE2__
-                    for (int indx = rr * TS + 10 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 10; indx += 8, indx1 += 4) {
+                    for (int indx = rr * ts + 10 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4) {
 
                         //first ask if one gets more directional discrimination from nearby B/R sites
                         vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
@@ -1241,7 +1230,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
 #else
 
-                    for (int cc = 10 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) {
+                    for (int cc = 10 + (FC(rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) {
 
                         //first ask if one gets more directional discrimination from nearby B/R sites
                         float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2);
@@ -1250,20 +1239,20 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                             pmwt[indx1] = pmwtalt;   //a better result was obtained from the neighbours
                         }
 
-                        rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.0f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated
+                        rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated
                     }
 
 #endif
 
                 for (int rr = 12; rr < rr1 - 12; rr++)
 #ifdef __SSE2__
-                    for (int indx = rr * TS + 12 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * TS + cc1 - 12; indx += 8, indx1 += 4) {
+                    for (int indx = rr * ts + 12 + (FC(rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4) {
                         vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
 
-                        if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the math for all 4 pixels and mask the unused out at the end
+                        if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the unused out at the end
                             //now interpolate G vertically/horizontally using R+B values
-                            //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts
-                            //color ratios for G interpolation
+                            //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
+                            //colour ratios for G interpolation
                             vfloat rbintv = LVFU(rbint[indx1]);
 
                             //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
@@ -1312,16 +1301,16 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
 #else
 
-                    for (int cc = 12 + (FC(rr, 2) & 1), indx = rr * TS + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) {
+                    for (int cc = 12 + (FC(rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) {
 
                         if (fabsf(0.5 - pmwt[indx >> 1]) < fabsf(0.5 - hvwt[indx >> 1]) ) {
                             continue;
                         }
 
                         //now interpolate G vertically/horizontally using R+B values
-                        //unfortunately, since G interpolation cannot be done diagonally this may lead to color shifts
+                        //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
 
-                        //color ratios for G interpolation
+                        //colour ratios for G interpolation
                         float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
                         float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
                         float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
@@ -1397,7 +1386,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 //fancy chrominance interpolation
                 //(ey,ex) is location of R site
                 for (int rr = 13 - ey; rr < rr1 - 12; rr += 2)
-                    for (int indx1 = (rr * TS + 13 - ex) >> 1; indx1 < (rr * TS + cc1 - 12) >> 1; indx1++) { //B coset
+                    for (int indx1 = (rr * ts + 13 - ex) >> 1; indx1 < (rr * ts + cc1 - 12) >> 1; indx1++) { //B coset
                         Dgrb[1][indx1] = Dgrb[0][indx1]; //split out G-B from G-R
                         Dgrb[0][indx1] = 0;
                     }
@@ -1410,7 +1399,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
                 for (int rr = 14; rr < rr1 - 14; rr++)
 #ifdef __SSE2__
-                    for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) {
+                    for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) {
                         vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
                         vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
                         vfloat wtnwv = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
@@ -1426,11 +1415,11 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
 #else
 
-                    for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * TS + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) {
-                        float wtnw = 1.0f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
-                        float wtne = 1.0f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
-                        float wtsw = 1.0f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
-                        float wtse = 1.0f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
+                    for (int cc = 14 + (FC(rr, 2) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) {
+                        float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
+                        float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
+                        float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
+                        float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
 
                         Dgrb[c][indx >> 1] = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1] - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1] ) +
                                               wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1] - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1] ) +
@@ -1439,15 +1428,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                     }
 
 #endif
-                //tile vars
-                //counters for pixel location in the image
-                int row, col;
-                //counters for pixel location within the tile
-                int cc;
-                //pointer counters within the tile
-                int indx;
-
-                // end of tile initialization
 
 #ifdef __SSE2__
                 int offset;
@@ -1465,12 +1445,14 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 #endif
 
                 for (int rr = 16; rr < rr1 - 16; rr++) {
+                    int row = rr + top;
+                    int col = left + 16;
+                    int indx = rr * ts + 16;
 #ifdef __SSE2__
                     offset = 1 - offset;
                     selmask = vnotm(selmask);
 
-                    for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 18 - (cc1 & 1); cc += 4, indx += 4) {
-                        col = cc + left;
+                    for (; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4) {
                         vfloat greenv = LVF(rgbgreen[indx]);
                         vfloat temp00v = vdup(LVF(hvwt[(indx - v1) >> 1]));
                         vfloat temp01v = vdup(LVF(hvwt[(indx + v1) >> 1]));
@@ -1485,94 +1467,86 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                     }
 
                     if(offset == 0) {
-                        for (indx = rr * TS + cc; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) {
-                            col = cc + left;
-                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                        for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
+                            float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
                                                          temp);
 
                             indx++;
                             col++;
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
                         }
 
                         if(cc1 & 1) { // width of tile is odd
-                            col = cc + left;
-                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                            float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
                                                          temp);
                         }
                     } else {
-                        for (indx = rr * TS + cc; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) {
-                            col = cc + left;
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+                        for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
+                            red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
 
                             indx++;
                             col++;
-                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                            float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
                                                          temp);
                         }
 
                         if(cc1 & 1) { // width of tile is odd
-                            col = cc + left;
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
                         }
                     }
 
 #else
 
                     if((FC(rr, 2) & 1) == 1) {
-                        for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) {
-                            col = cc + left;
-                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                        for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
+                            float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
                                                          temp);
 
                             indx++;
                             col++;
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
                         }
 
                         if(cc1 & 1) { // width of tile is odd
-                            col = cc + left;
-                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                            float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
                                                          temp);
                         }
                     } else {
-                        for (cc = 16, indx = rr * TS + cc, row = rr + top; cc < cc1 - 16 - (cc1 & 1); cc += 2, indx++) {
-                            col = cc + left;
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+                        for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
+                            red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
 
                             indx++;
                             col++;
-                            float temp =  1.0f / (hvwt[(indx - v1) >> 1] + 2.0f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
+                            float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
                                                         temp);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
                                                          temp);
                         }
 
                         if(cc1 & 1) { // width of tile is odd
-                            col = cc + left;
-                            red[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
-                            blue[row][col] = 65535.0f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
+                            red[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
+                            blue[row][col] = 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
                         }
                     }
 
@@ -1586,18 +1560,16 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 #ifdef __SSE2__
 
                     for (; cc < cc1 - 19; cc += 4) {
-                        STVFU(green[row][cc + left], LVF(rgbgreen[rr * TS + cc]) * c65535v);
+                        STVFU(green[row][cc + left], LVF(rgbgreen[rr * ts + cc]) * c65535v);
                     }
 
 #endif
 
                     for (; cc < cc1 - 16; cc++) {
-                        green[row][cc + left] = 65535.0f * rgbgreen[rr * TS + cc];
+                        green[row][cc + left] = 65535.f * rgbgreen[rr * ts + cc];
                     }
                 }
 
-                //end of main loop
-
                 if(plistener) {
                     progresscounter++;
 
@@ -1606,13 +1578,14 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                         #pragma omp critical (amazeprogress)
 #endif
                         {
-                            progress += (double)32 * ((TS - 32) * (TS - 32)) / (height * width);
+                            progress += (double)32 * ((ts - 32) * (ts - 32)) / (height * width);
                             progress = progress > 1.0 ? 1.0 : progress;
                             plistener->setProgress(progress);
                         }
                     }
                 }
             }
+        }  //end of main loop
 
         // clean up
         free(buffer);
@@ -1622,9 +1595,5 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
         plistener->setProgress(1.0);
     }
 
-    // done
-
-#undef TS
-
 }
 }
diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h
index 7bc480861..3f2bf6299 100644
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -39,8 +39,15 @@ typedef __m128i vint2;
 #define STVFU(x,y) _mm_storeu_ps(&x,y)
 #endif
 
-// Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
-#define LC2VFU(a) _mm_shuffle_ps( LVFU(a),  _mm_loadu_ps(  (&a) + 4 ), _MM_SHUFFLE( 2,0,2,0 ) )
+
+static INLINE vfloat LC2VFU(float &a)
+{
+    // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
+    vfloat a1 = _mm_loadu_ps( &a );
+    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
+    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 ));
+}
+
 
 // Store a vector of 4 floats in a[0],a[2],a[4] and a[6]
 #if defined(__x86_64__) && defined(__SSE4_1__)
diff --git a/rtengine/rt_math.h b/rtengine/rt_math.h
index 951d397bf..44c29fd97 100644
--- a/rtengine/rt_math.h
+++ b/rtengine/rt_math.h
@@ -83,8 +83,8 @@ template<typename _Tp>
 inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c) {
     // calculate a * b + (1 - a) * c
     // following is valid:
-    // intp(a, b+x, c+x) = vintpf(a, b, c) + x
-    // intp(a, b*x, c*x) = vintpf(a, b, c) * x
+    // intp(a, b+x, c+x) = intp(a, b, c) + x
+    // intp(a, b*x, c*x) = intp(a, b, c) * x
     return a * (b-c) + c;
 }
 
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
index a0300a1cc..6fed6d3d1 100644
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@@ -910,11 +910,20 @@ static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vca
 	static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
 		return _mm_blendv_ps(y,x,(vfloat)mask);
 	}
+
+	static INLINE vint vselc(vmask mask, vint x, vint y) {
+		return _mm_blendv_epi8(y,x,mask);
+	}
+
 #else
 	// three instructions when using SSE2
 	static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
 		return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
 	}
+
+	static INLINE vint vselc(vmask mask, vint x, vint y) {
+	    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+	}
 #endif
 
 static INLINE vfloat vselfzero(vmask mask, vfloat x) {
@@ -928,6 +937,16 @@ static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
     return _mm_andnot_ps((vfloat)mask, x);
 }
 
+static INLINE vint vselizero(vmask mask, vint x) {
+     // returns value of x if corresponding mask bits are 1, else returns 0
+     // faster than vselc(mask, x, ZEROV)
+    return _mm_and_si128(mask, x);
+}
+static INLINE vint vselinotzero(vmask mask, vint x) {
+    // returns value of x if corresponding mask bits are 0, else returns 0
+    // faster than vselc(mask, ZEROV, x)
+    return _mm_andnot_si128(mask, x);
+}
 
 static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
   vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
@@ -1362,9 +1381,12 @@ static INLINE vfloat vaddc2vfu(float &a)
     // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
     vfloat a1 = _mm_loadu_ps( &a );
     vfloat a2 = _mm_loadu_ps( (&a) + 4 );
-    return  _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
+    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
 }
 
+static INLINE vfloat vadivapb (vfloat a, vfloat b) {
+    return a / (a+b);
+}
 
 #endif // __SSE2__
 #endif // SLEEFSSEAVX

From ded93005d9cb0cd19d5f2c3d6669a0cccf171668 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Tue, 26 Jan 2016 23:27:52 +0100
Subject: [PATCH 3/3] removed streaming code because after adding _mm_mfence()
 it wasn't faster than the non SSE memset

---
 rtengine/amaze_demosaic_RT.cc | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/rtengine/amaze_demosaic_RT.cc b/rtengine/amaze_demosaic_RT.cc
index 2720521aa..5bc9bb54d 100644
--- a/rtengine/amaze_demosaic_RT.cc
+++ b/rtengine/amaze_demosaic_RT.cc
@@ -181,17 +181,7 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
 
         for (int top = winy - 16; top < winy + height; top += ts - 32) {
             for (int left = winx - 16; left < winx + width; left += ts - 32) {
-#ifdef __SSE2__
-                // Using SSE2 we can zero the memory without cache pollution
-                vfloat zerov = ZEROV;
-
-                for(int i = 3 * tsh; i < (ts - 6)*tsh; i += 16) {
-                    _mm_stream_ps((float*)&nyquist[i], zerov);
-                }
-
-#else
                 memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
-#endif
                 //location of tile bottom edge
                 int bottom = min(top + ts, winy + height + 16);
                 //location of tile right edge
@@ -983,8 +973,6 @@ SSEFUNCTION void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw,
                 // refine Nyquist areas using G curvatures
                 if(doNyquist) {
                     for (int rr = nystartrow; rr < nyendrow; rr++)
-
-                        // TODO_INGO: maybe this part is also worth vectorizing using _mm_movemask_ps
                         for (int indx = rr * ts + nystartcol + (FC(rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
 
                             if (nyquist2[indx >> 1]) {