From dc1d1a561b893ed75ea43a499e7a299c5752ed4d Mon Sep 17 00:00:00 2001
From: Desmis <jdesmis@gmail.com>
Date: Sat, 29 Jun 2019 10:51:09 +0200
Subject: [PATCH] Optimization for FFTW Retinex and Local Contrast

---
 rtdata/languages/default |  3 +-
 rtengine/iplocallab.cc   | 84 ++++++++++++++++++++++++++++++++++++----
 rtengine/ipretinex.cc    | 10 +----
 3 files changed, 81 insertions(+), 16 deletions(-)
diff --git a/rtdata/languages/default b/rtdata/languages/default
index b656e3f8c..438f2005d 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -942,6 +942,7 @@ HISTORY_MSG_695;Local - Soft method
 HISTORY_MSG_696;Local - Retinex Normalize
 HISTORY_MSG_697;Local - TM Normalize
 HISTORY_MSG_698;Local - Local contrast Fast Fourier
+HISTORY_MSG_699;Local - Retinex Fast Fourier
 HISTORY_MSG_CLAMPOOG;Clip out-of-gamut colors
 HISTORY_MSG_COLORTONING_LABGRID_VALUE;CT - Color correction
 HISTORY_MSG_COLORTONING_LABREGION_AB;CT - Color correction
@@ -2030,7 +2031,7 @@ TP_LOCALLAB_LIGHTRETI;Lightness
 TP_LOCALLAB_THRESRETI;Threshold
 TP_LOCALLAB_DENOIS;Denoise
 TP_LOCALLAB_DEHAZ;Dehaze
-TP_LOCALLAB_FFTW;Use Fast Fourier
+TP_LOCALLAB_FFTW;Use Fast Fourier Transform
 TP_LOCALLAB_GRIDONE;Color Toning
 TP_LOCALLAB_GRIDTWO;Direct
 TP_LOCALLAB_LUM;Curves LC
diff --git a/rtengine/iplocallab.cc b/rtengine/iplocallab.cc
index 008669b97..4b9ee24f2 100644
--- a/rtengine/iplocallab.cc
+++ b/rtengine/iplocallab.cc
@@ -6363,7 +6363,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
 
 
 
-// soft light and 
+// soft light and retinex_pde
         if (lp.strng > 0.f && call <= 3 && lp.sfena) {
             int ystart = std::max(static_cast<int>(lp.yc - lp.lyT) - cy, 0);
             int yend = std::min(static_cast<int>(lp.yc + lp.ly) - cy, original->H);
@@ -6534,8 +6534,8 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
                             break;
                         }
                     }
-                //printf("FTsizeH =%i FTsizeW=%i \n", ftsizeH, ftsizeW);
-                //optimize with size fftw
+                    //printf("FTsizeH =%i FTsizeW=%i \n", ftsizeH, ftsizeW);
+                    //optimize with size fftw
                     if(ystart == 0 && yend < original->H) lp.ly -= (bfh - ftsizeH);
                     else if (ystart != 0 && yend == original->H) lp.lyT -= (bfh - ftsizeH);
                     else if(ystart != 0 && yend != original->H) {
@@ -6557,7 +6557,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
                         bfwr = ftsizeW;
                         reduW = true;
                     }
-                //new values optimized
+                    //new values optimized
                     ystart = std::max(static_cast<int>(lp.yc - lp.lyT) - cy, 0);
                     yend = std::min(static_cast<int>(lp.yc + lp.ly) - cy, original->H);
                     xstart = std::max(static_cast<int>(lp.xc - lp.lxL) - cx, 0);
@@ -6600,10 +6600,35 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
                     localContrastParams.amount = params->locallab.spots.at(sp).lcamount;
                     localContrastParams.darkness = params->locallab.spots.at(sp).lcdarkness;
                     localContrastParams.lightness = params->locallab.spots.at(sp).lightness;
-                    bool fftwlc = false;
-                    if(params->locallab.spots.at(sp).fftwlc) fftwlc = true;
+                    bool fftwlc = false;                   
+                    if(!lp.ftwlc){
                         ImProcFunctions::localContrast(tmp1.get(), tmp1->L, localContrastParams, fftwlc, sk);
-
+                    } else { 
+                        std::unique_ptr<LabImage> tmpfftw(new LabImage(bfwr, bfhr));
+#ifdef _OPENMP
+                    #pragma omp parallel for schedule(dynamic,16)
+#endif
+                        for (int y = 0; y < bfhr; y++) {
+                            for (int x = 0; x < bfwr; x++) {
+                            tmpfftw->L[y][x] = tmp1->L[y][x];
+                            tmpfftw->a[y][x] = tmp1->a[y][x];
+                            tmpfftw->b[y][x] = tmp1->b[y][x];
+                            }
+                        }
+                        fftwlc = true;
+                        ImProcFunctions::localContrast(tmpfftw.get(), tmpfftw->L, localContrastParams, fftwlc, sk);
+#ifdef _OPENMP
+                    #pragma omp parallel for schedule(dynamic,16)
+#endif
+                        for (int y = 0; y < bfhr; y++) {
+                            for (int x = 0; x < bfwr; x++) {
+                            tmp1->L[y][x] = tmpfftw->L[y][x];
+                            tmp1->a[y][x] = tmpfftw->a[y][x];
+                            tmp1->b[y][x] = tmpfftw->b[y][x];
+                            }
+                        }
+                      
+                    }
                     float minL =  tmp1->L[0][0] - bufgb->L[0][0];
                     float maxL = minL;
 #ifdef _OPENMP
@@ -6694,6 +6719,51 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
             LabImage *buforigmas = nullptr;
             int bfh = int (lp.ly + lp.lyT) + del; //bfw bfh real size of square zone
             int bfw = int (lp.lx + lp.lxL) + del;
+          //  printf("before bfh=%i bfw=%i\n", bfh, bfw);
+            
+            if(lp.ftwreti) {
+                int ftsizeH = 1;
+                int ftsizeW = 1;
+
+                for (int ft=0; ft < N_fftwsize; ft++) {//find best values for FFTW
+                    if(fftw_size[ft] <= bfh) {
+                       ftsizeH = fftw_size[ft];
+                       break;
+                    }
+                }
+                
+                for (int ft=0; ft < N_fftwsize; ft++) {
+                    if(fftw_size[ft] <= bfw) {
+                        ftsizeW = fftw_size[ft];
+                        break;
+                    }
+                }
+                
+                int ystart = std::max(static_cast<int>(lp.yc - lp.lyT) - cy, 0);
+                int xstart = std::max(static_cast<int>(lp.xc - lp.lxL) - cx, 0);
+                int yend = std::min(static_cast<int>(lp.yc + lp.ly) - cy, original->H);
+                int xend = std::min(static_cast<int>(lp.xc + lp.lx) - cx, original->W);
+
+                if(ystart == 0 && yend < original->H) lp.ly -= (bfh - ftsizeH);
+                else if (ystart != 0 && yend == original->H) lp.lyT -= (bfh - ftsizeH);
+                else if(ystart != 0 && yend != original->H) {
+                    if(lp.ly <= lp.lyT) lp.lyT -= (bfh - ftsizeH);
+                    else lp.ly -= (bfh - ftsizeH);
+                }
+
+                if(xstart == 0 && xend < original->W) lp.lx -= (bfw - ftsizeW);
+                else if(xstart != 0 && xend == original->W) lp.lxL -= (bfw - ftsizeW);
+                else if(xstart != 0 && xend != original->W) {
+                    if(lp.lx <= lp.lxL) lp.lxL -= (bfw - ftsizeW);
+                    else lp.lx -= (bfw - ftsizeW);
+                }
+            //new size bfw, bfh not optimized if spot H > high or spot W > width ==> TODO
+            bfh = int (lp.ly + lp.lyT) + del;
+            bfw = int (lp.lx + lp.lxL) + del;
+            //printf("after bfh=%i bfw=%i  fftwH=%i fftww=%i\n", bfh, bfw, ftsizeH, ftsizeW);
+                
+            }
+
             array2D<float> buflight(bfw, bfh);
             JaggedArray<float> bufchro(bfw, bfh);
 
diff --git a/rtengine/ipretinex.cc b/rtengine/ipretinex.cc
index 1d34aa13d..eae962483 100644
--- a/rtengine/ipretinex.cc
+++ b/rtengine/ipretinex.cc
@@ -961,9 +961,9 @@ void ImProcFunctions::MSRLocal(int sp, bool fftw, int lum, LabImage * bufreti, L
         }
 
         float *buffer = new float[W_L * H_L];
-        float mulradiusfftw = 20.f;
+        float mulradiusfftw = 40.f;
         for (int scale = scal - 1; scale >= 0; scale--) {
-                printf("retscale=%f scale=%i \n", RetinexScales[scale], scale);
+            //    printf("retscale=%f scale=%i \n", mulradiusfftw * RetinexScales[scale], scale);
         if(!fftw) {
 #ifdef _OPENMP
             #pragma omp parallel  //disabled with FFTW
@@ -973,13 +973,9 @@ void ImProcFunctions::MSRLocal(int sp, bool fftw, int lum, LabImage * bufreti, L
                 if (scale == scal - 1)
                 {
                     gaussianBlur(src, out, W_L, H_L, RetinexScales[scale], buffer);
-                    //ImProcFunctions::fftw_convol_blur2(src, out, W_L, H_L, RetinexScales[scale], 0);
                 } else   // reuse result of last iteration
                 {
                     // out was modified in last iteration => restore it
- 
-                   // ImProcFunctions::fftw_convol_blur2(out, out, W_L, H_L,sqrtf(SQR(RetinexScales[scale]) - SQR(RetinexScales[scale + 1])), 0);
-
                     gaussianBlur(out, out, W_L, H_L, sqrtf(SQR(RetinexScales[scale]) - SQR(RetinexScales[scale + 1])), buffer);
                 }
             }
@@ -990,9 +986,7 @@ void ImProcFunctions::MSRLocal(int sp, bool fftw, int lum, LabImage * bufreti, L
                 } else   // reuse result of last iteration
                 {
                     // out was modified in last iteration => restore it
- 
                    ImProcFunctions::fftw_convol_blur2(out, out, W_L, H_L,sqrtf(SQR(mulradiusfftw * RetinexScales[scale]) - SQR(mulradiusfftw * RetinexScales[scale + 1])), 0);
-
                 }
             
         }