diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h index e5706fe9f..47a058fe1 100644 --- a/rtengine/improcfun.h +++ b/rtengine/improcfun.h @@ -301,6 +301,7 @@ public: void normalize_mean_dt(float *data, const float *ref, size_t size); void retinex_pde(float *datain, float * dataout, int bfw, int bfh, float thresh, float multy, float *dE); void fftw_convol_blur(float *input, float *output, int bfw, int bfh, float radius, int fftkern); + void fftw_tile_blur(int GW, int GH, int tilssize , int max_numblox_W, int min_numblox_W, float **tmp1, int numThreads, double radius); void MSRLocal(int sp, int lum, LabImage * bufreti, LabImage * bufmask, LabImage * buforig, LabImage * buforigmas, float** luminance, float** templ, const float* const *originalLuminance, const int width, const int height, const procparams::LocallabParams &loc, const int skip, const LocretigainCurve &locRETgainCcurve, const int chrome, const int scall, const float krad, float &minCD, float &maxCD, float &mini, float &maxi, float &Tmean, float &Tsigma, float &Tmin, float &Tmax, const LocCCmaskretiCurve & locccmasretiCurve, bool &lcmasretiutili, const LocLLmaskretiCurve & locllmasretiCurve, bool &llmasretiutili, const LocHHmaskretiCurve & lochhmasretiCurve, bool & lhmasretiutili, int llretiMask, LabImage * transformed, bool retiMasktmap, bool retiMask); diff --git a/rtengine/iplocallab.cc b/rtengine/iplocallab.cc index 676ab42d1..38a392c8f 100644 --- a/rtengine/iplocallab.cc +++ b/rtengine/iplocallab.cc @@ -45,6 +45,7 @@ #define offset 25 // shift between tiles #define fTS ((TS/2+1)) // second dimension of Fourier tiles #define blkrad 1 // radius of block averaging +#define offset2 25 // shift between tiles #define epsilon 0.001f/(TS*TS) //tolerance #define MAXSCOPE 1.25f @@ -3757,7 +3758,8 @@ static float *retinex_poisson_dct(float *data, size_t nx, size_t ny, double m) * * @author Nicolas Limare */ - + BENCHFUN + double *cosx = NULL, *cosy = NULL; size_t i; double m2; @@ -3800,9 +3802,11 @@ static float *retinex_poisson_dct(float *data, size_t nx, size_t ny, double m) static float *discrete_laplacian_threshold(float *data_out, const float *data_in, size_t nx, size_t ny, float t) { + BENCHFUN + size_t i, j; float *ptr_out; - float diff; + float diff=0.f; /* pointers to the current and neighbour values */ const float *ptr_in, *ptr_in_xm1, *ptr_in_xp1, *ptr_in_ym1, *ptr_in_yp1; @@ -3826,7 +3830,7 @@ static float *discrete_laplacian_threshold(float *data_out, const float *data_in ptr_out = data_out; for (j = 0; j < ny; j++) { for (i = 0; i < nx; i++) { - *ptr_out = 0.; + *ptr_out = 0.f; /* row differences */ if (0 < i) { diff = *ptr_in - *ptr_in_xm1; @@ -3875,6 +3879,7 @@ void ImProcFunctions::retinex_pde(float *datain, float * dataout, int bfw, int b * adapted by Jacques Desmis 2019 */ + BENCHFUN /* #ifdef RT_FFTW3F_OMP @@ -3882,6 +3887,9 @@ void ImProcFunctions::retinex_pde(float *datain, float * dataout, int bfw, int b fftwf_init_threads(); fftwf_plan_with_nthreads ( omp_get_max_threads() ); } + +// #else +// fftwf_plan_with_nthreads( 2 ); #endif */ fftwf_plan dct_fw, dct_fw04, dct_bw; @@ -3974,8 +3982,8 @@ void ImProcFunctions::fftw_convol_blur(float *input, float *output, int bfw, int BENCHFUN float *out; //for FFT datas - float *kern;//for kernel gauss - float *outkern;//for FFT kernel + float *kern = nullptr;//for kernel gauss + float *outkern = nullptr;//for FFT kernel fftwf_plan p, pkern;//plan for FFT int image_size, image_sizechange; @@ -4055,11 +4063,220 @@ void ImProcFunctions::fftw_convol_blur(float *input, float *output, int bfw, int } +void ImProcFunctions::fftw_tile_blur(int GW, int GH, int tilssize, int max_numblox_W, int min_numblox_W, float **tmp1, int numThreads, double radius) +{ + BENCHFUN + float epsil = 0.001f/(tilssize * tilssize); + fftwf_plan plan_forward_blox[2]; + fftwf_plan plan_backward_blox[2]; + + array2D tilemask_in(tilssize, tilssize); + array2D tilemask_out(tilssize, tilssize); + + float *Lbloxtmp = reinterpret_cast(fftwf_malloc(max_numblox_W * tilssize * tilssize * sizeof(float))); + float *fLbloxtmp = reinterpret_cast(fftwf_malloc(max_numblox_W * tilssize * tilssize * sizeof(float))); + + int nfwd[2] = {tilssize, tilssize}; + + //for DCT: + fftw_r2r_kind fwdkind[2] = {FFTW_REDFT10, FFTW_REDFT10}; + fftw_r2r_kind bwdkind[2] = {FFTW_REDFT01, FFTW_REDFT01}; + + // Creating the plans with FFTW_MEASURE instead of FFTW_ESTIMATE speeds up the execute a bit + plan_forward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, Lbloxtmp, nullptr, 1, tilssize * tilssize, fLbloxtmp, nullptr, 1, tilssize * tilssize, fwdkind, FFTW_MEASURE | FFTW_DESTROY_INPUT); + plan_backward_blox[0] = fftwf_plan_many_r2r(2, nfwd, max_numblox_W, fLbloxtmp, nullptr, 1, tilssize * tilssize, Lbloxtmp, nullptr, 1, tilssize * tilssize, bwdkind, FFTW_MEASURE | FFTW_DESTROY_INPUT); + plan_forward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, Lbloxtmp, nullptr, 1, tilssize * tilssize, fLbloxtmp, nullptr, 1, tilssize * tilssize, fwdkind, FFTW_MEASURE | FFTW_DESTROY_INPUT); + plan_backward_blox[1] = fftwf_plan_many_r2r(2, nfwd, min_numblox_W, fLbloxtmp, nullptr, 1, tilssize * tilssize, Lbloxtmp, nullptr, 1, tilssize * tilssize, bwdkind, FFTW_MEASURE | FFTW_DESTROY_INPUT); + fftwf_free(Lbloxtmp); + fftwf_free(fLbloxtmp); + const int border = MAX(2, tilssize / 16); + + for (int i = 0; i < tilssize; ++i) { + float i1 = abs((i > tilssize / 2 ? i - tilssize + 1 : i)); + float vmask = (i1 < border ? SQR(sin((rtengine::RT_PI_F * i1) / (2 * border))) : 1.0f); + float vmask2 = (i1 < 2 * border ? SQR(sin((rtengine::RT_PI_F * i1) / (2 * border))) : 1.0f); + + for (int j = 0; j < tilssize; ++j) { + float j1 = abs((j > tilssize / 2 ? j - tilssize + 1 : j)); + tilemask_in[i][j] = (vmask * (j1 < border ? SQR(sin((rtengine::RT_PI_F * j1) / (2 * border))) : 1.0f)) + epsil; + tilemask_out[i][j] = (vmask2 * (j1 < 2 * border ? SQR(sin((rtengine::RT_PI_F * j1) / (2 * border))) : 1.0f)) + epsil; + + } + } + + float *LbloxArray[numThreads]; + float *fLbloxArray[numThreads]; + + const int numblox_W = ceil((static_cast(GW)) / (offset2)) + 2 * blkrad; + const int numblox_H = ceil((static_cast(GH)) / (offset2)) + 2 * blkrad; + + array2D Lresult(GW, GH, ARRAY2D_CLEAR_DATA); + array2D totwt(GW, GH, ARRAY2D_CLEAR_DATA); //weight for combining DCT blocks + + for (int i = 0; i < numThreads; ++i) { + LbloxArray[i] = reinterpret_cast(fftwf_malloc(max_numblox_W * tilssize * tilssize * sizeof(float))); + fLbloxArray[i] = reinterpret_cast(fftwf_malloc(max_numblox_W * tilssize * tilssize * sizeof(float))); + } + +#ifdef _OPENMP + int masterThread = omp_get_thread_num(); +#endif +#ifdef _OPENMP + #pragma omp parallel +#endif + { +#ifdef _OPENMP + int subThread = masterThread * 1 + omp_get_thread_num(); +#else + int subThread = 0; +#endif + float *Lblox = LbloxArray[subThread]; + float *fLblox = fLbloxArray[subThread]; + float pBuf[GW + tilssize + 2 * blkrad * offset2] ALIGNED16; +#ifdef _OPENMP + #pragma omp for +#endif + + for (int vblk = 0; vblk < numblox_H; ++vblk) { + + int top = (vblk - blkrad) * offset2; + float * datarow = pBuf + blkrad * offset2; + + for (int i = 0; i < tilssize; ++i) { + int row = top + i; + int rr = row; + + if (row < 0) { + rr = MIN(-row, GH - 1); + } else if (row >= GH) { + rr = MAX(0, 2 * GH - 2 - row); + } + + for (int j = 0; j < GW; ++j) { + datarow[j] = (tmp1[rr][j]); + } + + for (int j = -blkrad * offset2; j < 0; ++j) { + datarow[j] = datarow[MIN(-j, GW - 1)]; + } + + for (int j = GW; j < GW + tilssize + blkrad * offset2; ++j) { + datarow[j] = datarow[MAX(0, 2 * GW - 2 - j)]; + }//now we have a padded data row + + for (int hblk = 0; hblk < numblox_W; ++hblk) { + int left = (hblk - blkrad) * offset2; + int indx = (hblk) * tilssize; //index of block in malloc + + if (top + i >= 0 && top + i < GH) { + int j; + + for (j = 0; j < min((-left), tilssize); ++j) { + Lblox[(indx + i)*tilssize + j] = tilemask_in[i][j] * datarow[left + j]; // luma data + } + + for (; j < min(tilssize, GW - left); ++j) { + Lblox[(indx + i)*tilssize + j] = tilemask_in[i][j] * datarow[left + j]; // luma data + totwt[top + i][left + j] += tilemask_in[i][j] * tilemask_out[i][j]; + } + + for (; j < tilssize; ++j) { + Lblox[(indx + i)*tilssize + j] = tilemask_in[i][j] * datarow[left + j]; // luma data + } + } else { + for (int j = 0; j < tilssize; ++j) { + Lblox[(indx + i)*tilssize + j] = tilemask_in[i][j] * datarow[left + j]; // luma data + } + } + + } + + }//end of filling block row + + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + //fftwf_print_plan (plan_forward_blox); + if (numblox_W == max_numblox_W) { + fftwf_execute_r2r(plan_forward_blox[0], Lblox, fLblox); // DCT an entire row of tiles + } else { + fftwf_execute_r2r(plan_forward_blox[1], Lblox, fLblox); // DCT an entire row of tiles + } + + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + double n_x = rtengine::RT_PI/(double) tilssize; + double n_y = rtengine::RT_PI/(double) tilssize; + n_x = n_x * n_x; + n_y = n_y * n_y; + //radius = 30.f; + for (int hblk = 0; hblk < numblox_W; ++hblk) { + int blkstart = hblk * tilssize * tilssize; + for(int j = 0; j < tilssize; j++){ + int index = j * tilssize; + for(int i = 0; i < tilssize; i++) + fLblox[blkstart + index + i] *= exp((float)(-radius)*(n_x * i * i + n_y * j * j)); + } + }//end of horizontal block loop + + //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + //now perform inverse FT of an entire row of blocks + if (numblox_W == max_numblox_W) { + fftwf_execute_r2r(plan_backward_blox[0], fLblox, Lblox); //for DCT + } else { + fftwf_execute_r2r(plan_backward_blox[1], fLblox, Lblox); //for DCT + } + + int topproc = (vblk - blkrad) * offset2; + const int numblox_W = ceil((static_cast(GW)) / (offset2)); + const float DCTnorm = 1.0f / (4 * tilssize * tilssize); //for DCT + + int imin = MAX(0, - topproc); + int bottom = MIN(topproc + tilssize, GH); + int imax = bottom - topproc; + for (int i = imin; i < imax; ++i) { + for (int hblk = 0; hblk < numblox_W; ++hblk) { + int left = (hblk - blkrad) * offset2; + int right = MIN(left + tilssize, GW); + int jmin = MAX(0, -left); + int jmax = right - left; + int indx = hblk * tilssize; + for (int j = jmin; j < jmax; ++j) { + Lresult[topproc + i][left + j] += tilemask_out[i][j] * Lblox[(indx + i) * tilssize + j] * DCTnorm; //for DCT + } + } + } + }//end of vertical block loop + } +//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#ifdef _OPENMP + + #pragma omp parallel for +#endif + + for (int i = 0; i < GH; ++i) { + for (int j = 0; j < GW; ++j) { + tmp1[i][j] = Lresult[i][j] / totwt[i][j]; + tmp1[i][j] = CLIPLOC(tmp1[i][j]); + } + } + + for (int i = 0; i < numThreads; ++i) { + fftwf_free(LbloxArray[i]); + fftwf_free(fLbloxArray[i]); + } + + fftwf_destroy_plan(plan_forward_blox[0]); + fftwf_destroy_plan(plan_backward_blox[0]); + fftwf_destroy_plan(plan_forward_blox[1]); + fftwf_destroy_plan(plan_backward_blox[1]); + fftwf_cleanup(); +} void ImProcFunctions::fftw_denoise(int GW, int GH, int max_numblox_W, int min_numblox_W, float **tmp1, array2D *Lin, int numThreads, const struct local_params & lp, int chrom) { + BENCHFUN fftwf_plan plan_forward_blox[2]; fftwf_plan plan_backward_blox[2]; @@ -6064,6 +6281,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o // soft light if (lp.strng > 0.f && call <= 3 && lp.sfena) { + const int ystart = std::max(static_cast(lp.yc - lp.lyT) - cy, 0); const int yend = std::min(static_cast(lp.yc + lp.ly) - cy, original->H); const int xstart = std::max(static_cast(lp.xc - lp.lxL) - cx, 0); @@ -6095,6 +6313,8 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o if(lp.softmet == 0) { ImProcFunctions::softLight(bufexpfin.get(), softLightParams); } else if(lp.softmet == 1) { + MyMutex::MyLock lock(*fftwMutex); + float *datain = new float[bfw*bfh]; float *dataout = new float[bfw*bfh]; float *dE = new float[bfw*bfh]; @@ -6933,6 +7153,25 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o bool ctoning = (a_scale != 0.f || b_scale != 0.f || a_base != 0.f || b_base != 0.f); if (!lp.inv && (lp.chro != 0 || lp.ligh != 0.f || lp.cont != 0 || ctoning || lp.qualcurvemet != 0 || lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 4 || lp.showmaskcolmet == 5) && lp.colorena) { // || lllocalcurve)) { //interior ellipse renforced lightness and chroma //locallutili +/* +//test for fftw blur with tiles.... + int GW = original->W; + int GH = original->H; + MyMutex::MyLock lock (*fftwMutex); + + double radius = 80.f; + int tilssize = 512; +#ifdef _OPENMP + const int numThreads = omp_get_max_threads(); +#else + const int numThreads = 1; + +#endif + int max_numblox_W = ceil((static_cast(GW)) / (offset2)) + 2 * blkrad; + // calculate min size of numblox_W. + int min_numblox_W = ceil((static_cast(GW)) / (offset2)) + 2 * blkrad; + fftw_tile_blur(GW, GH, tilssize , max_numblox_W, min_numblox_W, original->L, numThreads, radius); +*/ const int ystart = std::max(static_cast(lp.yc - lp.lyT) - cy, 0); const int yend = std::min(static_cast(lp.yc + lp.ly) - cy, original->H); const int xstart = std::max(static_cast(lp.xc - lp.lxL) - cx, 0);