From 0f21a0de6c1339fa7e8364d736af11dc1838b6f5 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Wed, 7 Sep 2016 16:44:16 +0200 Subject: [PATCH 1/5] Speedups for denoise, fixes #3418 --- rtengine/FTblockDN.cc | 97 ++++++++++++++++++----------------- rtengine/color.cc | 61 ++++++++++++++++++++++ rtengine/color.h | 18 +++++-- rtengine/dcrop.cc | 4 +- rtengine/improccoordinator.cc | 1 + rtengine/improccoordinator.h | 8 +++ rtengine/sleefsseavx.c | 37 +++++++++++++ 7 files changed, 174 insertions(+), 52 deletions(-) diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 69e846f3c..4ccc63c3a 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -37,7 +37,8 @@ #include "opthelper.h" #include "cplx_wavelet_dec.h" #include "median.h" - +#define BENCHMARK +#include "StopWatch.h" #ifdef _OPENMP #include #endif @@ -426,6 +427,7 @@ enum nrquality {QUALITY_STANDARD, QUALITY_HIGH}; SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi) { +BENCHFUN //#ifdef _DEBUG MyTime t1e, t2e; t1e.set(); @@ -588,19 +590,19 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef } } - float gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; LUTf gamcurve(65536, LUT_CLIP_BELOW); + float gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; + MyTime t1e, t2e; + t1e.set(); if (denoiseMethodRgb) { - for (int i = 0; i < 65536; ++i) { - gamcurve[i] = (Color::gamma(static_cast(i) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f; - } + Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f); } else { - for (int i = 0; i < 65536; ++i) { - gamcurve[i] = (Color::gamman(static_cast(i) / 65535.0, gam)) * 32768.0f; - } + Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f); } + t2e.set(); + printf("gamcurve performed in %d usec:\n", t2e.etime(t1e)); // inverse gamma transform for output data float igam = 1.f / gam; @@ -609,15 +611,16 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef LUTf igamcurve(65536, LUT_CLIP_BELOW); + MyTime t11e, t21e; + t11e.set(); + if (denoiseMethodRgb) { - for (int i = 0; i < 65536; ++i) { - igamcurve[i] = (Color::gamma(static_cast(i) / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); - } + Color::gammaf2lut(igamcurve, igam, igamthresh, igamslope, 32768.f, 65535.f); } else { - for (int i = 0; i < 65536; ++i) { - igamcurve[i] = (Color::gamman(static_cast(i) / 32768.0f, igam) * 65535.0f); - } + Color::gammanf2lut(igamcurve, igam, 32768.f, 65535.f); } + t21e.set(); + printf("igamcurve performed in %d usec:\n", t21e.etime(t11e)); const float gain = pow (2.0f, float(expcomp)); float noisevar_Ldetail = SQR(static_cast(SQR(100. - dnparams.Ldetail) + 50.*(100. - dnparams.Ldetail)) * TS * 0.5f); @@ -922,9 +925,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef B_ = (*denoiseigamtab)[B_]; //apply gamma noise standard (slider) - R_ = R_ < 65535.0f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.0f); - G_ = G_ < 65535.0f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.0f); - B_ = B_ < 65535.0f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.0f); + R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f); + G_ = G_ < 65535.f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.f); + B_ = B_ < 65535.f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.f); //true conversion xyz=>Lab float X, Y, Z; @@ -966,9 +969,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float Y = gain * src->g(i, j); float Z = gain * src->b(i, j); //conversion colorspace to determine luminance with no gamma - X = X < 65535.0f ? gamcurve[X] : (Color::gamma(static_cast(X) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); - Y = Y < 65535.0f ? gamcurve[Y] : (Color::gamma(static_cast(Y) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); - Z = Z < 65535.0f ? gamcurve[Z] : (Color::gamma(static_cast(Z) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); + X = X < 65535.f ? gamcurve[X] : (Color::gammaf(X / 65535.f, gam, gamthresh, gamslope) * 32768.f); + Y = Y < 65535.f ? gamcurve[Y] : (Color::gammaf(Y / 65535.f, gam, gamthresh, gamslope) * 32768.f); + Z = Z < 65535.f ? gamcurve[Z] : (Color::gammaf(Z / 65535.f, gam, gamthresh, gamslope) * 32768.f); //end chroma labdn->L[i1][j1] = Y; labdn->a[i1][j1] = (X - Y); @@ -1009,9 +1012,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float btmp = Color::igammatab_srgb[ src->b(i, j) ]; //modification Jacques feb 2013 // gamma slider different from raw - rtmp = rtmp < 65535.0f ? gamcurve[rtmp] : (Color::gamman(static_cast(rtmp) / 65535.0, gam) * 32768.0f); - gtmp = gtmp < 65535.0f ? gamcurve[gtmp] : (Color::gamman(static_cast(gtmp) / 65535.0, gam) * 32768.0f); - btmp = btmp < 65535.0f ? gamcurve[btmp] : (Color::gamman(static_cast(btmp) / 65535.0, gam) * 32768.0f); + rtmp = rtmp < 65535.f ? gamcurve[rtmp] : (Color::gammanf(rtmp / 65535.f, gam) * 32768.f); + gtmp = gtmp < 65535.f ? gamcurve[gtmp] : (Color::gammanf(gtmp / 65535.f, gam) * 32768.f); + btmp = btmp < 65535.f ? gamcurve[btmp] : (Color::gammanf(btmp / 65535.f, gam) * 32768.f); float X, Y, Z; Color::rgbxyz(rtmp, gtmp, btmp, X, Y, Z, wp); @@ -1650,9 +1653,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float Z = Y - (labdn->b[i1][j1]); - X = X < 32768.0f ? igamcurve[X] : (Color::gamma(X / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); - Y = Y < 32768.0f ? igamcurve[Y] : (Color::gamma(Y / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); - Z = Z < 32768.0f ? igamcurve[Z] : (Color::gamma(Z / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); + X = X < 32768.f ? igamcurve[X] : (Color::gammaf(X / 32768.f, igam, igamthresh, igamslope) * 65535.f); + Y = Y < 32768.f ? igamcurve[Y] : (Color::gammaf(Y / 32768.f, igam, igamthresh, igamslope) * 65535.f); + Z = Z < 32768.f ? igamcurve[Z] : (Color::gammaf(Z / 32768.f, igam, igamthresh, igamslope) * 65535.f); if (numtiles == 1) { dsttmp->r(i, j) = newGain * X; @@ -1695,9 +1698,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float r_, g_, b_; Color::xyz2rgb(X, Y, Z, r_, g_, b_, wip); //gamma slider is different from Raw - r_ = r_ < 32768.0f ? igamcurve[r_] : (Color::gamman(r_ / 32768.0f, igam) * 65535.0f); - g_ = g_ < 32768.0f ? igamcurve[g_] : (Color::gamman(g_ / 32768.0f, igam) * 65535.0f); - b_ = b_ < 32768.0f ? igamcurve[b_] : (Color::gamman(b_ / 32768.0f, igam) * 65535.0f); + r_ = r_ < 32768.f ? igamcurve[r_] : (Color::gammanf(r_ / 32768.f, igam) * 65535.f); + g_ = g_ < 32768.f ? igamcurve[g_] : (Color::gammanf(g_ / 32768.f, igam) * 65535.f); + b_ = b_ < 32768.f ? igamcurve[b_] : (Color::gammanf(b_ / 32768.f, igam) * 65535.f); if (numtiles == 1) { dsttmp->r(i, j) = newGain * r_; @@ -2982,7 +2985,7 @@ void ImProcFunctions::WaveletDenoiseAll_info(int levwav, wavelet_decomposition & } } -void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope) +SSEFUNCTION void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope) { gam = dnparams.gamma; gamthresh = 0.001f; @@ -2995,18 +2998,20 @@ void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoisePa } } - gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; bool denoiseMethodRgb = (dnparams.dmethod == "RGB"); + MyTime t1e, t2e; + t1e.set(); + if (denoiseMethodRgb) { - for (int i = 0; i < 65536; ++i) { - gamcurve[i] = (Color::gamma(static_cast(i) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f; - } + gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; + Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f); } else { - for (int i = 0; i < 65536; ++i) { - gamcurve[i] = (Color::gamman(static_cast(i) / 65535.0, gam)) * 32768.0f; - } + Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f); } + t2e.set(); + printf("gamcurve in RGB_denoise_infoGamCurve performed in %d usec:\n", t2e.etime(t1e)); + } void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc) @@ -3424,9 +3429,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat B_ = (*denoiseigamtab)[B_]; //apply gamma noise standard (slider) - R_ = R_ < 65535.0f ? gamcurve[R_] : (Color::gamman(static_cast(R_) / 65535.0, gam) * 32768.0f); - G_ = G_ < 65535.0f ? gamcurve[G_] : (Color::gamman(static_cast(G_) / 65535.0, gam) * 32768.0f); - B_ = B_ < 65535.0f ? gamcurve[B_] : (Color::gamman(static_cast(B_) / 65535.0, gam) * 32768.0f); + R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f); + G_ = G_ < 65535.f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.f); + B_ = B_ < 65535.f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.f); //true conversion xyz=>Lab float X, Y, Z; Color::rgbxyz(R_, G_, B_, X, Y, Z, wp); @@ -3451,9 +3456,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat float Y = gain * src->g(i, j); float Z = gain * src->b(i, j); - X = X < 65535.0f ? gamcurve[X] : (Color::gamma(static_cast(X) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); - Y = Y < 65535.0f ? gamcurve[Y] : (Color::gamma(static_cast(Y) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); - Z = Z < 65535.0f ? gamcurve[Z] : (Color::gamma(static_cast(Z) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); + X = X < 65535.f ? gamcurve[X] : (Color::gammaf(X / 65535.f, gam, gamthresh, gamslope) * 32768.f); + Y = Y < 65535.f ? gamcurve[Y] : (Color::gammaf(Y / 65535.f, gam, gamthresh, gamslope) * 32768.f); + Z = Z < 65535.f ? gamcurve[Z] : (Color::gammaf(Z / 65535.f, gam, gamthresh, gamslope) * 32768.f); labdn->a[i1][j1] = (X - Y); labdn->b[i1][j1] = (Y - Z); @@ -3480,9 +3485,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat float btmp = Color::igammatab_srgb[ src->b(i, j) ]; //modification Jacques feb 2013 // gamma slider different from raw - rtmp = rtmp < 65535.0f ? gamcurve[rtmp] : (Color::gamman(static_cast(rtmp) / 65535.0, gam) * 32768.0f); - gtmp = gtmp < 65535.0f ? gamcurve[gtmp] : (Color::gamman(static_cast(gtmp) / 65535.0, gam) * 32768.0f); - btmp = btmp < 65535.0f ? gamcurve[btmp] : (Color::gamman(static_cast(btmp) / 65535.0, gam) * 32768.0f); + rtmp = rtmp < 65535.f ? gamcurve[rtmp] : (Color::gammanf(rtmp / 65535.f, gam) * 32768.f); + gtmp = gtmp < 65535.f ? gamcurve[gtmp] : (Color::gammanf(gtmp / 65535.f, gam) * 32768.f); + btmp = btmp < 65535.f ? gamcurve[btmp] : (Color::gammanf(btmp / 65535.f, gam) * 32768.f); float X, Y, Z; Color::rgbxyz(rtmp, gtmp, btmp, X, Y, Z, wp); diff --git a/rtengine/color.cc b/rtengine/color.cc index 872f47522..fa4a79840 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -1478,6 +1478,67 @@ void Color::calcGamma (double pwr, double ts, int mode, int imax, double &gamma0 return; } } +void Color::gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope, float divisor, float factor) +{ +#ifdef __SSE2__ + // SSE2 version is more than 6 times faster than scalar version + vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f); + vfloat fourv = F2V(4.f); + vfloat gammav = F2V(1.f / gamma); + vfloat slopev = F2V((slope / divisor) * factor); + vfloat divisorv = F2V(xlogf(divisor)); + vfloat factorv = F2V(factor); + vfloat comparev = F2V(start * divisor); + int border = start * divisor; + int border1 = border - (border & 3); + int border2 = border1 + 4; + int i = 0; + for(; i < border1; i += 4) { + vfloat resultv = iv * slopev; + _mm_storeu_ps(&gammacurve[i], resultv); + iv += fourv; + } + for(; i < border2; i += 4) { + vfloat result0v = iv * slopev; + vfloat result1v = xexpf((xlogf(iv) - divisorv) * gammav) * factorv; + _mm_storeu_ps(&gammacurve[i], vself(vmaskf_le(iv, comparev), result0v, result1v)); + iv += fourv; + } + for(; i < 65536; i += 4) { + vfloat resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv; + _mm_storeu_ps(&gammacurve[i], resultv); + iv += fourv; + } +#else + for (int i = 0; i < 65536; ++i) { + gammacurve[i] = gammaf(static_cast(i) / divisor, gamma, start, slope) * factor; + } +#endif +} + +void Color::gammanf2lut (LUTf &gammacurve, float gamma, float divisor, float factor) //standard gamma without slope... +{ +#ifdef __SSE2__ + // SSE2 version is more than 6 times faster than scalar version + vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f); + vfloat fourv = F2V(4.f); + vfloat gammav = F2V(1.f / gamma); + vfloat divisorv = F2V(xlogf(divisor)); + vfloat factorv = F2V(factor); + vfloat resultv = xexpf((xlogf(iv) - divisorv) * gammav) * factorv; + _mm_storeu_ps(&gammacurve[0], resultv); + iv += fourv; + for(int i=4; i < 65536; i += 4) { + resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv; + _mm_storeu_ps(&gammacurve[i], resultv); + iv += fourv; + } +#else + for (int i = 0; i < 65536; ++i) { + gammacurve[i] = Color::gammanf(static_cast(i) / divisor, gamma) * factor; + } +#endif +} void Color::Lab2XYZ(float L, float a, float b, float &x, float &y, float &z) { diff --git a/rtengine/color.h b/rtengine/color.h index 1ae721e5a..c7699e9c5 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -1104,6 +1104,15 @@ public: { return (x <= start ? x*slope : exp(log(x) / gamma) * mul - add); } + + static inline float gammaf (float x, float gamma, float start, float slope) + { + return x <= start ? x*slope : xexpf(xlogf(x) / gamma); + } + + //fills a LUT of size 65536 using gamma with slope... + static void gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope, float divisor, float factor); + static inline double igamma (double x, double gamma, double start, double slope, double mul, double add) { return (x <= start * slope ? x / slope : exp(log((x + add) / mul) * gamma) ); @@ -1118,7 +1127,7 @@ public: */ static inline double gamman (double x, double gamma) //standard gamma without slope... { - return (x = exp(log(x) / gamma)); + return exp(log(x) / gamma); } /** @@ -1129,9 +1138,10 @@ public: */ static inline float gammanf (float x, float gamma) //standard gamma without slope... { - return (x = xexpf(xlogf(x) / gamma)); + return xexpf(xlogf(x) / gamma); } - + //fills a LUT of size 65536 using gamma without slope... + static void gammanf2lut (LUTf &gammacurve, float gamma, float divisor, float factor); /** * @brief Very simply inverse gamma @@ -1141,7 +1151,7 @@ public: */ static inline double igamman (double x, double gamma) //standard inverse gamma without slope... { - return (x = exp(log(x) * gamma) ); + return exp(log(x) * gamma); } diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc index 93aca0f6d..6bf3f2273 100644 --- a/rtengine/dcrop.cc +++ b/rtengine/dcrop.cc @@ -380,7 +380,7 @@ void Crop::update (int todo) } } - if(skip == 1 && params.dirpyrDenoise.enabled && ((settings->leveldnautsimpl == 1 && params.dirpyrDenoise.Cmethod == "AUT") || (settings->leveldnautsimpl == 0 && params.dirpyrDenoise.C2method == "AUTO"))) { + if(skip == 1 && params.dirpyrDenoise.enabled && !parent->denoiseInfoStore.valid && ((settings->leveldnautsimpl == 1 && params.dirpyrDenoise.Cmethod == "AUT") || (settings->leveldnautsimpl == 0 && params.dirpyrDenoise.C2method == "AUTO"))) { MyTime t1aue, t2aue; t1aue.set(); @@ -587,7 +587,7 @@ void Crop::update (int todo) params.dirpyrDenoise.chroma = chM / (autoNR * multip * adjustr); params.dirpyrDenoise.redchro = maxr; params.dirpyrDenoise.bluechro = maxb; - + parent->denoiseInfoStore.valid = true; if(parent->adnListener) { parent->adnListener->chromaChanged(params.dirpyrDenoise.chroma, params.dirpyrDenoise.redchro, params.dirpyrDenoise.bluechro); } diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc index a6354bd56..8925b29e0 100644 --- a/rtengine/improccoordinator.cc +++ b/rtengine/improccoordinator.cc @@ -320,6 +320,7 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall) ipf.setScale (scale); imgsrc->getImage (currWB, tr, orig_prev, pp, params.toneCurve, params.icm, params.raw); + denoiseInfoStore.valid = false; //ColorTemp::CAT02 (orig_prev, ¶ms) ; // printf("orig_prevW=%d\n scale=%d",orig_prev->width, scale); /* Issue 2785, disabled some 1:1 tools diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h index 5cc03cb72..a1124ff01 100644 --- a/rtengine/improccoordinator.h +++ b/rtengine/improccoordinator.h @@ -325,6 +325,14 @@ public: { return imgsrc; } + + class denoiseinfostore { + public: + bool valid; + + denoiseinfostore() : valid(false) {}; + } denoiseInfoStore; + }; } #endif diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index a9f49f143..1b0c9a0ae 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -1275,6 +1275,25 @@ static INLINE vfloat xlogf0(vfloat d) { return x; } +static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table + vfloat x, x2, t, m; + vint2 e; + + e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f))); + m = vldexpf(d, vsubi2(vcast_vi2_i(0), e)); + + x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m)); + x2 = vmulf(x, x); + + t = vcast_vf_f(0.2371599674224853515625f); + t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f)); + t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f)); + t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f)); + t = vmlaf(t, x2, vcast_vf_f(2.0f)); + + return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e))); + +} static INLINE vfloat xexpf(vfloat d) { vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f))); @@ -1299,6 +1318,24 @@ static INLINE vfloat xexpf(vfloat d) { return u; } +static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table + vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f))); + vfloat s, u; + + s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d); + s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s); + + u = vcast_vf_f(0.00136324646882712841033936f); + u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f)); + u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f)); + u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f)); + u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f)); + + u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s)); + + return vldexpf(u, q); +} + static INLINE vfloat xcbrtf(vfloat d) { vfloat x, y, q = vcast_vf_f(1.0), t; vint2 e, qu, re; From ef83e6b591a211afe8d4471c6933b3953ae2fd4e Mon Sep 17 00:00:00 2001 From: heckflosse Date: Wed, 7 Sep 2016 23:34:06 +0200 Subject: [PATCH 2/5] Removed StopWatches and corrected some bugs from latest commit --- rtengine/FTblockDN.cc | 19 ------------------- rtengine/dcrop.cc | 26 ++++++++++---------------- rtengine/improccoordinator.h | 10 ++++++---- 3 files changed, 16 insertions(+), 39 deletions(-) diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 4ccc63c3a..8229417e4 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -37,8 +37,6 @@ #include "opthelper.h" #include "cplx_wavelet_dec.h" #include "median.h" -#define BENCHMARK -#include "StopWatch.h" #ifdef _OPENMP #include #endif @@ -427,7 +425,6 @@ enum nrquality {QUALITY_STANDARD, QUALITY_HIGH}; SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi) { -BENCHFUN //#ifdef _DEBUG MyTime t1e, t2e; t1e.set(); @@ -594,15 +591,11 @@ BENCHFUN LUTf gamcurve(65536, LUT_CLIP_BELOW); float gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; - MyTime t1e, t2e; - t1e.set(); if (denoiseMethodRgb) { Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f); } else { Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f); } - t2e.set(); - printf("gamcurve performed in %d usec:\n", t2e.etime(t1e)); // inverse gamma transform for output data float igam = 1.f / gam; @@ -611,16 +604,11 @@ BENCHFUN LUTf igamcurve(65536, LUT_CLIP_BELOW); - MyTime t11e, t21e; - t11e.set(); - if (denoiseMethodRgb) { Color::gammaf2lut(igamcurve, igam, igamthresh, igamslope, 32768.f, 65535.f); } else { Color::gammanf2lut(igamcurve, igam, 32768.f, 65535.f); } - t21e.set(); - printf("igamcurve performed in %d usec:\n", t21e.etime(t11e)); const float gain = pow (2.0f, float(expcomp)); float noisevar_Ldetail = SQR(static_cast(SQR(100. - dnparams.Ldetail) + 50.*(100. - dnparams.Ldetail)) * TS * 0.5f); @@ -821,7 +809,6 @@ BENCHFUN {static_cast(wprof[2][0]), static_cast(wprof[2][1]), static_cast(wprof[2][2])} }; - // begin tile processing of image #ifdef _OPENMP #pragma omp parallel num_threads(numthreads) if (numthreads>1) @@ -3000,18 +2987,12 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::Dir bool denoiseMethodRgb = (dnparams.dmethod == "RGB"); - MyTime t1e, t2e; - t1e.set(); - if (denoiseMethodRgb) { gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f); } else { Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f); } - t2e.set(); - printf("gamcurve in RGB_denoise_infoGamCurve performed in %d usec:\n", t2e.etime(t1e)); - } void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc) diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc index 6bf3f2273..1e5aba77c 100644 --- a/rtengine/dcrop.cc +++ b/rtengine/dcrop.cc @@ -191,9 +191,6 @@ void Crop::update (int todo) parent->ipf.Tile_calc (tilesize, overlap, kall, widIm, heiIm, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip); kall = 0; - float *ch_M = new float [9];//allocate memory - float *max_r = new float [9]; - float *max_b = new float [9]; float *min_b = new float [9]; float *min_r = new float [9]; float *lumL = new float [9]; @@ -462,9 +459,9 @@ void Crop::update (int todo) //printf("DCROP skip=%d cha=%f red=%f bl=%f redM=%f bluM=%f chrom=%f sigm=%f lum=%f\n",skip, chaut,redaut,blueaut, maxredaut, maxblueaut, chromina, sigma, lumema); Nb[hcr * 3 + wcr] = nb; - ch_M[hcr * 3 + wcr] = pondcorrec * chaut; - max_r[hcr * 3 + wcr] = pondcorrec * maxredaut; - max_b[hcr * 3 + wcr] = pondcorrec * maxblueaut; + parent->denoiseInfoStore.ch_M[hcr * 3 + wcr] = pondcorrec * chaut; + parent->denoiseInfoStore.max_r[hcr * 3 + wcr] = pondcorrec * maxredaut; + parent->denoiseInfoStore.max_b[hcr * 3 + wcr] = pondcorrec * maxblueaut; min_r[hcr * 3 + wcr] = pondcorrec * minredaut; min_b[hcr * 3 + wcr] = pondcorrec * minblueaut; lumL[hcr * 3 + wcr] = lumema; @@ -524,20 +521,20 @@ void Crop::update (int todo) int lissage = settings->leveldnliss; for (int k = 0; k < 9; k++) { - float maxmax = max(max_r[k], max_b[k]); - parent->ipf.calcautodn_info (ch_M[k], delta[k], Nb[k], levaut, maxmax, lumL[k], chromC[k], mode, lissage, ry[k], sk[k], pcsk[k]); + float maxmax = max(parent->denoiseInfoStore.max_r[k], parent->denoiseInfoStore.max_b[k]); + parent->ipf.calcautodn_info (parent->denoiseInfoStore.ch_M[k], delta[k], Nb[k], levaut, maxmax, lumL[k], chromC[k], mode, lissage, ry[k], sk[k], pcsk[k]); // printf("ch_M=%f delta=%f\n",ch_M[k], delta[k]); } for (int k = 0; k < 9; k++) { - if(max_r[k] > max_b[k]) { + if(parent->denoiseInfoStore.max_r[k] > parent->denoiseInfoStore.max_b[k]) { Max_R[k] = (delta[k]) / ((autoNRmax * multip * adjustr * lowdenoise) / 2.f); - Min_B[k] = -(ch_M[k] - min_b[k]) / (autoNRmax * multip * adjustr * lowdenoise); + Min_B[k] = -(parent->denoiseInfoStore.ch_M[k] - min_b[k]) / (autoNRmax * multip * adjustr * lowdenoise); Max_B[k] = 0.f; Min_R[k] = 0.f; } else { Max_B[k] = (delta[k]) / ((autoNRmax * multip * adjustr * lowdenoise) / 2.f); - Min_R[k] = - (ch_M[k] - min_r[k]) / (autoNRmax * multip * adjustr * lowdenoise); + Min_R[k] = - (parent->denoiseInfoStore.ch_M[k] - min_r[k]) / (autoNRmax * multip * adjustr * lowdenoise); Min_B[k] = 0.f; Max_R[k] = 0.f; } @@ -545,7 +542,7 @@ void Crop::update (int todo) for (int k = 0; k < 9; k++) { // printf("ch_M= %f Max_R=%f Max_B=%f min_r=%f min_b=%f\n",ch_M[k],Max_R[k], Max_B[k],Min_R[k], Min_B[k]); - chM += ch_M[k]; + chM += parent->denoiseInfoStore.ch_M[k]; MaxBMoy += Max_B[k]; MaxRMoy += Max_R[k]; MinRMoy += Min_R[k]; @@ -644,7 +641,7 @@ void Crop::update (int todo) int kall = 0; float chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi; - parent->ipf.RGB_denoise(kall, origCrop, origCrop, calclum, ch_M, max_r, max_b, parent->imgsrc->isRAW(), /*Roffset,*/ denoiseParams, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi); + parent->ipf.RGB_denoise(kall, origCrop, origCrop, calclum, parent->denoiseInfoStore.ch_M, parent->denoiseInfoStore.max_r, parent->denoiseInfoStore.max_b, parent->imgsrc->isRAW(), /*Roffset,*/ denoiseParams, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi); if (parent->adnListener) { parent->adnListener->noiseChanged(nresi, highresi); @@ -665,9 +662,6 @@ void Crop::update (int todo) parent->imgsrc->convertColorSpace(origCrop, params.icm, parent->currWB); - delete [] ch_M; - delete [] max_r; - delete [] max_b; delete [] min_r; delete [] min_b; delete [] lumL; diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h index a1124ff01..776484791 100644 --- a/rtengine/improccoordinator.h +++ b/rtengine/improccoordinator.h @@ -326,11 +326,13 @@ public: return imgsrc; } - class denoiseinfostore { - public: - bool valid; + struct { + float chM; + float max_r[9]; + float max_b[9]; + float ch_M[9]; + bool valid = false; - denoiseinfostore() : valid(false) {}; } denoiseInfoStore; }; From 5bb20c413e889e397d005357c45003822989f606 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Fri, 9 Sep 2016 16:04:43 +0200 Subject: [PATCH 3/5] Cleaned code, also reduced base memory usage of RT by 1 MB --- rtengine/FTblockDN.cc | 66 +++----------------- rtengine/color.cc | 138 +++++++++++++++++++++++++----------------- rtengine/color.h | 11 ++-- 3 files changed, 95 insertions(+), 120 deletions(-) diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 8229417e4..8ff22864c 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -613,36 +613,6 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef const float gain = pow (2.0f, float(expcomp)); float noisevar_Ldetail = SQR(static_cast(SQR(100. - dnparams.Ldetail) + 50.*(100. - dnparams.Ldetail)) * TS * 0.5f); - if (settings->verbose) { - printf("Denoise Lab=%i\n", settings->denoiselabgamma); - } - - // To avoid branches in loops we access the gammatabs by pointers - // modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10 - // we can put other as gamma g=2.6 slope=11, etc. - // but noting to do with real gamma !!!: it's only for data Lab # data RGB - // finally I opted fot gamma55 and with options we can change - - LUTf *denoisegamtab; - LUTf *denoiseigamtab; - - switch(settings->denoiselabgamma) { - case 0: - denoisegamtab = &(Color::gammatab_26_11); - denoiseigamtab = &(Color::igammatab_26_11); - break; - - case 1: - denoisegamtab = &(Color::gammatab_4); - denoiseigamtab = &(Color::igammatab_4); - break; - - default: - denoisegamtab = &(Color::gammatab_55); - denoiseigamtab = &(Color::igammatab_55); - break; - } - array2D tilemask_in(TS, TS); array2D tilemask_out(TS, TS); @@ -907,9 +877,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float G_ = gain * src->g(i, j); float B_ = gain * src->b(i, j); - R_ = (*denoiseigamtab)[R_]; - G_ = (*denoiseigamtab)[G_]; - B_ = (*denoiseigamtab)[B_]; + R_ = Color::denoiseIGammaTab[R_]; + G_ = Color::denoiseIGammaTab[G_]; + B_ = Color::denoiseIGammaTab[B_]; //apply gamma noise standard (slider) R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f); @@ -1602,9 +1572,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef b_ = b_ < 32768.f ? igamcurve[b_] : (Color::gammanf(b_ / 32768.f, igam) * 65535.f); //readapt arbitrary gamma (inverse from beginning) - r_ = (*denoisegamtab)[r_]; - g_ = (*denoisegamtab)[g_]; - b_ = (*denoisegamtab)[b_]; + r_ = Color::denoiseGammaTab[r_]; + g_ = Color::denoiseGammaTab[g_]; + b_ = Color::denoiseGammaTab[b_]; if (numtiles == 1) { dsttmp->r(i, j) = newGain * r_; @@ -3242,24 +3212,6 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat int nb = 0; int comptlevel = 0; - // To avoid branches in loops we access the gammatabs by pointers - LUTf *denoiseigamtab; - - switch(settings->denoiselabgamma) { - case 0: - denoiseigamtab = &(Color::igammatab_26_11); - break; - - case 1: - denoiseigamtab = &(Color::igammatab_4); - break; - - default: - denoiseigamtab = &(Color::igammatab_55); - break; - } - - for (int tiletop = 0; tiletop < imheight; tiletop += tileHskip) { for (int tileleft = 0; tileleft < imwidth; tileleft += tileWskip) { @@ -3405,9 +3357,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat float G_ = gain * src->g(i, j); float B_ = gain * src->b(i, j); - R_ = (*denoiseigamtab)[R_]; - G_ = (*denoiseigamtab)[G_]; - B_ = (*denoiseigamtab)[B_]; + R_ = Color::denoiseIGammaTab[R_]; + G_ = Color::denoiseIGammaTab[G_]; + B_ = Color::denoiseIGammaTab[B_]; //apply gamma noise standard (slider) R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f); diff --git a/rtengine/color.cc b/rtengine/color.cc index fa4a79840..e19cfb616 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -41,15 +41,10 @@ LUTf Color::igammatab_srgb; LUTf Color::igammatab_srgb1; LUTf Color::gammatab_srgb; LUTf Color::gammatab_srgb1; -// LUTf Color::igammatab_709; -// LUTf Color::gammatab_709; -LUTf Color::igammatab_55; -LUTf Color::gammatab_55; -LUTf Color::igammatab_4; -LUTf Color::gammatab_4; -LUTf Color::igammatab_26_11; -LUTf Color::gammatab_26_11; +LUTf Color::denoiseGammaTab; +LUTf Color::denoiseIGammaTab; + LUTf Color::igammatab_24_17; LUTf Color::gammatab_24_17a; LUTf Color::gammatab_13_2; @@ -148,13 +143,10 @@ void Color::init () igammatab_srgb1(maxindex, 0); gammatab_srgb(maxindex, 0); gammatab_srgb1(maxindex, 0); - igammatab_55(maxindex, 0); - gammatab_55(maxindex, 0); - igammatab_4(maxindex, 0); - gammatab_4(maxindex, 0); - igammatab_26_11(maxindex, 0); - gammatab_26_11(maxindex, 0); + denoiseGammaTab(maxindex, 0); + denoiseIGammaTab(maxindex, 0); + igammatab_24_17(maxindex, 0); gammatab_24_17a(maxindex, LUT_CLIP_ABOVE | LUT_CLIP_BELOW); gammatab_13_2(maxindex, 0); @@ -193,6 +185,7 @@ void Color::init () { gammatab_srgb[i] = gammatab_srgb1[i] = gamma2(i / 65535.0); } + gammatab_srgb *= 65535.f; gamma2curve.share(gammatab_srgb, LUT_CLIP_BELOW | LUT_CLIP_ABOVE); // shares the buffer with gammatab_srgb but has different clip flags } @@ -200,9 +193,11 @@ void Color::init () #pragma omp section #endif { - for (int i = 0; i < maxindex; i++) { + for (int i = 0; i < maxindex; i++) + { igammatab_srgb[i] = igammatab_srgb1[i] = igamma2 (i / 65535.0); } + igammatab_srgb *= 65535.f; } #ifdef _OPENMP @@ -211,42 +206,74 @@ void Color::init () { double rsRGBGamma = 1.0 / sRGBGamma; - for (int i = 0; i < maxindex; i++) { + for (int i = 0; i < maxindex; i++) + { double val = pow (i / 65535.0, rsRGBGamma); gammatab[i] = 65535.0 * val; gammatabThumb[i] = (unsigned char)(255.0 * val); } } + #ifdef _OPENMP #pragma omp section #endif + // modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10 + // we can put other as gamma g=2.6 slope=11, etc. + // but noting to do with real gamma !!!: it's only for data Lab # data RGB + // finally I opted for gamma55 and with options we can change - for (int i = 0; i < maxindex; i++) { - gammatab_55[i] = 65535.0 * gamma55 (i / 65535.0); + switch(settings->denoiselabgamma) { + case 0: + for (int i = 0; i < maxindex; i++) { + denoiseGammaTab[i] = 65535.0 * gamma26_11 (i / 65535.0); + } + + break; + + case 1: + for (int i = 0; i < maxindex; i++) { + denoiseGammaTab[i] = 65535.0 * gamma4 (i / 65535.0); + } + + break; + + default: + for (int i = 0; i < maxindex; i++) { + denoiseGammaTab[i] = 65535.0 * gamma55 (i / 65535.0); + } + + break; } #ifdef _OPENMP #pragma omp section #endif + // modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10 + // we can put other as gamma g=2.6 slope=11, etc. + // but noting to do with real gamma !!!: it's only for data Lab # data RGB + // finally I opted for gamma55 and with options we can change - for (int i = 0; i < maxindex; i++) { - igammatab_55[i] = 65535.0 * igamma55 (i / 65535.0); - } + switch(settings->denoiselabgamma) { + case 0: + for (int i = 0; i < maxindex; i++) { + denoiseIGammaTab[i] = 65535.0 * igamma26_11 (i / 65535.0); + } -#ifdef _OPENMP - #pragma omp section -#endif + break; - for (int i = 0; i < maxindex; i++) { - gammatab_4[i] = 65535.0 * gamma4 (i / 65535.0); - } + case 1: + for (int i = 0; i < maxindex; i++) { + denoiseIGammaTab[i] = 65535.0 * igamma4 (i / 65535.0); + } -#ifdef _OPENMP - #pragma omp section -#endif + break; - for (int i = 0; i < maxindex; i++) { - igammatab_4[i] = 65535.0 * igamma4 (i / 65535.0); + default: + for (int i = 0; i < maxindex; i++) { + denoiseIGammaTab[i] = 65535.0 * igamma55 (i / 65535.0); + } + + break; } #ifdef _OPENMP @@ -297,22 +324,6 @@ void Color::init () igammatab_145_3[i] = 65535.0 * igamma145_3 (i / 65535.0); } -#ifdef _OPENMP - #pragma omp section -#endif - - for (int i = 0; i < maxindex; i++) { - gammatab_26_11[i] = 65535.0 * gamma26_11 (i / 65535.0); - } - -#ifdef _OPENMP - #pragma omp section -#endif - - for (int i = 0; i < maxindex; i++) { - igammatab_26_11[i] = 65535.0 * igamma26_11 (i / 65535.0); - } - #ifdef _OPENMP #pragma omp section #endif @@ -1482,7 +1493,7 @@ void Color::gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope, { #ifdef __SSE2__ // SSE2 version is more than 6 times faster than scalar version - vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f); + vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f); vfloat fourv = F2V(4.f); vfloat gammav = F2V(1.f / gamma); vfloat slopev = F2V((slope / divisor) * factor); @@ -1493,26 +1504,32 @@ void Color::gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope, int border1 = border - (border & 3); int border2 = border1 + 4; int i = 0; + for(; i < border1; i += 4) { vfloat resultv = iv * slopev; - _mm_storeu_ps(&gammacurve[i], resultv); + STVFU(gammacurve[i], resultv); iv += fourv; } + for(; i < border2; i += 4) { vfloat result0v = iv * slopev; vfloat result1v = xexpf((xlogf(iv) - divisorv) * gammav) * factorv; - _mm_storeu_ps(&gammacurve[i], vself(vmaskf_le(iv, comparev), result0v, result1v)); + STVFU(gammacurve[i], vself(vmaskf_le(iv, comparev), result0v, result1v)); iv += fourv; } + for(; i < 65536; i += 4) { vfloat resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv; - _mm_storeu_ps(&gammacurve[i], resultv); + STVFU(gammacurve[i], resultv); iv += fourv; } + #else + for (int i = 0; i < 65536; ++i) { gammacurve[i] = gammaf(static_cast(i) / divisor, gamma, start, slope) * factor; } + #endif } @@ -1520,23 +1537,30 @@ void Color::gammanf2lut (LUTf &gammacurve, float gamma, float divisor, float fac { #ifdef __SSE2__ // SSE2 version is more than 6 times faster than scalar version - vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f); + vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f); vfloat fourv = F2V(4.f); vfloat gammav = F2V(1.f / gamma); vfloat divisorv = F2V(xlogf(divisor)); vfloat factorv = F2V(factor); + + // first input value is zero => we have to use the xlogf function which checks this vfloat resultv = xexpf((xlogf(iv) - divisorv) * gammav) * factorv; - _mm_storeu_ps(&gammacurve[0], resultv); + STVFU(gammacurve[0], resultv); iv += fourv; - for(int i=4; i < 65536; i += 4) { + + // inside the loop we can use xlogfNoCheck and xexpfNoCheck because we know about the input values + for(int i = 4; i < 65536; i += 4) { resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv; - _mm_storeu_ps(&gammacurve[i], resultv); + STVFU(gammacurve[i], resultv); iv += fourv; } + #else + for (int i = 0; i < 65536; ++i) { gammacurve[i] = Color::gammanf(static_cast(i) / divisor, gamma) * factor; } + #endif } @@ -2204,6 +2228,7 @@ void Color::gamutLchonly (float HH, float2 sincosval, float &Lprov1, float &Chpr neg = false, more_rgb = false; #endif float ChprovSave = Chprov1; + do { inGamut = true; @@ -2226,6 +2251,7 @@ void Color::gamutLchonly (float HH, float2 sincosval, float &Lprov1, float &Chpr #ifdef _DEBUG neg = true; #endif + if (isnan(HH)) { float atemp = ChprovSave * sincosval.y * 327.68; float btemp = ChprovSave * sincosval.x * 327.68; diff --git a/rtengine/color.h b/rtengine/color.h index c7699e9c5..0f6b24797 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -132,13 +132,10 @@ public: static LUTf igammatab_srgb1; static LUTf gammatab_srgb; static LUTf gammatab_srgb1; - static LUTf igammatab_55; - static LUTf gammatab_55; - static LUTf igammatab_4; - static LUTf gammatab_4; - static LUTf igammatab_26_11; - static LUTf gammatab_26_11; + static LUTf denoiseGammaTab; + static LUTf denoiseIGammaTab; + static LUTf igammatab_24_17; static LUTf gammatab_24_17a; static LUTf gammatab_13_2; @@ -1107,7 +1104,7 @@ public: static inline float gammaf (float x, float gamma, float start, float slope) { - return x <= start ? x*slope : xexpf(xlogf(x) / gamma); + return x <= start ? x * slope : xexpf(xlogf(x) / gamma); } //fills a LUT of size 65536 using gamma with slope... From d564d17425e2677bf1f24b77088cb7e61957fc09 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Fri, 9 Sep 2016 16:36:18 +0200 Subject: [PATCH 4/5] mentioned the author of the sleef library in the header of the sleef source files, removed unused sleef.h --- rtengine/helperavx.h | 6 +++++ rtengine/helpersse2.h | 9 ++++++++ rtengine/sleef.c | 9 ++++++++ rtengine/sleef.h | 51 ------------------------------------------ rtengine/sleefsseavx.c | 10 +++++++++ 5 files changed, 34 insertions(+), 51 deletions(-) delete mode 100644 rtengine/sleef.h diff --git a/rtengine/helperavx.h b/rtengine/helperavx.h index eb32277c3..528760a92 100644 --- a/rtengine/helperavx.h +++ b/rtengine/helperavx.h @@ -1,3 +1,9 @@ +//////////////////////////////////////////////////////////////// +// +// this code was taken from http://shibatch.sourceforge.net/ +// Many thanks to the author: Naoki Shibata +// +//////////////////////////////////////////////////////////////// #ifndef __AVX__ #error Please specify -mavx. #endif diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h index 0f1fc5759..23dd016fa 100644 --- a/rtengine/helpersse2.h +++ b/rtengine/helpersse2.h @@ -1,3 +1,12 @@ +//////////////////////////////////////////////////////////////// +// +// this code was taken from http://shibatch.sourceforge.net/ +// Many thanks to the author of original version: Naoki Shibata +// +// This version contains modifications made by Ingo Weyrich +// +//////////////////////////////////////////////////////////////// + #ifndef __SSE2__ #error Please specify -msse2. #endif diff --git a/rtengine/sleef.c b/rtengine/sleef.c index 2377aea79..bc38a3cfb 100644 --- a/rtengine/sleef.c +++ b/rtengine/sleef.c @@ -1,3 +1,12 @@ +//////////////////////////////////////////////////////////////// +// +// this code was taken from http://shibatch.sourceforge.net/ +// Many thanks to the author of original version: Naoki Shibata +// +// This version contains modifications made by Ingo Weyrich +// +//////////////////////////////////////////////////////////////// + #ifndef _SLEEFC_ #define _SLEEFC_ diff --git a/rtengine/sleef.h b/rtengine/sleef.h deleted file mode 100644 index 101a4faff..000000000 --- a/rtengine/sleef.h +++ /dev/null @@ -1,51 +0,0 @@ -typedef struct { - double x, y; -} double2; - -typedef struct { - float x, y; -} float2; - -double xsin(double d); -double xcos(double d); -double2 xsincos(double d); -double xtan(double d); -double xasin(double s); -double xacos(double s); -double xatan(double s); -double xatan2(double y, double x); -double xlog(double d); -double xexp(double d); -double xpow(double x, double y); - -double xsinh(double x); -double xcosh(double x); -double xtanh(double x); -double xasinh(double x); -double xacosh(double x); -double xatanh(double x); -double xldexp(double x, int q); -int xilogb(double d); - -double xfma(double x, double y, double z); -double xsqrt(double d); -double xcbrt(double d); - -double xexp2(double a); -double xexp10(double a); -double xexpm1(double a); -double xlog10(double a); -double xlog1p(double a); - -float xsinf(float d); -float xcosf(float d); -float2 xsincosf(float d); -float xtanf(float d); -float xasinf(float s); -float xacosf(float s); -float xatanf(float s); -float xatan2f(float y, float x); -float xlogf(float d); -float xexpf(float d); -float xpowf(float x, float y); -float xcbrtf(float d); diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index 1b0c9a0ae..a55fcf897 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -1,3 +1,13 @@ +//////////////////////////////////////////////////////////////// +// +// this code was taken from http://shibatch.sourceforge.net/ +// Many thanks to the author of original version: Naoki Shibata +// +// This version contains modifications made by Ingo Weyrich +// +//////////////////////////////////////////////////////////////// + + #ifndef SLEEFSSEAVX #define SLEEFSSEAVX From 3ffe983140ecb59966a3b929f8fbb196d452c2a1 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Fri, 9 Sep 2016 21:54:51 +0200 Subject: [PATCH 5/5] Small code cleanups --- rtengine/FTblockDN.cc | 4 ++-- rtengine/improccoordinator.h | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 8ff22864c..6c608cfc9 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -2106,7 +2106,7 @@ float ImProcFunctions::MadMax(float * DataList, int & max, int datalen) float ImProcFunctions::Mad(float * DataList, const int datalen) { - if(datalen <= 0) { // Avoid possible buffer underrun + if(datalen <= 1) { // Avoid possible buffer underrun return 0; } @@ -2135,7 +2135,7 @@ float ImProcFunctions::Mad(float * DataList, const int datalen) float ImProcFunctions::MadRgb(float * DataList, const int datalen) { - if(datalen <= 0) { // Avoid possible buffer underrun + if(datalen <= 1) { // Avoid possible buffer underrun return 0; } diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h index 776484791..0fb0041f3 100644 --- a/rtengine/improccoordinator.h +++ b/rtengine/improccoordinator.h @@ -326,12 +326,13 @@ public: return imgsrc; } - struct { + struct DenoiseInfoStore { + DenoiseInfoStore () : valid(false) {} float chM; float max_r[9]; float max_b[9]; float ch_M[9]; - bool valid = false; + bool valid; } denoiseInfoStore;