diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 69e846f3c..4ccc63c3a 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -37,7 +37,8 @@ #include "opthelper.h" #include "cplx_wavelet_dec.h" #include "median.h" - +#define BENCHMARK +#include "StopWatch.h" #ifdef _OPENMP #include #endif @@ -426,6 +427,7 @@ enum nrquality {QUALITY_STANDARD, QUALITY_HIGH}; SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi) { +BENCHFUN //#ifdef _DEBUG MyTime t1e, t2e; t1e.set(); @@ -588,19 +590,19 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef } } - float gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; LUTf gamcurve(65536, LUT_CLIP_BELOW); + float gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; + MyTime t1e, t2e; + t1e.set(); if (denoiseMethodRgb) { - for (int i = 0; i < 65536; ++i) { - gamcurve[i] = (Color::gamma(static_cast(i) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f; - } + Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f); } else { - for (int i = 0; i < 65536; ++i) { - gamcurve[i] = (Color::gamman(static_cast(i) / 65535.0, gam)) * 32768.0f; - } + Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f); } + t2e.set(); + printf("gamcurve performed in %d usec:\n", t2e.etime(t1e)); // inverse gamma transform for output data float igam = 1.f / gam; @@ -609,15 +611,16 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef LUTf igamcurve(65536, LUT_CLIP_BELOW); + MyTime t11e, t21e; + t11e.set(); + if (denoiseMethodRgb) { - for (int i = 0; i < 65536; ++i) { - igamcurve[i] = (Color::gamma(static_cast(i) / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); - } + Color::gammaf2lut(igamcurve, igam, igamthresh, igamslope, 32768.f, 65535.f); } else { - for (int i = 0; i < 65536; ++i) { - igamcurve[i] = (Color::gamman(static_cast(i) / 32768.0f, igam) * 65535.0f); - } + Color::gammanf2lut(igamcurve, igam, 32768.f, 65535.f); } + t21e.set(); + printf("igamcurve performed in %d usec:\n", t21e.etime(t11e)); const float gain = pow (2.0f, float(expcomp)); float noisevar_Ldetail = SQR(static_cast(SQR(100. - dnparams.Ldetail) + 50.*(100. - dnparams.Ldetail)) * TS * 0.5f); @@ -922,9 +925,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef B_ = (*denoiseigamtab)[B_]; //apply gamma noise standard (slider) - R_ = R_ < 65535.0f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.0f); - G_ = G_ < 65535.0f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.0f); - B_ = B_ < 65535.0f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.0f); + R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f); + G_ = G_ < 65535.f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.f); + B_ = B_ < 65535.f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.f); //true conversion xyz=>Lab float X, Y, Z; @@ -966,9 +969,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float Y = gain * src->g(i, j); float Z = gain * src->b(i, j); //conversion colorspace to determine luminance with no gamma - X = X < 65535.0f ? gamcurve[X] : (Color::gamma(static_cast(X) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); - Y = Y < 65535.0f ? gamcurve[Y] : (Color::gamma(static_cast(Y) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); - Z = Z < 65535.0f ? gamcurve[Z] : (Color::gamma(static_cast(Z) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); + X = X < 65535.f ? gamcurve[X] : (Color::gammaf(X / 65535.f, gam, gamthresh, gamslope) * 32768.f); + Y = Y < 65535.f ? gamcurve[Y] : (Color::gammaf(Y / 65535.f, gam, gamthresh, gamslope) * 32768.f); + Z = Z < 65535.f ? gamcurve[Z] : (Color::gammaf(Z / 65535.f, gam, gamthresh, gamslope) * 32768.f); //end chroma labdn->L[i1][j1] = Y; labdn->a[i1][j1] = (X - Y); @@ -1009,9 +1012,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float btmp = Color::igammatab_srgb[ src->b(i, j) ]; //modification Jacques feb 2013 // gamma slider different from raw - rtmp = rtmp < 65535.0f ? gamcurve[rtmp] : (Color::gamman(static_cast(rtmp) / 65535.0, gam) * 32768.0f); - gtmp = gtmp < 65535.0f ? gamcurve[gtmp] : (Color::gamman(static_cast(gtmp) / 65535.0, gam) * 32768.0f); - btmp = btmp < 65535.0f ? gamcurve[btmp] : (Color::gamman(static_cast(btmp) / 65535.0, gam) * 32768.0f); + rtmp = rtmp < 65535.f ? gamcurve[rtmp] : (Color::gammanf(rtmp / 65535.f, gam) * 32768.f); + gtmp = gtmp < 65535.f ? gamcurve[gtmp] : (Color::gammanf(gtmp / 65535.f, gam) * 32768.f); + btmp = btmp < 65535.f ? gamcurve[btmp] : (Color::gammanf(btmp / 65535.f, gam) * 32768.f); float X, Y, Z; Color::rgbxyz(rtmp, gtmp, btmp, X, Y, Z, wp); @@ -1650,9 +1653,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float Z = Y - (labdn->b[i1][j1]); - X = X < 32768.0f ? igamcurve[X] : (Color::gamma(X / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); - Y = Y < 32768.0f ? igamcurve[Y] : (Color::gamma(Y / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); - Z = Z < 32768.0f ? igamcurve[Z] : (Color::gamma(Z / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f); + X = X < 32768.f ? igamcurve[X] : (Color::gammaf(X / 32768.f, igam, igamthresh, igamslope) * 65535.f); + Y = Y < 32768.f ? igamcurve[Y] : (Color::gammaf(Y / 32768.f, igam, igamthresh, igamslope) * 65535.f); + Z = Z < 32768.f ? igamcurve[Z] : (Color::gammaf(Z / 32768.f, igam, igamthresh, igamslope) * 65535.f); if (numtiles == 1) { dsttmp->r(i, j) = newGain * X; @@ -1695,9 +1698,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef float r_, g_, b_; Color::xyz2rgb(X, Y, Z, r_, g_, b_, wip); //gamma slider is different from Raw - r_ = r_ < 32768.0f ? igamcurve[r_] : (Color::gamman(r_ / 32768.0f, igam) * 65535.0f); - g_ = g_ < 32768.0f ? igamcurve[g_] : (Color::gamman(g_ / 32768.0f, igam) * 65535.0f); - b_ = b_ < 32768.0f ? igamcurve[b_] : (Color::gamman(b_ / 32768.0f, igam) * 65535.0f); + r_ = r_ < 32768.f ? igamcurve[r_] : (Color::gammanf(r_ / 32768.f, igam) * 65535.f); + g_ = g_ < 32768.f ? igamcurve[g_] : (Color::gammanf(g_ / 32768.f, igam) * 65535.f); + b_ = b_ < 32768.f ? igamcurve[b_] : (Color::gammanf(b_ / 32768.f, igam) * 65535.f); if (numtiles == 1) { dsttmp->r(i, j) = newGain * r_; @@ -2982,7 +2985,7 @@ void ImProcFunctions::WaveletDenoiseAll_info(int levwav, wavelet_decomposition & } } -void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope) +SSEFUNCTION void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope) { gam = dnparams.gamma; gamthresh = 0.001f; @@ -2995,18 +2998,20 @@ void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoisePa } } - gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; bool denoiseMethodRgb = (dnparams.dmethod == "RGB"); + MyTime t1e, t2e; + t1e.set(); + if (denoiseMethodRgb) { - for (int i = 0; i < 65536; ++i) { - gamcurve[i] = (Color::gamma(static_cast(i) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f; - } + gamslope = exp(log(static_cast(gamthresh)) / gam) / gamthresh; + Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f); } else { - for (int i = 0; i < 65536; ++i) { - gamcurve[i] = (Color::gamman(static_cast(i) / 65535.0, gam)) * 32768.0f; - } + Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f); } + t2e.set(); + printf("gamcurve in RGB_denoise_infoGamCurve performed in %d usec:\n", t2e.etime(t1e)); + } void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc) @@ -3424,9 +3429,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat B_ = (*denoiseigamtab)[B_]; //apply gamma noise standard (slider) - R_ = R_ < 65535.0f ? gamcurve[R_] : (Color::gamman(static_cast(R_) / 65535.0, gam) * 32768.0f); - G_ = G_ < 65535.0f ? gamcurve[G_] : (Color::gamman(static_cast(G_) / 65535.0, gam) * 32768.0f); - B_ = B_ < 65535.0f ? gamcurve[B_] : (Color::gamman(static_cast(B_) / 65535.0, gam) * 32768.0f); + R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f); + G_ = G_ < 65535.f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.f); + B_ = B_ < 65535.f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.f); //true conversion xyz=>Lab float X, Y, Z; Color::rgbxyz(R_, G_, B_, X, Y, Z, wp); @@ -3451,9 +3456,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat float Y = gain * src->g(i, j); float Z = gain * src->b(i, j); - X = X < 65535.0f ? gamcurve[X] : (Color::gamma(static_cast(X) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); - Y = Y < 65535.0f ? gamcurve[Y] : (Color::gamma(static_cast(Y) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); - Z = Z < 65535.0f ? gamcurve[Z] : (Color::gamma(static_cast(Z) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f); + X = X < 65535.f ? gamcurve[X] : (Color::gammaf(X / 65535.f, gam, gamthresh, gamslope) * 32768.f); + Y = Y < 65535.f ? gamcurve[Y] : (Color::gammaf(Y / 65535.f, gam, gamthresh, gamslope) * 32768.f); + Z = Z < 65535.f ? gamcurve[Z] : (Color::gammaf(Z / 65535.f, gam, gamthresh, gamslope) * 32768.f); labdn->a[i1][j1] = (X - Y); labdn->b[i1][j1] = (Y - Z); @@ -3480,9 +3485,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat float btmp = Color::igammatab_srgb[ src->b(i, j) ]; //modification Jacques feb 2013 // gamma slider different from raw - rtmp = rtmp < 65535.0f ? gamcurve[rtmp] : (Color::gamman(static_cast(rtmp) / 65535.0, gam) * 32768.0f); - gtmp = gtmp < 65535.0f ? gamcurve[gtmp] : (Color::gamman(static_cast(gtmp) / 65535.0, gam) * 32768.0f); - btmp = btmp < 65535.0f ? gamcurve[btmp] : (Color::gamman(static_cast(btmp) / 65535.0, gam) * 32768.0f); + rtmp = rtmp < 65535.f ? gamcurve[rtmp] : (Color::gammanf(rtmp / 65535.f, gam) * 32768.f); + gtmp = gtmp < 65535.f ? gamcurve[gtmp] : (Color::gammanf(gtmp / 65535.f, gam) * 32768.f); + btmp = btmp < 65535.f ? gamcurve[btmp] : (Color::gammanf(btmp / 65535.f, gam) * 32768.f); float X, Y, Z; Color::rgbxyz(rtmp, gtmp, btmp, X, Y, Z, wp); diff --git a/rtengine/color.cc b/rtengine/color.cc index 872f47522..fa4a79840 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -1478,6 +1478,67 @@ void Color::calcGamma (double pwr, double ts, int mode, int imax, double &gamma0 return; } } +void Color::gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope, float divisor, float factor) +{ +#ifdef __SSE2__ + // SSE2 version is more than 6 times faster than scalar version + vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f); + vfloat fourv = F2V(4.f); + vfloat gammav = F2V(1.f / gamma); + vfloat slopev = F2V((slope / divisor) * factor); + vfloat divisorv = F2V(xlogf(divisor)); + vfloat factorv = F2V(factor); + vfloat comparev = F2V(start * divisor); + int border = start * divisor; + int border1 = border - (border & 3); + int border2 = border1 + 4; + int i = 0; + for(; i < border1; i += 4) { + vfloat resultv = iv * slopev; + _mm_storeu_ps(&gammacurve[i], resultv); + iv += fourv; + } + for(; i < border2; i += 4) { + vfloat result0v = iv * slopev; + vfloat result1v = xexpf((xlogf(iv) - divisorv) * gammav) * factorv; + _mm_storeu_ps(&gammacurve[i], vself(vmaskf_le(iv, comparev), result0v, result1v)); + iv += fourv; + } + for(; i < 65536; i += 4) { + vfloat resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv; + _mm_storeu_ps(&gammacurve[i], resultv); + iv += fourv; + } +#else + for (int i = 0; i < 65536; ++i) { + gammacurve[i] = gammaf(static_cast(i) / divisor, gamma, start, slope) * factor; + } +#endif +} + +void Color::gammanf2lut (LUTf &gammacurve, float gamma, float divisor, float factor) //standard gamma without slope... +{ +#ifdef __SSE2__ + // SSE2 version is more than 6 times faster than scalar version + vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f); + vfloat fourv = F2V(4.f); + vfloat gammav = F2V(1.f / gamma); + vfloat divisorv = F2V(xlogf(divisor)); + vfloat factorv = F2V(factor); + vfloat resultv = xexpf((xlogf(iv) - divisorv) * gammav) * factorv; + _mm_storeu_ps(&gammacurve[0], resultv); + iv += fourv; + for(int i=4; i < 65536; i += 4) { + resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv; + _mm_storeu_ps(&gammacurve[i], resultv); + iv += fourv; + } +#else + for (int i = 0; i < 65536; ++i) { + gammacurve[i] = Color::gammanf(static_cast(i) / divisor, gamma) * factor; + } +#endif +} void Color::Lab2XYZ(float L, float a, float b, float &x, float &y, float &z) { diff --git a/rtengine/color.h b/rtengine/color.h index 1ae721e5a..c7699e9c5 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -1104,6 +1104,15 @@ public: { return (x <= start ? x*slope : exp(log(x) / gamma) * mul - add); } + + static inline float gammaf (float x, float gamma, float start, float slope) + { + return x <= start ? x*slope : xexpf(xlogf(x) / gamma); + } + + //fills a LUT of size 65536 using gamma with slope... + static void gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope, float divisor, float factor); + static inline double igamma (double x, double gamma, double start, double slope, double mul, double add) { return (x <= start * slope ? x / slope : exp(log((x + add) / mul) * gamma) ); @@ -1118,7 +1127,7 @@ public: */ static inline double gamman (double x, double gamma) //standard gamma without slope... { - return (x = exp(log(x) / gamma)); + return exp(log(x) / gamma); } /** @@ -1129,9 +1138,10 @@ public: */ static inline float gammanf (float x, float gamma) //standard gamma without slope... { - return (x = xexpf(xlogf(x) / gamma)); + return xexpf(xlogf(x) / gamma); } - + //fills a LUT of size 65536 using gamma without slope... + static void gammanf2lut (LUTf &gammacurve, float gamma, float divisor, float factor); /** * @brief Very simply inverse gamma @@ -1141,7 +1151,7 @@ public: */ static inline double igamman (double x, double gamma) //standard inverse gamma without slope... { - return (x = exp(log(x) * gamma) ); + return exp(log(x) * gamma); } diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc index 93aca0f6d..6bf3f2273 100644 --- a/rtengine/dcrop.cc +++ b/rtengine/dcrop.cc @@ -380,7 +380,7 @@ void Crop::update (int todo) } } - if(skip == 1 && params.dirpyrDenoise.enabled && ((settings->leveldnautsimpl == 1 && params.dirpyrDenoise.Cmethod == "AUT") || (settings->leveldnautsimpl == 0 && params.dirpyrDenoise.C2method == "AUTO"))) { + if(skip == 1 && params.dirpyrDenoise.enabled && !parent->denoiseInfoStore.valid && ((settings->leveldnautsimpl == 1 && params.dirpyrDenoise.Cmethod == "AUT") || (settings->leveldnautsimpl == 0 && params.dirpyrDenoise.C2method == "AUTO"))) { MyTime t1aue, t2aue; t1aue.set(); @@ -587,7 +587,7 @@ void Crop::update (int todo) params.dirpyrDenoise.chroma = chM / (autoNR * multip * adjustr); params.dirpyrDenoise.redchro = maxr; params.dirpyrDenoise.bluechro = maxb; - + parent->denoiseInfoStore.valid = true; if(parent->adnListener) { parent->adnListener->chromaChanged(params.dirpyrDenoise.chroma, params.dirpyrDenoise.redchro, params.dirpyrDenoise.bluechro); } diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc index a6354bd56..8925b29e0 100644 --- a/rtengine/improccoordinator.cc +++ b/rtengine/improccoordinator.cc @@ -320,6 +320,7 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall) ipf.setScale (scale); imgsrc->getImage (currWB, tr, orig_prev, pp, params.toneCurve, params.icm, params.raw); + denoiseInfoStore.valid = false; //ColorTemp::CAT02 (orig_prev, ¶ms) ; // printf("orig_prevW=%d\n scale=%d",orig_prev->width, scale); /* Issue 2785, disabled some 1:1 tools diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h index 5cc03cb72..a1124ff01 100644 --- a/rtengine/improccoordinator.h +++ b/rtengine/improccoordinator.h @@ -325,6 +325,14 @@ public: { return imgsrc; } + + class denoiseinfostore { + public: + bool valid; + + denoiseinfostore() : valid(false) {}; + } denoiseInfoStore; + }; } #endif diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c index a9f49f143..1b0c9a0ae 100644 --- a/rtengine/sleefsseavx.c +++ b/rtengine/sleefsseavx.c @@ -1275,6 +1275,25 @@ static INLINE vfloat xlogf0(vfloat d) { return x; } +static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table + vfloat x, x2, t, m; + vint2 e; + + e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f))); + m = vldexpf(d, vsubi2(vcast_vi2_i(0), e)); + + x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m)); + x2 = vmulf(x, x); + + t = vcast_vf_f(0.2371599674224853515625f); + t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f)); + t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f)); + t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f)); + t = vmlaf(t, x2, vcast_vf_f(2.0f)); + + return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e))); + +} static INLINE vfloat xexpf(vfloat d) { vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f))); @@ -1299,6 +1318,24 @@ static INLINE vfloat xexpf(vfloat d) { return u; } +static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table + vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f))); + vfloat s, u; + + s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d); + s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s); + + u = vcast_vf_f(0.00136324646882712841033936f); + u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f)); + u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f)); + u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f)); + u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f)); + + u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s)); + + return vldexpf(u, q); +} + static INLINE vfloat xcbrtf(vfloat d) { vfloat x, y, q = vcast_vf_f(1.0), t; vint2 e, qu, re;