diff --git a/rtengine/color.cc b/rtengine/color.cc index 7c12c0ca5..26c2cf7c9 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -1835,21 +1835,21 @@ void Color::RGB2L(float *R, float *G, float *B, float *L, const float wp[3][3], { #ifdef __SSE2__ - vfloat minvalfv = F2V(0.f); - vfloat maxvalfv = F2V(MAXVALF); + const vfloat maxvalfv = F2V(MAXVALF); + const vfloat rmv = F2V(wp[1][0]); + const vfloat gmv = F2V(wp[1][1]); + const vfloat bmv = F2V(wp[1][2]); #endif int i = 0; #ifdef __SSE2__ - for(;i < width - 3; i+=4) { + for(; i < width - 3; i+=4) { const vfloat rv = LVFU(R[i]); const vfloat gv = LVFU(G[i]); const vfloat bv = LVFU(B[i]); - const vfloat yv = F2V(wp[1][0]) * rv + F2V(wp[1][1]) * gv + F2V(wp[1][2]) * bv; + const vfloat yv = rmv * rv + gmv * gv + bmv * bv; - vmask maxMask = vmaskf_gt(yv, maxvalfv); - vmask minMask = vmaskf_lt(yv, minvalfv); - if (_mm_movemask_ps((vfloat)vorm(maxMask, minMask))) { + if (_mm_movemask_ps((vfloat)vorm(vmaskf_gt(yv, maxvalfv), vmaskf_lt(yv, ZEROV)))) { // take slower code path for all 4 pixels if one of the values is > MAXVALF. Still faster than non SSE2 version for(int k = 0; k < 4; ++k) { float y = yv[k]; @@ -1860,7 +1860,7 @@ void Color::RGB2L(float *R, float *G, float *B, float *L, const float wp[3][3], } } #endif - for(;i < width; ++i) { + for(; i < width; ++i) { const float rv = R[i]; const float gv = G[i]; const float bv = B[i]; diff --git a/rtengine/color.h b/rtengine/color.h index 7b198f284..7cc7368b3 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -1804,12 +1804,6 @@ public: return (hr); } - static inline void RGB2YCbCr(float R, float G, float B, float &Y, float &Cb, float &Cr) { - Y = 0.2627f * R + 0.6780f * G + 0.0593f * B; - Cb = (1.f - 0.0593f) * B - (0.2627f * R + 0.6780f * G); - Cr = (1.f - 0.2627f) * R - (0.6780f * G + 0.0593f * B); - } - static inline void RGB2YCbCr(float* R, float* G, float* B, float* Y, float* Cb, float *Cr, float gamma, int W) { gamma = 1.f / gamma; int i = 0; @@ -1839,12 +1833,6 @@ public: } } - static inline void YCbCr2RGB(float Y, float Cb, float Cr, float &R, float &G, float &B) { - R = std::max(Y + Cr, 0.f); - G = std::max(Y - (0.0593f / 0.6780f) * Cb - (0.2627f / 0.6780f) * Cr, 0.f); - B = std::max(Y + Cb, 0.f); - } - static inline void YCbCr2RGB(float* Y, float* Cb, float* Cr, float* R, float* G, float* B, float gamma, int W) { int i = 0; #ifdef __SSE2__ @@ -1870,6 +1858,30 @@ public: B[i] = std::max(y + cb, 0.f); } } + + static inline void RGB2Y(float* R, float* G, float* B, float* Y, float gamma, int W) { + gamma = 1.f / gamma; + int i = 0; +#ifdef __SSE2__ + const vfloat gammav = F2V(gamma); + const vfloat c1v = F2V(0.2627f); + const vfloat c2v = F2V(0.6780f); + const vfloat c3v = F2V(0.0593f); + for (; i < W - 3; i += 4) { + const vfloat Rv = vmaxf(LVFU(R[i]), ZEROV); + const vfloat Gv = vmaxf(LVFU(G[i]), ZEROV); + const vfloat Bv = vmaxf(LVFU(B[i]), ZEROV); + STVFU(Y[i], pow_F(c1v * Rv + c2v * Gv + c3v * Bv, gammav)); + } +#endif + for (; i < W; ++i) { + const float r = std::max(R[i], 0.f); + const float g = std::max(G[i], 0.f); + const float b = std::max(B[i], 0.f); + Y[i] = pow_F(0.2627f * r + 0.6780f * g + 0.0593f * b, gamma); + } + } + }; } diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc index 050b62692..d947d4195 100644 --- a/rtengine/rawimagesource.cc +++ b/rtengine/rawimagesource.cc @@ -4987,17 +4987,22 @@ BENCHFUN } array2D L(W,H); - array2D& Y = red; // red will be overridden anyway => we can use its buffer to store Y - array2D& Cb = green; // green will be overridden anyway => we can use its buffer to store Cb - array2D& Cr = blue; // blue will be overridden anyway => we can use its buffer to store Cr + array2D YOld(W,H); + array2D YNew(W,H); +// array2D& Y = red; // red will be overridden anyway => we can use its buffer to store Y +// array2D& Cb = green; // green will be overridden anyway => we can use its buffer to store Cb +// array2D& Cr = blue; // blue will be overridden anyway => we can use its buffer to store Cr StopWatch Stop1("rgb2Y"); #ifdef _OPENMP - #pragma omp parallel for + #pragma omp parallel for schedule(dynamic, 16) #endif for (int i = 0; i < H; ++i) { Color::RGB2L(red[i], green[i], blue[i], L[i], xyz_rgb, W); - Color::RGB2YCbCr(red[i], green[i], blue[i], Y[i], Cb[i], Cr[i], sharpeningParams.gamma, W); + Color::RGB2Y(red[i], green[i], blue[i], YOld[i], sharpeningParams.gamma, W); + for (int j = 0; j < W; ++j) { + YNew[i][j] = YOld[i][j]; + } } // calculate contrast based blend factors to reduce sharpening in regions with low contrast JaggedArray blend(W, H); @@ -5008,13 +5013,19 @@ BENCHFUN array2D& tmp = L; // L is not used anymore now => we can use its buffer as the needed temporary buffer ProcParams dummy; ImProcFunctions ipf(&dummy); - ipf.deconvsharpening(Y, tmp, blend, W, H, sharpeningParams, 1.0); + ipf.deconvsharpening(YNew, tmp, blend, W, H, sharpeningParams, 1.0); StopWatch Stop2("Y2RGB"); + const float gamma = sharpeningParams.gamma; #ifdef _OPENMP #pragma omp parallel for #endif for (int i = 0; i < H; ++i) { - Color::YCbCr2RGB(Y[i], Cb[i], Cr[i], red[i], green[i], blue[i], sharpeningParams.gamma, W); + for (int j = 0; j < W; ++j) { + const float factor = pow_F(YNew[i][j] / (YOld[i][j] == 0.f ? 0.00001f : YOld[i][j]), gamma); + red[i][j] *= factor; + green[i][j] *= factor; + blue[i][j] *= factor; + } } Stop2.stop(); } diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc index 1011ae7b7..97c44dfc1 100644 --- a/rtengine/rt_algo.cc +++ b/rtengine/rt_algo.cc @@ -52,7 +52,7 @@ vfloat calcBlendFactor(vfloat valv, vfloat thresholdv) { } #endif -float tileAverage(float **data, size_t tileY, size_t tileX, size_t tilesize) { +float tileAverage(const float * const *data, size_t tileY, size_t tileX, size_t tilesize) { float avg = 0.f; #ifdef __SSE2__ @@ -75,7 +75,7 @@ float tileAverage(float **data, size_t tileY, size_t tileX, size_t tilesize) { return avg / rtengine::SQR(tilesize); } -float tileVariance(float **data, size_t tileY, size_t tileX, size_t tilesize, float avg) { +float tileVariance(const float * const *data, size_t tileY, size_t tileX, size_t tilesize, float avg) { float var = 0.f; #ifdef __SSE2__ @@ -99,7 +99,7 @@ float tileVariance(float **data, size_t tileY, size_t tileX, size_t tilesize, fl return var / (rtengine::SQR(tilesize) * avg); } -float calcContrastThreshold(float** luminance, int tileY, int tileX, int tilesize) { +float calcContrastThreshold(const float* const * luminance, int tileY, int tileX, int tilesize) { constexpr float scale = 0.0625f / 327.68f; std::vector> blend(tilesize - 4, std::vector(tilesize - 4)); @@ -299,7 +299,7 @@ void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& maxOut = rtengine::LIM(maxOut, minVal, maxVal); } -void buildBlendMask(float** luminance, float **blend, int W, int H, float &contrastThreshold, float amount, bool autoContrast) { +void buildBlendMask(const float* const * luminance, float **blend, int W, int H, float &contrastThreshold, float amount, bool autoContrast) { if (autoContrast) { constexpr float minLuminance = 2000.f; diff --git a/rtengine/rt_algo.h b/rtengine/rt_algo.h index a8e2e3e23..a13f3cb8e 100644 --- a/rtengine/rt_algo.h +++ b/rtengine/rt_algo.h @@ -24,5 +24,5 @@ namespace rtengine { void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multiThread = true); -void buildBlendMask(float** luminance, float **blend, int W, int H, float &contrastThreshold, float amount = 1.f, bool autoContrast = false); +void buildBlendMask(const float* const * luminance, float **blend, int W, int H, float &contrastThreshold, float amount = 1.f, bool autoContrast = false); } diff --git a/rtgui/pdsharpening.cc b/rtgui/pdsharpening.cc index 1151684e7..7a428806f 100644 --- a/rtgui/pdsharpening.cc +++ b/rtgui/pdsharpening.cc @@ -49,7 +49,7 @@ PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDS pack_start(*hb); Gtk::VBox* rld = Gtk::manage(new Gtk::VBox()); - gamma = Gtk::manage(new Adjuster(M("TP_SHARPENING_GAMMA"), 0.5, 3.0, 0.05, 1.35)); + gamma = Gtk::manage(new Adjuster(M("TP_SHARPENING_GAMMA"), 0.5, 6.0, 0.05, 1.35)); dradius = Gtk::manage(new Adjuster(M("TP_SHARPENING_EDRADIUS"), 0.4, 2.5, 0.01, 0.75)); diter = Gtk::manage(new Adjuster(M("TP_SHARPENING_RLD_ITERATIONS"), 5, 100, 1, 30)); rld->pack_start(*gamma);