diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc index 4e62e1c1f..62483c3d7 100644 --- a/rtengine/FTblockDN.cc +++ b/rtengine/FTblockDN.cc @@ -42,6 +42,7 @@ #ifdef _OPENMP #include #endif +#define BENCHMARK #include "StopWatch.h" #define TS 64 // Tile size @@ -839,6 +840,12 @@ BENCHFUN {static_cast(wprof[2][0]), static_cast(wprof[2][1]), static_cast(wprof[2][2])} }; + const float wpfast[3][3] = { + {static_cast(wprof[0][0]) / Color::D50x, static_cast(wprof[0][1]) / Color::D50x, static_cast(wprof[0][2]) / Color::D50x}, + {static_cast(wprof[1][0]), static_cast(wprof[1][1]), static_cast(wprof[1][2])}, + {static_cast(wprof[2][0]) / Color::D50z, static_cast(wprof[2][1]) / Color::D50z, static_cast(wprof[2][2]) / Color::D50z} + }; + // begin tile processing of image #ifdef _OPENMP #pragma omp parallel num_threads(numthreads) if (numthreads>1) @@ -925,51 +932,38 @@ BENCHFUN if (!denoiseMethodRgb) { //lab mode //modification Jacques feb 2013 and july 2014 #ifdef _OPENMP - #pragma omp parallel for num_threads(denoiseNestedLevels) if (denoiseNestedLevels>1) + #pragma omp parallel for schedule(dynamic,16) num_threads(denoiseNestedLevels) if (denoiseNestedLevels>1) #endif for (int i = tiletop; i < tilebottom; ++i) { - int i1 = i - tiletop; + const int i1 = i - tiletop; for (int j = tileleft; j < tileright; ++j) { - int j1 = j - tileleft; - float R_ = gain * src->r(i, j); - float G_ = gain * src->g(i, j); - float B_ = gain * src->b(i, j); + const int j1 = j - tileleft; - R_ = Color::denoiseIGammaTab[R_]; - G_ = Color::denoiseIGammaTab[G_]; - B_ = Color::denoiseIGammaTab[B_]; + const float R_ = Color::denoiseIGammaTab[gain * src->r(i, j)]; + const float G_ = Color::denoiseIGammaTab[gain * src->g(i, j)]; + const float B_ = Color::denoiseIGammaTab[gain * src->b(i, j)]; //apply gamma noise standard (slider) - R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f); - G_ = G_ < 65535.f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.f); - B_ = B_ < 65535.f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.f); - - //true conversion xyz=>Lab - float X, Y, Z; - Color::rgbxyz(R_, G_, B_, X, Y, Z, wp); - - //convert to Lab - float L, a, b; - Color::XYZ2Lab(X, Y, Z, L, a, b); - - labdn->L[i1][j1] = L; - labdn->a[i1][j1] = a; - labdn->b[i1][j1] = b; + labdn->L[i1][j1] = R_ < 65535.f ? gamcurve[R_] : Color::gammanf(R_ / 65535.f, gam) * 32768.f; + labdn->a[i1][j1] = G_ < 65535.f ? gamcurve[G_] : Color::gammanf(G_ / 65535.f, gam) * 32768.f; + labdn->b[i1][j1] = B_ < 65535.f ? gamcurve[B_] : Color::gammanf(B_ / 65535.f, gam) * 32768.f; if (((i1 | j1) & 1) == 0) { if (numTries == 1) { - noisevarlum[(i1 >> 1)*width2 + (j1 >> 1)] = useNoiseLCurve ? lumcalc[i >> 1][j >> 1] : noisevarL; - noisevarchrom[(i1 >> 1)*width2 + (j1 >> 1)] = useNoiseCCurve ? maxNoiseVarab * ccalc[i >> 1][j >> 1] : 1.f; + noisevarlum[(i1 >> 1) * width2 + (j1 >> 1)] = useNoiseLCurve ? lumcalc[i >> 1][j >> 1] : noisevarL; + noisevarchrom[(i1 >> 1) * width2 + (j1 >> 1)] = useNoiseCCurve ? maxNoiseVarab * ccalc[i >> 1][j >> 1] : 1.f; } else { - noisevarlum[(i1 >> 1)*width2 + (j1 >> 1)] = lumcalc[i >> 1][j >> 1]; - noisevarchrom[(i1 >> 1)*width2 + (j1 >> 1)] = ccalc[i >> 1][j >> 1]; + noisevarlum[(i1 >> 1) * width2 + (j1 >> 1)] = lumcalc[i >> 1][j >> 1]; + noisevarchrom[(i1 >> 1) * width2 + (j1 >> 1)] = ccalc[i >> 1][j >> 1]; } } //end chroma } + //true conversion xyz=>Lab + Color::RGB2Lab(labdn->L[i1], labdn->a[i1], labdn->b[i1], labdn->L[i1], labdn->a[i1], labdn->b[i1], wpfast, width); } } else {//RGB mode #ifdef _OPENMP @@ -1605,27 +1599,13 @@ BENCHFUN for (int i = tiletop; i < tilebottom; ++i) { int i1 = i - tiletop; - + //true conversion Lab==>xyz + Color::Lab2RGBLimit(labdn->L[i1], labdn->a[i1], labdn->b[i1], labdn->L[i1], labdn->a[i1], labdn->b[i1], wip, 9000000.f, 1.f + qhighFactor * realred, 1.f + qhighFactor * realblue, width); for (int j = tileleft; j < tileright; ++j) { int j1 = j - tileleft; - //modification Jacques feb 2013 - //true conversion Lab==>xyz - float L = labdn->L[i1][j1]; - float a = labdn->a[i1][j1]; - float b = labdn->b[i1][j1]; - float c_h = SQR(a) + SQR(b); - - if (c_h > 9000000.f) { - a *= 1.f + qhighFactor * realred; - b *= 1.f + qhighFactor * realblue; - } - - //convert XYZ - float X, Y, Z; - Color::Lab2XYZ(L, a, b, X, Y, Z); - //apply inverse gamma noise - float r_, g_, b_; - Color::xyz2rgb(X, Y, Z, r_, g_, b_, wip); + float r_ = labdn->L[i1][j1]; + float g_ = labdn->a[i1][j1]; + float b_ = labdn->b[i1][j1]; //inverse gamma standard (slider) r_ = r_ < 32768.f ? igamcurve[r_] : (Color::gammanf(r_ / 32768.f, igam) * 65535.f); g_ = g_ < 32768.f ? igamcurve[g_] : (Color::gammanf(g_ / 32768.f, igam) * 65535.f); diff --git a/rtengine/color.cc b/rtengine/color.cc index 4ace03bc0..3f2a75788 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -1771,10 +1771,10 @@ void Color::RGB2Lab(float *R, float *G, float *B, float *L, float *a, float *b, { #ifdef __SSE2__ - vfloat minvalfv = F2V(0.f); - vfloat maxvalfv = F2V(MAXVALF); - vfloat c500v = F2V(500.f); - vfloat c200v = F2V(200.f); + const vfloat minvalfv = ZEROV; + const vfloat maxvalfv = F2V(MAXVALF); + const vfloat c500v = F2V(500.f); + const vfloat c200v = F2V(200.f); #endif int i = 0; @@ -1787,9 +1787,7 @@ void Color::RGB2Lab(float *R, float *G, float *B, float *L, float *a, float *b, const vfloat yv = F2V(wp[1][0]) * rv + F2V(wp[1][1]) * gv + F2V(wp[1][2]) * bv; const vfloat zv = F2V(wp[2][0]) * rv + F2V(wp[2][1]) * gv + F2V(wp[2][2]) * bv; - vmask maxMask = vmaskf_gt(vmaxf(xv, vmaxf(yv, zv)), maxvalfv); - vmask minMask = vmaskf_lt(vminf(xv, vminf(yv, zv)), minvalfv); - if (_mm_movemask_ps((vfloat)maxMask) || _mm_movemask_ps((vfloat)minMask)) { + if (_mm_movemask_ps((vfloat)vorm(vmaskf_gt(vmaxf(xv, vmaxf(yv, zv)), maxvalfv), vmaskf_lt(vminf(xv, vminf(yv, zv)), minvalfv)))) { // take slower code path for all 4 pixels if one of the values is > MAXVALF. Still faster than non SSE2 version for(int k = 0; k < 4; ++k) { float x = xv[k]; @@ -1872,6 +1870,51 @@ void Color::RGB2L(float *R, float *G, float *B, float *L, const float wp[3][3], } } +void Color::Lab2RGBLimit(float *L, float *a, float *b, float *R, float *G, float *B, const float wp[3][3], float limit, float afactor, float bfactor, int width) +{ + + int i = 0; + +#ifdef __SSE2__ + const vfloat wpv[3][3] = { + {F2V(wp[0][0]), F2V(wp[0][1]), F2V(wp[0][2])}, + {F2V(wp[1][0]), F2V(wp[1][1]), F2V(wp[1][2])}, + {F2V(wp[2][0]), F2V(wp[2][1]), F2V(wp[2][2])} + }; + const vfloat limitv = F2V(limit); + const vfloat afactorv = F2V(afactor); + const vfloat bfactorv = F2V(bfactor); + + for(;i < width - 3; i+=4) { + const vfloat Lv = LVFU(L[i]); + vfloat av = LVFU(a[i]); + vfloat bv = LVFU(b[i]); + + const vmask mask = vmaskf_gt(SQRV(av) + SQRV(bv), limitv); + av = vself(mask, av * afactorv, av); + bv = vself(mask, bv * bfactorv, bv); + vfloat Xv, Yv, Zv; + Lab2XYZ(Lv, av, bv, Xv, Yv, Zv); + vfloat Rv, Gv, Bv; + xyz2rgb(Xv, Yv, Zv, Rv, Gv, Bv, wpv); + STVFU(R[i], Rv); + STVFU(G[i], Gv); + STVFU(B[i], Bv); + } +#endif + for(;i < width; ++i) { + float X, Y, Z; + float av = a[i]; + float bv = b[i]; + if (SQR(av) + SQR(bv) > limit) { + av *= afactor; + bv *= bfactor; + } + Lab2XYZ(L[i], av, bv, X, Y, Z); + xyz2rgb(X, Y, Z, R[i], G[i], B[i], wp); + } +} + void Color::XYZ2Lab(float X, float Y, float Z, float &L, float &a, float &b) { diff --git a/rtengine/color.h b/rtengine/color.h index 9f8863343..c3d1cc50a 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -617,6 +617,7 @@ public: */ static void XYZ2Lab(float x, float y, float z, float &L, float &a, float &b); static void RGB2Lab(float *X, float *Y, float *Z, float *L, float *a, float *b, const float wp[3][3], int width); + static void Lab2RGBLimit(float *L, float *a, float *b, float *R, float *G, float *B, const float wp[3][3], float limit, float afactor, float bfactor, int width); static void RGB2L(float *X, float *Y, float *Z, float *L, const float wp[3][3], int width); /**