From 5402c3d27cf4123ecb1e5d4affccf0f580c2e3c6 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Fri, 11 Sep 2015 22:19:47 +0200 Subject: [PATCH] Optimizations for Retinex HSL mode --- rtengine/color.cc | 147 +++++++++++++++++++++++++++++ rtengine/color.h | 14 ++- rtengine/ipretinex.cc | 18 +--- rtengine/rawimagesource.cc | 189 +++++++++++++++++++++++-------------- 4 files changed, 282 insertions(+), 86 deletions(-) diff --git a/rtengine/color.cc b/rtengine/color.cc index d671d9baa..ed6e9cae6 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -280,6 +280,70 @@ void Color::rgb2hsl(float r, float g, float b, float &h, float &s, float &l) } } +void Color::rgb2hslfloat(float r, float g, float b, float &h, float &s, float &l) +{ + + float m = min(r, g, b); + float M = max(r, g, b); + float C = M - m; + + l = (M + m) * 7.6295109e-6f; // (0.5f / 65535.f) + + if (fabsf(C) < 0.65535f) { // 0.00001f * 65535.f + h = 0.f; + s = 0.f; + } else { + + if (l <= 0.5f) { + s = (M - m) / (M + m); + } else { + s = (M - m) / (131070.f - M - m); // 131070.f = 2.f * 65535.f + } + + if ( r == M ) { + h = (g - b); + } else if ( g == M ) { + h = (2.f * C) + (b - r); + } else { + h = (4.f * C) + (r - g); + } + + h /= (6.f * C); + + if ( h < 0.f ) { + h += 1.f; + } else if ( h > 1.f ) { + h -= 1.f; + } + } +} + +#ifdef __SSE2__ +void Color::rgb2hsl(vfloat r, vfloat g, vfloat b, vfloat &h, vfloat &s, vfloat &l) +{ + vfloat maxv = _mm_max_ps(r, _mm_max_ps(g, b)); + vfloat minv = _mm_min_ps(r, _mm_min_ps(g, b)); + vfloat C = maxv - minv; + vfloat tempv = maxv + minv; + l = (tempv) * F2V(7.6295109e-6f); + s = (maxv - minv); + s /= vself(vmaskf_gt(l, F2V(0.5f)), F2V(131070.f) - tempv, tempv); + + h = F2V(4.f) * C + r - g; + h = vself(vmaskf_eq(g, maxv), F2V(2.f) * C + b - r, h); + h = vself(vmaskf_eq(r, maxv), g - b, h); + + h /= (F2V(6.f) * C); + vfloat onev = F2V(1.f); + h = vself(vmaskf_lt(h, ZEROV), h + onev, h); + h = vself(vmaskf_gt(h, onev), h - onev, h); + + vmask zeromask = vmaskf_lt(vabsf(C), F2V(0.65535f)); + h = vself(zeromask, ZEROV, h); + s = vself(zeromask, ZEROV, s); +} +#endif + double Color::hue2rgb(double p, double q, double t) { if (t < 0.) { @@ -299,6 +363,42 @@ double Color::hue2rgb(double p, double q, double t) } } +float Color::hue2rgbfloat(float p, float q, float t) +{ + if (t < 0.f) { + t += 6.f; + } else if( t > 6.f) { + t -= 6.f; + } + + if (t < 1.f) { + return p + (q - p) * t; + } else if (t < 3.f) { + return q; + } else if (t < 4.f) { + return p + (q - p) * (4.f - t); + } else { + return p; + } +} + +#ifdef __SSE2__ +vfloat Color::hue2rgb(vfloat p, vfloat q, vfloat t) +{ + vfloat fourv = F2V(4.f); + vfloat threev = F2V(3.f); + vfloat sixv = threev + threev; + t = vself(vmaskf_lt(t, ZEROV), t + sixv, t); + t = vself(vmaskf_gt(t, sixv), t - sixv, t); + + vfloat temp1 = p + (q - p) * t; + vfloat temp2 = p + (q - p) * (fourv - t); + vfloat result = vself(vmaskf_lt(t, fourv), temp2, p); + result = vself(vmaskf_lt(t, threev), q, result); + return vself(vmaskf_lt(t, fourv - threev), temp1, result); +} +#endif + void Color::hsl2rgb (float h, float s, float l, float &r, float &g, float &b) { @@ -324,6 +424,53 @@ void Color::hsl2rgb (float h, float s, float l, float &r, float &g, float &b) } } +void Color::hsl2rgbfloat (float h, float s, float l, float &r, float &g, float &b) +{ + + if (s == 0.f) { + r = g = b = 65535.f * l; // achromatic + } else { + float m2; + + if (l <= 0.5f) { + m2 = l * (1.f + s); + } else { + m2 = l + s - l * s; + } + + float m1 = 2.f * l - m2; + + r = 65535.f * hue2rgbfloat (m1, m2, h * 6.f + 2.f); + g = 65535.f * hue2rgbfloat (m1, m2, h * 6.f); + b = 65535.f * hue2rgbfloat (m1, m2, h * 6.f - 2.f); + } +} + +#ifdef __SSE2__ +void Color::hsl2rgb (vfloat h, vfloat s, vfloat l, vfloat &r, vfloat &g, vfloat &b) +{ + + vfloat m2 = s * l; + m2 = vself(vmaskf_gt(l, F2V(0.5f)), s - m2, m2); + m2 += l; + + vfloat twov = F2V(2.f); + vfloat c65535v = F2V(65535.f); + vfloat m1 = l + l - m2; + + h *= F2V(6.f); + r = c65535v * hue2rgb (m1, m2, h + twov); + g = c65535v * hue2rgb (m1, m2, h); + b = c65535v * hue2rgb (m1, m2, h - twov); + + vmask selectsMask = vmaskf_eq(ZEROV, s); + vfloat lc65535v = c65535v * l; + r = vself(selectsMask, lc65535v, r); + g = vself(selectsMask, lc65535v, g); + b = vself(selectsMask, lc65535v, b); +} +#endif + void Color::hsl2rgb01 (float h, float s, float l, float &r, float &g, float &b) { diff --git a/rtengine/color.h b/rtengine/color.h index 493597a33..70dbcddf5 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -89,7 +89,10 @@ private: // Separated from init() to keep the code clear static void initMunsell (); static double hue2rgb(double p, double q, double t); - + static float hue2rgbfloat(float p, float q, float t); +#ifdef __SSE2__ + static vfloat hue2rgb(vfloat p, vfloat q, vfloat t); +#endif public: typedef enum Channel { @@ -179,7 +182,10 @@ public: * @param l luminance channel [0; 1] (return value) */ static void rgb2hsl (float r, float g, float b, float &h, float &s, float &l); - + static void rgb2hslfloat (float r, float g, float b, float &h, float &s, float &l); +#ifdef __SSE2__ + static void rgb2hsl (vfloat r, vfloat g, vfloat b, vfloat &h, vfloat &s, vfloat &l); +#endif /** * @brief Convert hue/saturation/luminance in red/green/blue @@ -191,6 +197,10 @@ public: * @param b blue channel [0 ; 65535] (return value) */ static void hsl2rgb (float h, float s, float l, float &r, float &g, float &b); + static void hsl2rgbfloat (float h, float s, float l, float &r, float &g, float &b); +#ifdef __SSE2__ + static void hsl2rgb (vfloat h, vfloat s, vfloat l, vfloat &r, vfloat &g, vfloat &b); +#endif /** * @brief Convert hue/saturation/luminance in red/green/blue diff --git a/rtengine/ipretinex.cc b/rtengine/ipretinex.cc index 5a3b87213..c11150e4e 100644 --- a/rtengine/ipretinex.cc +++ b/rtengine/ipretinex.cc @@ -259,7 +259,8 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, int width out[i] = &outBuffer[i * W_L]; } - float pond = 1.0f / (float) scal; + float logBetaGain = xlogf(16384.f); + float pond = logBetaGain / (float) scal; #ifdef _OPENMP #pragma omp parallel @@ -284,6 +285,7 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, int width for (int i = 0; i < H_L; i++) { int j = 0; #ifdef __SSE2__ + if(useHslLin) { for (; j < W_L - 3; j += 4) { _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv * (LIMV(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv) )); @@ -293,7 +295,9 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, int width _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv * xlogf(LIMV(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv) )); } } + #endif + if(useHslLin) { for (; j < W_L; j++) { luminance[i][j] += pond * (LIM(src[i][j] / out[i][j], ilimD, limD)); @@ -312,23 +316,11 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, int width delete [] outBuffer; delete [] srcBuffer; - float logBetaGain = xlogf(16384.f); if (dehatransmissionCurve) { execcur = true; } -//I re-execute luminance[i][j] = logBetaGain * luminance[i][j] because I call 2 times (or one) mean_stdvv -// no difference or very little in time -#ifdef _OPENMP - #pragma omp parallel for -#endif - - for (int i = 0; i < H_L; i++ ) - for (int j = 0; j < W_L; j++) { - luminance[i][j] = logBetaGain * luminance[i][j]; - } - mean = 0.f; stddv = 0.f; // I call mean_stddv2 instead of mean_stddv ==> logBetaGain diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc index e6664dc06..ff3946e5c 100644 --- a/rtengine/rawimagesource.cc +++ b/rtengine/rawimagesource.cc @@ -1842,49 +1842,83 @@ void RawImageSource::retinex(RAWParams raw, ColorManagementParams cmp, RetinexPa bool useHsl = deh.retinexcolorspace == "HSLLOG"; bool useHslLin = deh.retinexcolorspace == "HSLLIN"; + if(useHsl || useHslLin) { +#ifdef _OPENMP + #pragma omp parallel +#endif + { +#ifdef __SSE2__ + vfloat c32768 = F2V(32768.f); +#endif +#ifdef _OPENMP + #pragma omp for +#endif + + for (int i = border; i < H - border; i++ ) + { + int j = border; +#ifdef __SSE2__ + + for (; j < W - border - 3; j += 4) { + vfloat H, S, L; + Color::rgb2hsl(LVFU(red[i][j]), LVFU(green[i][j]), LVFU(blue[i][j]), H, S, L); + _mm_storeu_ps(&labdeha->a[i - border][j - border], H); + _mm_storeu_ps(&labdeha->b[i - border][j - border], S); + L *= c32768; + _mm_storeu_ps(&labTmp[i - border][j - border], L); + + if(dehaHcontlutili) { // curve is not vectorized + for (int k = 0; k < 4; k++) { + labdeha->L[i - border][j - border + k] = cdHcurve[labTmp[i - border][j - border + k]]; + } + } else { + _mm_storeu_ps(&labdeha->L[i - border][j - border], L); + } + } + +#endif + + for (; j < W - border; j++) { + float H, S, L; + //rgb=>lab + Color::rgb2hslfloat(red[i][j], green[i][j], blue[i][j], labdeha->a[i - border][j - border], labdeha->b[i - border][j - border], L); + L *= 32768.f; + labTmp[i - border][j - border] = L; + + if(dehaHcontlutili) { + labdeha->L[i - border][j - border] = cdHcurve[labTmp[i - border][j - border]]; + } else { + labdeha->L[i - border][j - border] = labTmp[i - border][j - border]; + } + } + } + } + } else { + + // Conversion rgb -> lab is hard to vectorize because it uses a lut (that's not the main problem) + // and it uses a condition inside XYZ2Lab which is almost impossible to vectorize without making it slower... +#ifdef _OPENMP + #pragma omp parallel for +#endif + for (int i = border; i < H - border; i++ ) for (int j = border; j < W - border; j++) { - float H,S,L; + float X, Y, Z, L, aa, bb; //rgb=>lab - Color::rgb2hsl(red[i][j], green[i][j], blue[i][j],H,S,L); - // L *= 65535.f; - L *= 32768.f; + Color::rgbxyz(red[i][j], green[i][j], blue[i][j], X, Y, Z, wp); + //convert Lab + Color::XYZ2Lab(X, Y, Z, L, aa, bb); labTmp[i - border][j - border] = L; - if(dehaHcontlutili) { - L = cdHcurve[L]; //apply curve to equalize histogram + if(dehacontlutili) { + L = cdcurve[L]; //apply curve to equalize histogram } labdeha->L[i - border][j - border] = L; - labdeha->a[i - border][j - border] = H; - labdeha->b[i - border][j - border] = S; + labdeha->a[i - border][j - border] = aa; + labdeha->b[i - border][j - border] = bb; } - } else { - - // Conversion rgb -> lab is hard to vectorize because it uses a lut (that's not the main problem) - // and it uses a condition inside XYZ2Lab which is almost impossible to vectorize without making it slower... -#ifdef _OPENMP - #pragma omp parallel for -#endif - - for (int i = border; i < H - border; i++ ) - for (int j = border; j < W - border; j++) { - float X, Y, Z, L, aa, bb; - //rgb=>lab - Color::rgbxyz(red[i][j], green[i][j], blue[i][j], X, Y, Z, wp); - //convert Lab - Color::XYZ2Lab(X, Y, Z, L, aa, bb); - labTmp[i - border][j - border] = L; - - if(dehacontlutili) { - L = cdcurve[L]; //apply curve to equalize histogram - } - - labdeha->L[i - border][j - border] = L; - labdeha->a[i - border][j - border] = aa; - labdeha->b[i - border][j - border] = bb; - } } MSR(labdeha->L, labTmp, WNew, HNew, deh, dehatransmissionCurve, minCD, maxCD, mini, maxi, Tmean, Tsigma, Tmin, Tmax); @@ -1892,65 +1926,78 @@ void RawImageSource::retinex(RAWParams raw, ColorManagementParams cmp, RetinexPa delete [] labTmpBuffer; if(useHsl || useHslLin) { +#ifdef _OPENMP + #pragma omp parallel for +#endif + for (int i = border; i < H - border; i++ ) { int j = border; +#ifdef __SSE2__ + vfloat c32768 = F2V(32768.f); + + for (; j < W - border - 3; j += 4) { + vfloat R, G, B; + Color::hsl2rgb(LVFU(labdeha->a[i - border][j - border]), LVFU(labdeha->b[i - border][j - border]), LVFU(labdeha->L[i - border][j - border]) / c32768, R, G, B); + + _mm_storeu_ps(&red[i][j], R); + _mm_storeu_ps(&green[i][j], G); + _mm_storeu_ps(&blue[i][j], B); + } + +#endif + for (; j < W - border; j++) { - float R, G, B; - // Color::hsl2rgb(labdeha->a[i - border][j - border],labdeha->b[i - border][j - border],labdeha->L[i - border][j - border]/65535.f,R,G,B); - Color::hsl2rgb(labdeha->a[i - border][j - border],labdeha->b[i - border][j - border],labdeha->L[i - border][j - border]/32768.f,R,G,B); - red[i][j] = R; - green[i][j] = G; - blue[i][j] = B; + Color::hsl2rgbfloat(labdeha->a[i - border][j - border], labdeha->b[i - border][j - border], labdeha->L[i - border][j - border] / 32768.f, red[i][j], green[i][j], blue[i][j]); } } } else { #ifdef __SSE2__ - vfloat wipv[3][3]; + vfloat wipv[3][3]; - for(int i = 0; i < 3; i++) - for(int j = 0; j < 3; j++) { - wipv[i][j] = F2V(wiprof[i][j]); - } + for(int i = 0; i < 3; i++) + for(int j = 0; j < 3; j++) { + wipv[i][j] = F2V(wiprof[i][j]); + } #endif // __SSE2__ #ifdef _OPENMP - #pragma omp parallel for + #pragma omp parallel for #endif - for (int i = border; i < H - border; i++ ) { - int j = border; + for (int i = border; i < H - border; i++ ) { + int j = border; #ifdef __SSE2__ - for (; j < W - border - 3; j += 4) { - vfloat L2, a2, b2, x_, y_, z_; - vfloat R, G, B; - L2 = LVFU(labdeha->L[i - border][j - border]); - a2 = LVFU(labdeha->a[i - border][j - border]); - b2 = LVFU(labdeha->b[i - border][j - border]); - Color::Lab2XYZ(L2, a2, b2, x_, y_, z_) ; - Color::xyz2rgb(x_, y_, z_, R, G, B, wipv); - _mm_storeu_ps(&red[i][j], R); - _mm_storeu_ps(&green[i][j], G); - _mm_storeu_ps(&blue[i][j], B); - } + for (; j < W - border - 3; j += 4) { + vfloat L2, a2, b2, x_, y_, z_; + vfloat R, G, B; + L2 = LVFU(labdeha->L[i - border][j - border]); + a2 = LVFU(labdeha->a[i - border][j - border]); + b2 = LVFU(labdeha->b[i - border][j - border]); + Color::Lab2XYZ(L2, a2, b2, x_, y_, z_) ; + Color::xyz2rgb(x_, y_, z_, R, G, B, wipv); + _mm_storeu_ps(&red[i][j], R); + _mm_storeu_ps(&green[i][j], G); + _mm_storeu_ps(&blue[i][j], B); + } #endif - for (; j < W - border; j++) { - float L2, a2, b2, x_, y_, z_; - float R, G, B; - L2 = labdeha->L[i - border][j - border]; - a2 = labdeha->a[i - border][j - border]; - b2 = labdeha->b[i - border][j - border]; - Color::Lab2XYZ(L2, a2, b2, x_, y_, z_) ; - Color::xyz2rgb(x_, y_, z_, R, G, B, wip); - red[i][j] = R; - green[i][j] = G; - blue[i][j] = B; + for (; j < W - border; j++) { + float L2, a2, b2, x_, y_, z_; + float R, G, B; + L2 = labdeha->L[i - border][j - border]; + a2 = labdeha->a[i - border][j - border]; + b2 = labdeha->b[i - border][j - border]; + Color::Lab2XYZ(L2, a2, b2, x_, y_, z_) ; + Color::xyz2rgb(x_, y_, z_, R, G, B, wip); + red[i][j] = R; + green[i][j] = G; + blue[i][j] = B; + } } } - } delete labdeha;