diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc index bd45819d9..a2ca3cac4 100644 --- a/rtengine/rt_algo.cc +++ b/rtengine/rt_algo.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #ifdef _OPENMP @@ -31,7 +32,6 @@ #include "rt_algo.h" #include "rt_math.h" #include "sleef.c" -#include "jaggedarray.h" namespace { float calcBlendFactor(float val, float threshold) { @@ -54,46 +54,110 @@ vfloat calcBlendFactor(vfloat valv, vfloat thresholdv) { float tileAverage(float **data, size_t tileY, size_t tileX, size_t tilesize) { + float avg = 0.f; #ifdef __SSE2__ vfloat avgv = ZEROV; - for (size_t y = tileY; y < tileY + tilesize; ++y) { - for (size_t x = tileX; x < tileX + tilesize; x += 4) { +#endif + for (std::size_t y = tileY; y < tileY + tilesize; ++y) { + std::size_t x = tileX; +#ifdef __SSE2__ + for (; x < tileX + tilesize - 3; x += 4) { avgv += LVFU(data[y][x]); } - } - const float avg = vhadd(avgv); -#else - float avg = 0.f; - for (size_t y = tileY; y < tileY + tilesize; ++y) { - for (size_t x = tileX; x < tileX + tilesize; ++x) { +#endif + for (; x < tileX + tilesize; ++x) { avg += data[y][x]; } } +#ifdef __SSE2__ + avg += vhadd(avgv); #endif return avg / rtengine::SQR(tilesize); } float tileVariance(float **data, size_t tileY, size_t tileX, size_t tilesize, float avg) { + float var = 0.f; #ifdef __SSE2__ vfloat varv = ZEROV; const vfloat avgv = F2V(avg); - for (size_t y = tileY; y < tileY + tilesize; ++y) { - for (size_t x = tileX; x < tileX + tilesize; x +=4) { +#endif + for (std::size_t y = tileY; y < tileY + tilesize; ++y) { + std::size_t x = tileX; +#ifdef __SSE2__ + for (; x < tileX + tilesize - 3; x += 4) { varv += SQRV(LVFU(data[y][x]) - avgv); } - } - const float var = vhadd(varv); -#else - float var = 0.f; - for (size_t y = tileY; y < tileY + tilesize; ++y) { - for (size_t x = tileX;; x < tileX + tilesize; ++x) { +#endif + for (; x < tileX + tilesize; ++x) { var += rtengine::SQR(data[y][x] - avg); } } +#ifdef __SSE2__ + var += vhadd(varv); #endif return var / (rtengine::SQR(tilesize) * avg); } + +float calcContrastThreshold(float** luminance, int tileY, int tileX, int tilesize) { + + constexpr float scale = 0.0625f / 327.68f; + std::vector> blend(tilesize - 4, std::vector(tilesize - 4)); + +#ifdef __SSE2__ + const vfloat scalev = F2V(scale); +#endif + + for(int j = tileY + 2; j < tileY + tilesize - 2; ++j) { + int i = tileX + 2; +#ifdef __SSE2__ + for(; i < tileX + tilesize - 5; i += 4) { + vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) + + SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev; + STVFU(blend[j - tileY - 2][i - tileX - 2], contrastv); + } +#endif + for(; i < tileX + tilesize - 2; ++i) { + + float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + + rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale; + + blend[j - tileY - 2][i - tileX - 2] = contrast; + } + } + + const float limit = rtengine::SQR(tilesize - 4) / 100.f; + + int c; + for (c = 1; c < 100; ++c) { + const float contrastThreshold = c / 100.f; + float sum = 0.f; +#ifdef __SSE2__ + const vfloat contrastThresholdv = F2V(contrastThreshold); + vfloat sumv = ZEROV; +#endif + + for(int j = 0; j < tilesize - 4; ++j) { + int i = 0; +#ifdef __SSE2__ + for(; i < tilesize - 7; i += 4) { + sumv += calcBlendFactor(LVFU(blend[j][i]), contrastThresholdv); + } +#endif + for(; i < tilesize - 4; ++i) { + sum += calcBlendFactor(blend[j][i], contrastThreshold); + } + } +#ifdef __SSE2__ + sum += vhadd(sumv); +#endif + if (sum <= limit) { + break; + } + } + + return c / 100.f; +} } namespace rtengine @@ -237,254 +301,168 @@ void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& void buildBlendMask(float** luminance, float **blend, int W, int H, float &contrastThreshold, float amount, bool autoContrast) { - if(contrastThreshold == 0.f && !autoContrast) { + if (autoContrast) { + constexpr float minLuminance = 2000.f; + constexpr float maxLuminance = 20000.f; + for (int pass = 0; pass < 2; ++pass) { + const int tilesize = 80 / (pass + 1); + const int skip = pass == 0 ? tilesize : tilesize / 4; + const int numTilesW = W / skip - 3 * pass; + const int numTilesH = H / skip - 3 * pass; + std::vector> variances(numTilesH, std::vector(numTilesW)); + +#ifdef _OPENMP + #pragma omp parallel for schedule(dynamic) +#endif + for (int i = 0; i < numTilesH; ++i) { + const int tileY = i * skip; + for (int j = 0; j < numTilesW; ++j) { + const int tileX = j * skip; + const float avg = tileAverage(luminance, tileY, tileX, tilesize); + if (avg < minLuminance || avg > maxLuminance) { + // too dark or too bright => skip the tile + variances[i][j] = RT_INFINITY_F; + continue; + } else { + variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg); + } + } + } + + float minvar = RT_INFINITY_F; + int minI = 0, minJ = 0; + for (int i = 0; i < numTilesH; ++i) { + for (int j = 0; j < numTilesW; ++j) { + if (variances[i][j] < minvar) { + minvar = variances[i][j]; + minI = i; + minJ = j; + } + } + } + + const int minY = skip * minI; + const int minX = skip * minJ; + + if (minvar <= 1.f || pass == 1) { + if (pass == 0) { + // a variance <= 1 means we already found a flat region and can skip second pass + contrastThreshold = calcContrastThreshold(luminance, minY, minX, tilesize); + break; + } else { + // in second pass we allow a variance of 4 + // we additionally scan the tiles +-skip pixels around the best tile from pass 2 + // Means we scan (2 * skip + 1)^2 tiles in this step to get a better hit rate + // fortunately the scan is quite fast, so we use only one core and don't parallelize + const int topLeftYStart = std::max(minY - skip, 0); + const int topLeftXStart = std::max(minX - skip, 0); + const int topLeftYEnd = std::min(minY + skip, H - tilesize); + const int topLeftXEnd = std::min(minX + skip, W - tilesize); + const int numTilesH = topLeftYEnd - topLeftYStart + 1; + const int numTilesW = topLeftXEnd - topLeftXStart + 1; + + std::vector> variances(numTilesH, std::vector(numTilesW)); + for (int i = 0; i < numTilesH; ++i) { + const int tileY = topLeftYStart + i; + for (int j = 0; j < numTilesW; ++j) { + const int tileX = topLeftXStart + j; + const float avg = tileAverage(luminance, tileY, tileX, tilesize); + + if (avg < minLuminance || avg > maxLuminance) { + // too dark or too bright => skip the tile + variances[i][j] = RT_INFINITY_F; + continue; + } else { + variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg); + } + } + } + + float minvar = RT_INFINITY_F; + int minI = 0, minJ = 0; + for (int i = 0; i < numTilesH; ++i) { + for (int j = 0; j < numTilesW; ++j) { + if (variances[i][j] < minvar) { + minvar = variances[i][j]; + minI = i; + minJ = j; + } + } + } + contrastThreshold = minvar <= 4.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f; + } + } + } + } + + if(contrastThreshold == 0.f) { for(int j = 0; j < H; ++j) { for(int i = 0; i < W; ++i) { blend[j][i] = amount; } } } else { - if (autoContrast) { - constexpr float minLuminance = 2000.f; - constexpr float maxLuminance = 20000.f; - for (int pass = 0; pass < 2; ++pass) { - const int tilesize = 80 / (pass + 1); - const int skip = pass == 0 ? tilesize : tilesize / 4; - const int numTilesW = W / skip - 3 * pass; - const int numTilesH = H / skip - 3 * pass; - std::vector> variances(numTilesH, std::vector(numTilesW)); - + constexpr float scale = 0.0625f / 327.68f; #ifdef _OPENMP - #pragma omp parallel for schedule(dynamic) + #pragma omp parallel +#endif + { +#ifdef __SSE2__ + const vfloat contrastThresholdv = F2V(contrastThreshold); + const vfloat scalev = F2V(scale); + const vfloat amountv = F2V(amount); #endif - for (int i = 0; i < numTilesH; ++i) { - const int tileY = i * skip; - for (int j = 0; j < numTilesW; ++j) { - const int tileX = j * skip; - const float avg = tileAverage(luminance, tileY, tileX, tilesize); - if (avg < minLuminance || avg > maxLuminance) { - // too dark or too bright => skip the tile - variances[i][j] = RT_INFINITY_F; - continue; - } else { - variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg); - } - } - } - - float minvar = RT_INFINITY_F; - int minI = 0, minJ = 0; - for (int i = 0; i < numTilesH; ++i) { - for (int j = 0; j < numTilesW; ++j) { - if (variances[i][j] < minvar) { - minvar = variances[i][j]; - minI = i; - minJ = j; - } - } - } - - const int minY = skip * minI; - const int minX = skip * minJ; - - if (minvar <= 1.f || pass == 1) { - if (pass == 0) { - // a variance <= 1 means we already found a flat region and can skip second pass - JaggedArray Lum(tilesize, tilesize); - JaggedArray Blend(tilesize, tilesize); - for (int i = 0; i < tilesize; ++i) { - for (int j = 0; j < tilesize; ++j) { - Lum[i][j] = luminance[i + minY][j + minX]; - } - } - contrastThreshold = calcContrastThreshold(Lum, Blend, tilesize, tilesize) / 100.f; - break; - } else { - // in second pass we allow a variance of 4 - // we additionally scan the tiles +-skip pixels around the best tile from pass 2 - // Means we scan (2 * skip + 1)^2 tiles in this step to get a better hit rate - // fortunately the scan is quite fast, so we use only one core and don't parallelize - const int topLeftYStart = std::max(minY - skip, 0); - const int topLeftXStart = std::max(minX - skip, 0); - const int topLeftYEnd = std::min(minY + skip, H - tilesize); - const int topLeftXEnd = std::min(minX + skip, W - tilesize); - const int numTilesH = topLeftYEnd - topLeftYStart + 1; - const int numTilesW = topLeftXEnd - topLeftXStart + 1; - - std::vector> variances(numTilesH, std::vector(numTilesW)); - for (int i = 0; i < numTilesH; ++i) { - const int tileY = topLeftYStart + i; - for (int j = 0; j < numTilesW; ++j) { - const int tileX = topLeftXStart + j; - const float avg = tileAverage(luminance, tileY, tileX, tilesize); - - if (avg < minLuminance || avg > maxLuminance) { - // too dark or too bright => skip the tile - variances[i][j] = RT_INFINITY_F; - continue; - } else { - variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg); - } - } - } - - float minvar = RT_INFINITY_F; - int minI = 0, minJ = 0; - for (int i = 0; i < numTilesH; ++i) { - for (int j = 0; j < numTilesW; ++j) { - if (variances[i][j] < minvar) { - minvar = variances[i][j]; - minI = i; - minJ = j; - } - } - } - if (minvar <= 4.f) { - JaggedArray Lum(tilesize, tilesize); - JaggedArray Blend(tilesize, tilesize); - const int minY = topLeftYStart + minI; - const int minX = topLeftXStart + minJ; - for (int i = 0; i < tilesize; ++i) { - for (int j = 0; j < tilesize; ++j) { - Lum[i][j] = luminance[i + minY][j + minX]; - } - } - contrastThreshold = calcContrastThreshold(Lum, Blend, tilesize, tilesize) / 100.f; - } else { - contrastThreshold = 0.f; - } - } - } - } - } - - if(contrastThreshold == 0.f) { - for(int j = 0; j < H; ++j) { - for(int i = 0; i < W; ++i) { - blend[j][i] = amount; - } - } - } else { - constexpr float scale = 0.0625f / 327.68f; #ifdef _OPENMP - #pragma omp parallel + #pragma omp for schedule(dynamic,16) +#endif + + for(int j = 2; j < H - 2; ++j) { + int i = 2; +#ifdef __SSE2__ + for(; i < W - 5; i += 4) { + vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) + + SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev; + + STVFU(blend[j][i], amountv * calcBlendFactor(contrastv, contrastThresholdv)); + } +#endif + for(; i < W - 2; ++i) { + + float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + + rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale; + + blend[j][i] = amount * calcBlendFactor(contrast, contrastThreshold); + } + } + +#ifdef _OPENMP + #pragma omp single #endif { -#ifdef __SSE2__ - const vfloat contrastThresholdv = F2V(contrastThreshold); - const vfloat scalev = F2V(scale); - const vfloat amountv = F2V(amount); -#endif -#ifdef _OPENMP - #pragma omp for schedule(dynamic,16) -#endif - - for(int j = 2; j < H - 2; ++j) { - int i = 2; -#ifdef __SSE2__ - for(; i < W - 5; i += 4) { - vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) + - SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev; - - STVFU(blend[j][i], amountv * calcBlendFactor(contrastv, contrastThresholdv)); - } -#endif - for(; i < W - 2; ++i) { - - float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + - rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale; - - blend[j][i] = amount * calcBlendFactor(contrast, contrastThreshold); + // upper border + for(int j = 0; j < 2; ++j) { + for(int i = 2; i < W - 2; ++i) { + blend[j][i] = blend[2][i]; } } - -#ifdef _OPENMP - #pragma omp single -#endif - { - // upper border - for(int j = 0; j < 2; ++j) { - for(int i = 2; i < W - 2; ++i) { - blend[j][i] = blend[2][i]; - } - } - // lower border - for(int j = H - 2; j < H; ++j) { - for(int i = 2; i < W - 2; ++i) { - blend[j][i] = blend[H-3][i]; - } - } - for(int j = 0; j < H; ++j) { - // left border - blend[j][0] = blend[j][1] = blend[j][2]; - // right border - blend[j][W - 2] = blend[j][W - 1] = blend[j][W - 3]; + // lower border + for(int j = H - 2; j < H; ++j) { + for(int i = 2; i < W - 2; ++i) { + blend[j][i] = blend[H-3][i]; } } - - // blur blend mask to smooth transitions - gaussianBlur(blend, blend, W, H, 2.0); + for(int j = 0; j < H; ++j) { + // left border + blend[j][0] = blend[j][1] = blend[j][2]; + // right border + blend[j][W - 2] = blend[j][W - 1] = blend[j][W - 3]; + } } + + // blur blend mask to smooth transitions + gaussianBlur(blend, blend, W, H, 2.0); } } } -int calcContrastThreshold(float** luminance, float **blend, int W, int H) { - - constexpr float scale = 0.0625f / 327.68f; - -#ifdef __SSE2__ - const vfloat scalev = F2V(scale); -#endif - - for(int j = 2; j < H - 2; ++j) { - int i = 2; -#ifdef __SSE2__ - for(; i < W - 5; i += 4) { - vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) + - SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev; - STVFU(blend[j -2 ][i - 2], contrastv); - } -#endif - for(; i < W - 2; ++i) { - - float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + - rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale; - - blend[j -2][i- 2] = contrast; - } - } - - const float limit = (W - 4) * (H - 4) / 100.f; - - int c; - for (c = 1; c < 100; ++c) { - const float contrastThreshold = c / 100.f; - float sum = 0.f; -#ifdef __SSE2__ - const vfloat contrastThresholdv = F2V(contrastThreshold); - vfloat sumv = ZEROV; -#endif - - for(int j = 0; j < H - 4; ++j) { - int i = 0; -#ifdef __SSE2__ - for(; i < W - 7; i += 4) { - sumv += calcBlendFactor(LVFU(blend[j][i]), contrastThresholdv); - } -#endif - for(; i < W - 4; ++i) { - sum += calcBlendFactor(blend[j][i], contrastThreshold); - } - } -#ifdef __SSE2__ - sum += vhadd(sumv); -#endif - if (sum <= limit) { - break; - } - } - - return c; -} } diff --git a/rtengine/rt_algo.h b/rtengine/rt_algo.h index 0207e6f57..a8e2e3e23 100644 --- a/rtengine/rt_algo.h +++ b/rtengine/rt_algo.h @@ -25,5 +25,4 @@ namespace rtengine { void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multiThread = true); void buildBlendMask(float** luminance, float **blend, int W, int H, float &contrastThreshold, float amount = 1.f, bool autoContrast = false); -int calcContrastThreshold(float** luminance, float **blend, int W, int H); }