diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc index af2083d8e..1011ae7b7 100644 --- a/rtengine/rt_algo.cc +++ b/rtengine/rt_algo.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #ifdef _OPENMP @@ -31,7 +32,6 @@ #include "rt_algo.h" #include "rt_math.h" #include "sleef.c" -#include "jaggedarray.h" namespace { float calcBlendFactor(float val, float threshold) { @@ -51,6 +51,113 @@ vfloat calcBlendFactor(vfloat valv, vfloat thresholdv) { return onev / (onev + xexpf(c16v - c16v * valv / thresholdv)); } #endif + +float tileAverage(float **data, size_t tileY, size_t tileX, size_t tilesize) { + + float avg = 0.f; +#ifdef __SSE2__ + vfloat avgv = ZEROV; +#endif + for (std::size_t y = tileY; y < tileY + tilesize; ++y) { + std::size_t x = tileX; +#ifdef __SSE2__ + for (; x < tileX + tilesize - 3; x += 4) { + avgv += LVFU(data[y][x]); + } +#endif + for (; x < tileX + tilesize; ++x) { + avg += data[y][x]; + } + } +#ifdef __SSE2__ + avg += vhadd(avgv); +#endif + return avg / rtengine::SQR(tilesize); +} + +float tileVariance(float **data, size_t tileY, size_t tileX, size_t tilesize, float avg) { + + float var = 0.f; +#ifdef __SSE2__ + vfloat varv = ZEROV; + const vfloat avgv = F2V(avg); +#endif + for (std::size_t y = tileY; y < tileY + tilesize; ++y) { + std::size_t x = tileX; +#ifdef __SSE2__ + for (; x < tileX + tilesize - 3; x += 4) { + varv += SQRV(LVFU(data[y][x]) - avgv); + } +#endif + for (; x < tileX + tilesize; ++x) { + var += rtengine::SQR(data[y][x] - avg); + } + } +#ifdef __SSE2__ + var += vhadd(varv); +#endif + return var / (rtengine::SQR(tilesize) * avg); +} + +float calcContrastThreshold(float** luminance, int tileY, int tileX, int tilesize) { + + constexpr float scale = 0.0625f / 327.68f; + std::vector> blend(tilesize - 4, std::vector(tilesize - 4)); + +#ifdef __SSE2__ + const vfloat scalev = F2V(scale); +#endif + + for(int j = tileY + 2; j < tileY + tilesize - 2; ++j) { + int i = tileX + 2; +#ifdef __SSE2__ + for(; i < tileX + tilesize - 5; i += 4) { + vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) + + SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev; + STVFU(blend[j - tileY - 2][i - tileX - 2], contrastv); + } +#endif + for(; i < tileX + tilesize - 2; ++i) { + + float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + + rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale; + + blend[j - tileY - 2][i - tileX - 2] = contrast; + } + } + + const float limit = rtengine::SQR(tilesize - 4) / 100.f; + + int c; + for (c = 1; c < 100; ++c) { + const float contrastThreshold = c / 100.f; + float sum = 0.f; +#ifdef __SSE2__ + const vfloat contrastThresholdv = F2V(contrastThreshold); + vfloat sumv = ZEROV; +#endif + + for(int j = 0; j < tilesize - 4; ++j) { + int i = 0; +#ifdef __SSE2__ + for(; i < tilesize - 7; i += 4) { + sumv += calcBlendFactor(LVFU(blend[j][i]), contrastThresholdv); + } +#endif + for(; i < tilesize - 4; ++i) { + sum += calcBlendFactor(blend[j][i], contrastThreshold); + } + } +#ifdef __SSE2__ + sum += vhadd(sumv); +#endif + if (sum <= limit) { + break; + } + } + + return c / 100.f; +} } namespace rtengine @@ -194,230 +301,173 @@ void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& void buildBlendMask(float** luminance, float **blend, int W, int H, float &contrastThreshold, float amount, bool autoContrast) { - if(contrastThreshold == 0.f && !autoContrast) { + if (autoContrast) { + constexpr float minLuminance = 2000.f; + constexpr float maxLuminance = 20000.f; + constexpr float minTileVariance = 0.5f; + for (int pass = 0; pass < 2; ++pass) { + const int tilesize = 80 / (pass + 1); + const int skip = pass == 0 ? tilesize : tilesize / 4; + const int numTilesW = W / skip - 3 * pass; + const int numTilesH = H / skip - 3 * pass; + std::vector> variances(numTilesH, std::vector(numTilesW)); + +#ifdef _OPENMP + #pragma omp parallel for schedule(dynamic) +#endif + for (int i = 0; i < numTilesH; ++i) { + const int tileY = i * skip; + for (int j = 0; j < numTilesW; ++j) { + const int tileX = j * skip; + const float avg = tileAverage(luminance, tileY, tileX, tilesize); + if (avg < minLuminance || avg > maxLuminance) { + // too dark or too bright => skip the tile + variances[i][j] = RT_INFINITY_F; + continue; + } else { + variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg); + // exclude tiles with a variance less than minTileVariance + variances[i][j] = variances[i][j] < minTileVariance ? RT_INFINITY_F : variances[i][j]; + } + } + } + + float minvar = RT_INFINITY_F; + int minI = 0, minJ = 0; + for (int i = 0; i < numTilesH; ++i) { + for (int j = 0; j < numTilesW; ++j) { + if (variances[i][j] < minvar) { + minvar = variances[i][j]; + minI = i; + minJ = j; + } + } + } + + if (minvar <= 1.f || pass == 1) { + const int minY = skip * minI; + const int minX = skip * minJ; + if (pass == 0) { + // a variance <= 1 means we already found a flat region and can skip second pass + contrastThreshold = calcContrastThreshold(luminance, minY, minX, tilesize); + break; + } else { + // in second pass we allow a variance of 4 + // we additionally scan the tiles +-skip pixels around the best tile from pass 2 + // Means we scan (2 * skip + 1)^2 tiles in this step to get a better hit rate + // fortunately the scan is quite fast, so we use only one core and don't parallelize + const int topLeftYStart = std::max(minY - skip, 0); + const int topLeftXStart = std::max(minX - skip, 0); + const int topLeftYEnd = std::min(minY + skip, H - tilesize); + const int topLeftXEnd = std::min(minX + skip, W - tilesize); + const int numTilesH = topLeftYEnd - topLeftYStart + 1; + const int numTilesW = topLeftXEnd - topLeftXStart + 1; + + std::vector> variances(numTilesH, std::vector(numTilesW)); + for (int i = 0; i < numTilesH; ++i) { + const int tileY = topLeftYStart + i; + for (int j = 0; j < numTilesW; ++j) { + const int tileX = topLeftXStart + j; + const float avg = tileAverage(luminance, tileY, tileX, tilesize); + + if (avg < minLuminance || avg > maxLuminance) { + // too dark or too bright => skip the tile + variances[i][j] = RT_INFINITY_F; + continue; + } else { + variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg); + // exclude tiles with a variance less than minTileVariance + variances[i][j] = variances[i][j] < minTileVariance ? RT_INFINITY_F : variances[i][j]; + } + } + } + + float minvar = RT_INFINITY_F; + int minI = 0, minJ = 0; + for (int i = 0; i < numTilesH; ++i) { + for (int j = 0; j < numTilesW; ++j) { + if (variances[i][j] < minvar) { + minvar = variances[i][j]; + minI = i; + minJ = j; + } + } + } + + contrastThreshold = minvar <= 4.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f; + } + } + } + } + + if(contrastThreshold == 0.f) { for(int j = 0; j < H; ++j) { for(int i = 0; i < W; ++i) { blend[j][i] = amount; } } } else { - if (autoContrast) { - for (int pass = 0; pass < 2; ++pass) { - const int tilesize = 80 / (pass + 1); - const int skip = pass == 0 ? tilesize : tilesize / 4; - const int numTilesW = W / skip - 3 * pass; - const int numTilesH = H / skip - 3 * pass; - std::vector> variances(numTilesH, std::vector(numTilesW)); - + constexpr float scale = 0.0625f / 327.68f; #ifdef _OPENMP - #pragma omp parallel for schedule(dynamic) + #pragma omp parallel #endif - for (int i = 0; i < numTilesH; ++i) { - const int tileY = i * skip; - for (int j = 0; j < numTilesW; ++j) { - const int tileX = j * skip; + { #ifdef __SSE2__ - vfloat avgv = ZEROV; - for (int y = tileY; y < tileY + tilesize; ++y) { - for (int x = tileX; x < tileX + tilesize; x += 4) { - avgv += LVFU(luminance[y][x]); - } - } - float avg = vhadd(avgv); -#else - float avg = 0.f; - for (int y = tileY; y < tileY + tilesize; ++y) { - for (int x = tileX; x < tileX + tilesize; ++x) { - avg += luminance[y][x]; - } - } + const vfloat contrastThresholdv = F2V(contrastThreshold); + const vfloat scalev = F2V(scale); + const vfloat amountv = F2V(amount); #endif - avg /= SQR(tilesize); - if (avg < 2000.f || avg > 20000.f) { - // too dark or too bright => skip the tile - variances[i][j] = RT_INFINITY_F; - continue; - } +#ifdef _OPENMP + #pragma omp for schedule(dynamic,16) +#endif + + for(int j = 2; j < H - 2; ++j) { + int i = 2; #ifdef __SSE2__ - vfloat varv = ZEROV; - avgv = F2V(avg); - for (int y = tileY; y < tileY + tilesize; ++y) { - for (int x = tileX; x < tileX + tilesize; x +=4) { - varv += SQRV(LVFU(luminance[y][x]) - avgv); - } - } - float var = vhadd(varv); -#else - float var = 0.f; - for (int y = tileY; y < tileY + tilesize; ++y) { - for (int x = tileX; x < tileX + tilesize; ++x) { - var += SQR(luminance[y][x] - avg); - } - } + for(; i < W - 5; i += 4) { + vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) + + SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev; + + STVFU(blend[j][i], amountv * calcBlendFactor(contrastv, contrastThresholdv)); + } #endif - var /= (SQR(tilesize) * avg); - variances[i][j] = var; - } - } + for(; i < W - 2; ++i) { - float minvar = RT_INFINITY_F; - int minI = 0, minJ = 0; - for (int i = 0; i < numTilesH; ++i) { - for (int j = 0; j < numTilesW; ++j) { - if (variances[i][j] < minvar) { - minvar = variances[i][j]; - minI = i; - minJ = j; - } - } - } + float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + + rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale; - const int minY = skip * minI; - const int minX = skip * minJ; - - if (minvar <= 1.f || pass == 1) { - // a variance <= 1 means we already found a flat region and can skip second pass - // in second pass we allow a variance of 2 - JaggedArray Lum(tilesize, tilesize); - JaggedArray Blend(tilesize, tilesize); - for (int i = 0; i < tilesize; ++i) { - for (int j = 0; j < tilesize; ++j) { - Lum[i][j] = luminance[i + minY][j + minX]; - } - } - contrastThreshold = (pass == 0 || minvar <= 4.f) ? calcContrastThreshold(Lum, Blend, tilesize, tilesize) / 100.f : 0.f; - break; + blend[j][i] = amount * calcBlendFactor(contrast, contrastThreshold); } } - } - if(contrastThreshold == 0.f) { - for(int j = 0; j < H; ++j) { - for(int i = 0; i < W; ++i) { - blend[j][i] = amount; - } - } - } else { - constexpr float scale = 0.0625f / 327.68f; #ifdef _OPENMP - #pragma omp parallel + #pragma omp single #endif { -#ifdef __SSE2__ - const vfloat contrastThresholdv = F2V(contrastThreshold); - const vfloat scalev = F2V(scale); - const vfloat amountv = F2V(amount); -#endif -#ifdef _OPENMP - #pragma omp for schedule(dynamic,16) -#endif - - for(int j = 2; j < H - 2; ++j) { - int i = 2; -#ifdef __SSE2__ - for(; i < W - 5; i += 4) { - vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) + - SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev; - - STVFU(blend[j][i], amountv * calcBlendFactor(contrastv, contrastThresholdv)); - } -#endif - for(; i < W - 2; ++i) { - - float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + - rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale; - - blend[j][i] = amount * calcBlendFactor(contrast, contrastThreshold); + // upper border + for(int j = 0; j < 2; ++j) { + for(int i = 2; i < W - 2; ++i) { + blend[j][i] = blend[2][i]; } } - -#ifdef _OPENMP - #pragma omp single -#endif - { - // upper border - for(int j = 0; j < 2; ++j) { - for(int i = 2; i < W - 2; ++i) { - blend[j][i] = blend[2][i]; - } - } - // lower border - for(int j = H - 2; j < H; ++j) { - for(int i = 2; i < W - 2; ++i) { - blend[j][i] = blend[H-3][i]; - } - } - for(int j = 0; j < H; ++j) { - // left border - blend[j][0] = blend[j][1] = blend[j][2]; - // right border - blend[j][W - 2] = blend[j][W - 1] = blend[j][W - 3]; + // lower border + for(int j = H - 2; j < H; ++j) { + for(int i = 2; i < W - 2; ++i) { + blend[j][i] = blend[H-3][i]; } } - - // blur blend mask to smooth transitions - gaussianBlur(blend, blend, W, H, 2.0); + for(int j = 0; j < H; ++j) { + // left border + blend[j][0] = blend[j][1] = blend[j][2]; + // right border + blend[j][W - 2] = blend[j][W - 1] = blend[j][W - 3]; + } } + + // blur blend mask to smooth transitions + gaussianBlur(blend, blend, W, H, 2.0); } } } -int calcContrastThreshold(float** luminance, float **blend, int W, int H) { - - constexpr float scale = 0.0625f / 327.68f; - -#ifdef __SSE2__ - const vfloat scalev = F2V(scale); -#endif - - for(int j = 2; j < H - 2; ++j) { - int i = 2; -#ifdef __SSE2__ - for(; i < W - 5; i += 4) { - vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) + - SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev; - STVFU(blend[j -2 ][i - 2], contrastv); - } -#endif - for(; i < W - 2; ++i) { - - float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + - rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale; - - blend[j -2][i- 2] = contrast; - } - } - - const float limit = (W - 4) * (H - 4) / 100.f; - - int c; - for (c = 1; c < 100; ++c) { - const float contrastThreshold = c / 100.f; - float sum = 0.f; -#ifdef __SSE2__ - const vfloat contrastThresholdv = F2V(contrastThreshold); - vfloat sumv = ZEROV; -#endif - - for(int j = 0; j < H - 4; ++j) { - int i = 0; -#ifdef __SSE2__ - for(; i < W - 7; i += 4) { - sumv += calcBlendFactor(LVFU(blend[j][i]), contrastThresholdv); - } -#endif - for(; i < W - 4; ++i) { - sum += calcBlendFactor(blend[j][i], contrastThreshold); - } - } -#ifdef __SSE2__ - sum += vhadd(sumv); -#endif - if (sum <= limit) { - break; - } - } - - return c; -} } diff --git a/rtengine/rt_algo.h b/rtengine/rt_algo.h index 0207e6f57..a8e2e3e23 100644 --- a/rtengine/rt_algo.h +++ b/rtengine/rt_algo.h @@ -25,5 +25,4 @@ namespace rtengine { void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multiThread = true); void buildBlendMask(float** luminance, float **blend, int W, int H, float &contrastThreshold, float amount = 1.f, bool autoContrast = false); -int calcContrastThreshold(float** luminance, float **blend, int W, int H); }