diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h index da302964b..3020278b2 100644 --- a/rtengine/boxblur.h +++ b/rtengine/boxblur.h @@ -204,15 +204,15 @@ template void boxblur (T** src, A** dst, T* buffer, int radx, tempv = tempv / lenv; temp1v = temp1v / lenv; - STVFU( dst[0][col], tempv); - STVFU( dst[0][col + 4], temp1v); + STVFU(dst[0][col], tempv); + STVFU(dst[0][col + 4], temp1v); for (int row = 1; row <= rady; row++) { lenp1v = lenv + onev; tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v; temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v; - STVFU( dst[row][col], tempv); - STVFU( dst[row][col + 4], temp1v); + STVFU(dst[row][col], tempv); + STVFU(dst[row][col + 4], temp1v); lenv = lenp1v; } @@ -221,16 +221,16 @@ template void boxblur (T** src, A** dst, T* buffer, int radx, for (int row = rady + 1; row < H - rady; row++) { tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ; temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ; - STVFU( dst[row][col], tempv); - STVFU( dst[row][col + 4], temp1v); + STVFU(dst[row][col], tempv); + STVFU(dst[row][col + 4], temp1v); } for (int row = H - rady; row < H; row++) { lenm1v = lenv - onev; tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v; temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v; - STVFU( dst[row][col], tempv); - STVFU( dst[row][col + 4], temp1v); + STVFU(dst[row][col], tempv); + STVFU(dst[row][col + 4], temp1v); lenv = lenm1v; } } @@ -312,6 +312,221 @@ template void boxblur (T** src, A** dst, T* buffer, int radx, } +inline void boxblur (float** src, float** dst, int radius, int W, int H, bool multiThread) +{ + //box blur using rowbuffers and linebuffers instead of a full size buffer + + if (radius == 0) { + if (src != dst) { +#ifdef _OPENMP + #pragma omp parallel for if (multiThread) +#endif + + for (int row = 0; row < H; row++) { + for (int col = 0; col < W; col++) { + dst[row][col] = src[row][col]; + } + } + } + return; + } + + constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache +#ifdef _OPENMP + #pragma omp parallel if (multiThread) +#endif + { + float* const buffer = new float[std::max(W, 8 * H)]; + //horizontal blur + float* const lineBuffer = buffer; +#ifdef _OPENMP + #pragma omp for +#endif + for (int row = 0; row < H; row++) { + float len = radius + 1; + float tempval = src[row][0]; + lineBuffer[0] = tempval; + for (int j = 1; j <= radius; j++) { + tempval += src[row][j]; + } + + tempval /= len; + dst[row][0] = tempval; + + for (int col = 1; col <= radius; col++) { + lineBuffer[col] = src[row][col]; + dst[row][col] = tempval = (tempval * len + src[row][col + radius]) / (len + 1); + len ++; + } + + for (int col = radius + 1; col < W - radius; col++) { + lineBuffer[col] = src[row][col]; + dst[row][col] = tempval = tempval + (src[row][col + radius] - lineBuffer[col - radius - 1]) / len; + } + + for (int col = W - radius; col < W; col++) { + dst[row][col] = tempval = (tempval * len - lineBuffer[col - radius - 1]) / (len - 1); + len --; + } + } + + //vertical blur +#ifdef __SSE2__ + vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer; + vfloat leninitv = F2V(radius + 1); + vfloat onev = F2V(1.f); + vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv; + +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int col = 0; col < W - 7; col += 8) { + lenv = leninitv; + tempv = LVFU(dst[0][col]); + temp1v = LVFU(dst[0][col + 4]); + rowBuffer[0][0] = tempv; + rowBuffer[0][1] = temp1v; + + for (int i = 1; i <= radius; i++) { + tempv = tempv + LVFU(dst[i][col]); + temp1v = temp1v + LVFU(dst[i][col + 4]); + } + + tempv = tempv / lenv; + temp1v = temp1v / lenv; + STVFU(dst[0][col], tempv); + STVFU(dst[0][col + 4], temp1v); + + for (int row = 1; row <= radius; row++) { + rowBuffer[row][0] = LVFU(dst[row][col]); + rowBuffer[row][1] = LVFU(dst[row][col + 4]); + lenp1v = lenv + onev; + tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v; + temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v; + STVFU(dst[row][col], tempv); + STVFU(dst[row][col + 4], temp1v); + lenv = lenp1v; + } + + rlenv = onev / lenv; + + for (int row = radius + 1; row < H - radius; row++) { + rowBuffer[row][0] = LVFU(dst[row][col]); + rowBuffer[row][1] = LVFU(dst[row][col + 4]); + tempv = tempv + (LVFU(dst[row + radius][col]) - rowBuffer[row - radius - 1][0]) * rlenv ; + temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - rowBuffer[row - radius - 1][1]) * rlenv ; + STVFU(dst[row][col], tempv); + STVFU(dst[row][col + 4], temp1v); + } + + for (int row = H - radius; row < H; row++) { + lenm1v = lenv - onev; + tempv = (tempv * lenv - rowBuffer[row - radius - 1][0]) / lenm1v; + temp1v = (temp1v * lenv - rowBuffer[row - radius - 1][1]) / lenm1v; + STVFU(dst[row][col], tempv); + STVFU(dst[row][col + 4], temp1v); + lenv = lenm1v; + } + } + +#else + float (* const rowBuffer)[8] = (float(*)[8]) buffer; +#ifdef _OPENMP + #pragma omp for nowait +#endif + + for (int col = 0; col < W - numCols + 1; col += 8) { + float len = radius + 1; + + for(int k = 0; k < numCols; k++) { + rowBuffer[0][k] = dst[0][col + k]; + } + + for (int i = 1; i <= radius; i++) { + for(int k = 0; k < numCols; k++) { + dst[0][col + k] += dst[i][col + k]; + } + } + + for(int k = 0; k < numCols; k++) { + dst[0][col + k] /= len; + } + + for (int row = 1; row <= radius; row++) { + for(int k = 0; k < numCols; k++) { + rowBuffer[row][k] = dst[row][col + k]; + dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1); + } + + len ++; + } + + for (int row = radius + 1; row < H - radius; row++) { + for(int k = 0; k < numCols; k++) { + rowBuffer[row][k] = dst[row][col + k]; + dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) / len; + } + } + + for (int row = H - radius; row < H; row++) { + for(int k = 0; k < numCols; k++) { + dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1); + } + + len --; + } + } + +#endif + //vertical blur, remaining columns +#ifdef _OPENMP + #pragma omp single +#endif + { + const int remaining = W % numCols; + if (remaining > 0) { + float (* const rowBuffer)[8] = (float(*)[8]) buffer; + const int col = W - remaining; + + float len = radius + 1; + for(int k = 0; k < remaining; k++) { + rowBuffer[0][k] = dst[0][col + k]; + } + for (int i = 1; i <= radius; i++) { + for(int k = 0; k < remaining; k++) { + dst[0][col + k] += dst[i][col + k]; + } + } + for(int k = 0; k < remaining; k++) { + dst[0][col + k] /= len; + } + for (int row = 1; row <= radius; row++) { + for(int k = 0; k < remaining; k++) { + rowBuffer[row][k] = dst[row][col + k]; + dst[row][col + k] = (dst[(row - 1)][col + k] * len + dst[row + radius][col + k]) / (len + 1); + len ++; + } + } + const float rlen = 1.f / len; + for (int row = radius + 1; row < H - radius; row++) { + for(int k = 0; k < remaining; k++) { + rowBuffer[row][k] = dst[row][col + k]; + dst[row][col + k] = dst[(row - 1)][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) * rlen; + } + } + for (int row = H - radius; row < H; row++) { + for(int k = 0; k < remaining; k++) { + dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1); + len --; + } + } + } + } + delete [] buffer; + } +} + template void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H) { //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1) @@ -382,15 +597,15 @@ template void boxblur (T* src, A* dst, A* buffer, int radx, in tempv = tempv / lenv; temp1v = temp1v / lenv; - STVFU( dst[0 * W + col], tempv); - STVFU( dst[0 * W + col + 4], temp1v); + STVFU(dst[0 * W + col], tempv); + STVFU(dst[0 * W + col + 4], temp1v); for (int row = 1; row <= rady; row++) { lenp1v = lenv + onev; tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v; temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v; - STVFU( dst[row * W + col], tempv); - STVFU( dst[row * W + col + 4], temp1v); + STVFU(dst[row * W + col], tempv); + STVFU(dst[row * W + col + 4], temp1v); lenv = lenp1v; } @@ -399,16 +614,16 @@ template void boxblur (T* src, A* dst, A* buffer, int radx, in for (int row = rady + 1; row < H - rady; row++) { tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ; temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ; - STVFU( dst[row * W + col], tempv); - STVFU( dst[row * W + col + 4], temp1v); + STVFU(dst[row * W + col], tempv); + STVFU(dst[row * W + col + 4], temp1v); } for (int row = H - rady; row < H; row++) { lenm1v = lenv - onev; tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v; temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v; - STVFU( dst[row * W + col], tempv); - STVFU( dst[row * W + col + 4], temp1v); + STVFU(dst[row * W + col], tempv); + STVFU(dst[row * W + col + 4], temp1v); lenv = lenm1v; } } @@ -422,12 +637,12 @@ template void boxblur (T* src, A* dst, A* buffer, int radx, in } tempv = tempv / lenv; - STVFU( dst[0 * W + col], tempv); + STVFU(dst[0 * W + col], tempv); for (int row = 1; row <= rady; row++) { lenp1v = lenv + onev; tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v; - STVFU( dst[row * W + col], tempv); + STVFU(dst[row * W + col], tempv); lenv = lenp1v; } @@ -435,13 +650,13 @@ template void boxblur (T* src, A* dst, A* buffer, int radx, in for (int row = rady + 1; row < H - rady; row++) { tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ; - STVFU( dst[row * W + col], tempv); + STVFU(dst[row * W + col], tempv); } for (int row = H - rady; row < H; row++) { lenm1v = lenv - onev; tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v; - STVFU( dst[row * W + col], tempv); + STVFU(dst[row * W + col], tempv); lenv = lenm1v; } } diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc index bc7f64f05..8d19fc7a5 100644 --- a/rtengine/guidedfilter.cc +++ b/rtengine/guidedfilter.cc @@ -3,6 +3,7 @@ * This file is part of RawTherapee. * * Copyright (c) 2018 Alberto Griggio + * Optimized 2019 Ingo Weyrich * * RawTherapee is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -16,9 +17,9 @@ * * You should have received a copy of the GNU General Public License * along with RawTherapee. If not, see . - */ +*/ -/** +/* * This is a Fast Guided Filter implementation, derived directly from the * pseudo-code of the paper: * @@ -26,32 +27,16 @@ * by Kaiming He, Jian Sun * * available at https://arxiv.org/abs/1505.00996 - */ +*/ #include "guidedfilter.h" #include "boxblur.h" #include "rescale.h" #include "imagefloat.h" - +#define BENCHMARK +#include "StopWatch.h" namespace rtengine { -#if 0 -# define DEBUG_DUMP(arr) \ - do { \ - Imagefloat im(arr.width(), arr.height()); \ - const char *out = "/tmp/" #arr ".tif"; \ - for (int y = 0; y < im.getHeight(); ++y) { \ - for (int x = 0; x < im.getWidth(); ++x) { \ - im.r(y, x) = im.g(y, x) = im.b(y, x) = arr[y][x] * 65535.f; \ - } \ - } \ - im.saveTIFF(out, 16); \ - } while (false) -#else -# define DEBUG_DUMP(arr) -#endif - - namespace { int calculate_subsampling(int w, int h, int r) @@ -78,15 +63,7 @@ int calculate_subsampling(int w, int h, int r) void guidedFilter(const array2D &guide, const array2D &src, array2D &dst, int r, float epsilon, bool multithread, int subsampling) { - - const int W = src.width(); - const int H = src.height(); - - if (subsampling <= 0) { - subsampling = calculate_subsampling(W, H, r); - } - - enum Op { MUL, DIVEPSILON, ADD, SUB, ADDMUL, SUBMUL }; + enum Op {MUL, DIVEPSILON, SUBMUL}; const auto apply = [=](Op op, array2D &res, const array2D &a, const array2D &b, const array2D &c=array2D()) -> void @@ -99,139 +76,107 @@ void guidedFilter(const array2D &guide, const array2D &src, array2 #endif for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { - float r; - float aa = a[y][x]; - float bb = b[y][x]; switch (op) { - case MUL: - r = aa * bb; - break; - case DIVEPSILON: - r = aa / (bb + epsilon); - break; - case ADD: - r = aa + bb; - break; - case SUB: - r = aa - bb; - break; - case ADDMUL: - r = aa * bb + c[y][x]; - break; - case SUBMUL: - r = c[y][x] - (aa * bb); - break; - default: - assert(false); - r = 0; - break; + case MUL: + res[y][x] = a[y][x] * b[y][x]; + break; + case DIVEPSILON: + res[y][x] = a[y][x] / (b[y][x] + epsilon); // note: the value of epsilon intentionally has an impact on the result. It is not only to avoid divisions by zero + break; + case SUBMUL: + res[y][x] = c[y][x] - (a[y][x] * b[y][x]); + break; + default: + assert(false); + res[y][x] = 0; + break; } - res[y][x] = r; } } }; - // use the terminology of the paper (Algorithm 2) - const array2D &I = guide; - const array2D &p = src; - array2D &q = dst; - const auto f_subsample = [=](array2D &d, const array2D &s) -> void { rescaleBilinear(s, d, multithread); }; - const auto f_upsample = f_subsample; - - const size_t w = W / subsampling; - const size_t h = H / subsampling; - - AlignedBuffer blur_buf(w * h); const auto f_mean = [&](array2D &d, array2D &s, int rad) -> void { rad = LIM(rad, 0, (min(s.width(), s.height()) - 1) / 2 - 1); - float **src = s; - float **dst = d; -#ifdef _OPENMP - #pragma omp parallel if (multithread) -#endif - boxblur(src, dst, blur_buf.data, rad, rad, s.width(), s.height()); + boxblur(s, d, rad, s.width(), s.height(), multithread); }; + const int W = src.width(); + const int H = src.height(); + + if (subsampling <= 0) { + subsampling = calculate_subsampling(W, H, r); + } + + const size_t w = W / subsampling; + const size_t h = H / subsampling; + const float r1 = float(r) / subsampling; + array2D I1(w, h); array2D p1(w, h); - f_subsample(I1, I); - f_subsample(p1, p); + f_subsample(I1, guide); - DEBUG_DUMP(I); - DEBUG_DUMP(p); - DEBUG_DUMP(I1); - DEBUG_DUMP(p1); + if (&guide == &src) { + f_mean(p1, I1, r1); - float r1 = float(r) / subsampling; + apply(MUL, I1, I1, I1); // I1 = I1 * I1 - array2D meanI(w, h); - f_mean(meanI, I1, r1); - DEBUG_DUMP(meanI); + f_mean(I1, I1, r1); - array2D meanp(w, h); - f_mean(meanp, p1, r1); - DEBUG_DUMP(meanp); + apply(SUBMUL, I1, p1, p1, I1); // I1 = I1 - p1 * p1 + apply(DIVEPSILON, I1, I1, I1); // I1 = I1 / (I1 + epsilon) + apply(SUBMUL, p1, I1, p1, p1); // p1 = p1 - I1 * p1 - array2D &corrIp = p1; - apply(MUL, corrIp, I1, p1); - f_mean(corrIp, corrIp, r1); - DEBUG_DUMP(corrIp); + } else { + f_subsample(p1, src); - array2D &corrI = I1; - apply(MUL, corrI, I1, I1); - f_mean(corrI, corrI, r1); - DEBUG_DUMP(corrI); + array2D meanI(w, h); + f_mean(meanI, I1, r1); - array2D &varI = corrI; - apply(SUBMUL, varI, meanI, meanI, corrI); - DEBUG_DUMP(varI); + array2D meanp(w, h); + f_mean(meanp, p1, r1); - array2D &covIp = corrIp; - apply(SUBMUL, covIp, meanI, meanp, corrIp); - DEBUG_DUMP(covIp); + apply(MUL, p1, I1, p1); - array2D &a = varI; - apply(DIVEPSILON, a, covIp, varI); - DEBUG_DUMP(a); + f_mean(p1, p1, r1); - array2D &b = covIp; - apply(SUBMUL, b, a, meanI, meanp); - DEBUG_DUMP(b); + apply(MUL, I1, I1, I1); - array2D &meana = a; - f_mean(meana, a, r1); - DEBUG_DUMP(meana); + f_mean(I1, I1, r1); - array2D &meanb = b; - f_mean(meanb, b, r1); - DEBUG_DUMP(meanb); + apply(SUBMUL, I1, meanI, meanI, I1); + apply(SUBMUL, p1, meanI, meanp, p1); + apply(DIVEPSILON, I1, p1, I1); + apply(SUBMUL, p1, I1, meanI, meanp); + } - const int Ws = meana.width(); - const int Hs = meana.height(); - const int Wd = q.width(); - const int Hd = q.height(); + f_mean(I1, I1, r1); + f_mean(p1, p1, r1); - float col_scale = float (Ws) / float (Wd); - float row_scale = float (Hs) / float (Hd); + const int Ws = I1.width(); + const int Hs = I1.height(); + const int Wd = dst.width(); + const int Hd = dst.height(); + + const float col_scale = static_cast(Ws) / static_cast(Wd); + const float row_scale = static_cast(Hs) / static_cast(Hd); #ifdef _OPENMP #pragma omp parallel for if (multithread) #endif for (int y = 0; y < Hd; ++y) { - float ymrs = y * row_scale; - + const float ymrs = y * row_scale; for (int x = 0; x < Wd; ++x) { - q[y][x] = getBilinearValue(meana, x * col_scale, ymrs) * I[y][x] + getBilinearValue(meanb, x * col_scale, ymrs); + dst[y][x] = getBilinearValue(I1, x * col_scale, ymrs) * guide[y][x] + getBilinearValue(p1, x * col_scale, ymrs); } } } diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc index 60d4cb9ff..68af84970 100644 --- a/rtengine/ipdehaze.cc +++ b/rtengine/ipdehaze.cc @@ -35,7 +35,10 @@ #include "improcfun.h" #include "procparams.h" #include "rt_algo.h" +#include "rt_algo.h" #include "rt_math.h" +#define BENCHMARK +#include "StopWatch.h" extern Options options; @@ -43,24 +46,7 @@ namespace rtengine { namespace { -#if 0 -# define DEBUG_DUMP(arr) \ - do { \ - Imagefloat im(arr.width(), arr.height()); \ - const char *out = "/tmp/" #arr ".tif"; \ - for (int y = 0; y < im.getHeight(); ++y) { \ - for (int x = 0; x < im.getWidth(); ++x) { \ - im.r(y, x) = im.g(y, x) = im.b(y, x) = arr[y][x] * 65535.f; \ - } \ - } \ - im.saveTIFF(out, 16); \ - } while (false) -#else -# define DEBUG_DUMP(arr) -#endif - - -int get_dark_channel(const array2D &R, const array2D &G, const array2D &B, array2D &dst, int patchsize, const float ambient[3], bool clip, bool multithread) +int get_dark_channel(const array2D &R, const array2D &G, const array2D &B, array2D &dst, int patchsize, const float ambient[3], bool clip, bool multithread, float strength) { const int W = R.width(); const int H = R.height(); @@ -73,22 +59,12 @@ int get_dark_channel(const array2D &R, const array2D &G, const arr for (int x = 0; x < W; x += patchsize) { float val = RT_INFINITY_F; const int pW = min(x + patchsize, W); - for (int yy = y; yy < pH; ++yy) { - for (int xx = x; xx < pW; ++xx) { - float r = R[yy][xx]; - float g = G[yy][xx]; - float b = B[yy][xx]; - if (ambient) { - r /= ambient[0]; - g /= ambient[1]; - b /= ambient[2]; - } - val = min(val, r, g, b); + for (int xx = x; xx < pW; ++xx) { + for (int yy = y; yy < pH; ++yy) { + val = min(val, R[yy][xx] / ambient[0], G[yy][xx] / ambient[1], B[yy][xx] / ambient[2]); } } - if (clip) { - val = LIM01(val); - } + val = 1.f - strength * LIM01(val); for (int yy = y; yy < pH; ++yy) { std::fill(dst[yy] + x, dst[yy] + pW, val); } @@ -98,41 +74,59 @@ int get_dark_channel(const array2D &R, const array2D &G, const arr return (W / patchsize + ((W % patchsize) > 0)) * (H / patchsize + ((H % patchsize) > 0)); } +int get_dark_channel_downsized(const array2D &R, const array2D &G, const array2D &B, array2D &dst, int patchsize, bool multithread) +{ + const int W = R.width(); + const int H = R.height(); + +#ifdef _OPENMP + #pragma omp parallel for if (multithread) +#endif + for (int y = 0; y < H; y += patchsize) { + int yy = y / patchsize; + const int pH = min(y + patchsize, H); + for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) { + float val = RT_INFINITY_F; + const int pW = min(x + patchsize, W); + for (int xp = x; xp < pW; ++xp) { + for (int yp = y; yp < pH; ++yp) { + val = min(val, R[yp][xp], G[yp][xp], B[yp][xp]); + } + } + dst[yy][xx] = val; + } + } + + return (W / patchsize + ((W % patchsize) > 0)) * (H / patchsize + ((H % patchsize) > 0)); +} + float estimate_ambient_light(const array2D &R, const array2D &G, const array2D &B, const array2D &dark, int patchsize, int npatches, float ambient[3]) { const int W = R.width(); const int H = R.height(); - const auto get_percentile = - [](std::priority_queue &q, float prcnt) -> float - { - size_t n = LIM(q.size() * prcnt, 1, q.size()); - while (q.size() > n) { - q.pop(); - } - return q.top(); - }; - float darklim = RT_INFINITY_F; { - std::priority_queue p; - for (int y = 0; y < H; y += patchsize) { - for (int x = 0; x < W; x += patchsize) { - if (!OOG(dark[y][x], 1.f - 1e-5f)) { - p.push(dark[y][x]); + std::vector p; + for (int y = 0, yy = 0; y < H; y += patchsize, ++yy) { + for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) { + if (!OOG(dark[yy][xx], 1.f - 1e-5f)) { + p.push_back(dark[yy][xx]); } } } - darklim = get_percentile(p, 0.95); + const int pos = p.size() * 0.95; + std::nth_element(p.begin(), p.begin() + pos, p.end()); + darklim = p[pos]; } std::vector> patches; patches.reserve(npatches); - for (int y = 0; y < H; y += patchsize) { - for (int x = 0; x < W; x += patchsize) { - if (dark[y][x] >= darklim && !OOG(dark[y][x], 1.f)) { + for (int y = 0, yy = 0; y < H; y += patchsize, ++yy) { + for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) { + if (dark[yy][xx] >= darklim && !OOG(dark[yy][xx], 1.f)) { patches.push_back(std::make_pair(x, y)); } } @@ -145,33 +139,38 @@ float estimate_ambient_light(const array2D &R, const array2D &G, c float bright_lim = RT_INFINITY_F; { - std::priority_queue l; + std::vector l; + l.reserve(patches.size() * patchsize * patchsize); - for (auto &p : patches) { - const int pW = min(p.first+patchsize, W); - const int pH = min(p.second+patchsize, H); + for (const auto &p : patches) { + const int pW = min(p.first + patchsize, W); + const int pH = min(p.second + patchsize, H); for (int y = p.second; y < pH; ++y) { for (int x = p.first; x < pW; ++x) { - l.push(R[y][x] + G[y][x] + B[y][x]); + l.push_back(R[y][x] + G[y][x] + B[y][x]); } } } - - bright_lim = get_percentile(l, 0.95); + const int pos = l.size() * 0.95; + std::nth_element(l.begin(), l.begin() + pos, l.end()); + bright_lim = l[pos]; } double rr = 0, gg = 0, bb = 0; int n = 0; - for (auto &p : patches) { - const int pW = min(p.first+patchsize, W); - const int pH = min(p.second+patchsize, H); +#ifdef _OPENMP + #pragma omp parallel for schedule(dynamic) reduction(+:rr,gg,bb,n) +#endif + for (const auto &p : patches) { + const int pW = min(p.first + patchsize, W); + const int pH = min(p.second + patchsize, H); for (int y = p.second; y < pH; ++y) { for (int x = p.first; x < pW; ++x) { - float r = R[y][x]; - float g = G[y][x]; - float b = B[y][x]; + const float r = R[y][x]; + const float g = G[y][x]; + const float b = B[y][x]; if (r + g + b >= bright_lim) { rr += r; gg += g; @@ -181,6 +180,7 @@ float estimate_ambient_light(const array2D &R, const array2D &G, c } } } + n = std::max(n, 1); ambient[0] = rr / n; ambient[1] = gg / n; @@ -211,12 +211,12 @@ void extract_channels(Imagefloat *img, array2D &r, array2D &g, arr void ImProcFunctions::dehaze(Imagefloat *img) { - if (!params->dehaze.enabled) { + if (!params->dehaze.enabled || params->dehaze.strength == 0.0) { return; } - +BENCHFUN img->normalizeFloatTo1(); - + const int W = img->getWidth(); const int H = img->getHeight(); const float strength = LIM01(float(params->dehaze.strength) / 100.f * 0.9f); @@ -229,21 +229,19 @@ void ImProcFunctions::dehaze(Imagefloat *img) int patchsize = max(int(5 / scale), 2); float ambient[3]; - array2D &t_tilde = dark; float max_t = 0.f; { - int npatches = 0; array2D R(W, H); array2D G(W, H); array2D B(W, H); extract_channels(img, R, G, B, patchsize, 1e-1, multiThread); - - patchsize = max(max(W, H) / 600, 2); - npatches = get_dark_channel(R, G, B, dark, patchsize, nullptr, false, multiThread); - DEBUG_DUMP(dark); - max_t = estimate_ambient_light(R, G, B, dark, patchsize, npatches, ambient); + patchsize = max(max(W, H) / 600, 2); + array2D darkDownsized(W / patchsize + 1, H / patchsize + 1); + const int npatches = get_dark_channel_downsized(R, G, B, darkDownsized, patchsize, multiThread); + + max_t = estimate_ambient_light(R, G, B, darkDownsized, patchsize, npatches, ambient); if (options.rtSettings.verbose) { std::cout << "dehaze: ambient light is " @@ -251,78 +249,102 @@ void ImProcFunctions::dehaze(Imagefloat *img) << std::endl; } - get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread); - } - - if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) { - if (options.rtSettings.verbose) { - std::cout << "dehaze: no haze detected" << std::endl; + if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) { + if (options.rtSettings.verbose) { + std::cout << "dehaze: no haze detected" << std::endl; + } + img->normalizeFloatTo65535(); + return; // probably no haze at all } - img->normalizeFloatTo65535(); - return; // probably no haze at all - } - DEBUG_DUMP(t_tilde); - -#ifdef _OPENMP - #pragma omp parallel for if (multiThread) -#endif - for (int y = 0; y < H; ++y) { - for (int x = 0; x < W; ++x) { - dark[y][x] = 1.f - strength * dark[y][x]; - } + get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread, strength); } const int radius = patchsize * 4; - const float epsilon = 1e-5; - array2D &t = t_tilde; + constexpr float epsilon = 1e-5f; { array2D guideB(W, H, img->b.ptrs, ARRAY2D_BYREFERENCE); - guidedFilter(guideB, t_tilde, t, radius, epsilon, multiThread); + guidedFilter(guideB, dark, dark, radius, epsilon, multiThread); } - DEBUG_DUMP(t); - if (options.rtSettings.verbose) { std::cout << "dehaze: max distance is " << max_t << std::endl; } - float depth = -float(params->dehaze.depth) / 100.f; + const float depth = -float(params->dehaze.depth) / 100.f; const float t0 = max(1e-3f, std::exp(depth * max_t)); const float teps = 1e-3f; #ifdef _OPENMP #pragma omp parallel for if (multiThread) #endif for (int y = 0; y < H; ++y) { - for (int x = 0; x < W; ++x) { + int x = 0; +#ifdef __SSE2__ + const vfloat onev = F2V(1.f); + const vfloat ambient0v = F2V(ambient[0]); + const vfloat ambient1v = F2V(ambient[1]); + const vfloat ambient2v = F2V(ambient[2]); + const vfloat t0v = F2V(t0); + const vfloat tepsv = F2V(teps); + const vfloat c65535v = F2V(65535.f); + for (; x < W - 3; x += 4) { // ensure that the transmission is such that to avoid clipping... - float rgb[3] = { img->r(y, x), img->g(y, x), img->b(y, x) }; + vfloat r = LVFU(img->r(y, x)); + vfloat g = LVFU(img->g(y, x)); + vfloat b = LVFU(img->b(y, x)); // ... t >= tl to avoid negative values - float tl = 1.f - min(rgb[0]/ambient[0], rgb[1]/ambient[1], rgb[2]/ambient[2]); + const vfloat tlv = onev - vminf(r / ambient0v, vminf(g / ambient1v, b / ambient2v)); // ... t >= tu to avoid values > 1 - float tu = t0 - teps; - for (int c = 0; c < 3; ++c) { - if (ambient[c] < 1) { - tu = max(tu, (rgb[c] - ambient[c])/(1.f - ambient[c])); - } - } - float mt = max(t[y][x], t0, tl + teps, tu + teps); - if (params->dehaze.showDepthMap) { - img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt); - } else { - float r = (rgb[0] - ambient[0]) / mt + ambient[0]; - float g = (rgb[1] - ambient[1]) / mt + ambient[1]; - float b = (rgb[2] - ambient[2]) / mt + ambient[2]; + r -= ambient0v; + g -= ambient1v; + b -= ambient2v; - img->r(y, x) = r; - img->g(y, x) = g; - img->b(y, x) = b; + vfloat tuv = t0v - tepsv; + tuv = vself(vmaskf_lt(ambient0v, onev), vmaxf(tuv, r / (onev - ambient0v)), tuv); + tuv = vself(vmaskf_lt(ambient1v, onev), vmaxf(tuv, g / (onev - ambient1v)), tuv); + tuv = vself(vmaskf_lt(ambient2v, onev), vmaxf(tuv, b / (onev - ambient2v)), tuv); + + const vfloat mtv = vmaxf(LVFU(dark[y][x]), vmaxf(tlv, tuv) + tepsv); + if (params->dehaze.showDepthMap) { + const vfloat valv = vclampf(onev - mtv, ZEROV, onev) * c65535v; + STVFU(img->r(y, x), valv); + STVFU(img->g(y, x), valv); + STVFU(img->b(y, x), valv); + } else { + STVFU(img->r(y, x), (r / mtv + ambient0v) * c65535v); + STVFU(img->g(y, x), (g / mtv + ambient1v) * c65535v); + STVFU(img->b(y, x), (b / mtv + ambient2v) * c65535v); + } + } +#endif + for (; x < W; ++x) { + // ensure that the transmission is such that to avoid clipping... + float r = img->r(y, x); + float g = img->g(y, x); + float b = img->b(y, x); + // ... t >= tl to avoid negative values + const float tl = 1.f - min(r / ambient[0], g / ambient[1], b / ambient[2]); + // ... t >= tu to avoid values > 1 + r -= ambient[0]; + g -= ambient[1]; + b -= ambient[2]; + + float tu = t0 - teps; + tu = ambient[0] < 1.f ? max(tu, r / (1.f - ambient[0])) : tu; + tu = ambient[1] < 1.f ? max(tu, g / (1.f - ambient[1])) : tu; + tu = ambient[2] < 1.f ? max(tu, b / (1.f - ambient[2])) : tu; + + const float mt = max(dark[y][x], tl + teps, tu + teps); + if (params->dehaze.showDepthMap) { + img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * 65535.f; + } else { + img->r(y, x) = (r / mt + ambient[0]) * 65535.f; + img->g(y, x) = (g / mt + ambient[1]) * 65535.f; + img->b(y, x) = (b / mt + ambient[2]) * 65535.f; } } } - - img->normalizeFloatTo65535(); }