Dehaze: further speedup, #5456

This commit is contained in:
Ingo Weyrich 2019-09-19 20:56:33 +02:00
parent 991fc94d89
commit 3ab379ad0a
3 changed files with 442 additions and 260 deletions

View File

@ -204,15 +204,15 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU( dst[0][col], tempv);
STVFU( dst[0][col + 4], temp1v);
STVFU(dst[0][col], tempv);
STVFU(dst[0][col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenp1v;
}
@ -221,16 +221,16 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
STVFU( dst[row][col], tempv);
STVFU( dst[row][col + 4], temp1v);
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenm1v;
}
}
@ -312,6 +312,221 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
}
inline void boxblur (float** src, float** dst, int radius, int W, int H, bool multiThread)
{
//box blur using rowbuffers and linebuffers instead of a full size buffer
if (radius == 0) {
if (src != dst) {
#ifdef _OPENMP
#pragma omp parallel for if (multiThread)
#endif
for (int row = 0; row < H; row++) {
for (int col = 0; col < W; col++) {
dst[row][col] = src[row][col];
}
}
}
return;
}
constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
#ifdef _OPENMP
#pragma omp parallel if (multiThread)
#endif
{
float* const buffer = new float[std::max(W, 8 * H)];
//horizontal blur
float* const lineBuffer = buffer;
#ifdef _OPENMP
#pragma omp for
#endif
for (int row = 0; row < H; row++) {
float len = radius + 1;
float tempval = src[row][0];
lineBuffer[0] = tempval;
for (int j = 1; j <= radius; j++) {
tempval += src[row][j];
}
tempval /= len;
dst[row][0] = tempval;
for (int col = 1; col <= radius; col++) {
lineBuffer[col] = src[row][col];
dst[row][col] = tempval = (tempval * len + src[row][col + radius]) / (len + 1);
len ++;
}
for (int col = radius + 1; col < W - radius; col++) {
lineBuffer[col] = src[row][col];
dst[row][col] = tempval = tempval + (src[row][col + radius] - lineBuffer[col - radius - 1]) / len;
}
for (int col = W - radius; col < W; col++) {
dst[row][col] = tempval = (tempval * len - lineBuffer[col - radius - 1]) / (len - 1);
len --;
}
}
//vertical blur
#ifdef __SSE2__
vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer;
vfloat leninitv = F2V(radius + 1);
vfloat onev = F2V(1.f);
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int col = 0; col < W - 7; col += 8) {
lenv = leninitv;
tempv = LVFU(dst[0][col]);
temp1v = LVFU(dst[0][col + 4]);
rowBuffer[0][0] = tempv;
rowBuffer[0][1] = temp1v;
for (int i = 1; i <= radius; i++) {
tempv = tempv + LVFU(dst[i][col]);
temp1v = temp1v + LVFU(dst[i][col + 4]);
}
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU(dst[0][col], tempv);
STVFU(dst[0][col + 4], temp1v);
for (int row = 1; row <= radius; row++) {
rowBuffer[row][0] = LVFU(dst[row][col]);
rowBuffer[row][1] = LVFU(dst[row][col + 4]);
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenp1v;
}
rlenv = onev / lenv;
for (int row = radius + 1; row < H - radius; row++) {
rowBuffer[row][0] = LVFU(dst[row][col]);
rowBuffer[row][1] = LVFU(dst[row][col + 4]);
tempv = tempv + (LVFU(dst[row + radius][col]) - rowBuffer[row - radius - 1][0]) * rlenv ;
temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - rowBuffer[row - radius - 1][1]) * rlenv ;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
}
for (int row = H - radius; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - rowBuffer[row - radius - 1][0]) / lenm1v;
temp1v = (temp1v * lenv - rowBuffer[row - radius - 1][1]) / lenm1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenm1v;
}
}
#else
float (* const rowBuffer)[8] = (float(*)[8]) buffer;
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int col = 0; col < W - numCols + 1; col += 8) {
float len = radius + 1;
for(int k = 0; k < numCols; k++) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int i = 1; i <= radius; i++) {
for(int k = 0; k < numCols; k++) {
dst[0][col + k] += dst[i][col + k];
}
}
for(int k = 0; k < numCols; k++) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; row++) {
for(int k = 0; k < numCols; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
}
len ++;
}
for (int row = radius + 1; row < H - radius; row++) {
for(int k = 0; k < numCols; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) / len;
}
}
for (int row = H - radius; row < H; row++) {
for(int k = 0; k < numCols; k++) {
dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
}
len --;
}
}
#endif
//vertical blur, remaining columns
#ifdef _OPENMP
#pragma omp single
#endif
{
const int remaining = W % numCols;
if (remaining > 0) {
float (* const rowBuffer)[8] = (float(*)[8]) buffer;
const int col = W - remaining;
float len = radius + 1;
for(int k = 0; k < remaining; k++) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int i = 1; i <= radius; i++) {
for(int k = 0; k < remaining; k++) {
dst[0][col + k] += dst[i][col + k];
}
}
for(int k = 0; k < remaining; k++) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; row++) {
for(int k = 0; k < remaining; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[(row - 1)][col + k] * len + dst[row + radius][col + k]) / (len + 1);
len ++;
}
}
const float rlen = 1.f / len;
for (int row = radius + 1; row < H - radius; row++) {
for(int k = 0; k < remaining; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = dst[(row - 1)][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) * rlen;
}
}
for (int row = H - radius; row < H; row++) {
for(int k = 0; k < remaining; k++) {
dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
len --;
}
}
}
}
delete [] buffer;
}
}
template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
{
//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
@ -382,15 +597,15 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU( dst[0 * W + col], tempv);
STVFU( dst[0 * W + col + 4], temp1v);
STVFU(dst[0 * W + col], tempv);
STVFU(dst[0 * W + col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
lenv = lenp1v;
}
@ -399,16 +614,16 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
STVFU( dst[row * W + col], tempv);
STVFU( dst[row * W + col + 4], temp1v);
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
lenv = lenm1v;
}
}
@ -422,12 +637,12 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
}
tempv = tempv / lenv;
STVFU( dst[0 * W + col], tempv);
STVFU(dst[0 * W + col], tempv);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
STVFU( dst[row * W + col], tempv);
STVFU(dst[row * W + col], tempv);
lenv = lenp1v;
}
@ -435,13 +650,13 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
STVFU( dst[row * W + col], tempv);
STVFU(dst[row * W + col], tempv);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
STVFU( dst[row * W + col], tempv);
STVFU(dst[row * W + col], tempv);
lenv = lenm1v;
}
}

View File

@ -3,6 +3,7 @@
* This file is part of RawTherapee.
*
* Copyright (c) 2018 Alberto Griggio <alberto.griggio@gmail.com>
* Optimized 2019 Ingo Weyrich <heckflosse67@gmx.de>
*
* RawTherapee is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -16,9 +17,9 @@
*
* You should have received a copy of the GNU General Public License
* along with RawTherapee. If not, see <https://www.gnu.org/licenses/>.
*/
*/
/**
/*
* This is a Fast Guided Filter implementation, derived directly from the
* pseudo-code of the paper:
*
@ -26,32 +27,16 @@
* by Kaiming He, Jian Sun
*
* available at https://arxiv.org/abs/1505.00996
*/
*/
#include "guidedfilter.h"
#include "boxblur.h"
#include "rescale.h"
#include "imagefloat.h"
#define BENCHMARK
#include "StopWatch.h"
namespace rtengine {
#if 0
# define DEBUG_DUMP(arr) \
do { \
Imagefloat im(arr.width(), arr.height()); \
const char *out = "/tmp/" #arr ".tif"; \
for (int y = 0; y < im.getHeight(); ++y) { \
for (int x = 0; x < im.getWidth(); ++x) { \
im.r(y, x) = im.g(y, x) = im.b(y, x) = arr[y][x] * 65535.f; \
} \
} \
im.saveTIFF(out, 16); \
} while (false)
#else
# define DEBUG_DUMP(arr)
#endif
namespace {
int calculate_subsampling(int w, int h, int r)
@ -78,15 +63,7 @@ int calculate_subsampling(int w, int h, int r)
void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2D<float> &dst, int r, float epsilon, bool multithread, int subsampling)
{
const int W = src.width();
const int H = src.height();
if (subsampling <= 0) {
subsampling = calculate_subsampling(W, H, r);
}
enum Op { MUL, DIVEPSILON, ADD, SUB, ADDMUL, SUBMUL };
enum Op {MUL, DIVEPSILON, SUBMUL};
const auto apply =
[=](Op op, array2D<float> &res, const array2D<float> &a, const array2D<float> &b, const array2D<float> &c=array2D<float>()) -> void
@ -99,139 +76,107 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
#endif
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
float r;
float aa = a[y][x];
float bb = b[y][x];
switch (op) {
case MUL:
r = aa * bb;
break;
case DIVEPSILON:
r = aa / (bb + epsilon);
break;
case ADD:
r = aa + bb;
break;
case SUB:
r = aa - bb;
break;
case ADDMUL:
r = aa * bb + c[y][x];
break;
case SUBMUL:
r = c[y][x] - (aa * bb);
break;
default:
assert(false);
r = 0;
break;
case MUL:
res[y][x] = a[y][x] * b[y][x];
break;
case DIVEPSILON:
res[y][x] = a[y][x] / (b[y][x] + epsilon); // note: the value of epsilon intentionally has an impact on the result. It is not only to avoid divisions by zero
break;
case SUBMUL:
res[y][x] = c[y][x] - (a[y][x] * b[y][x]);
break;
default:
assert(false);
res[y][x] = 0;
break;
}
res[y][x] = r;
}
}
};
// use the terminology of the paper (Algorithm 2)
const array2D<float> &I = guide;
const array2D<float> &p = src;
array2D<float> &q = dst;
const auto f_subsample =
[=](array2D<float> &d, const array2D<float> &s) -> void
{
rescaleBilinear(s, d, multithread);
};
const auto f_upsample = f_subsample;
const size_t w = W / subsampling;
const size_t h = H / subsampling;
AlignedBuffer<float> blur_buf(w * h);
const auto f_mean =
[&](array2D<float> &d, array2D<float> &s, int rad) -> void
{
rad = LIM(rad, 0, (min(s.width(), s.height()) - 1) / 2 - 1);
float **src = s;
float **dst = d;
#ifdef _OPENMP
#pragma omp parallel if (multithread)
#endif
boxblur<float, float>(src, dst, blur_buf.data, rad, rad, s.width(), s.height());
boxblur(s, d, rad, s.width(), s.height(), multithread);
};
const int W = src.width();
const int H = src.height();
if (subsampling <= 0) {
subsampling = calculate_subsampling(W, H, r);
}
const size_t w = W / subsampling;
const size_t h = H / subsampling;
const float r1 = float(r) / subsampling;
array2D<float> I1(w, h);
array2D<float> p1(w, h);
f_subsample(I1, I);
f_subsample(p1, p);
f_subsample(I1, guide);
DEBUG_DUMP(I);
DEBUG_DUMP(p);
DEBUG_DUMP(I1);
DEBUG_DUMP(p1);
if (&guide == &src) {
f_mean(p1, I1, r1);
float r1 = float(r) / subsampling;
apply(MUL, I1, I1, I1); // I1 = I1 * I1
array2D<float> meanI(w, h);
f_mean(meanI, I1, r1);
DEBUG_DUMP(meanI);
f_mean(I1, I1, r1);
array2D<float> meanp(w, h);
f_mean(meanp, p1, r1);
DEBUG_DUMP(meanp);
apply(SUBMUL, I1, p1, p1, I1); // I1 = I1 - p1 * p1
apply(DIVEPSILON, I1, I1, I1); // I1 = I1 / (I1 + epsilon)
apply(SUBMUL, p1, I1, p1, p1); // p1 = p1 - I1 * p1
array2D<float> &corrIp = p1;
apply(MUL, corrIp, I1, p1);
f_mean(corrIp, corrIp, r1);
DEBUG_DUMP(corrIp);
} else {
f_subsample(p1, src);
array2D<float> &corrI = I1;
apply(MUL, corrI, I1, I1);
f_mean(corrI, corrI, r1);
DEBUG_DUMP(corrI);
array2D<float> meanI(w, h);
f_mean(meanI, I1, r1);
array2D<float> &varI = corrI;
apply(SUBMUL, varI, meanI, meanI, corrI);
DEBUG_DUMP(varI);
array2D<float> meanp(w, h);
f_mean(meanp, p1, r1);
array2D<float> &covIp = corrIp;
apply(SUBMUL, covIp, meanI, meanp, corrIp);
DEBUG_DUMP(covIp);
apply(MUL, p1, I1, p1);
array2D<float> &a = varI;
apply(DIVEPSILON, a, covIp, varI);
DEBUG_DUMP(a);
f_mean(p1, p1, r1);
array2D<float> &b = covIp;
apply(SUBMUL, b, a, meanI, meanp);
DEBUG_DUMP(b);
apply(MUL, I1, I1, I1);
array2D<float> &meana = a;
f_mean(meana, a, r1);
DEBUG_DUMP(meana);
f_mean(I1, I1, r1);
array2D<float> &meanb = b;
f_mean(meanb, b, r1);
DEBUG_DUMP(meanb);
apply(SUBMUL, I1, meanI, meanI, I1);
apply(SUBMUL, p1, meanI, meanp, p1);
apply(DIVEPSILON, I1, p1, I1);
apply(SUBMUL, p1, I1, meanI, meanp);
}
const int Ws = meana.width();
const int Hs = meana.height();
const int Wd = q.width();
const int Hd = q.height();
f_mean(I1, I1, r1);
f_mean(p1, p1, r1);
float col_scale = float (Ws) / float (Wd);
float row_scale = float (Hs) / float (Hd);
const int Ws = I1.width();
const int Hs = I1.height();
const int Wd = dst.width();
const int Hd = dst.height();
const float col_scale = static_cast<float>(Ws) / static_cast<float>(Wd);
const float row_scale = static_cast<float>(Hs) / static_cast<float>(Hd);
#ifdef _OPENMP
#pragma omp parallel for if (multithread)
#endif
for (int y = 0; y < Hd; ++y) {
float ymrs = y * row_scale;
const float ymrs = y * row_scale;
for (int x = 0; x < Wd; ++x) {
q[y][x] = getBilinearValue(meana, x * col_scale, ymrs) * I[y][x] + getBilinearValue(meanb, x * col_scale, ymrs);
dst[y][x] = getBilinearValue(I1, x * col_scale, ymrs) * guide[y][x] + getBilinearValue(p1, x * col_scale, ymrs);
}
}
}

View File

@ -35,7 +35,10 @@
#include "improcfun.h"
#include "procparams.h"
#include "rt_algo.h"
#include "rt_algo.h"
#include "rt_math.h"
#define BENCHMARK
#include "StopWatch.h"
extern Options options;
@ -43,24 +46,7 @@ namespace rtengine {
namespace {
#if 0
# define DEBUG_DUMP(arr) \
do { \
Imagefloat im(arr.width(), arr.height()); \
const char *out = "/tmp/" #arr ".tif"; \
for (int y = 0; y < im.getHeight(); ++y) { \
for (int x = 0; x < im.getWidth(); ++x) { \
im.r(y, x) = im.g(y, x) = im.b(y, x) = arr[y][x] * 65535.f; \
} \
} \
im.saveTIFF(out, 16); \
} while (false)
#else
# define DEBUG_DUMP(arr)
#endif
int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread)
int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread, float strength)
{
const int W = R.width();
const int H = R.height();
@ -73,22 +59,12 @@ int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const arr
for (int x = 0; x < W; x += patchsize) {
float val = RT_INFINITY_F;
const int pW = min(x + patchsize, W);
for (int yy = y; yy < pH; ++yy) {
for (int xx = x; xx < pW; ++xx) {
float r = R[yy][xx];
float g = G[yy][xx];
float b = B[yy][xx];
if (ambient) {
r /= ambient[0];
g /= ambient[1];
b /= ambient[2];
}
val = min(val, r, g, b);
for (int xx = x; xx < pW; ++xx) {
for (int yy = y; yy < pH; ++yy) {
val = min(val, R[yy][xx] / ambient[0], G[yy][xx] / ambient[1], B[yy][xx] / ambient[2]);
}
}
if (clip) {
val = LIM01(val);
}
val = 1.f - strength * LIM01(val);
for (int yy = y; yy < pH; ++yy) {
std::fill(dst[yy] + x, dst[yy] + pW, val);
}
@ -98,41 +74,59 @@ int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const arr
return (W / patchsize + ((W % patchsize) > 0)) * (H / patchsize + ((H % patchsize) > 0));
}
int get_dark_channel_downsized(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, bool multithread)
{
const int W = R.width();
const int H = R.height();
#ifdef _OPENMP
#pragma omp parallel for if (multithread)
#endif
for (int y = 0; y < H; y += patchsize) {
int yy = y / patchsize;
const int pH = min(y + patchsize, H);
for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
float val = RT_INFINITY_F;
const int pW = min(x + patchsize, W);
for (int xp = x; xp < pW; ++xp) {
for (int yp = y; yp < pH; ++yp) {
val = min(val, R[yp][xp], G[yp][xp], B[yp][xp]);
}
}
dst[yy][xx] = val;
}
}
return (W / patchsize + ((W % patchsize) > 0)) * (H / patchsize + ((H % patchsize) > 0));
}
float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, const array2D<float> &dark, int patchsize, int npatches, float ambient[3])
{
const int W = R.width();
const int H = R.height();
const auto get_percentile =
[](std::priority_queue<float> &q, float prcnt) -> float
{
size_t n = LIM<size_t>(q.size() * prcnt, 1, q.size());
while (q.size() > n) {
q.pop();
}
return q.top();
};
float darklim = RT_INFINITY_F;
{
std::priority_queue<float> p;
for (int y = 0; y < H; y += patchsize) {
for (int x = 0; x < W; x += patchsize) {
if (!OOG(dark[y][x], 1.f - 1e-5f)) {
p.push(dark[y][x]);
std::vector<float> p;
for (int y = 0, yy = 0; y < H; y += patchsize, ++yy) {
for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
if (!OOG(dark[yy][xx], 1.f - 1e-5f)) {
p.push_back(dark[yy][xx]);
}
}
}
darklim = get_percentile(p, 0.95);
const int pos = p.size() * 0.95;
std::nth_element(p.begin(), p.begin() + pos, p.end());
darklim = p[pos];
}
std::vector<std::pair<int, int>> patches;
patches.reserve(npatches);
for (int y = 0; y < H; y += patchsize) {
for (int x = 0; x < W; x += patchsize) {
if (dark[y][x] >= darklim && !OOG(dark[y][x], 1.f)) {
for (int y = 0, yy = 0; y < H; y += patchsize, ++yy) {
for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
if (dark[yy][xx] >= darklim && !OOG(dark[yy][xx], 1.f)) {
patches.push_back(std::make_pair(x, y));
}
}
@ -145,33 +139,38 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
float bright_lim = RT_INFINITY_F;
{
std::priority_queue<float> l;
std::vector<float> l;
l.reserve(patches.size() * patchsize * patchsize);
for (auto &p : patches) {
const int pW = min(p.first+patchsize, W);
const int pH = min(p.second+patchsize, H);
for (const auto &p : patches) {
const int pW = min(p.first + patchsize, W);
const int pH = min(p.second + patchsize, H);
for (int y = p.second; y < pH; ++y) {
for (int x = p.first; x < pW; ++x) {
l.push(R[y][x] + G[y][x] + B[y][x]);
l.push_back(R[y][x] + G[y][x] + B[y][x]);
}
}
}
bright_lim = get_percentile(l, 0.95);
const int pos = l.size() * 0.95;
std::nth_element(l.begin(), l.begin() + pos, l.end());
bright_lim = l[pos];
}
double rr = 0, gg = 0, bb = 0;
int n = 0;
for (auto &p : patches) {
const int pW = min(p.first+patchsize, W);
const int pH = min(p.second+patchsize, H);
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic) reduction(+:rr,gg,bb,n)
#endif
for (const auto &p : patches) {
const int pW = min(p.first + patchsize, W);
const int pH = min(p.second + patchsize, H);
for (int y = p.second; y < pH; ++y) {
for (int x = p.first; x < pW; ++x) {
float r = R[y][x];
float g = G[y][x];
float b = B[y][x];
const float r = R[y][x];
const float g = G[y][x];
const float b = B[y][x];
if (r + g + b >= bright_lim) {
rr += r;
gg += g;
@ -181,6 +180,7 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
}
}
}
n = std::max(n, 1);
ambient[0] = rr / n;
ambient[1] = gg / n;
@ -211,12 +211,12 @@ void extract_channels(Imagefloat *img, array2D<float> &r, array2D<float> &g, arr
void ImProcFunctions::dehaze(Imagefloat *img)
{
if (!params->dehaze.enabled) {
if (!params->dehaze.enabled || params->dehaze.strength == 0.0) {
return;
}
BENCHFUN
img->normalizeFloatTo1();
const int W = img->getWidth();
const int H = img->getHeight();
const float strength = LIM01(float(params->dehaze.strength) / 100.f * 0.9f);
@ -229,21 +229,19 @@ void ImProcFunctions::dehaze(Imagefloat *img)
int patchsize = max(int(5 / scale), 2);
float ambient[3];
array2D<float> &t_tilde = dark;
float max_t = 0.f;
{
int npatches = 0;
array2D<float> R(W, H);
array2D<float> G(W, H);
array2D<float> B(W, H);
extract_channels(img, R, G, B, patchsize, 1e-1, multiThread);
patchsize = max(max(W, H) / 600, 2);
npatches = get_dark_channel(R, G, B, dark, patchsize, nullptr, false, multiThread);
DEBUG_DUMP(dark);
max_t = estimate_ambient_light(R, G, B, dark, patchsize, npatches, ambient);
patchsize = max(max(W, H) / 600, 2);
array2D<float> darkDownsized(W / patchsize + 1, H / patchsize + 1);
const int npatches = get_dark_channel_downsized(R, G, B, darkDownsized, patchsize, multiThread);
max_t = estimate_ambient_light(R, G, B, darkDownsized, patchsize, npatches, ambient);
if (options.rtSettings.verbose) {
std::cout << "dehaze: ambient light is "
@ -251,78 +249,102 @@ void ImProcFunctions::dehaze(Imagefloat *img)
<< std::endl;
}
get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread);
}
if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) {
if (options.rtSettings.verbose) {
std::cout << "dehaze: no haze detected" << std::endl;
if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) {
if (options.rtSettings.verbose) {
std::cout << "dehaze: no haze detected" << std::endl;
}
img->normalizeFloatTo65535();
return; // probably no haze at all
}
img->normalizeFloatTo65535();
return; // probably no haze at all
}
DEBUG_DUMP(t_tilde);
#ifdef _OPENMP
#pragma omp parallel for if (multiThread)
#endif
for (int y = 0; y < H; ++y) {
for (int x = 0; x < W; ++x) {
dark[y][x] = 1.f - strength * dark[y][x];
}
get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread, strength);
}
const int radius = patchsize * 4;
const float epsilon = 1e-5;
array2D<float> &t = t_tilde;
constexpr float epsilon = 1e-5f;
{
array2D<float> guideB(W, H, img->b.ptrs, ARRAY2D_BYREFERENCE);
guidedFilter(guideB, t_tilde, t, radius, epsilon, multiThread);
guidedFilter(guideB, dark, dark, radius, epsilon, multiThread);
}
DEBUG_DUMP(t);
if (options.rtSettings.verbose) {
std::cout << "dehaze: max distance is " << max_t << std::endl;
}
float depth = -float(params->dehaze.depth) / 100.f;
const float depth = -float(params->dehaze.depth) / 100.f;
const float t0 = max(1e-3f, std::exp(depth * max_t));
const float teps = 1e-3f;
#ifdef _OPENMP
#pragma omp parallel for if (multiThread)
#endif
for (int y = 0; y < H; ++y) {
for (int x = 0; x < W; ++x) {
int x = 0;
#ifdef __SSE2__
const vfloat onev = F2V(1.f);
const vfloat ambient0v = F2V(ambient[0]);
const vfloat ambient1v = F2V(ambient[1]);
const vfloat ambient2v = F2V(ambient[2]);
const vfloat t0v = F2V(t0);
const vfloat tepsv = F2V(teps);
const vfloat c65535v = F2V(65535.f);
for (; x < W - 3; x += 4) {
// ensure that the transmission is such that to avoid clipping...
float rgb[3] = { img->r(y, x), img->g(y, x), img->b(y, x) };
vfloat r = LVFU(img->r(y, x));
vfloat g = LVFU(img->g(y, x));
vfloat b = LVFU(img->b(y, x));
// ... t >= tl to avoid negative values
float tl = 1.f - min(rgb[0]/ambient[0], rgb[1]/ambient[1], rgb[2]/ambient[2]);
const vfloat tlv = onev - vminf(r / ambient0v, vminf(g / ambient1v, b / ambient2v));
// ... t >= tu to avoid values > 1
float tu = t0 - teps;
for (int c = 0; c < 3; ++c) {
if (ambient[c] < 1) {
tu = max(tu, (rgb[c] - ambient[c])/(1.f - ambient[c]));
}
}
float mt = max(t[y][x], t0, tl + teps, tu + teps);
if (params->dehaze.showDepthMap) {
img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt);
} else {
float r = (rgb[0] - ambient[0]) / mt + ambient[0];
float g = (rgb[1] - ambient[1]) / mt + ambient[1];
float b = (rgb[2] - ambient[2]) / mt + ambient[2];
r -= ambient0v;
g -= ambient1v;
b -= ambient2v;
img->r(y, x) = r;
img->g(y, x) = g;
img->b(y, x) = b;
vfloat tuv = t0v - tepsv;
tuv = vself(vmaskf_lt(ambient0v, onev), vmaxf(tuv, r / (onev - ambient0v)), tuv);
tuv = vself(vmaskf_lt(ambient1v, onev), vmaxf(tuv, g / (onev - ambient1v)), tuv);
tuv = vself(vmaskf_lt(ambient2v, onev), vmaxf(tuv, b / (onev - ambient2v)), tuv);
const vfloat mtv = vmaxf(LVFU(dark[y][x]), vmaxf(tlv, tuv) + tepsv);
if (params->dehaze.showDepthMap) {
const vfloat valv = vclampf(onev - mtv, ZEROV, onev) * c65535v;
STVFU(img->r(y, x), valv);
STVFU(img->g(y, x), valv);
STVFU(img->b(y, x), valv);
} else {
STVFU(img->r(y, x), (r / mtv + ambient0v) * c65535v);
STVFU(img->g(y, x), (g / mtv + ambient1v) * c65535v);
STVFU(img->b(y, x), (b / mtv + ambient2v) * c65535v);
}
}
#endif
for (; x < W; ++x) {
// ensure that the transmission is such that to avoid clipping...
float r = img->r(y, x);
float g = img->g(y, x);
float b = img->b(y, x);
// ... t >= tl to avoid negative values
const float tl = 1.f - min(r / ambient[0], g / ambient[1], b / ambient[2]);
// ... t >= tu to avoid values > 1
r -= ambient[0];
g -= ambient[1];
b -= ambient[2];
float tu = t0 - teps;
tu = ambient[0] < 1.f ? max(tu, r / (1.f - ambient[0])) : tu;
tu = ambient[1] < 1.f ? max(tu, g / (1.f - ambient[1])) : tu;
tu = ambient[2] < 1.f ? max(tu, b / (1.f - ambient[2])) : tu;
const float mt = max(dark[y][x], tl + teps, tu + teps);
if (params->dehaze.showDepthMap) {
img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * 65535.f;
} else {
img->r(y, x) = (r / mt + ambient[0]) * 65535.f;
img->g(y, x) = (g / mt + ambient[1]) * 65535.f;
img->b(y, x) = (b / mt + ambient[2]) * 65535.f;
}
}
}
img->normalizeFloatTo65535();
}