/* * This file is part of RawTherapee. * * Copyright (c) 2004-2010 Gabor Horvath * * RawTherapee is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * RawTherapee is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with RawTherapee. If not, see . */ #include "shmap.h" #include "gauss.h" #include "imagefloat.h" #include "rtengine.h" #include "rt_math.h" #include "rawimagesource.h" #include "sleef.c" #include "jaggedarray.h" #undef THREAD_PRIORITY_NORMAL #include "opthelper.h" namespace rtengine { SHMap::SHMap (int w, int h) : max_f(0.f), min_f(0.f), avg(0.f), W(w), H(h) { map = new float*[H]; for (int i = 0; i < H; i++) { map[i] = new float[W]; } } SHMap::~SHMap () { for (int i = 0; i < H; i++) { delete [] map[i]; } delete [] map; } void SHMap::fillLuminance( Imagefloat * img, float **luminance, double lumi[3] ) // fill with luminance { #ifdef _OPENMP #pragma omp parallel for #endif for (int i = 0; i < H; i++) for (int j = 0; j < W; j++) { luminance[i][j] = lumi[0] * std::max(img->r(i, j), 0.f) + lumi[1] * std::max(img->g(i, j), 0.f) + lumi[2] * std::max(img->b(i, j), 0.f); } } void SHMap::fillLuminanceL( float ** L, float **luminance) // fill with luminance { #ifdef _OPENMP #pragma omp parallel for #endif for (int i = 0; i < H; i++) for (int j = 0; j < W; j++) { luminance[i][j] = std::max(L[i][j], 0.f) ;//we can put here some enhancements Gamma, compression data,... } } void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int skip) { if (!hq) { fillLuminance( img, map, lumi); const bool useBoxBlur = radius > 40.0; // boxblur is less prone to artifacts for large radi #ifdef _OPENMP #pragma omp parallel if (!useBoxBlur) #endif { gaussianBlur (map, map, W, H, radius, useBoxBlur); } } else { //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //experimental dirpyr shmap float thresh = (100.f * radius); //1000; // set up range function // calculate size of Lookup table. That's possible because from a value k for all i>=k rangefn[i] will be exp(-10) // So we use this fact and the automatic clip of lut to reduce the size of lut and the number of calculations to fill the lut // In past this lut had only integer precision with rangefn[i] = 0 for all i>=k // We set the last element to a small epsilon 1e-15 instead of zero to avoid divisions by zero const int lutSize = thresh * sqrtf(10.f) + 1; thresh *= thresh; LUTf rangefn(lutSize); for (int i = 0; i < lutSize - 1; i++) { rangefn[i] = xexpf(-min(10.f, (static_cast(i) * i) / thresh )); //*intfactor; } rangefn[lutSize - 1] = 1e-15f; // We need one temporary buffer JaggedArray buffer (W, H); // the final result has to be in map // for an even number of levels that means: map => buffer, buffer => map // for an odd number of levels that means: buffer => map, map => buffer, buffer => map // so let's calculate the number of levels first // There are at least two levels int numLevels = 2; int scale = 2; while (skip * scale < 16) { scale *= 2; numLevels++; } float ** dirpyrlo[2]; if(numLevels & 1) { // odd number of levels, start with buffer dirpyrlo[0] = buffer; dirpyrlo[1] = map; } else { // even number of levels, start with map dirpyrlo[0] = map; dirpyrlo[1] = buffer; } fillLuminance( img, dirpyrlo[0], lumi); scale = 1; int level = 0; int indx = 0; dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1 - indx], W, H, rangefn, level, scale ); scale *= 2; level ++; indx = 1 - indx; while (skip * scale < 16) { dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1 - indx], W, H, rangefn, level, scale ); scale *= 2; level ++; indx = 1 - indx; } dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1 - indx], W, H, rangefn, level, scale ); } // update average, minimum, maximum double _avg = 0.0f; // use double precision to gain precision especially at systems with few cores and big pictures (error for 36 MPixel on single core was about 8% with float) min_f = 65535; max_f = 0; #ifdef _OPENMP #pragma omp parallel #endif { float _min_f = 65535.0f; float _max_f = 0.0f; float _val; #ifdef _OPENMP #pragma omp for reduction(+:_avg) schedule(dynamic,16) nowait #endif for (int i = 0; i < H; i++) for (int j = 0; j < W; j++) { _val = map[i][j]; if (_val < _min_f) { _min_f = _val; } if (_val > _max_f) { _max_f = _val; } _avg += _val; } #ifdef _OPENMP #pragma omp critical #endif { if(_min_f < min_f ) { min_f = _min_f; } if(_max_f > max_f ) { max_f = _max_f; } } } _avg /= ((H) * (W)); avg = _avg; } void SHMap::updateL (float** L, double radius, bool hq, int skip) { if (!hq) { fillLuminanceL( L, map); #ifdef _OPENMP #pragma omp parallel #endif { gaussianBlur (map, map, W, H, radius); } } else { //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //experimental dirpyr shmap float thresh = (100.f * radius); //1000; int levrad; // = 16; levrad = 2; //for retinex - otherwise levrad = 16 // set up range function // calculate size of Lookup table. That's possible because from a value k for all i>=k rangefn[i] will be exp(-10) // So we use this fact and the automatic clip of lut to reduce the size of lut and the number of calculations to fill the lut // In past this lut had only integer precision with rangefn[i] = 0 for all i>=k // We set the last element to a small epsilon 1e-15 instead of zero to avoid divisions by zero const int lutSize = (int) thresh * sqrtf(10.f) + 1; thresh *= thresh; LUTf rangefn(lutSize); for (int i = 0; i < lutSize - 1; i++) { rangefn[i] = xexpf(-min(10.f, (static_cast(i) * i) / thresh )); //*intfactor; } rangefn[lutSize - 1] = 1e-15f; //printf("lut=%d rf5=%f rfm=%f\n thre=%f",lutSize, rangefn[5],rangefn[lutSize-10],thresh ); // We need one temporary buffer JaggedArray buffer (W, H); // the final result has to be in map // for an even number of levels that means: map => buffer, buffer => map // for an odd number of levels that means: buffer => map, map => buffer, buffer => map // so let's calculate the number of levels first // There are at least two levels int numLevels = 2; int scale = 2; while (skip * scale < levrad) { scale *= 2; numLevels++; } //printf("numlev=%d\n",numLevels); float ** dirpyrlo[2]; if(numLevels & 1) { // odd number of levels, start with buffer dirpyrlo[0] = buffer; dirpyrlo[1] = map; } else { // even number of levels, start with map dirpyrlo[0] = map; dirpyrlo[1] = buffer; } fillLuminanceL( L, dirpyrlo[0]); scale = 1; int level = 0; int indx = 0; dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1 - indx], W, H, rangefn, level, scale ); scale *= 2; level ++; indx = 1 - indx; while (skip * scale < levrad) { dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1 - indx], W, H, rangefn, level, scale ); scale *= 2; level ++; indx = 1 - indx; } dirpyr_shmap(dirpyrlo[indx], dirpyrlo[1 - indx], W, H, rangefn, level, scale ); } // update average, minimum, maximum double _avg = 0.0f; // use double precision to gain precision especially at systems with few cores and big pictures (error for 36 MPixel on single core was about 8% with float) min_f = 65535; max_f = 0; #ifdef _OPENMP #pragma omp parallel #endif { float _min_f = 65535.0f; float _max_f = 0.0f; float _val; #ifdef _OPENMP #pragma omp for reduction(+:_avg) schedule(dynamic,16) nowait #endif for (int i = 0; i < H; i++) for (int j = 0; j < W; j++) { _val = map[i][j]; if (_val < _min_f) { _min_f = _val; } if (_val > _max_f) { _max_f = _val; } _avg += _val; } #ifdef _OPENMP #pragma omp critical #endif { if(_min_f < min_f ) { min_f = _min_f; } if(_max_f > max_f ) { max_f = _max_f; } } } _avg /= ((H) * (W)); avg = _avg; } void SHMap::forceStat (float max_, float min_, float avg_) { max_f = max_; min_f = min_; avg = avg_; } void SHMap::dirpyr_shmap(float ** data_fine, float ** data_coarse, int width, int height, LUTf & rangefn, int level, int scale) { //scale is spacing of directional averaging weights //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // calculate weights, compute directionally weighted average int scalewin, halfwin; if(level < 2) { halfwin = 1; scalewin = halfwin * scale; #ifdef _OPENMP #pragma omp parallel #endif { #ifdef __SSE2__ vfloat dirwtv, valv, normv, dftemp1v, dftemp2v; #endif // __SSE2__ int j; #ifdef _OPENMP #pragma omp for #endif for(int i = 0; i < height; i++) { float dirwt; for(j = 0; j < scalewin; j++) { float val = 0.f; float norm = 0.f; for(int inbr = max(i - scalewin, i % scale); inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j % scale; jnbr <= j + scalewin; jnbr += scale) { //printf("dat=%f ",abs(data_fine[inbr][jnbr] - data_fine[i][j])); dirwt = ( rangefn[abs(data_fine[inbr][jnbr] - data_fine[i][j])] ); val += dirwt * data_fine[inbr][jnbr]; norm += dirwt; } } data_coarse[i][j] = val / norm; // low pass filter } #ifdef __SSE2__ int inbrMin = max(i - scalewin, i % scale); for(; j < (width - scalewin) - 3; j += 4) { valv = _mm_setzero_ps(); normv = _mm_setzero_ps(); dftemp1v = LVFU(data_fine[i][j]); for(int inbr = inbrMin; inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j - scalewin; jnbr <= j + scalewin; jnbr += scale) { dftemp2v = LVFU(data_fine[inbr][jnbr]); dirwtv = ( rangefn[_mm_cvttps_epi32(vabsf(dftemp2v - dftemp1v))] ); valv += dirwtv * dftemp2v; normv += dirwtv; } } _mm_storeu_ps( &data_coarse[i][j], valv / normv); } for(; j < width - scalewin; j++) { float val = 0.f; float norm = 0.f; for(int inbr = inbrMin; inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j - scalewin; jnbr <= j + scalewin; jnbr += scale) { dirwt = ( rangefn[abs(data_fine[inbr][jnbr] - data_fine[i][j])] ); val += dirwt * data_fine[inbr][jnbr]; norm += dirwt; } } data_coarse[i][j] = val / norm; // low pass filter } #else for(; j < width - scalewin; j++) { float val = 0.f; float norm = 0.f; for(int inbr = max(i - scalewin, i % scale); inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j - scalewin; jnbr <= j + scalewin; jnbr += scale) { dirwt = ( rangefn[abs(data_fine[inbr][jnbr] - data_fine[i][j])] ); val += dirwt * data_fine[inbr][jnbr]; norm += dirwt; } } data_coarse[i][j] = val / norm; // low pass filter } #endif for(; j < width; j++) { float val = 0.f; float norm = 0.f; for(int inbr = max(i - scalewin, i % scale); inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j - scalewin; jnbr < width; jnbr += scale) { dirwt = ( rangefn[abs(data_fine[inbr][jnbr] - data_fine[i][j])] ); val += dirwt * data_fine[inbr][jnbr]; norm += dirwt; } } data_coarse[i][j] = val / norm; // low pass filter } } } } else { halfwin = 2; scalewin = halfwin * scale; int domker[5][5] = {{1, 1, 1, 1, 1}, {1, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {1, 1, 1, 1, 1}}; //generate domain kernel #ifdef _OPENMP #pragma omp parallel #endif { #ifdef __SSE2__ vfloat dirwtv, valv, normv, dftemp1v, dftemp2v; float domkerv[5][5][4] ALIGNED16 = {{{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {2, 2, 2, 2}, {2, 2, 2, 2}, {2, 2, 2, 2}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}}; #endif // __SSE2__ int j; #ifdef _OPENMP #pragma omp for schedule(dynamic,16) #endif for(int i = 0; i < height; i++) { float dirwt; for(j = 0; j < scalewin; j++) { float val = 0.f; float norm = 0.f; for(int inbr = max(i - scalewin, i % scale); inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j % scale; jnbr <= j + scalewin; jnbr += scale) { dirwt = ( domker[(inbr - i) / scale + halfwin][(jnbr - j) / scale + halfwin] * rangefn[abs(data_fine[inbr][jnbr] - data_fine[i][j])] ); val += dirwt * data_fine[inbr][jnbr]; norm += dirwt; } } data_coarse[i][j] = val / norm; // low pass filter } #ifdef __SSE2__ for(; j < width - scalewin - 3; j += 4) { valv = _mm_setzero_ps(); normv = _mm_setzero_ps(); dftemp1v = LVFU(data_fine[i][j]); for(int inbr = max(i - scalewin, i % scale); inbr <= MIN(i + scalewin, height - 1); inbr += scale) { int indexihlp = (inbr - i) / scale + halfwin; for (int jnbr = j - scalewin, indexjhlp = 0; jnbr <= j + scalewin; jnbr += scale, indexjhlp++) { dftemp2v = LVFU(data_fine[inbr][jnbr]); dirwtv = ( LVF(domkerv[indexihlp][indexjhlp]) * rangefn[_mm_cvttps_epi32(vabsf(dftemp2v - dftemp1v))] ); valv += dirwtv * dftemp2v; normv += dirwtv; } } _mm_storeu_ps( &data_coarse[i][j], valv / normv); } for(; j < width - scalewin; j++) { float val = 0; float norm = 0; for(int inbr = max(i - scalewin, i % scale); inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j - scalewin; jnbr <= j + scalewin; jnbr += scale) { dirwt = ( domker[(inbr - i) / scale + halfwin][(jnbr - j) / scale + halfwin] * rangefn[abs(data_fine[inbr][jnbr] - data_fine[i][j])] ); val += dirwt * data_fine[inbr][jnbr]; norm += dirwt; } } data_coarse[i][j] = val / norm; // low pass filter } #else for(; j < width - scalewin; j++) { float val = 0; float norm = 0; for(int inbr = max(i - scalewin, i % scale); inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j - scalewin; jnbr <= j + scalewin; jnbr += scale) { dirwt = ( domker[(inbr - i) / scale + halfwin][(jnbr - j) / scale + halfwin] * rangefn[abs(data_fine[inbr][jnbr] - data_fine[i][j])] ); val += dirwt * data_fine[inbr][jnbr]; norm += dirwt; } } data_coarse[i][j] = val / norm; // low pass filter } #endif for(; j < width; j++) { float val = 0; float norm = 0; for(int inbr = max(i - scalewin, i % scale); inbr <= min(i + scalewin, height - 1); inbr += scale) { for (int jnbr = j - scalewin; jnbr < width; jnbr += scale) { dirwt = ( domker[(inbr - i) / scale + halfwin][(jnbr - j) / scale + halfwin] * rangefn[abs(data_fine[inbr][jnbr] - data_fine[i][j])] ); val += dirwt * data_fine[inbr][jnbr]; norm += dirwt; } } data_coarse[i][j] = val / norm; // low pass filter } } } } } }//end of SHMap