reviewed boxblur code and usage

This commit is contained in:
Ingo Weyrich 2019-09-26 15:03:09 +02:00
parent 6026c110fa
commit 6bebc19f02
15 changed files with 1015 additions and 1595 deletions

View File

@ -31,6 +31,7 @@ set(CAMCONSTSFILE "camconst.json")
set(RTENGINESOURCEFILES set(RTENGINESOURCEFILES
badpixels.cc badpixels.cc
boxblur.cc
CA_correct_RT.cc CA_correct_RT.cc
capturesharpening.cc capturesharpening.cc
EdgePreservingDecomposition.cc EdgePreservingDecomposition.cc

View File

@ -479,6 +479,7 @@ enum nrquality {QUALITY_STANDARD, QUALITY_HIGH};
void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &nresi, float &highresi) void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &nresi, float &highresi)
{ {
BENCHFUN BENCHFUN
//#ifdef _DEBUG //#ifdef _DEBUG
MyTime t1e, t2e; MyTime t1e, t2e;
t1e.set(); t1e.set();
@ -687,8 +688,8 @@ BENCHFUN
} }
} }
int tilesize; int tilesize = 0;
int overlap; int overlap = 0;
if (settings->leveldnti == 0) { if (settings->leveldnti == 0) {
tilesize = 1024; tilesize = 1024;
@ -1341,8 +1342,6 @@ BENCHFUN
#ifdef _OPENMP #ifdef _OPENMP
int masterThread = omp_get_thread_num(); int masterThread = omp_get_thread_num();
#endif
#ifdef _OPENMP
#pragma omp parallel num_threads(denoiseNestedLevels) if (denoiseNestedLevels>1) #pragma omp parallel num_threads(denoiseNestedLevels) if (denoiseNestedLevels>1)
#endif #endif
{ {
@ -1351,11 +1350,9 @@ BENCHFUN
#else #else
int subThread = 0; int subThread = 0;
#endif #endif
float blurbuffer[TS * TS] ALIGNED64;
float *Lblox = LbloxArray[subThread]; float *Lblox = LbloxArray[subThread];
float *fLblox = fLbloxArray[subThread]; float *fLblox = fLbloxArray[subThread];
float pBuf[width + TS + 2 * blkrad * offset] ALIGNED16; float pBuf[width + TS + 2 * blkrad * offset] ALIGNED16;
float nbrwt[TS * TS] ALIGNED64;
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp for #pragma omp for
#endif #endif
@ -1430,7 +1427,7 @@ BENCHFUN
for (int hblk = 0; hblk < numblox_W; ++hblk) { for (int hblk = 0; hblk < numblox_W; ++hblk) {
RGBtile_denoise(fLblox, hblk, noisevar_Ldetail, nbrwt, blurbuffer); RGBtile_denoise(fLblox, hblk, noisevar_Ldetail);
}//end of horizontal block loop }//end of horizontal block loop
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -1447,14 +1444,8 @@ BENCHFUN
//add row of blocks to output image tile //add row of blocks to output image tile
RGBoutput_tile_row(Lblox, Ldetail, tilemask_out, height, width, topproc); RGBoutput_tile_row(Lblox, Ldetail, tilemask_out, height, width, topproc);
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
}//end of vertical block loop }//end of vertical block loop
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
} }
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for num_threads(denoiseNestedLevels) if (denoiseNestedLevels>1) #pragma omp parallel for num_threads(denoiseNestedLevels) if (denoiseNestedLevels>1)
@ -2041,26 +2032,20 @@ BENCHFUN
}//end of main RGB_denoise }//end of main RGB_denoise
void ImProcFunctions::RGBtile_denoise(float* fLblox, int hblproc, float noisevar_Ldetail) //for DCT
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
void ImProcFunctions::RGBtile_denoise(float * fLblox, int hblproc, float noisevar_Ldetail, float * nbrwt, float * blurbuffer) //for DCT
{ {
int blkstart = hblproc * TS * TS; float nbrwt[TS * TS] ALIGNED64;
const int blkstart = hblproc * TS * TS;
boxabsblur(fLblox + blkstart, nbrwt, 3, 3, TS, TS, blurbuffer); //blur neighbor weights for more robust estimation //for DCT boxabsblur(fLblox + blkstart, nbrwt, 3, TS, TS, false); //blur neighbor weights for more robust estimation //for DCT
#ifdef __SSE2__ #ifdef __SSE2__
__m128 tempv; const vfloat noisevar_Ldetailv = F2V(-1.f / noisevar_Ldetail);
__m128 noisevar_Ldetailv = _mm_set1_ps(noisevar_Ldetail); const vfloat onev = F2V(1.f);
__m128 onev = _mm_set1_ps(1.0f);
for (int n = 0; n < TS * TS; n += 4) { //for DCT for (int n = 0; n < TS * TS; n += 4) { //for DCT
tempv = onev - xexpf(-SQRV(LVF(nbrwt[n])) / noisevar_Ldetailv); const vfloat tempv = onev - xexpf(SQRV(LVF(nbrwt[n])) * noisevar_Ldetailv);
_mm_storeu_ps(&fLblox[blkstart + n], LVFU(fLblox[blkstart + n]) * tempv); STVF(fLblox[blkstart + n], LVF(fLblox[blkstart + n]) * tempv);
}//output neighbor averaged result }//output neighbor averaged result
#else #else
@ -2071,14 +2056,7 @@ void ImProcFunctions::RGBtile_denoise(float * fLblox, int hblproc, float noiseva
#endif #endif
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% }
//printf("vblk=%d hlk=%d wsqave=%f || ",vblproc,hblproc,wsqave);
}//end of function tile_denoise
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
void ImProcFunctions::RGBoutput_tile_row(float *bloxrow_L, float ** Ldetail, float ** tilemask_out, int height, int width, int top) void ImProcFunctions::RGBoutput_tile_row(float *bloxrow_L, float ** Ldetail, float ** tilemask_out, int height, int width, int top)
{ {
@ -2207,7 +2185,7 @@ void ImProcFunctions::Noise_residualAB(const wavelet_decomposition &WaveletCoeff
chmaxresid = maxresid; chmaxresid = maxresid;
} }
bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3]) bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(const wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3])
{ {
int maxlvl = min(WaveletCoeffs_L.maxlevel(), 5); int maxlvl = min(WaveletCoeffs_L.maxlevel(), 5);
const float eps = 0.01f; const float eps = 0.01f;
@ -2258,23 +2236,22 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &Wavelet
//simple wavelet shrinkage //simple wavelet shrinkage
float * sfave = buffer[0] + 32; float * sfave = buffer[0] + 32;
float * sfaved = buffer[2] + 96; float * sfaved = buffer[2] + 96;
float * blurBuffer = buffer[1] + 64;
float mad_Lr = madL[lvl][dir - 1]; float mad_Lr = madL[lvl][dir - 1];
float levelFactor = mad_Lr * 5.f / (lvl + 1); float levelFactor = mad_Lr * 5.f / (lvl + 1);
#ifdef __SSE2__ #ifdef __SSE2__
__m128 mad_Lv; vfloat mad_Lv;
__m128 ninev = _mm_set1_ps(9.0f); vfloat ninev = F2V(9.0f);
__m128 epsv = _mm_set1_ps(eps); vfloat epsv = F2V(eps);
__m128 mag_Lv; vfloat mag_Lv;
__m128 levelFactorv = _mm_set1_ps(levelFactor); vfloat levelFactorv = F2V(levelFactor);
int coeffloc_L; int coeffloc_L;
for (coeffloc_L = 0; coeffloc_L < Hlvl_L * Wlvl_L - 3; coeffloc_L += 4) { for (coeffloc_L = 0; coeffloc_L < Hlvl_L * Wlvl_L - 3; coeffloc_L += 4) {
mad_Lv = LVFU(noisevarlum[coeffloc_L]) * levelFactorv; mad_Lv = LVFU(noisevarlum[coeffloc_L]) * levelFactorv;
mag_Lv = SQRV(LVFU(WavCoeffs_L[dir][coeffloc_L])); mag_Lv = SQRV(LVFU(WavCoeffs_L[dir][coeffloc_L]));
_mm_storeu_ps(&sfave[coeffloc_L], mag_Lv / (mag_Lv + mad_Lv * xexpf(-mag_Lv / (mad_Lv * ninev)) + epsv)); STVFU(sfave[coeffloc_L], mag_Lv / (mag_Lv + mad_Lv * xexpf(-mag_Lv / (mad_Lv * ninev)) + epsv));
} }
for (; coeffloc_L < Hlvl_L * Wlvl_L; ++coeffloc_L) { for (; coeffloc_L < Hlvl_L * Wlvl_L; ++coeffloc_L) {
@ -2294,15 +2271,15 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &Wavelet
} }
#endif #endif
boxblur(sfave, sfaved, blurBuffer, lvl + 2, lvl + 2, Wlvl_L, Hlvl_L); //increase smoothness by locally averaging shrinkage boxblur(sfave, sfaved, lvl + 2, Wlvl_L, Hlvl_L, false); //increase smoothness by locally averaging shrinkage
#ifdef __SSE2__ #ifdef __SSE2__
__m128 sfavev; vfloat sfavev;
__m128 sf_Lv; vfloat sf_Lv;
for (coeffloc_L = 0; coeffloc_L < Hlvl_L * Wlvl_L - 3; coeffloc_L += 4) { for (coeffloc_L = 0; coeffloc_L < Hlvl_L * Wlvl_L - 3; coeffloc_L += 4) {
sfavev = LVFU(sfaved[coeffloc_L]); sfavev = LVFU(sfaved[coeffloc_L]);
sf_Lv = LVFU(sfave[coeffloc_L]); sf_Lv = LVFU(sfave[coeffloc_L]);
_mm_storeu_ps(&WavCoeffs_L[dir][coeffloc_L], LVFU(WavCoeffs_L[dir][coeffloc_L]) * (SQRV(sfavev) + SQRV(sf_Lv)) / (sfavev + sf_Lv + epsv)); STVFU(WavCoeffs_L[dir][coeffloc_L], LVFU(WavCoeffs_L[dir][coeffloc_L]) * (SQRV(sfavev) + SQRV(sf_Lv)) / (sfavev + sf_Lv + epsv));
//use smoothed shrinkage unless local shrinkage is much less //use smoothed shrinkage unless local shrinkage is much less
} }
@ -2340,7 +2317,7 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &Wavelet
return (!memoryAllocationFailed); return (!memoryAllocationFailed);
} }
bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab,
float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb) float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb)
{ {
int maxlvl = WaveletCoeffs_L.maxlevel(); int maxlvl = WaveletCoeffs_L.maxlevel();
@ -2422,12 +2399,12 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &Wavele
if (noisevar_ab > 0.001f) { if (noisevar_ab > 0.001f) {
#ifdef __SSE2__ #ifdef __SSE2__
__m128 onev = _mm_set1_ps(1.f); vfloat onev = F2V(1.f);
__m128 mad_abrv = _mm_set1_ps(mad_abr); vfloat mad_abrv = F2V(mad_abr);
__m128 rmad_Lm9v = onev / _mm_set1_ps(mad_Lr * 9.f); vfloat rmad_Lm9v = onev / F2V(mad_Lr * 9.f);
__m128 mad_abv; vfloat mad_abv;
__m128 mag_Lv, mag_abv; vfloat mag_Lv, mag_abv;
__m128 tempabv; vfloat tempabv;
int coeffloc_ab; int coeffloc_ab;
for (coeffloc_ab = 0; coeffloc_ab < Hlvl_ab * Wlvl_ab - 3; coeffloc_ab += 4) { for (coeffloc_ab = 0; coeffloc_ab < Hlvl_ab * Wlvl_ab - 3; coeffloc_ab += 4) {
@ -2437,7 +2414,7 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &Wavele
mag_Lv = LVFU(WavCoeffs_L[dir][coeffloc_ab]); mag_Lv = LVFU(WavCoeffs_L[dir][coeffloc_ab]);
mag_abv = SQRV(tempabv); mag_abv = SQRV(tempabv);
mag_Lv = SQRV(mag_Lv) * rmad_Lm9v; mag_Lv = SQRV(mag_Lv) * rmad_Lm9v;
_mm_storeu_ps(&WavCoeffs_ab[dir][coeffloc_ab], tempabv * SQRV((onev - xexpf(-(mag_abv / mad_abv) - (mag_Lv))))); STVFU(WavCoeffs_ab[dir][coeffloc_ab], tempabv * SQRV((onev - xexpf(-(mag_abv / mad_abv) - (mag_Lv)))));
} }
// few remaining pixels // few remaining pixels
@ -2470,9 +2447,7 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &Wavele
} }
for (int i = 2; i >= 0; i--) { for (int i = 2; i >= 0; i--) {
if (buffer[i] != nullptr) { delete[] buffer[i];
delete[] buffer[i];
}
} }
} }
@ -2480,7 +2455,7 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &Wavele
} }
bool ImProcFunctions::WaveletDenoiseAllL(wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3], float * vari, int edge)//mod JD bool ImProcFunctions::WaveletDenoiseAllL(const wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3], float * vari, int edge)//mod JD
{ {
@ -2530,16 +2505,14 @@ bool ImProcFunctions::WaveletDenoiseAllL(wavelet_decomposition &WaveletCoeffs_L,
} }
for (int i = 3; i >= 0; i--) { for (int i = 3; i >= 0; i--) {
if (buffer[i] != nullptr) { delete[] buffer[i];
delete[] buffer[i];
}
} }
} }
return (!memoryAllocationFailed); return (!memoryAllocationFailed);
} }
bool ImProcFunctions::WaveletDenoiseAllAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, bool ImProcFunctions::WaveletDenoiseAllAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab,
float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb)//mod JD float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb)//mod JD
{ {
@ -2596,7 +2569,7 @@ bool ImProcFunctions::WaveletDenoiseAllAB(wavelet_decomposition &WaveletCoeffs_L
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
void ImProcFunctions::ShrinkAllL(wavelet_decomposition &WaveletCoeffs_L, float **buffer, int level, int dir, void ImProcFunctions::ShrinkAllL(const wavelet_decomposition &WaveletCoeffs_L, float **buffer, int level, int dir,
float *noisevarlum, float * madL, float * vari, int edge) float *noisevarlum, float * madL, float * vari, int edge)
{ {
@ -2607,12 +2580,12 @@ void ImProcFunctions::ShrinkAllL(wavelet_decomposition &WaveletCoeffs_L, float *
float * sfaved = buffer[1] + 64; float * sfaved = buffer[1] + 64;
float * blurBuffer = buffer[2] + 96; float * blurBuffer = buffer[2] + 96;
int W_L = WaveletCoeffs_L.level_W(level); const int W_L = WaveletCoeffs_L.level_W(level);
int H_L = WaveletCoeffs_L.level_H(level); const int H_L = WaveletCoeffs_L.level_H(level);
float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(level); float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(level);
// printf("OK lev=%d\n",level); const float mad_L = madL[dir - 1] ;
float mad_L = madL[dir - 1] ; const float levelFactor = mad_L * 5.f / static_cast<float>(level + 1);
if (edge == 1 && vari) { if (edge == 1 && vari) {
noisevarlum = blurBuffer; // we need one buffer, but fortunately we don't have to allocate a new one because we can use blurBuffer noisevarlum = blurBuffer; // we need one buffer, but fortunately we don't have to allocate a new one because we can use blurBuffer
@ -2622,71 +2595,45 @@ void ImProcFunctions::ShrinkAllL(wavelet_decomposition &WaveletCoeffs_L, float *
} }
} }
float levelFactor = mad_L * 5.f / static_cast<float>(level + 1); int i = 0;
#ifdef __SSE2__ #ifdef __SSE2__
__m128 magv; const vfloat levelFactorv = F2V(levelFactor);
__m128 levelFactorv = _mm_set1_ps(levelFactor); const vfloat ninev = F2V(9.f);
__m128 mad_Lv; const vfloat epsv = F2V(eps);
__m128 ninev = _mm_set1_ps(9.0f);
__m128 epsv = _mm_set1_ps(eps);
int i;
for (i = 0; i < W_L * H_L - 3; i += 4) { for (; i < W_L * H_L - 3; i += 4) {
mad_Lv = LVFU(noisevarlum[i]) * levelFactorv; const vfloat mad_Lv = LVFU(noisevarlum[i]) * levelFactorv;
magv = SQRV(LVFU(WavCoeffs_L[dir][i])); const vfloat magv = SQRV(LVFU(WavCoeffs_L[dir][i]));
_mm_storeu_ps(&sfave[i], magv / (magv + mad_Lv * xexpf(-magv / (ninev * mad_Lv)) + epsv)); STVFU(sfave[i], magv / (magv + mad_Lv * xexpf(-magv / (ninev * mad_Lv)) + epsv));
} }
#endif
// few remaining pixels // few remaining pixels
for (; i < W_L * H_L; ++i) { for (; i < W_L * H_L; ++i) {
float mag = SQR(WavCoeffs_L[dir][i]); const float mag = SQR(WavCoeffs_L[dir][i]);
sfave[i] = mag / (mag + levelFactor * noisevarlum[i] * xexpf(-mag / (9 * levelFactor * noisevarlum[i])) + eps); sfave[i] = mag / (mag + levelFactor * noisevarlum[i] * xexpf(-mag / (9 * levelFactor * noisevarlum[i])) + eps);
} }
#else boxblur(sfave, sfaved, level + 2, W_L, H_L, false); //increase smoothness by locally averaging shrinkage
for (int i = 0; i < W_L * H_L; ++i) {
float mag = SQR(WavCoeffs_L[dir][i]);
float shrinkfactor = mag / (mag + levelFactor * noisevarlum[i] * xexpf(-mag / (9 * levelFactor * noisevarlum[i])) + eps);
sfave[i] = shrinkfactor;
}
#endif
boxblur(sfave, sfaved, blurBuffer, level + 2, level + 2, W_L, H_L); //increase smoothness by locally averaging shrinkage
i = 0;
#ifdef __SSE2__ #ifdef __SSE2__
__m128 sfv;
for (i = 0; i < W_L * H_L - 3; i += 4) { for (; i < W_L * H_L - 3; i += 4) {
sfv = LVFU(sfave[i]); const vfloat sfv = LVFU(sfave[i]);
//use smoothed shrinkage unless local shrinkage is much less //use smoothed shrinkage unless local shrinkage is much less
_mm_storeu_ps(&WavCoeffs_L[dir][i], _mm_loadu_ps(&WavCoeffs_L[dir][i]) * (SQRV(LVFU(sfaved[i])) + SQRV(sfv)) / (LVFU(sfaved[i]) + sfv + epsv)); STVFU(WavCoeffs_L[dir][i], LVFU(WavCoeffs_L[dir][i]) * (SQRV(LVFU(sfaved[i])) + SQRV(sfv)) / (LVFU(sfaved[i]) + sfv + epsv));
} }
#endif
// few remaining pixels // few remaining pixels
for (; i < W_L * H_L; ++i) { for (; i < W_L * H_L; ++i) {
float sf = sfave[i]; const float sf = sfave[i];
//use smoothed shrinkage unless local shrinkage is much less //use smoothed shrinkage unless local shrinkage is much less
WavCoeffs_L[dir][i] *= (SQR(sfaved[i]) + SQR(sf)) / (sfaved[i] + sf + eps); WavCoeffs_L[dir][i] *= (SQR(sfaved[i]) + SQR(sf)) / (sfaved[i] + sf + eps);
}//now luminance coefficients are denoised }//now luminance coefficients are denoised
#else
for (int i = 0; i < W_L * H_L; ++i) {
float sf = sfave[i];
//use smoothed shrinkage unless local shrinkage is much less
WavCoeffs_L[dir][i] *= (SQR(sfaved[i]) + SQR(sf)) / (sfaved[i] + sf + eps);
}//now luminance coefficients are denoised
#endif
} }
void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, float **buffer, int level, int dir, void ImProcFunctions::ShrinkAllAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab, float **buffer, int level, int dir,
float *noisevarchrom, float noisevar_ab, const bool useNoiseCCurve, bool autoch, float *noisevarchrom, float noisevar_ab, const bool useNoiseCCurve, bool autoch,
bool denoiseMethodRgb, float * madL, float * madaab, bool madCalculated) bool denoiseMethodRgb, float * madL, float * madaab, bool madCalculated)
@ -2700,7 +2647,6 @@ void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavele
float * sfaveab = buffer[0] + 32; float * sfaveab = buffer[0] + 32;
float * sfaveabd = buffer[1] + 64; float * sfaveabd = buffer[1] + 64;
float * blurBuffer = buffer[2] + 96;
int W_ab = WaveletCoeffs_ab.level_W(level); int W_ab = WaveletCoeffs_ab.level_W(level);
int H_ab = WaveletCoeffs_ab.level_H(level); int H_ab = WaveletCoeffs_ab.level_H(level);
@ -2724,12 +2670,12 @@ void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavele
if (noisevar_ab > 0.001f) { if (noisevar_ab > 0.001f) {
madab = useNoiseCCurve ? madab : madab * noisevar_ab; madab = useNoiseCCurve ? madab : madab * noisevar_ab;
#ifdef __SSE2__ #ifdef __SSE2__
__m128 onev = _mm_set1_ps(1.f); vfloat onev = F2V(1.f);
__m128 mad_abrv = _mm_set1_ps(madab); vfloat mad_abrv = F2V(madab);
__m128 rmadLm9v = onev / _mm_set1_ps(mad_L * 9.f); vfloat rmadLm9v = onev / F2V(mad_L * 9.f);
__m128 mad_abv ; vfloat mad_abv ;
__m128 mag_Lv, mag_abv; vfloat mag_Lv, mag_abv;
int coeffloc_ab; int coeffloc_ab;
for (coeffloc_ab = 0; coeffloc_ab < H_ab * W_ab - 3; coeffloc_ab += 4) { for (coeffloc_ab = 0; coeffloc_ab < H_ab * W_ab - 3; coeffloc_ab += 4) {
@ -2738,7 +2684,7 @@ void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavele
mag_Lv = LVFU(WavCoeffs_L[dir][coeffloc_ab]); mag_Lv = LVFU(WavCoeffs_L[dir][coeffloc_ab]);
mag_abv = SQRV(LVFU(WavCoeffs_ab[dir][coeffloc_ab])); mag_abv = SQRV(LVFU(WavCoeffs_ab[dir][coeffloc_ab]));
mag_Lv = (SQRV(mag_Lv)) * rmadLm9v; mag_Lv = (SQRV(mag_Lv)) * rmadLm9v;
_mm_storeu_ps(&sfaveab[coeffloc_ab], (onev - xexpf(-(mag_abv / mad_abv) - (mag_Lv)))); STVFU(sfaveab[coeffloc_ab], (onev - xexpf(-(mag_abv / mad_abv) - (mag_Lv))));
} }
// few remaining pixels // few remaining pixels
@ -2761,18 +2707,18 @@ void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavele
#endif #endif
boxblur(sfaveab, sfaveabd, blurBuffer, level + 2, level + 2, W_ab, H_ab); //increase smoothness by locally averaging shrinkage boxblur(sfaveab, sfaveabd, level + 2, W_ab, H_ab, false); //increase smoothness by locally averaging shrinkage
#ifdef __SSE2__ #ifdef __SSE2__
__m128 epsv = _mm_set1_ps(eps); vfloat epsv = F2V(eps);
__m128 sfabv; vfloat sfabv;
__m128 sfaveabv; vfloat sfaveabv;
for (coeffloc_ab = 0; coeffloc_ab < H_ab * W_ab - 3; coeffloc_ab += 4) { for (coeffloc_ab = 0; coeffloc_ab < H_ab * W_ab - 3; coeffloc_ab += 4) {
sfabv = LVFU(sfaveab[coeffloc_ab]); sfabv = LVFU(sfaveab[coeffloc_ab]);
sfaveabv = LVFU(sfaveabd[coeffloc_ab]); sfaveabv = LVFU(sfaveabd[coeffloc_ab]);
//use smoothed shrinkage unless local shrinkage is much less //use smoothed shrinkage unless local shrinkage is much less
_mm_storeu_ps(&WavCoeffs_ab[dir][coeffloc_ab], LVFU(WavCoeffs_ab[dir][coeffloc_ab]) * (SQRV(sfaveabv) + SQRV(sfabv)) / (sfaveabv + sfabv + epsv)); STVFU(WavCoeffs_ab[dir][coeffloc_ab], LVFU(WavCoeffs_ab[dir][coeffloc_ab]) * (SQRV(sfaveabv) + SQRV(sfabv)) / (sfaveabv + sfabv + epsv));
} }
// few remaining pixels // few remaining pixels
@ -2919,8 +2865,8 @@ void ImProcFunctions::ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b,
} }
void ImProcFunctions::WaveletDenoiseAll_info(int levwav, wavelet_decomposition &WaveletCoeffs_a, void ImProcFunctions::WaveletDenoiseAll_info(int levwav, const wavelet_decomposition &WaveletCoeffs_a,
wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int schoice, const wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int schoice,
float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool denoiseMethodRgb) float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool denoiseMethodRgb)
{ {
@ -3106,7 +3052,7 @@ void ImProcFunctions::calcautodn_info(float &chaut, float &delta, int Nb, int le
} }
void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, const bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread) void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, const bool isRAW, const LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread)
{ {
if ((settings->leveldnautsimpl == 1 && dnparams.Cmethod == "MAN") || (settings->leveldnautsimpl == 0 && dnparams.C2method == "MANU")) { if ((settings->leveldnautsimpl == 1 && dnparams.Cmethod == "MAN") || (settings->leveldnautsimpl == 0 && dnparams.C2method == "MANU")) {
//nothing to do //nothing to do
@ -3173,8 +3119,8 @@ void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc,
const float gain = pow(2.0f, float(expcomp)); const float gain = pow(2.0f, float(expcomp));
int tilesize; int tilesize = 0;
int overlap; int overlap = 0;
if (settings->leveldnti == 0) { if (settings->leveldnti == 0) {
tilesize = 1024; tilesize = 1024;
@ -3275,16 +3221,16 @@ void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc,
for (int i = tiletop; i < tilebottom; i += 2) { for (int i = tiletop; i < tilebottom; i += 2) {
int i1 = i - tiletop; int i1 = i - tiletop;
#ifdef __SSE2__ #ifdef __SSE2__
__m128 aNv, bNv; vfloat aNv, bNv;
__m128 c100v = _mm_set1_ps(100.f); vfloat c100v = F2V(100.f);
int j; int j;
for (j = tileleft; j < tileright - 7; j += 8) { for (j = tileleft; j < tileright - 7; j += 8) {
int j1 = j - tileleft; int j1 = j - tileleft;
aNv = LVFU(acalc[i >> 1][j >> 1]); aNv = LVFU(acalc[i >> 1][j >> 1]);
bNv = LVFU(bcalc[i >> 1][j >> 1]); bNv = LVFU(bcalc[i >> 1][j >> 1]);
_mm_storeu_ps(&noisevarhue[i1 >> 1][j1 >> 1], xatan2f(bNv, aNv)); STVFU(noisevarhue[i1 >> 1][j1 >> 1], xatan2f(bNv, aNv));
_mm_storeu_ps(&noisevarchrom[i1 >> 1][j1 >> 1], vmaxf(vsqrtf(SQRV(aNv) + SQRV(bNv)),c100v)); STVFU(noisevarchrom[i1 >> 1][j1 >> 1], vmaxf(vsqrtf(SQRV(aNv) + SQRV(bNv)),c100v));
} }
for (; j < tileright; j += 2) { for (; j < tileright; j += 2) {

420
rtengine/boxblur.cc Normal file
View File

@ -0,0 +1,420 @@
/*
* This file is part of RawTherapee.
*
* Copyright (C) 2010 Emil Martinec <ejmartin@uchicago.edu>
* Copyright (C) 2019 Ingo Weyrich <heckflosse67@gmx.de>
*
* RawTherapee is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* RawTherapee is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RawTherapee. If not, see <https://www.gnu.org/licenses/>.
*/
#include <memory>
#include <cmath>
#include "boxblur.h"
#include "rt_math.h"
#include "opthelper.h"
namespace rtengine
{
void boxblur(float** src, float** dst, int radius, int W, int H, bool multiThread)
{
//box blur using rowbuffers and linebuffers instead of a full size buffer
if (radius == 0) {
if (src != dst) {
#ifdef _OPENMP
#pragma omp parallel for if (multiThread)
#endif
for (int row = 0; row < H; ++row) {
for (int col = 0; col < W; ++col) {
dst[row][col] = src[row][col];
}
}
}
return;
}
constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
#ifdef _OPENMP
#pragma omp parallel if (multiThread)
#endif
{
std::unique_ptr<float[]> buffer(new float[numCols * (radius + 1)]);
//horizontal blur
float* const lineBuffer = buffer.get();
#ifdef _OPENMP
#pragma omp for
#endif
for (int row = 0; row < H; ++row) {
float len = radius + 1;
float tempval = src[row][0];
lineBuffer[0] = tempval;
for (int j = 1; j <= radius; j++) {
tempval += src[row][j];
}
tempval /= len;
dst[row][0] = tempval;
for (int col = 1; col <= radius; ++col) {
lineBuffer[col] = src[row][col];
tempval = (tempval * len + src[row][col + radius]) / (len + 1);
dst[row][col] = tempval;
++len;
}
int pos = 0;
for (int col = radius + 1; col < W - radius; ++col) {
const float oldVal = lineBuffer[pos];
lineBuffer[pos] = src[row][col];
tempval = tempval + (src[row][col + radius] - oldVal) / len;
dst[row][col] = tempval;
++pos;
pos = pos <= radius ? pos : 0;
}
for (int col = W - radius; col < W; ++col) {
tempval = (tempval * len - lineBuffer[pos]) / (len - 1);
dst[row][col] = tempval;
--len;
++pos;
pos = pos <= radius ? pos : 0;
}
}
//vertical blur
#ifdef __SSE2__
vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer.get();
const vfloat leninitv = F2V(radius + 1);
const vfloat onev = F2V(1.f);
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int col = 0; col < W - 7; col += 8) {
lenv = leninitv;
tempv = LVFU(dst[0][col]);
temp1v = LVFU(dst[0][col + 4]);
rowBuffer[0][0] = tempv;
rowBuffer[0][1] = temp1v;
for (int i = 1; i <= radius; ++i) {
tempv = tempv + LVFU(dst[i][col]);
temp1v = temp1v + LVFU(dst[i][col + 4]);
}
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU(dst[0][col], tempv);
STVFU(dst[0][col + 4], temp1v);
for (int row = 1; row <= radius; ++row) {
rowBuffer[row][0] = LVFU(dst[row][col]);
rowBuffer[row][1] = LVFU(dst[row][col + 4]);
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenp1v;
}
rlenv = onev / lenv;
int pos = 0;
for (int row = radius + 1; row < H - radius; ++row) {
vfloat oldVal0 = rowBuffer[pos][0];
vfloat oldVal1 = rowBuffer[pos][1];
rowBuffer[pos][0] = LVFU(dst[row][col]);
rowBuffer[pos][1] = LVFU(dst[row][col + 4]);
tempv = tempv + (LVFU(dst[row + radius][col]) - oldVal0) * rlenv ;
temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - oldVal1) * rlenv ;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
++pos;
pos = pos <= radius ? pos : 0;
}
for (int row = H - radius; row < H; ++row) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - rowBuffer[pos][0]) / lenm1v;
temp1v = (temp1v * lenv - rowBuffer[pos][1]) / lenm1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenm1v;
++pos;
pos = pos <= radius ? pos : 0;
}
}
#else
float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int col = 0; col < W - numCols + 1; col += 8) {
float len = radius + 1;
for (int k = 0; k < numCols; ++k) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int i = 1; i <= radius; ++i) {
for (int k = 0; k < numCols; ++k) {
dst[0][col + k] += dst[i][col + k];
}
}
for(int k = 0; k < numCols; ++k) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; ++row) {
for(int k = 0; k < numCols; ++k) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
}
len ++;
}
int pos = 0;
for (int row = radius + 1; row < H - radius; ++row) {
for(int k = 0; k < numCols; ++k) {
float oldVal = rowBuffer[pos][k];
rowBuffer[pos][k] = dst[row][col + k];
dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) / len;
}
++pos;
pos = pos <= radius ? pos : 0;
}
for (int row = H - radius; row < H; ++row) {
for(int k = 0; k < numCols; ++k) {
dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[pos][k]) / (len - 1);
}
len --;
++pos;
pos = pos <= radius ? pos : 0;
}
}
#endif
//vertical blur, remaining columns
#ifdef _OPENMP
#pragma omp single
#endif
{
const int remaining = W % numCols;
if (remaining > 0) {
float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
const int col = W - remaining;
float len = radius + 1;
for(int k = 0; k < remaining; ++k) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int row = 1; row <= radius; ++row) {
for(int k = 0; k < remaining; ++k) {
dst[0][col + k] += dst[row][col + k];
}
}
for(int k = 0; k < remaining; ++k) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; ++row) {
for(int k = 0; k < remaining; ++k) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
}
len ++;
}
const float rlen = 1.f / len;
int pos = 0;
for (int row = radius + 1; row < H - radius; ++row) {
for(int k = 0; k < remaining; ++k) {
float oldVal = rowBuffer[pos][k];
rowBuffer[pos][k] = dst[row][col + k];
dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) * rlen;
}
++pos;
pos = pos <= radius ? pos : 0;
}
for (int row = H - radius; row < H; ++row) {
for(int k = 0; k < remaining; ++k) {
dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[pos][k]) / (len - 1);
}
len --;
++pos;
pos = pos <= radius ? pos : 0;
}
}
}
}
}
void boxabsblur(float** src, float** dst, int radius, int W, int H, bool multiThread)
{
//abs box blur using rowbuffers and linebuffers instead of a full size buffer, W should be a multiple of 16
if (radius == 0) {
if (src != dst) {
#ifdef _OPENMP
#pragma omp parallel for if (multiThread)
#endif
for (int row = 0; row < H; ++row) {
for (int col = 0; col < W; ++col) {
dst[row][col] = std::fabs(src[row][col]);
}
}
}
return;
}
constexpr int numCols = 16; // process numCols columns at once for better usage of L1 cpu cache
#ifdef _OPENMP
#pragma omp parallel if (multiThread)
#endif
{
float buffer[numCols * (radius + 1)] ALIGNED64;
//horizontal blur
float* const lineBuffer = buffer;
#ifdef _OPENMP
#pragma omp for
#endif
for (int row = 0; row < H; ++row) {
float len = radius + 1;
float tempval = std::fabs(src[row][0]);
lineBuffer[0] = tempval;
for (int j = 1; j <= radius; j++) {
tempval += std::fabs(src[row][j]);
}
tempval /= len;
dst[row][0] = tempval;
for (int col = 1; col <= radius; ++col) {
lineBuffer[col] = std::fabs(src[row][col]);
tempval = (tempval * len + std::fabs(src[row][col + radius])) / (len + 1);
dst[row][col] = tempval;
++len;
}
const float rlen = 1.f / len;
int pos = 0;
for (int col = radius + 1; col < W - radius; ++col) {
const float oldVal = lineBuffer[pos];
lineBuffer[pos] = std::fabs(src[row][col]);
tempval = tempval + (std::fabs(src[row][col + radius]) - oldVal) * rlen;
dst[row][col] = tempval;
++pos;
pos = pos <= radius ? pos : 0;
}
for (int col = W - radius; col < W; ++col) {
tempval = (tempval * len - lineBuffer[pos]) / (len - 1);
dst[row][col] = tempval;
--len;
++pos;
pos = pos <= radius ? pos : 0;
}
}
//vertical blur
float (* const rowBuffer)[numCols] = (float(*)[numCols]) buffer;
#ifdef _OPENMP
#pragma omp for
#endif
for (int col = 0; col < W; col += numCols) {
float len = radius + 1;
for (int k = 0; k < numCols; ++k) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int i = 1; i <= radius; ++i) {
for (int k = 0; k < numCols; ++k) {
dst[0][col + k] += dst[i][col + k];
}
}
for(int k = 0; k < numCols; ++k) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; ++row) {
for(int k = 0; k < numCols; ++k) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
}
++len;
}
const float rlen = 1.f / len;
int pos = 0;
for (int row = radius + 1; row < H - radius; ++row) {
for(int k = 0; k < numCols; ++k) {
float oldVal = rowBuffer[pos][k];
rowBuffer[pos][k] = dst[row][col + k];
dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) * rlen;
}
++pos;
pos = pos <= radius ? pos : 0;
}
for (int row = H - radius; row < H; ++row) {
for(int k = 0; k < numCols; ++k) {
dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[pos][k]) / (len - 1);
}
--len;
++pos;
pos = pos <= radius ? pos : 0;
}
}
}
}
void boxblur(float* src, float* dst, int radius, int W, int H, bool multiThread)
{
float* srcp[H];
float* dstp[H];
for (int i = 0; i < H; ++i) {
srcp[i] = src + i * W;
dstp[i] = dst + i * W;
}
boxblur(srcp, dstp, radius, W, H, multiThread);
}
void boxabsblur(float* src, float* dst, int radius, int W, int H, bool multiThread)
{
float* srcp[H];
float* dstp[H];
for (int i = 0; i < H; ++i) {
srcp[i] = src + i * W;
dstp[i] = dst + i * W;
}
boxabsblur(srcp, dstp, radius, W, H, multiThread);
}
}

View File

@ -1,7 +1,7 @@
/* /*
* This file is part of RawTherapee. * This file is part of RawTherapee.
* *
* Copyright (C) 2010 Emil Martinec <ejmartin@uchicago.edu> * Copyright (C) 2019 Ingo Weyrich <heckflosse67@gmx.de>
* *
* RawTherapee is free software: you can redistribute it and/or modify * RawTherapee is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@ -15,873 +15,15 @@
* *
* You should have received a copy of the GNU General Public License * You should have received a copy of the GNU General Public License
* along with RawTherapee. If not, see <https://www.gnu.org/licenses/>. * along with RawTherapee. If not, see <https://www.gnu.org/licenses/>.
*/ */
#ifndef _BOXBLUR_H_ #pragma once
#define _BOXBLUR_H_
#include <assert.h>
#include <memory>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "alignedbuffer.h"
#include "rt_math.h"
#include "opthelper.h"
namespace rtengine namespace rtengine
{ {
// classical filtering if the support window is small: void boxblur(float** src, float** dst, int radius, int W, int H, bool multiThread);
void boxblur(float* src, float* dst, int radius, int W, int H, bool multiThread);
template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, int W, int H) void boxabsblur(float** src, float** dst, int radius, int W, int H, bool multiThread);
{ void boxabsblur(float* src, float* dst, int radius, int W, int H, bool multiThread);
//box blur image; box range = (radx,rady)
assert(2*radx+1 < W);
assert(2*rady+1 < H);
AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W * H);
float* temp = buffer->data;
if (radx == 0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++)
for (int col = 0; col < W; col++) {
temp[row * W + col] = (float)src[row][col];
}
} else {
//horizontal blur
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) {
int len = radx + 1;
temp[row * W + 0] = (float)src[row][0] / len;
for (int j = 1; j <= radx; j++) {
temp[row * W + 0] += (float)src[row][j] / len;
}
for (int col = 1; col <= radx; col++) {
temp[row * W + col] = (temp[row * W + col - 1] * len + (float)src[row][col + radx]) / (len + 1);
len ++;
}
for (int col = radx + 1; col < W - radx; col++) {
temp[row * W + col] = temp[row * W + col - 1] + ((float)(src[row][col + radx] - src[row][col - radx - 1])) / len;
}
for (int col = W - radx; col < W; col++) {
temp[row * W + col] = (temp[row * W + col - 1] * len - src[row][col - radx - 1]) / (len - 1);
len --;
}
}
}
if (rady == 0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++)
for (int col = 0; col < W; col++) {
dst[row][col] = temp[row * W + col];
}
} else {
//vertical blur
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0][col] = temp[0 * W + col] / len;
for (int i = 1; i <= rady; i++) {
dst[0][col] += temp[i * W + col] / len;
}
for (int row = 1; row <= rady; row++) {
dst[row][col] = (dst[(row - 1)][col] * len + temp[(row + rady) * W + col]) / (len + 1);
len ++;
}
for (int row = rady + 1; row < H - rady; row++) {
dst[row][col] = dst[(row - 1)][col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
}
for (int row = H - rady; row < H; row++) {
dst[row][col] = (dst[(row - 1)][col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
len --;
}
}
}
delete buffer;
} }
template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx, int rady, int W, int H)
{
//box blur image; box range = (radx,rady)
float* temp = buffer;
if (radx == 0) {
#ifdef _OPENMP
#pragma omp for
#endif
for (int row = 0; row < H; row++)
for (int col = 0; col < W; col++) {
temp[row * W + col] = (float)src[row][col];
}
} else {
//horizontal blur
#ifdef _OPENMP
#pragma omp for
#endif
for (int row = 0; row < H; row++) {
float len = radx + 1;
float tempval = (float)src[row][0];
for (int j = 1; j <= radx; j++) {
tempval += (float)src[row][j];
}
tempval /= len;
temp[row * W + 0] = tempval;
for (int col = 1; col <= radx; col++) {
temp[row * W + col] = tempval = (tempval * len + (float)src[row][col + radx]) / (len + 1);
len ++;
}
for (int col = radx + 1; col < W - radx; col++) {
temp[row * W + col] = tempval = tempval + ((float)(src[row][col + radx] - src[row][col - radx - 1])) / len;
}
for (int col = W - radx; col < W; col++) {
temp[row * W + col] = tempval = (tempval * len - src[row][col - radx - 1]) / (len - 1);
len --;
}
}
}
if (rady == 0) {
#ifdef _OPENMP
#pragma omp for
#endif
for (int row = 0; row < H; row++)
for (int col = 0; col < W; col++) {
dst[row][col] = temp[row * W + col];
}
} else {
const int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
#ifdef __SSE2__
vfloat leninitv = F2V( (float)(rady + 1));
vfloat onev = F2V( 1.f );
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
#ifdef _OPENMP
#pragma omp for
#endif
for (int col = 0; col < W - 7; col += 8) {
lenv = leninitv;
tempv = LVFU(temp[0 * W + col]);
temp1v = LVFU(temp[0 * W + col + 4]);
for (int i = 1; i <= rady; i++) {
tempv = tempv + LVFU(temp[i * W + col]);
temp1v = temp1v + LVFU(temp[i * W + col + 4]);
}
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU(dst[0][col], tempv);
STVFU(dst[0][col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenp1v;
}
rlenv = onev / lenv;
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenm1v;
}
}
#else
//vertical blur
#ifdef _OPENMP
#pragma omp for
#endif
for (int col = 0; col < W - numCols + 1; col += 8) {
float len = rady + 1;
for(int k = 0; k < numCols; k++) {
dst[0][col + k] = temp[0 * W + col + k];
}
for (int i = 1; i <= rady; i++) {
for(int k = 0; k < numCols; k++) {
dst[0][col + k] += temp[i * W + col + k];
}
}
for(int k = 0; k < numCols; k++) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= rady; row++) {
for(int k = 0; k < numCols; k++) {
dst[row][col + k] = (dst[(row - 1)][col + k] * len + temp[(row + rady) * W + col + k]) / (len + 1);
}
len ++;
}
for (int row = rady + 1; row < H - rady; row++) {
for(int k = 0; k < numCols; k++) {
dst[row][col + k] = dst[(row - 1)][col + k] + (temp[(row + rady) * W + col + k] - temp[(row - rady - 1) * W + col + k]) / len;
}
}
for (int row = H - rady; row < H; row++) {
for(int k = 0; k < numCols; k++) {
dst[row][col + k] = (dst[(row - 1)][col + k] * len - temp[(row - rady - 1) * W + col + k]) / (len - 1);
}
len --;
}
}
#endif
#ifdef _OPENMP
#pragma omp single
#endif
for (int col = W - (W % numCols); col < W; col++) {
float len = rady + 1;
dst[0][col] = temp[0 * W + col] / len;
for (int i = 1; i <= rady; i++) {
dst[0][col] += temp[i * W + col] / len;
}
for (int row = 1; row <= rady; row++) {
dst[row][col] = (dst[(row - 1)][col] * len + temp[(row + rady) * W + col]) / (len + 1);
len ++;
}
for (int row = rady + 1; row < H - rady; row++) {
dst[row][col] = dst[(row - 1)][col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
}
for (int row = H - rady; row < H; row++) {
dst[row][col] = (dst[(row - 1)][col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
len --;
}
}
}
}
inline void boxblur (float** src, float** dst, int radius, int W, int H, bool multiThread)
{
//box blur using rowbuffers and linebuffers instead of a full size buffer
if (radius == 0) {
if (src != dst) {
#ifdef _OPENMP
#pragma omp parallel for if (multiThread)
#endif
for (int row = 0; row < H; row++) {
for (int col = 0; col < W; col++) {
dst[row][col] = src[row][col];
}
}
}
return;
}
constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
#ifdef _OPENMP
#pragma omp parallel if (multiThread)
#endif
{
std::unique_ptr<float[]> buffer(new float[numCols * (radius + 1)]);
//horizontal blur
float* const lineBuffer = buffer.get();
#ifdef _OPENMP
#pragma omp for
#endif
for (int row = 0; row < H; row++) {
float len = radius + 1;
float tempval = src[row][0];
lineBuffer[0] = tempval;
for (int j = 1; j <= radius; j++) {
tempval += src[row][j];
}
tempval /= len;
dst[row][0] = tempval;
for (int col = 1; col <= radius; col++) {
lineBuffer[col] = src[row][col];
tempval = (tempval * len + src[row][col + radius]) / (len + 1);
dst[row][col] = tempval;
++len;
}
int pos = 0;
for (int col = radius + 1; col < W - radius; col++) {
const float oldVal = lineBuffer[pos];
lineBuffer[pos] = src[row][col];
tempval = tempval + (src[row][col + radius] - oldVal) / len;
dst[row][col] = tempval;
++pos;
pos = pos <= radius ? pos : 0;
}
for (int col = W - radius; col < W; col++) {
tempval = (tempval * len - lineBuffer[pos]) / (len - 1);
dst[row][col] = tempval;
--len;
++pos;
pos = pos <= radius ? pos : 0;
}
}
//vertical blur
#ifdef __SSE2__
vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer.get();
const vfloat leninitv = F2V(radius + 1);
const vfloat onev = F2V(1.f);
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int col = 0; col < W - 7; col += 8) {
lenv = leninitv;
tempv = LVFU(dst[0][col]);
temp1v = LVFU(dst[0][col + 4]);
rowBuffer[0][0] = tempv;
rowBuffer[0][1] = temp1v;
for (int i = 1; i <= radius; i++) {
tempv = tempv + LVFU(dst[i][col]);
temp1v = temp1v + LVFU(dst[i][col + 4]);
}
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU(dst[0][col], tempv);
STVFU(dst[0][col + 4], temp1v);
for (int row = 1; row <= radius; row++) {
rowBuffer[row][0] = LVFU(dst[row][col]);
rowBuffer[row][1] = LVFU(dst[row][col + 4]);
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenp1v;
}
rlenv = onev / lenv;
int pos = 0;
for (int row = radius + 1; row < H - radius; row++) {
vfloat oldVal0 = rowBuffer[pos][0];
vfloat oldVal1 = rowBuffer[pos][1];
rowBuffer[pos][0] = LVFU(dst[row][col]);
rowBuffer[pos][1] = LVFU(dst[row][col + 4]);
tempv = tempv + (LVFU(dst[row + radius][col]) - oldVal0) * rlenv ;
temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - oldVal1) * rlenv ;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
++pos;
pos = pos <= radius ? pos : 0;
}
for (int row = H - radius; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - rowBuffer[pos][0]) / lenm1v;
temp1v = (temp1v * lenv - rowBuffer[pos][1]) / lenm1v;
STVFU(dst[row][col], tempv);
STVFU(dst[row][col + 4], temp1v);
lenv = lenm1v;
++pos;
pos = pos <= radius ? pos : 0;
}
}
#else
float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
#ifdef _OPENMP
#pragma omp for nowait
#endif
for (int col = 0; col < W - numCols + 1; col += 8) {
float len = radius + 1;
for (int k = 0; k < numCols; k++) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int i = 1; i <= radius; i++) {
for (int k = 0; k < numCols; k++) {
dst[0][col + k] += dst[i][col + k];
}
}
for(int k = 0; k < numCols; k++) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; row++) {
for(int k = 0; k < numCols; k++) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
}
len ++;
}
int pos = 0;
for (int row = radius + 1; row < H - radius; row++) {
for(int k = 0; k < numCols; k++) {
float oldVal = rowBuffer[pos][k];
rowBuffer[pos][k] = dst[row][col + k];
dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) / len;
}
++pos;
pos = pos <= radius ? pos : 0;
}
for (int row = H - radius; row < H; row++) {
for(int k = 0; k < numCols; k++) {
dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[pos][k]) / (len - 1);
}
len --;
++pos;
pos = pos <= radius ? pos : 0;
}
}
#endif
//vertical blur, remaining columns
#ifdef _OPENMP
#pragma omp single
#endif
{
const int remaining = W % numCols;
if (remaining > 0) {
float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
const int col = W - remaining;
float len = radius + 1;
for(int k = 0; k < remaining; ++k) {
rowBuffer[0][k] = dst[0][col + k];
}
for (int row = 1; row <= radius; ++row) {
for(int k = 0; k < remaining; ++k) {
dst[0][col + k] += dst[row][col + k];
}
}
for(int k = 0; k < remaining; ++k) {
dst[0][col + k] /= len;
}
for (int row = 1; row <= radius; ++row) {
for(int k = 0; k < remaining; ++k) {
rowBuffer[row][k] = dst[row][col + k];
dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
}
len ++;
}
const float rlen = 1.f / len;
int pos = 0;
for (int row = radius + 1; row < H - radius; ++row) {
for(int k = 0; k < remaining; ++k) {
float oldVal = rowBuffer[pos][k];
rowBuffer[pos][k] = dst[row][col + k];
dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) * rlen;
}
++pos;
pos = pos <= radius ? pos : 0;
}
for (int row = H - radius; row < H; ++row) {
for(int k = 0; k < remaining; ++k) {
dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[pos][k]) / (len - 1);
}
len --;
++pos;
pos = pos <= radius ? pos : 0;
}
}
}
}
}
template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
{
//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
float* temp = buffer;
if (radx == 0) {
for (int row = 0; row < H; row++)
for (int col = 0; col < W; col++) {
temp[row * W + col] = src[row * W + col];
}
} else {
//horizontal blur
for (int row = H - 1; row >= 0; row--) {
int len = radx + 1;
float tempval = (float)src[row * W];
for (int j = 1; j <= radx; j++) {
tempval += (float)src[row * W + j];
}
tempval = tempval / len;
temp[row * W] = tempval;
for (int col = 1; col <= radx; col++) {
tempval = (tempval * len + src[row * W + col + radx]) / (len + 1);
temp[row * W + col] = tempval;
len ++;
}
float reclen = 1.f / len;
for (int col = radx + 1; col < W - radx; col++) {
tempval = tempval + ((float)(src[row * W + col + radx] - src[row * W + col - radx - 1])) * reclen;
temp[row * W + col] = tempval;
}
for (int col = W - radx; col < W; col++) {
tempval = (tempval * len - src[row * W + col - radx - 1]) / (len - 1);
temp[row * W + col] = tempval;
len --;
}
}
}
if (rady == 0) {
for (int row = 0; row < H; row++)
for (int col = 0; col < W; col++) {
dst[row * W + col] = temp[row * W + col];
}
} else {
//vertical blur
#ifdef __SSE2__
vfloat leninitv = F2V( (float)(rady + 1));
vfloat onev = F2V( 1.f );
vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
int col;
for (col = 0; col < W - 7; col += 8) {
lenv = leninitv;
tempv = LVFU(temp[0 * W + col]);
temp1v = LVFU(temp[0 * W + col + 4]);
for (int i = 1; i <= rady; i++) {
tempv = tempv + LVFU(temp[i * W + col]);
temp1v = temp1v + LVFU(temp[i * W + col + 4]);
}
tempv = tempv / lenv;
temp1v = temp1v / lenv;
STVFU(dst[0 * W + col], tempv);
STVFU(dst[0 * W + col + 4], temp1v);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
lenv = lenp1v;
}
rlenv = onev / lenv;
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
STVFU(dst[row * W + col], tempv);
STVFU(dst[row * W + col + 4], temp1v);
lenv = lenm1v;
}
}
for (; col < W - 3; col += 4) {
lenv = leninitv;
tempv = LVFU(temp[0 * W + col]);
for (int i = 1; i <= rady; i++) {
tempv = tempv + LVFU(temp[i * W + col]);
}
tempv = tempv / lenv;
STVFU(dst[0 * W + col], tempv);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
STVFU(dst[row * W + col], tempv);
lenv = lenp1v;
}
rlenv = onev / lenv;
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
STVFU(dst[row * W + col], tempv);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
STVFU(dst[row * W + col], tempv);
lenv = lenm1v;
}
}
for (; col < W; col++) {
int len = rady + 1;
dst[0 * W + col] = temp[0 * W + col] / len;
for (int i = 1; i <= rady; i++) {
dst[0 * W + col] += temp[i * W + col] / len;
}
for (int row = 1; row <= rady; row++) {
dst[row * W + col] = (dst[(row - 1) * W + col] * len + temp[(row + rady) * W + col]) / (len + 1);
len ++;
}
for (int row = rady + 1; row < H - rady; row++) {
dst[row * W + col] = dst[(row - 1) * W + col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
}
for (int row = H - rady; row < H; row++) {
dst[row * W + col] = (dst[(row - 1) * W + col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
len --;
}
}
#else
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0 * W + col] = temp[0 * W + col] / len;
for (int i = 1; i <= rady; i++) {
dst[0 * W + col] += temp[i * W + col] / len;
}
for (int row = 1; row <= rady; row++) {
dst[row * W + col] = (dst[(row - 1) * W + col] * len + temp[(row + rady) * W + col]) / (len + 1);
len ++;
}
for (int row = rady + 1; row < H - rady; row++) {
dst[row * W + col] = dst[(row - 1) * W + col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
}
for (int row = H - rady; row < H; row++) {
dst[row * W + col] = (dst[(row - 1) * W + col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
len --;
}
}
#endif
}
}
template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady, int W, int H, float * temp)
{
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
if (radx == 0) {
for (int row = 0; row < H; row++)
for (int col = 0; col < W; col++) {
temp[row * W + col] = fabs(src[row * W + col]);
}
} else {
//horizontal blur
for (int row = 0; row < H; row++) {
int len = radx + 1;
float tempval = fabsf((float)src[row * W + 0]);
for (int j = 1; j <= radx; j++) {
tempval += fabsf((float)src[row * W + j]);
}
tempval /= len;
temp[row * W + 0] = tempval;
for (int col = 1; col <= radx; col++) {
tempval = (tempval * len + fabsf(src[row * W + col + radx])) / (len + 1);
temp[row * W + col] = tempval;
len ++;
}
float rlen = 1.f / (float)len;
for (int col = radx + 1; col < W - radx; col++) {
tempval = tempval + ((float)(fabsf(src[row * W + col + radx]) - fabsf(src[row * W + col - radx - 1]))) * rlen;
temp[row * W + col] = tempval;
}
for (int col = W - radx; col < W; col++) {
tempval = (tempval * len - fabsf(src[row * W + col - radx - 1])) / (len - 1);
temp[row * W + col] = tempval;
len --;
}
}
}
if (rady == 0) {
for (int row = 0; row < H; row++)
for (int col = 0; col < W; col++) {
dst[row * W + col] = temp[row * W + col];
}
} else {
//vertical blur
#ifdef __SSE2__
vfloat leninitv = F2V( (float)(rady + 1));
vfloat onev = F2V( 1.f );
vfloat tempv, lenv, lenp1v, lenm1v, rlenv;
for (int col = 0; col < W - 3; col += 4) {
lenv = leninitv;
tempv = LVF(temp[0 * W + col]);
for (int i = 1; i <= rady; i++) {
tempv = tempv + LVF(temp[i * W + col]);
}
tempv = tempv / lenv;
STVF(dst[0 * W + col], tempv);
for (int row = 1; row <= rady; row++) {
lenp1v = lenv + onev;
tempv = (tempv * lenv + LVF(temp[(row + rady) * W + col])) / lenp1v;
STVF(dst[row * W + col], tempv);
lenv = lenp1v;
}
rlenv = onev / lenv;
for (int row = rady + 1; row < H - rady; row++) {
tempv = tempv + (LVF(temp[(row + rady) * W + col]) - LVF(temp[(row - rady - 1) * W + col])) * rlenv;
STVF(dst[row * W + col], tempv);
}
for (int row = H - rady; row < H; row++) {
lenm1v = lenv - onev;
tempv = (tempv * lenv - LVF(temp[(row - rady - 1) * W + col])) / lenm1v;
STVF(dst[row * W + col], tempv);
lenv = lenm1v;
}
}
for (int col = W - (W % 4); col < W; col++) {
int len = rady + 1;
dst[0 * W + col] = temp[0 * W + col] / len;
for (int i = 1; i <= rady; i++) {
dst[0 * W + col] += temp[i * W + col] / len;
}
for (int row = 1; row <= rady; row++) {
dst[row * W + col] = (dst[(row - 1) * W + col] * len + temp[(row + rady) * W + col]) / (len + 1);
len ++;
}
for (int row = rady + 1; row < H - rady; row++) {
dst[row * W + col] = dst[(row - 1) * W + col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
}
for (int row = H - rady; row < H; row++) {
dst[row * W + col] = (dst[(row - 1) * W + col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
len --;
}
}
#else
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0 * W + col] = temp[0 * W + col] / len;
for (int i = 1; i <= rady; i++) {
dst[0 * W + col] += temp[i * W + col] / len;
}
for (int row = 1; row <= rady; row++) {
dst[row * W + col] = (dst[(row - 1) * W + col] * len + temp[(row + rady) * W + col]) / (len + 1);
len ++;
}
for (int row = rady + 1; row < H - rady; row++) {
dst[row * W + col] = dst[(row - 1) * W + col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
}
for (int row = H - rady; row < H; row++) {
dst[row * W + col] = (dst[(row - 1) * W + col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
len --;
}
}
#endif
}
}
}
#endif /* _BOXBLUR_H_ */

View File

@ -1349,7 +1349,7 @@ template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const i
} }
#endif #endif
template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int H, const double sigma, T *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, T** buffer2 = nullptr) template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int H, const double sigma, bool useBoxBlur, eGaussType gausstype = GAUSS_STANDARD, T** buffer2 = nullptr)
{ {
static constexpr auto GAUSS_SKIP = 0.25; static constexpr auto GAUSS_SKIP = 0.25;
static constexpr auto GAUSS_3X3_LIMIT = 0.6; static constexpr auto GAUSS_3X3_LIMIT = 0.6;
@ -1357,7 +1357,7 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
static constexpr auto GAUSS_7X7_LIMIT = 1.15; static constexpr auto GAUSS_7X7_LIMIT = 1.15;
static constexpr auto GAUSS_DOUBLE = 25.0; static constexpr auto GAUSS_DOUBLE = 25.0;
if(buffer) { if (useBoxBlur) {
// special variant for very large sigma, currently only used by retinex algorithm // special variant for very large sigma, currently only used by retinex algorithm
// use iterated boxblur to approximate gaussian blur // use iterated boxblur to approximate gaussian blur
// Compute ideal averaging filter width and number of iterations // Compute ideal averaging filter width and number of iterations
@ -1393,10 +1393,10 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
sizes[i] = ((i < m ? wl : wu) - 1) / 2; sizes[i] = ((i < m ? wl : wu) - 1) / 2;
} }
rtengine::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H); rtengine::boxblur(src, dst, sizes[0], W, H, true);
for(int i = 1; i < n; i++) { for(int i = 1; i < n; i++) {
rtengine::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H); rtengine::boxblur(dst, dst, sizes[i], W, H, true);
} }
} else { } else {
if (sigma < GAUSS_SKIP) { if (sigma < GAUSS_SKIP) {
@ -1532,8 +1532,8 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
} }
} }
void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer, eGaussType gausstype, float** buffer2) void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, bool useBoxBlur, eGaussType gausstype, float** buffer2)
{ {
gaussianBlurImpl<float>(src, dst, W, H, sigma, buffer, gausstype, buffer2); gaussianBlurImpl<float>(src, dst, W, H, sigma, useBoxBlur, gausstype, buffer2);
} }

View File

@ -21,6 +21,6 @@
enum eGaussType {GAUSS_STANDARD, GAUSS_MULT, GAUSS_DIV}; enum eGaussType {GAUSS_STANDARD, GAUSS_MULT, GAUSS_DIV};
void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, float** buffer2 = nullptr); void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, bool useBoxBlur = false, eGaussType gausstype = GAUSS_STANDARD, float** buffer2 = nullptr);
#endif #endif

View File

@ -105,7 +105,7 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
[multithread](array2D<float> &d, array2D<float> &s, int rad) -> void [multithread](array2D<float> &d, array2D<float> &s, int rad) -> void
{ {
rad = LIM(rad, 0, (min(s.width(), s.height()) - 1) / 2 - 1); rad = LIM(rad, 0, (min(s.width(), s.height()) - 1) / 2 - 1);
boxblur(s, d, rad, s.width(), s.height(), multithread); boxblur(static_cast<float**>(s), static_cast<float**>(d), rad, s.width(), s.height(), multithread);
}; };
const int W = src.width(); const int W = src.width();

View File

@ -198,20 +198,20 @@ public:
void Median_Denoise(float **src, float **dst, float upperBound, int width, int height, Median medianType, int iterations, int numThreads, float **buffer = nullptr); void Median_Denoise(float **src, float **dst, float upperBound, int width, int height, Median medianType, int iterations, int numThreads, float **buffer = nullptr);
void RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &nresi, float &highresi); void RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &nresi, float &highresi);
void RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, const bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope); void RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, const bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope);
void RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &minredaut, float & minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread = false); void RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, bool isRAW, const LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &minredaut, float & minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread = false);
void RGBtile_denoise(float * fLblox, int hblproc, float noisevar_Ldetail, float * nbrwt, float * blurbuffer); //for DCT void RGBtile_denoise(float * fLblox, int hblproc, float noisevar_Ldetail); //for DCT
void RGBoutput_tile_row(float *bloxrow_L, float ** Ldetail, float ** tilemask_out, int height, int width, int top); void RGBoutput_tile_row(float *bloxrow_L, float ** Ldetail, float ** tilemask_out, int height, int width, int top);
bool WaveletDenoiseAllL(wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3], float * vari, int edge); bool WaveletDenoiseAllL(const wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3], float * vari, int edge);
bool WaveletDenoiseAllAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb); bool WaveletDenoiseAllAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab, float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb);
void WaveletDenoiseAll_info(int levwav, wavelet_decomposition &WaveletCoeffs_a, void WaveletDenoiseAll_info(int levwav, const wavelet_decomposition &WaveletCoeffs_a,
wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float & minblueaut, int schoice, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, const wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float & minblueaut, int schoice, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc,
float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool denoiseMethodRgb); float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool denoiseMethodRgb);
bool WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3]); bool WaveletDenoiseAll_BiShrinkL(const wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3]);
bool WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, float *noisevarchrom, float madL[8][3], float noisevar_ab, bool WaveletDenoiseAll_BiShrinkAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab, float *noisevarchrom, float madL[8][3], float noisevar_ab,
const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb); const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb);
void ShrinkAllL(wavelet_decomposition &WaveletCoeffs_L, float **buffer, int level, int dir, float *noisevarlum, float * madL, float * vari, int edge); void ShrinkAllL(const wavelet_decomposition &WaveletCoeffs_L, float **buffer, int level, int dir, float *noisevarlum, float * madL, float * vari, int edge);
void ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, float **buffer, int level, int dir, void ShrinkAllAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab, float **buffer, int level, int dir,
float *noisevarchrom, float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb, float * madL, float * madaab = nullptr, bool madCalculated = false); float *noisevarchrom, float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb, float * madL, float * madaab = nullptr, bool madCalculated = false);
void ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b, void ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b,
int W_ab, int H_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, int W_ab, int H_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc,

File diff suppressed because it is too large Load Diff

View File

@ -207,13 +207,13 @@ BENCHFUN
for (int k = 0; k < sharpenParam.deconviter; k++) { for (int k = 0; k < sharpenParam.deconviter; k++) {
if (!needdamp) { if (!needdamp) {
// apply gaussian blur and divide luminance by result of gaussian blur // apply gaussian blur and divide luminance by result of gaussian blur
gaussianBlur(tmpI, tmp, W, H, sigma, nullptr, GAUSS_DIV, luminance); gaussianBlur(tmpI, tmp, W, H, sigma, false, GAUSS_DIV, luminance);
} else { } else {
// apply gaussian blur + damping // apply gaussian blur + damping
gaussianBlur(tmpI, tmp, W, H, sigma); gaussianBlur(tmpI, tmp, W, H, sigma);
dcdamping(tmp, luminance, damping, W, H); dcdamping(tmp, luminance, damping, W, H);
} }
gaussianBlur(tmp, tmpI, W, H, sigma, nullptr, GAUSS_MULT); gaussianBlur(tmp, tmpI, W, H, sigma, false, GAUSS_MULT);
} // end for } // end for
#ifdef _OPENMP #ifdef _OPENMP

View File

@ -32,7 +32,6 @@
#include "improcfun.h" #include "improcfun.h"
#include "LUT.h" #include "LUT.h"
#include "array2D.h" #include "array2D.h"
#include "boxblur.h"
#include "rt_math.h" #include "rt_math.h"
#include "mytime.h" #include "mytime.h"
#include "sleef.c" #include "sleef.c"

View File

@ -1950,8 +1950,8 @@ void RawImageSource::retinexPrepareCurves(const RetinexParams &retinexParams, LU
CurveFactory::curveDehaContL (retinexcontlutili, retinexParams.cdcurve, cdcurve, 1, lhist16RETI, histLRETI); CurveFactory::curveDehaContL (retinexcontlutili, retinexParams.cdcurve, cdcurve, 1, lhist16RETI, histLRETI);
} }
CurveFactory::mapcurve (mapcontlutili, retinexParams.mapcurve, mapcurve, 1, lhist16RETI, histLRETI); CurveFactory::mapcurve(mapcontlutili, retinexParams.mapcurve, mapcurve, 1, lhist16RETI, histLRETI);
mapcurve *= 0.5f;
retinexParams.getCurves(retinextransmissionCurve, retinexgaintransmissionCurve); retinexParams.getCurves(retinextransmissionCurve, retinexgaintransmissionCurve);
} }

View File

@ -201,7 +201,7 @@ public:
} }
static void inverse33 (const double (*coeff)[3], double (*icoeff)[3]); static void inverse33 (const double (*coeff)[3], double (*icoeff)[3]);
void MSR(float** luminance, float **originalLuminance, float **exLuminance, LUTf & mapcurve, bool &mapcontlutili, int width, int height, const RetinexParams &deh, const RetinextransmissionCurve & dehatransmissionCurve, const RetinexgaintransmissionCurve & dehagaintransmissionCurve, float &minCD, float &maxCD, float &mini, float &maxi, float &Tmean, float &Tsigma, float &Tmin, float &Tmax); void MSR(float** luminance, float **originalLuminance, float **exLuminance, const LUTf& mapcurve, bool mapcontlutili, int width, int height, const RetinexParams &deh, const RetinextransmissionCurve & dehatransmissionCurve, const RetinexgaintransmissionCurve & dehagaintransmissionCurve, float &minCD, float &maxCD, float &mini, float &maxi, float &Tmean, float &Tsigma, float &Tmin, float &Tmax);
void HLRecovery_inpaint (float** red, float** green, float** blue) override; void HLRecovery_inpaint (float** red, float** green, float** blue) override;
static void HLRecovery_Luminance (float* rin, float* gin, float* bin, float* rout, float* gout, float* bout, int width, float maxval); static void HLRecovery_Luminance (float* rin, float* gin, float* bin, float* rout, float* gout, float* bout, int width, float maxval);
static void HLRecovery_CIELab (float* rin, float* gin, float* bin, float* rout, float* gout, float* bout, int width, float maxval, double cam[3][3], double icam[3][3]); static void HLRecovery_CIELab (float* rin, float* gin, float* bin, float* rout, float* gout, float* bout, int width, float maxval, double cam[3][3], double icam[3][3]);

View File

@ -21,6 +21,7 @@
#pragma once #pragma once
#include "array2D.h" #include "array2D.h"
#include "rt_math.h"
namespace rtengine { namespace rtengine {

View File

@ -85,21 +85,14 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int
if (!hq) { if (!hq) {
fillLuminance( img, map, lumi); fillLuminance( img, map, lumi);
float *buffer = nullptr; const bool useBoxBlur = radius > 40.0; // boxblur is less prone to artifacts for large radi
if(radius > 40.) {
// When we pass another buffer to gaussianBlur, it will use iterated boxblur which is less prone to artifacts
buffer = new float[W * H];
}
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel #pragma omp parallel if (!useBoxBlur)
#endif #endif
{ {
gaussianBlur (map, map, W, H, radius, buffer); gaussianBlur (map, map, W, H, radius, useBoxBlur);
} }
delete [] buffer;
} }
else { else {