OMP optimizations for NR

This commit is contained in:
michael
2012-07-04 18:49:09 -04:00
parent 952feb48b2
commit 856ecbab68
4 changed files with 149 additions and 13 deletions

View File

@@ -80,7 +80,7 @@ Temperature=5745
Green=1.0
[Impulse Denoising]
Enabled=true
Enabled=false
Threshold=50
[Defringing]

View File

@@ -2,6 +2,10 @@
#include "boxblur.h"
#include <cstdlib>
#ifdef _OPENMP
#include <omp.h>
#endif
//#define MAX(a,b) ((a)<(b)?(b):(a))
//#define MIN(a,b) ((a)>(b)?(b):(a))
@@ -62,9 +66,12 @@ float *EdgePreserveLab::CreateBlur(float *Source, float LScale, float abScale, f
float * var = new float[w*h];
rtengine::boxvar(g, var, 1, 1, w, h);
for(y = 0; y != h1; y++){
#ifdef _OPENMP
#pragma omp parallel for
#endif
for(y = 0; y < h1; y++){
float *rg = &g[w*y];
for(x = 0; x != w1; x++){
for(x = 0; x < w1; x++){
//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
/*float gx = (fabs((rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w])));
float gy = (fabs((rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1])));
@@ -103,6 +110,8 @@ float *EdgePreserveLab::CreateBlur(float *Source, float LScale, float abScale, f
memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
//TODO: OMP here?
for(i = y = 0; y != h; y++){
for(x = 0; x != w; x++, i++){
float ac;

View File

@@ -98,6 +98,9 @@ namespace rtengine {
const short int imheight=src->height, imwidth=src->width;
if (dnparams.luma==0 && dnparams.chroma==0) {//nothing to do; copy src to dst
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<imheight; i++) {
for (int j=0; j<imwidth; j++) {
dst->r[i][j] = src->r[i][j];
@@ -145,6 +148,9 @@ namespace rtengine {
const int border = MAX(2,TS/16);
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<TS; i++) {
float i1 = abs((i>TS/2 ? i-TS+1 : i));
float vmask = (i1<border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
@@ -164,6 +170,9 @@ namespace rtengine {
//output buffer
Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int n=0; n<3*imwidth*imheight; n++) {
dsttmp->data[n] = 0;
}
@@ -196,7 +205,6 @@ namespace rtengine {
//now we have tile dimensions, overlaps
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
@@ -214,6 +222,7 @@ namespace rtengine {
//pixel weight
array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
// OMP candidate?
//fill tile from image; convert RGB to "luma/chroma"
for (int i=tiletop, i1=0; i<tilebottom; i++, i1++)
for (int j=tileleft, j1=0; j<tileright; j++, j1++) {
@@ -340,6 +349,9 @@ namespace rtengine {
}//now we have a padded data row
//now fill this row of the blocks with Lab high pass data
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int hblk=0; hblk<numblox_W; hblk++) {
int left = (hblk-blkrad)*offset;
int indx = (hblk)*TS;//index of block in malloc
@@ -361,6 +373,9 @@ namespace rtengine {
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// now process the vblk row of blocks for noise reduction
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int hblk=0; hblk<numblox_W; hblk++) {
RGBtile_denoise (fLblox, vblk, hblk, numblox_H, numblox_W, noisevar_Ldetail );
@@ -395,7 +410,9 @@ namespace rtengine {
fftwf_cleanup();
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<height; i++) {
for (int j=0; j<width; j++) {
//may want to include masking threshold for large hipass data to preserve edges/detail
@@ -427,6 +444,7 @@ namespace rtengine {
if (tileright<imwidth) Hmask[width-1-i] = mask;
}
//TODO: OMP candidate?
//convert back to RGB and write to destination array
for (int i=tiletop, i1=0; i<tilebottom; i++, i1++) {
float X,Y,Z;
@@ -460,7 +478,7 @@ namespace rtengine {
}//end of tile row
}//end of tile loop
//TODO: is memcpy multithreaded - should this be replaced with the OMP-ed for loop?
//copy denoised image to output
memcpy (dst->data, dsttmp->data, 3*imwidth*imheight*sizeof(float));
@@ -483,6 +501,9 @@ namespace rtengine {
boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int n=0; n<TS*TS; n++) { //for DCT
fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail));
}//output neighbor averaged result
@@ -503,6 +524,9 @@ namespace rtengine {
const int numblox_W = ceil(((float)(width))/(offset));
const float DCTnorm = 1.0f/(4*TS*TS); //for DCT
#ifdef _OPENMP
#pragma omp parallel for
#endif
//add row of tiles to output image
for (int hblk=0; hblk < numblox_W; hblk++) {
int left = (hblk-blkrad)*offset;
@@ -544,6 +568,9 @@ namespace rtengine {
for (int i=0; i<65536; i++) histo[i]=0;
//calculate histogram of absolute values of HH wavelet coeffs
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<datalen; i++) {
histo[MAX(0,MIN(65535,abs((int)DataList[i])))]++;
}
@@ -744,6 +771,9 @@ namespace rtengine {
{
int maxlvl = WaveletCoeffs_L.maxlevel();
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int lvl=0; lvl<maxlvl; lvl++) {
int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
@@ -780,7 +810,9 @@ namespace rtengine {
int max;
printf("\n level=%d \n",level);
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int dir=1; dir<4; dir++) {
float madL = SQR(MadMax(WavCoeffs_L[dir], max, W_L*H_L));
float mada = SQR(MadMax(WavCoeffs_a[dir], max, W_ab*H_ab));
@@ -795,6 +827,10 @@ namespace rtengine {
if (noisevar_ab>0.01) {
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<H_ab; i++) {
for (int j=0; j<W_ab; j++) {
@@ -818,6 +854,10 @@ namespace rtengine {
boxblur(sfavea, sfavea, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
boxblur(sfaveb, sfaveb, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
//MK
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<H_ab; i++)
for (int j=0; j<W_ab; j++) {
@@ -840,6 +880,9 @@ namespace rtengine {
if (noisevar_L>0.01) {
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<W_L*H_L; i++) {
float mag = SQR(WavCoeffs_L[dir][i]);
@@ -850,6 +893,9 @@ namespace rtengine {
}
//OpenMP here
boxblur(sfave, sfave, level+2, level+2, W_L, H_L);//increase smoothness by locally averaging shrinkage
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<W_L*H_L; i++) {

View File

@@ -48,6 +48,9 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
float* temp = buffer->data;
if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
temp[row*H+col] = (float)src[row][col];
@@ -55,6 +58,9 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
} else {
//horizontal blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) {
int len = radx + 1;
temp[row*W+0] = (float)src[row][0]/len;
@@ -76,6 +82,9 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
}
if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
dst[row][col] = temp[row*W+col];
@@ -83,6 +92,9 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
} else {
//vertical blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0][col] = temp[0*W+col]/len;
@@ -127,6 +139,9 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
} else {
//horizontal blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) {
int len = radx + 1;
temp[row*W+0] = (float)src[row*W+0]/len;
@@ -148,6 +163,9 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
}
if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col];
@@ -155,6 +173,9 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
} else {
//vertical blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len;
@@ -282,6 +303,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
float* tempave = buffer2->data;
if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
temp[row*H+col] = src[row*W+col];
@@ -289,6 +313,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
} else {
//horizontal blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) {
int len = radx + 1;
temp[row*W+0] = (float)src[row*W+0]/len;
@@ -311,12 +338,18 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
if (rady==0) {
for (int row=0; row<H; row++)
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col=0; col<H; col++) {
tempave[row*W+col] = temp[row*W+col];
}
} else {
//vertical blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) {
int len = rady + 1;
tempave[0*W+col] = temp[0*W+col]/len;
@@ -342,6 +375,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
temp[row*H+col] = fabs(src[row*W+col]-tempave[row*W+col]);
@@ -349,6 +385,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
} else {
//horizontal blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) {
int len = radx + 1;
temp[row*W+0] = fabs(src[row*W+0]-tempave[row*W+0])/len;
@@ -371,6 +410,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
}
if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col];
@@ -378,6 +420,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
} else {
//vertical blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len;
@@ -416,6 +461,9 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
float* temp = buffer->data;
if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
temp[row*H+col] = SQR(src[row*W+col]);
@@ -423,6 +471,9 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
} else {
//horizontal blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) {
int len = radx + 1;
temp[row*W+0] = SQR((float)src[row*W+0])/len;
@@ -444,6 +495,9 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
}
if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col];
@@ -451,6 +505,9 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
} else {
//vertical blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len;
@@ -488,6 +545,9 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
float* temp = buffer->data;
if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) {
int rr = min(H-1,max(0,row+dy));
for (int col=0; col<H; col++) {
@@ -498,6 +558,9 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
} else {
//horizontal blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) {
int len = radx + 1;
int rr = min(H-1,max(0,row+dy));
@@ -527,6 +590,9 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
}
if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col];
@@ -534,6 +600,9 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
} else {
//vertical blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len;
@@ -572,6 +641,9 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
float* temp = buffer->data;
if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
temp[row*H+col] = fabs(src[row*W+col]);
@@ -579,6 +651,9 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
} else {
//horizontal blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) {
int len = radx + 1;
temp[row*W+0] = fabs((float)src[row*W+0])/len;
@@ -600,6 +675,9 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
}
if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++)
for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col];
@@ -607,6 +685,9 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
} else {
//vertical blur
//OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) {
int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len;