OMP optimizations for NR

This commit is contained in:
michael
2012-07-04 18:49:09 -04:00
parent 952feb48b2
commit 856ecbab68
4 changed files with 149 additions and 13 deletions

View File

@@ -80,7 +80,7 @@ Temperature=5745
Green=1.0 Green=1.0
[Impulse Denoising] [Impulse Denoising]
Enabled=true Enabled=false
Threshold=50 Threshold=50
[Defringing] [Defringing]

View File

@@ -2,6 +2,10 @@
#include "boxblur.h" #include "boxblur.h"
#include <cstdlib> #include <cstdlib>
#ifdef _OPENMP
#include <omp.h>
#endif
//#define MAX(a,b) ((a)<(b)?(b):(a)) //#define MAX(a,b) ((a)<(b)?(b):(a))
//#define MIN(a,b) ((a)>(b)?(b):(a)) //#define MIN(a,b) ((a)>(b)?(b):(a))
@@ -62,9 +66,12 @@ float *EdgePreserveLab::CreateBlur(float *Source, float LScale, float abScale, f
float * var = new float[w*h]; float * var = new float[w*h];
rtengine::boxvar(g, var, 1, 1, w, h); rtengine::boxvar(g, var, 1, 1, w, h);
for(y = 0; y != h1; y++){ #ifdef _OPENMP
#pragma omp parallel for
#endif
for(y = 0; y < h1; y++){
float *rg = &g[w*y]; float *rg = &g[w*y];
for(x = 0; x != w1; x++){ for(x = 0; x < w1; x++){
//Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient. //Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
/*float gx = (fabs((rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]))); /*float gx = (fabs((rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w])));
float gy = (fabs((rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]))); float gy = (fabs((rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1])));
@@ -103,6 +110,8 @@ float *EdgePreserveLab::CreateBlur(float *Source, float LScale, float abScale, f
memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float)); memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
memset(a_w, 0, A->DiagonalLength(w)*sizeof(float)); memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float)); memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
//TODO: OMP here?
for(i = y = 0; y != h; y++){ for(i = y = 0; y != h; y++){
for(x = 0; x != w; x++, i++){ for(x = 0; x != w; x++, i++){
float ac; float ac;

View File

@@ -98,6 +98,9 @@ namespace rtengine {
const short int imheight=src->height, imwidth=src->width; const short int imheight=src->height, imwidth=src->width;
if (dnparams.luma==0 && dnparams.chroma==0) {//nothing to do; copy src to dst if (dnparams.luma==0 && dnparams.chroma==0) {//nothing to do; copy src to dst
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<imheight; i++) { for (int i=0; i<imheight; i++) {
for (int j=0; j<imwidth; j++) { for (int j=0; j<imwidth; j++) {
dst->r[i][j] = src->r[i][j]; dst->r[i][j] = src->r[i][j];
@@ -144,7 +147,10 @@ namespace rtengine {
array2D<float> tilemask_out(TS,TS); array2D<float> tilemask_out(TS,TS);
const int border = MAX(2,TS/16); const int border = MAX(2,TS/16);
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<TS; i++) { for (int i=0; i<TS; i++) {
float i1 = abs((i>TS/2 ? i-TS+1 : i)); float i1 = abs((i>TS/2 ? i-TS+1 : i));
float vmask = (i1<border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f); float vmask = (i1<border ? SQR(sin((M_PI*i1)/(2*border))) : 1.0f);
@@ -164,6 +170,9 @@ namespace rtengine {
//output buffer //output buffer
Imagefloat * dsttmp = new Imagefloat(imwidth,imheight); Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int n=0; n<3*imwidth*imheight; n++) { for (int n=0; n<3*imwidth*imheight; n++) {
dsttmp->data[n] = 0; dsttmp->data[n] = 0;
} }
@@ -196,7 +205,6 @@ namespace rtengine {
//now we have tile dimensions, overlaps //now we have tile dimensions, overlaps
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) { for (int tiletop=0; tiletop<imheight; tiletop+=tileHskip) {
for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) { for (int tileleft=0; tileleft<imwidth; tileleft+=tileWskip) {
@@ -214,6 +222,7 @@ namespace rtengine {
//pixel weight //pixel weight
array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks array2D<float> totwt(width,height,ARRAY2D_CLEAR_DATA);//weight for combining DCT blocks
// OMP candidate?
//fill tile from image; convert RGB to "luma/chroma" //fill tile from image; convert RGB to "luma/chroma"
for (int i=tiletop, i1=0; i<tilebottom; i++, i1++) for (int i=tiletop, i1=0; i<tilebottom; i++, i1++)
for (int j=tileleft, j1=0; j<tileright; j++, j1++) { for (int j=tileleft, j1=0; j<tileright; j++, j1++) {
@@ -340,6 +349,9 @@ namespace rtengine {
}//now we have a padded data row }//now we have a padded data row
//now fill this row of the blocks with Lab high pass data //now fill this row of the blocks with Lab high pass data
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int hblk=0; hblk<numblox_W; hblk++) { for (int hblk=0; hblk<numblox_W; hblk++) {
int left = (hblk-blkrad)*offset; int left = (hblk-blkrad)*offset;
int indx = (hblk)*TS;//index of block in malloc int indx = (hblk)*TS;//index of block in malloc
@@ -361,6 +373,9 @@ namespace rtengine {
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// now process the vblk row of blocks for noise reduction // now process the vblk row of blocks for noise reduction
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int hblk=0; hblk<numblox_W; hblk++) { for (int hblk=0; hblk<numblox_W; hblk++) {
RGBtile_denoise (fLblox, vblk, hblk, numblox_H, numblox_W, noisevar_Ldetail ); RGBtile_denoise (fLblox, vblk, hblk, numblox_H, numblox_W, noisevar_Ldetail );
@@ -395,7 +410,9 @@ namespace rtengine {
fftwf_cleanup(); fftwf_cleanup();
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<height; i++) { for (int i=0; i<height; i++) {
for (int j=0; j<width; j++) { for (int j=0; j<width; j++) {
//may want to include masking threshold for large hipass data to preserve edges/detail //may want to include masking threshold for large hipass data to preserve edges/detail
@@ -427,6 +444,7 @@ namespace rtengine {
if (tileright<imwidth) Hmask[width-1-i] = mask; if (tileright<imwidth) Hmask[width-1-i] = mask;
} }
//TODO: OMP candidate?
//convert back to RGB and write to destination array //convert back to RGB and write to destination array
for (int i=tiletop, i1=0; i<tilebottom; i++, i1++) { for (int i=tiletop, i1=0; i<tilebottom; i++, i1++) {
float X,Y,Z; float X,Y,Z;
@@ -460,7 +478,7 @@ namespace rtengine {
}//end of tile row }//end of tile row
}//end of tile loop }//end of tile loop
//TODO: is memcpy multithreaded - should this be replaced with the OMP-ed for loop?
//copy denoised image to output //copy denoised image to output
memcpy (dst->data, dsttmp->data, 3*imwidth*imheight*sizeof(float)); memcpy (dst->data, dsttmp->data, 3*imwidth*imheight*sizeof(float));
@@ -482,7 +500,10 @@ namespace rtengine {
int blkstart = hblproc*TS*TS; int blkstart = hblproc*TS*TS;
boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation //for DCT
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int n=0; n<TS*TS; n++) { //for DCT for (int n=0; n<TS*TS; n++) { //for DCT
fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail)); fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail));
}//output neighbor averaged result }//output neighbor averaged result
@@ -502,7 +523,10 @@ namespace rtengine {
{ {
const int numblox_W = ceil(((float)(width))/(offset)); const int numblox_W = ceil(((float)(width))/(offset));
const float DCTnorm = 1.0f/(4*TS*TS); //for DCT const float DCTnorm = 1.0f/(4*TS*TS); //for DCT
#ifdef _OPENMP
#pragma omp parallel for
#endif
//add row of tiles to output image //add row of tiles to output image
for (int hblk=0; hblk < numblox_W; hblk++) { for (int hblk=0; hblk < numblox_W; hblk++) {
int left = (hblk-blkrad)*offset; int left = (hblk-blkrad)*offset;
@@ -544,6 +568,9 @@ namespace rtengine {
for (int i=0; i<65536; i++) histo[i]=0; for (int i=0; i<65536; i++) histo[i]=0;
//calculate histogram of absolute values of HH wavelet coeffs //calculate histogram of absolute values of HH wavelet coeffs
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<datalen; i++) { for (int i=0; i<datalen; i++) {
histo[MAX(0,MIN(65535,abs((int)DataList[i])))]++; histo[MAX(0,MIN(65535,abs((int)DataList[i])))]++;
} }
@@ -743,7 +770,10 @@ namespace rtengine {
wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab ) wavelet_decomposition &WaveletCoeffs_b, float noisevar_L, float noisevar_ab )
{ {
int maxlvl = WaveletCoeffs_L.maxlevel(); int maxlvl = WaveletCoeffs_L.maxlevel();
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int lvl=0; lvl<maxlvl; lvl++) { for (int lvl=0; lvl<maxlvl; lvl++) {
int Wlvl_L = WaveletCoeffs_L.level_W(lvl); int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
@@ -780,7 +810,9 @@ namespace rtengine {
int max; int max;
printf("\n level=%d \n",level); printf("\n level=%d \n",level);
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int dir=1; dir<4; dir++) { for (int dir=1; dir<4; dir++) {
float madL = SQR(MadMax(WavCoeffs_L[dir], max, W_L*H_L)); float madL = SQR(MadMax(WavCoeffs_L[dir], max, W_L*H_L));
float mada = SQR(MadMax(WavCoeffs_a[dir], max, W_ab*H_ab)); float mada = SQR(MadMax(WavCoeffs_a[dir], max, W_ab*H_ab));
@@ -794,7 +826,11 @@ namespace rtengine {
float mad_b = madb*noisevar_ab; float mad_b = madb*noisevar_ab;
if (noisevar_ab>0.01) { if (noisevar_ab>0.01) {
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<H_ab; i++) { for (int i=0; i<H_ab; i++) {
for (int j=0; j<W_ab; j++) { for (int j=0; j<W_ab; j++) {
@@ -818,6 +854,10 @@ namespace rtengine {
boxblur(sfavea, sfavea, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage boxblur(sfavea, sfavea, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
boxblur(sfaveb, sfaveb, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage boxblur(sfaveb, sfaveb, level+2, level+2, W_ab, H_ab);//increase smoothness by locally averaging shrinkage
//MK
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<H_ab; i++) for (int i=0; i<H_ab; i++)
for (int j=0; j<W_ab; j++) { for (int j=0; j<W_ab; j++) {
@@ -840,6 +880,9 @@ namespace rtengine {
if (noisevar_L>0.01) { if (noisevar_L>0.01) {
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<W_L*H_L; i++) { for (int i=0; i<W_L*H_L; i++) {
float mag = SQR(WavCoeffs_L[dir][i]); float mag = SQR(WavCoeffs_L[dir][i]);
@@ -850,6 +893,9 @@ namespace rtengine {
} }
//OpenMP here //OpenMP here
boxblur(sfave, sfave, level+2, level+2, W_L, H_L);//increase smoothness by locally averaging shrinkage boxblur(sfave, sfave, level+2, level+2, W_L, H_L);//increase smoothness by locally averaging shrinkage
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i=0; i<W_L*H_L; i++) { for (int i=0; i<W_L*H_L; i++) {

View File

@@ -48,6 +48,9 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
float* temp = buffer->data; float* temp = buffer->data;
if (radx==0) { if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
temp[row*H+col] = (float)src[row][col]; temp[row*H+col] = (float)src[row][col];
@@ -55,6 +58,9 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
} else { } else {
//horizontal blur //horizontal blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) { for (int row = 0; row < H; row++) {
int len = radx + 1; int len = radx + 1;
temp[row*W+0] = (float)src[row][0]/len; temp[row*W+0] = (float)src[row][0]/len;
@@ -76,6 +82,9 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
} }
if (rady==0) { if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
dst[row][col] = temp[row*W+col]; dst[row][col] = temp[row*W+col];
@@ -83,6 +92,9 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
} else { } else {
//vertical blur //vertical blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) { for (int col = 0; col < W; col++) {
int len = rady + 1; int len = rady + 1;
dst[0][col] = temp[0*W+col]/len; dst[0][col] = temp[0*W+col]/len;
@@ -127,6 +139,9 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
} else { } else {
//horizontal blur //horizontal blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) { for (int row = 0; row < H; row++) {
int len = radx + 1; int len = radx + 1;
temp[row*W+0] = (float)src[row*W+0]/len; temp[row*W+0] = (float)src[row*W+0]/len;
@@ -148,6 +163,9 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
} }
if (rady==0) { if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col]; dst[row*W+col] = temp[row*W+col];
@@ -155,6 +173,9 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
} else { } else {
//vertical blur //vertical blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) { for (int col = 0; col < W; col++) {
int len = rady + 1; int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len; dst[0*W+col] = temp[0*W+col]/len;
@@ -282,6 +303,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
float* tempave = buffer2->data; float* tempave = buffer2->data;
if (radx==0) { if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
temp[row*H+col] = src[row*W+col]; temp[row*H+col] = src[row*W+col];
@@ -289,6 +313,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
} else { } else {
//horizontal blur //horizontal blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) { for (int row = 0; row < H; row++) {
int len = radx + 1; int len = radx + 1;
temp[row*W+0] = (float)src[row*W+0]/len; temp[row*W+0] = (float)src[row*W+0]/len;
@@ -311,12 +338,18 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
if (rady==0) { if (rady==0) {
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
tempave[row*W+col] = temp[row*W+col]; tempave[row*W+col] = temp[row*W+col];
} }
} else { } else {
//vertical blur //vertical blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) { for (int col = 0; col < W; col++) {
int len = rady + 1; int len = rady + 1;
tempave[0*W+col] = temp[0*W+col]/len; tempave[0*W+col] = temp[0*W+col]/len;
@@ -342,13 +375,19 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
if (radx==0) { if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
temp[row*H+col] = fabs(src[row*W+col]-tempave[row*W+col]); temp[row*H+col] = fabs(src[row*W+col]-tempave[row*W+col]);
} }
} else { } else {
//horizontal blur //horizontal blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) { for (int row = 0; row < H; row++) {
int len = radx + 1; int len = radx + 1;
temp[row*W+0] = fabs(src[row*W+0]-tempave[row*W+0])/len; temp[row*W+0] = fabs(src[row*W+0]-tempave[row*W+0])/len;
@@ -371,6 +410,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
} }
if (rady==0) { if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col]; dst[row*W+col] = temp[row*W+col];
@@ -378,6 +420,9 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
} else { } else {
//vertical blur //vertical blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) { for (int col = 0; col < W; col++) {
int len = rady + 1; int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len; dst[0*W+col] = temp[0*W+col]/len;
@@ -416,6 +461,9 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
float* temp = buffer->data; float* temp = buffer->data;
if (radx==0) { if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
temp[row*H+col] = SQR(src[row*W+col]); temp[row*H+col] = SQR(src[row*W+col]);
@@ -423,6 +471,9 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
} else { } else {
//horizontal blur //horizontal blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) { for (int row = 0; row < H; row++) {
int len = radx + 1; int len = radx + 1;
temp[row*W+0] = SQR((float)src[row*W+0])/len; temp[row*W+0] = SQR((float)src[row*W+0])/len;
@@ -444,6 +495,9 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
} }
if (rady==0) { if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col]; dst[row*W+col] = temp[row*W+col];
@@ -451,6 +505,9 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
} else { } else {
//vertical blur //vertical blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) { for (int col = 0; col < W; col++) {
int len = rady + 1; int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len; dst[0*W+col] = temp[0*W+col]/len;
@@ -488,6 +545,9 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
float* temp = buffer->data; float* temp = buffer->data;
if (radx==0) { if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) { for (int row=0; row<H; row++) {
int rr = min(H-1,max(0,row+dy)); int rr = min(H-1,max(0,row+dy));
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
@@ -498,6 +558,9 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
} else { } else {
//horizontal blur //horizontal blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) { for (int row = 0; row < H; row++) {
int len = radx + 1; int len = radx + 1;
int rr = min(H-1,max(0,row+dy)); int rr = min(H-1,max(0,row+dy));
@@ -527,6 +590,9 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
} }
if (rady==0) { if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col]; dst[row*W+col] = temp[row*W+col];
@@ -534,6 +600,9 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
} else { } else {
//vertical blur //vertical blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) { for (int col = 0; col < W; col++) {
int len = rady + 1; int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len; dst[0*W+col] = temp[0*W+col]/len;
@@ -572,6 +641,9 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
float* temp = buffer->data; float* temp = buffer->data;
if (radx==0) { if (radx==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
temp[row*H+col] = fabs(src[row*W+col]); temp[row*H+col] = fabs(src[row*W+col]);
@@ -579,6 +651,9 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
} else { } else {
//horizontal blur //horizontal blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row = 0; row < H; row++) { for (int row = 0; row < H; row++) {
int len = radx + 1; int len = radx + 1;
temp[row*W+0] = fabs((float)src[row*W+0])/len; temp[row*W+0] = fabs((float)src[row*W+0])/len;
@@ -600,6 +675,9 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
} }
if (rady==0) { if (rady==0) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int row=0; row<H; row++) for (int row=0; row<H; row++)
for (int col=0; col<H; col++) { for (int col=0; col<H; col++) {
dst[row*W+col] = temp[row*W+col]; dst[row*W+col] = temp[row*W+col];
@@ -607,6 +685,9 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
} else { } else {
//vertical blur //vertical blur
//OpenMP here //OpenMP here
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int col = 0; col < W; col++) { for (int col = 0; col < W; col++) {
int len = rady + 1; int len = rady + 1;
dst[0*W+col] = temp[0*W+col]/len; dst[0*W+col] = temp[0*W+col]/len;