Fixed several OMP bugs in denoise

see issue 1474
2012-07-14 17:20:24 +02:00
parent e56b12022d
commit 64c51e5a4b
2 changed files with 57 additions and 237 deletions
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -97,17 +97,9 @@ namespace rtengine {
 		
 		const short int imheight=src->height, imwidth=src->width;
 		
-		if (dnparams.luma==0 && dnparams.chroma==0) {//nothing to do; copy src to dst
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-			for (int i=0; i<imheight; i++) {
-				for (int j=0; j<imwidth; j++) {
-					dst->r[i][j] = src->r[i][j];
-					dst->r[i][j] = src->r[i][j];
-					dst->r[i][j] = src->r[i][j];
-				}
-			}
+		if (dnparams.luma==0 && dnparams.chroma==0) {
+            //nothing to do; copy src to dst
+            memcpy(dst->data,src->data,dst->width*dst->height*3*sizeof(float));
 			return;
 		}
 		
@@ -170,12 +162,7 @@ namespace rtengine {
 		
 		//output buffer
 		Imagefloat * dsttmp = new Imagefloat(imwidth,imheight);
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-		for (int n=0; n<3*imwidth*imheight; n++) {
-			dsttmp->data[n] = 0;
-		}
+		for (int n=0; n<3*imwidth*imheight; n++) dsttmp->data[n] = 0;
 		
 		const int tilesize = 1024;
 		const int overlap = 128;
@@ -383,9 +370,6 @@ namespace rtengine {
 					
 					//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 					// now process the vblk row of blocks for noise reduction
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif	
 					for (int hblk=0; hblk<numblox_W; hblk++) {
 						
 						RGBtile_denoise (fLblox, vblk, hblk, numblox_H, numblox_W, noisevar_Ldetail );
@@ -420,10 +404,6 @@ namespace rtengine {
 				fftwf_cleanup();
 				
 				//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-//#ifdef _OPENMP
-//#pragma omp parallel for
-//#endif
-//TODO: implement using AlignedBufferMP
 				for (int i=0; i<height; i++) {
 					for (int j=0; j<width; j++) {
 						//may want to include masking threshold for large hipass data to preserve edges/detail
@@ -493,9 +473,8 @@ namespace rtengine {
 			}//end of tile row
 		}//end of tile loop
 		
-//TODO: is memcpy multithreaded - should this be replaced with the OMP-ed for loop?	
 	//copy denoised image to output
-	memcpy (dst->data, dsttmp->data, 3*imwidth*imheight*sizeof(float));
+	memcpy (dst->data, dsttmp->data, 3*dst->width*dst->height*sizeof(float));
 	
 	delete dsttmp;
 	
@@ -516,10 +495,6 @@ namespace rtengine {
 		
 		boxabsblur(fLblox+blkstart, nbrwt, 3, 3, TS, TS);//blur neighbor weights for more robust estimation	//for DCT

-//#ifdef _OPENMP
-//#pragma omp parallel for
-//#endif
-//TODO: implement using AlignedBufferMP
 		for (int n=0; n<TS*TS; n++) {		//for DCT
 			fLblox[blkstart+n] *= (1-expf(-SQR(nbrwt[n])/noisevar_Ldetail));
 		}//output neighbor averaged result
@@ -584,9 +559,6 @@ namespace rtengine {
 		for (int i=0; i<65536; i++) histo[i]=0;
 		
 		//calculate histogram of absolute values of HH wavelet coeffs
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif		
 		for (int i=0; i<datalen; i++) {
 			histo[MAX(0,MIN(65535,abs((int)DataList[i])))]++;
 		}
@@ -826,9 +798,7 @@ namespace rtengine {
 		int max;
 		
 		printf("\n level=%d  \n",level);
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif			
+	
 		for (int dir=1; dir<4; dir++) {
 			float madL = SQR(MadMax(WavCoeffs_L[dir], max, W_L*H_L));	
 			float mada = SQR(MadMax(WavCoeffs_a[dir], max, W_ab*H_ab));
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -40,29 +40,23 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//box blur image; box range = (radx,rady)
 	
-	AlignedBufferMP<float> buffer(W*H);
+	AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W*H);
+	float* temp = buffer->data;
 	
 	if (radx==0) {
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				temp[row*H+col] = (float)src[row][col];
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//horizontal blur
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		for (int row = 0; row < H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-
 			int len = radx + 1;
 			temp[row*W+0] = (float)src[row][0]/len;
 			for (int j=1; j<=radx; j++) {
@@ -79,7 +73,6 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
 				temp[row*W+col] = (temp[row*W+col-1]*len - src[row][col-radx-1])/(len-1);
 				len --;
 			}
-			buffer.release(pBuf);
 		}
 	}
 	
@@ -87,24 +80,16 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				dst[row][col] = temp[row*W+col];
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//vertical blur
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		for (int col = 0; col < W; col++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int len = rady + 1;
 			dst[0][col] = temp[0*W+col]/len;
 			for (int i=1; i<=rady; i++) {
@@ -121,9 +106,11 @@ template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, i
 				dst[row][col] = (dst[(row-1)][col]*len - temp[(row-rady-1)*W+col])/(len-1);
 				len --;
 			}
-			buffer.release(pBuf);
 		}
 	}
+	
+	delete buffer;
+	
 }

 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -135,29 +122,20 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
 	
-	AlignedBufferMP<float> buffer(W*H);
+	AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W*H);
+	float* temp = buffer->data;
 		
 	if (radx==0) {
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-		for (int row=0; row<H; row++){
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				temp[row*H+col] = src[row*W+col];
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//horizontal blur
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		for (int row = 0; row < H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int len = radx + 1;
 			temp[row*W+0] = (float)src[row*W+0]/len;
 			for (int j=1; j<=radx; j++) {
@@ -174,7 +152,6 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
 				temp[row*W+col] = (temp[row*W+col-1]*len - src[row*W+col-radx-1])/(len-1);
 				len --;
 			}
-			buffer.release(pBuf);
 		}
 	}
 	
@@ -182,24 +159,16 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			 
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				dst[row*W+col] = temp[row*W+col];
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//vertical blur
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		for (int col = 0; col < W; col++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int len = rady + 1;
 			dst[0*W+col] = temp[0*W+col]/len;
 			for (int i=1; i<=rady; i++) {
@@ -216,10 +185,11 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int
 				dst[row*W+col] = (dst[(row-1)*W+col]*len - temp[(row-rady-1)*W+col])/(len-1);
 				len --;
 			}
-			buffer.release(pBuf);
 		}
 	}
 	
+	delete buffer;
+	
 }

 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -228,10 +198,12 @@ template<class T, class A> void boxblur (T* src, A* dst, int radx, int rady, int

 template<typename T> void boxvar (T* src, T* dst, int radx, int rady, int W, int H) {
 	
-	AlignedBufferMP<float> buffer1(W*H);
-	AlignedBufferMP<float> buffer2(W*H);
-	AlignedBufferMP<float> buffer3(W*H);
+	AlignedBuffer<float> buffer1(W*H);
+	AlignedBuffer<float> buffer2(W*H);
+	float* tempave = buffer1.data;
+	float* tempsqave = buffer2.data;
 	
+	AlignedBufferMP<float> buffer3(H);

 	//float image_ave = 0;
 	
@@ -241,15 +213,6 @@ template<typename T> void boxvar (T* src, T* dst, int radx, int rady, int W, int
 #pragma omp parallel for
 #endif
 	for (int row = 0; row < H; row++) {
-		AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-		T* tempave=(T*)pBuf1->data;
-		
-		AlignedBuffer<float>* pBuf2 = buffer2.acquire();
-		T* tempsqave=(T*)pBuf2->data;
-		
-		AlignedBuffer<float>* pBuf3 = buffer3.acquire();
-		T* tempave2=(T*)pBuf3->data;
-			
 		int len = radx + 1;
 		tempave[row*W+0] = src[row*W+0]/len;
 		tempsqave[row*W+0] = SQR(src[row*W+0])/len;
@@ -271,9 +234,6 @@ template<typename T> void boxvar (T* src, T* dst, int radx, int rady, int W, int
 			tempsqave[row*W+col] = (tempsqave[row*W+col-1]*len - SQR(src[row*W+col-radx-1]))/(len-1);
 			len --;
 		}
-		buffer1.release(pBuf1);
-		buffer2.release(pBuf2);
-		buffer3.release(pBuf3);
 	}
 	
 	//vertical blur
@@ -281,12 +241,6 @@ template<typename T> void boxvar (T* src, T* dst, int radx, int rady, int W, int
 #pragma omp parallel for
 #endif
 	for (int col = 0; col < W; col++) {
-		AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-		T* tempave=(T*)pBuf1->data;
-		
-		AlignedBuffer<float>* pBuf2 = buffer2.acquire();
-		T* tempsqave=(T*)pBuf2->data;
-		
 		AlignedBuffer<float>* pBuf3 = buffer3.acquire();
 		T* tempave2=(T*)pBuf3->data;
 			
@@ -316,8 +270,6 @@ template<typename T> void boxvar (T* src, T* dst, int radx, int rady, int W, int
 			dst[row*W+col] = fabs(dst[row*W+col] - SQR(tempave2[row]));
 			//image_ave += src[row*W+col];
 		}
-		buffer1.release(pBuf1);
-		buffer2.release(pBuf2);
 		buffer3.release(pBuf3);
 	}
 	
@@ -333,35 +285,26 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
 	
-	AlignedBufferMP<float> buffer1(W*H);
-	AlignedBufferMP<float> buffer2(W*H);	
+	AlignedBuffer<float>* buffer1 = new AlignedBuffer<float> (W*H);
+	float* temp = buffer1->data;
+	
+	AlignedBuffer<float>* buffer2 = new AlignedBuffer<float> (W*H);
+	float* tempave = buffer2->data;
 	
 	if (radx==0) {
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif	
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-			T* temp=(T*)pBuf1->data;
-		
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				temp[row*H+col] = src[row*W+col];
 			}
-			buffer1.release(pBuf1);
-		}
 	} else {
 		//horizontal blur
-//OpenMP here		
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		for (int row = 0; row < H; row++) {
-			AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-			T* temp=(T*)pBuf1->data;
-		
-			AlignedBuffer<float>* pBuf2 = buffer2.acquire();
-			T* tempave=(T*)pBuf2->data;
-			
 			int len = radx + 1;
 			temp[row*W+0] = (float)src[row*W+0]/len;
 			for (int j=1; j<=radx; j++) {
@@ -378,8 +321,6 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 				temp[row*W+col] = (temp[row*W+col-1]*len - src[row*W+col-radx-1])/(len-1);
 				len --;
 			}
-			buffer1.release(pBuf1);
-			buffer2.release(pBuf2);
 		}
 	}
 	
@@ -388,31 +329,16 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 #pragma omp parallel for
 #endif
 		for (int row=0; row<H; row++){
-			AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-			T* temp=(T*)pBuf1->data;
-		
-			AlignedBuffer<float>* pBuf2 = buffer2.acquire();
-			T* tempave=(T*)pBuf2->data;
-			
 			for (int col=0; col<H; col++) {
 				tempave[row*W+col] = temp[row*W+col];
 			}
-			buffer1.release(pBuf1);
-			buffer2.release(pBuf2);
 		}
 	} else {
 		//vertical blur
-//OpenMP here		
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 		for (int col = 0; col < W; col++) {
-			AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-			T* temp=(T*)pBuf1->data;
-		
-			AlignedBuffer<float>* pBuf2 = buffer2.acquire();
-			T* tempave=(T*)pBuf2->data;
-			
 			int len = rady + 1;
 			tempave[0*W+col] = temp[0*W+col]/len;
 			for (int i=1; i<=rady; i++) {
@@ -429,8 +355,6 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 				tempave[row*W+col] = (tempave[(row-1)*W+col]*len - temp[(row-rady-1)*W+col])/(len-1);
 				len --;
 			}
-			buffer1.release(pBuf1);
-			buffer2.release(pBuf2);
 		}
 	}
 	
@@ -442,19 +366,10 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-			T* temp=(T*)pBuf1->data;
-		
-			AlignedBuffer<float>* pBuf2 = buffer2.acquire();
-			T* tempave=(T*)pBuf2->data;
-			
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				temp[row*H+col] = fabs(src[row*W+col]-tempave[row*W+col]);
 			}
-			buffer1.release(pBuf1);
-			buffer2.release(pBuf2);
-		}
 	} else {
 		//horizontal blur
 //OpenMP here
@@ -462,12 +377,6 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 #pragma omp parallel for
 #endif		
 		for (int row = 0; row < H; row++) {
-			AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-			T* temp=(T*)pBuf1->data;
-		
-			AlignedBuffer<float>* pBuf2 = buffer2.acquire();
-			T* tempave=(T*)pBuf2->data;
-			
 			int len = radx + 1;
 			temp[row*W+0] = fabs(src[row*W+0]-tempave[row*W+0])/len;
 			for (int j=1; j<=radx; j++) {
@@ -485,8 +394,6 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 				temp[row*W+col] = (temp[row*W+col-1]*len - fabs(src[row*W+col-radx-1]-tempave[row*W+col-radx-1]))/(len-1);
 				len --;
 			}
-			buffer1.release(pBuf1);
-			buffer2.release(pBuf2);
 		}
 	}
 	
@@ -494,15 +401,10 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-			T* temp=(T*)pBuf1->data;
-			
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				dst[row*W+col] = temp[row*W+col];
 			}
-			buffer1.release(pBuf1);
-		}
 	} else {
 		//vertical blur
 //OpenMP here		
@@ -510,9 +412,6 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 #pragma omp parallel for
 #endif
 		for (int col = 0; col < W; col++) {
-			AlignedBuffer<float>* pBuf1 = buffer1.acquire();
-			T* temp=(T*)pBuf1->data;
-			
 			int len = rady + 1;
 			dst[0*W+col] = temp[0*W+col]/len;
 			for (int i=1; i<=rady; i++) {
@@ -529,10 +428,12 @@ template<typename T> void boxdev (T* src, T* dst, int radx, int rady, int W, int
 				dst[row*W+col] = (dst[(row-1)*W+col]*len - temp[(row-rady-1)*W+col])/(len-1);
 				len --;
 			}
-			
-			buffer1.release(pBuf1);
 		}
 	}
+			
+	delete buffer1;
+	delete buffer2;
+
 }

 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -544,21 +445,17 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
 	
-	AlignedBufferMP<float> buffer(W*H);
+	AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W*H);
+	float* temp = buffer->data;
 	
 	if (radx==0) {
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				temp[row*H+col] = SQR(src[row*W+col]);
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//horizontal blur
 //OpenMP here		
@@ -566,9 +463,6 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
 #pragma omp parallel for
 #endif
 		for (int row = 0; row < H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-		
 			int len = radx + 1;
 			temp[row*W+0] = SQR((float)src[row*W+0])/len;
 			for (int j=1; j<=radx; j++) {
@@ -585,8 +479,6 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
 				temp[row*W+col] = (temp[row*W+col-1]*len - SQR(src[row*W+col-radx-1]))/(len-1);
 				len --;
 			}
-			
-			buffer.release(pBuf);
 		}
 	}
 	
@@ -594,15 +486,10 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-		
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				dst[row*W+col] = temp[row*W+col];
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//vertical blur
 //OpenMP here		
@@ -610,9 +497,6 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
 #pragma omp parallel for
 #endif
 		for (int col = 0; col < W; col++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int len = rady + 1;
 			dst[0*W+col] = temp[0*W+col]/len;
 			for (int i=1; i<=rady; i++) {
@@ -629,10 +513,11 @@ template<class T, class A> void boxsqblur (T* src, A* dst, int radx, int rady, i
 				dst[row*W+col] = (dst[(row-1)*W+col]*len - temp[(row-rady-1)*W+col])/(len-1);
 				len --;
 			}
-			
-			buffer.release(pBuf);
 		}
 	}	
+			
+	delete buffer;
+	
 }

 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -644,33 +529,20 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
 	
-	AlignedBufferMP<float> buffer(W*H);
+	AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W*H);
+	float* temp = buffer->data;
 	
 	if (radx==0) {
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
 		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int rr = min(H-1,max(0,row+dy));
 			for (int col=0; col<H; col++) {
 				int cc = min(W-1,max(0,col+dx));
 				temp[row*H+col] = dy>0 ? (src[row*W+col])*(src[rr*W+cc]) : 0;
 			}
-			buffer.release(pBuf);
 		}
 	} else {
 		//horizontal blur
-//OpenMP here		
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
 		for (int row = 0; row < H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int len = radx + 1;
 			int rr = min(H-1,max(0,row+dy));
 			int cc = min(W-1,max(0,0+dx));
@@ -695,7 +567,6 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
 				temp[row*W+col] = (temp[row*W+col-1]*len - (src[row*W+col-radx-1])*(src[rr*W+cc1]))/(len-1);
 				len --;
 			}
-			buffer.release(pBuf);
 		}
 	}
 	
@@ -703,15 +574,10 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				dst[row*W+col] = temp[row*W+col];
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//vertical blur
 //OpenMP here		
@@ -719,9 +585,6 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
 #pragma omp parallel for
 #endif
 		for (int col = 0; col < W; col++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int len = rady + 1;
 			dst[0*W+col] = temp[0*W+col]/len;
 			for (int i=1; i<=rady; i++) {
@@ -738,9 +601,11 @@ template<class T, class A> void boxcorrelate (T* src, A* dst, int dx, int dy, in
 				dst[row*W+col] = (dst[(row-1)*W+col]*len - temp[(row-rady-1)*W+col])/(len-1);
 				len --;
 			}
-			buffer.release(pBuf);
 		}
 	}
+	
+	delete buffer;
+	
 }


@@ -753,21 +618,17 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
 	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	//box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
 	
-	AlignedBufferMP<float> buffer(W*H);
+	AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W*H);
+	float* temp = buffer->data;
 	
 	if (radx==0) {
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			 
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				temp[row*H+col] = fabs(src[row*W+col]);
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//horizontal blur
 //OpenMP here		
@@ -775,9 +636,6 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
 #pragma omp parallel for
 #endif
 		for (int row = 0; row < H; row++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int len = radx + 1;
 			temp[row*W+0] = fabs((float)src[row*W+0])/len;
 			for (int j=1; j<=radx; j++) {
@@ -794,7 +652,6 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
 				temp[row*W+col] = (temp[row*W+col-1]*len - fabs(src[row*W+col-radx-1]))/(len-1);
 				len --;
 			}
-			buffer.release(pBuf);
 		}
 	}
 	
@@ -802,15 +659,10 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-		for (int row=0; row<H; row++)  {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
+		for (int row=0; row<H; row++) 
 			for (int col=0; col<H; col++) {
 				dst[row*W+col] = temp[row*W+col];
 			}
-			buffer.release(pBuf);
-		}
 	} else {
 		//vertical blur
 //OpenMP here		
@@ -818,9 +670,6 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
 #pragma omp parallel for
 #endif
 		for (int col = 0; col < W; col++) {
-			AlignedBuffer<float>* pBuf = buffer.acquire();
-			T* temp=(T*)pBuf->data;
-			
 			int len = rady + 1;
 			dst[0*W+col] = temp[0*W+col]/len;
 			for (int i=1; i<=rady; i++) {
@@ -837,12 +686,13 @@ template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady,
 				dst[row*W+col] = (dst[(row-1)*W+col]*len - temp[(row-rady-1)*W+col])/(len-1);
 				len --;
 			}
-			buffer.release(pBuf);
-		}
 	}
 }

-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	delete buffer;
+	
+}
+
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 }