Speedup und reduced memory usage for Noise Reduction, Issue 2557 #132

2015-01-16 13:01:28 +01:00
parent 6af0133661
commit 09a07513f0
16 changed files with 952 additions and 962 deletions
--- a/rtengine/cplx_wavelet_level.h
+++ b/rtengine/cplx_wavelet_level.h
@@ -31,9 +31,6 @@ namespace rtengine {
 	class wavelet_level
 	{

-		// size of padded border
-		size_t m_pad;
-
 		// level of decomposition
 		int lvl;

@@ -49,19 +46,27 @@ namespace rtengine {

 		// load a row/column of input data, possibly with padding

-		void AnalysisFilterHaarVertical (T * srcbuffer, T * dstLo, T * dstHi, int pitch, int srclen, int row);
-		void AnalysisFilterHaarHorizontal (T * srcbuffer, T * dstLo, T * dstHi, int srclen, int row);
-		void SynthesisFilterHaarHorizontal (T * srcLo, T * srcHi, T * dst, int dstlen);
-		void SynthesisFilterHaarVertical (T * srcLo, T * srcHi, T * dst, int pitch, int dstlen);
+		void AnalysisFilterHaarVertical (const T * const srcbuffer, T * dstLo, T * dstHi, const int width, const int height, const int row);
+		void AnalysisFilterHaarHorizontal (const T * const srcbuffer, T * dstLo, T * dstHi, const int width, const int row);
+		void SynthesisFilterHaarHorizontal (const T * const srcLo, const T * const srcHi, T * dst, const int width, const int height);
+		void SynthesisFilterHaarVertical (const T * const srcLo, const T * const srcHi, T * dst, const int width, const int height);

 		void AnalysisFilterSubsampHorizontal (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
-							 int taps, int offset, int pitch, int srclen, int m_w2, int row);
+							 const int taps, const int offset, const int srcwidth, const int dstwidth, const int row);
+#ifdef __SSE2__
+		void AnalysisFilterSubsampVertical (T * srcbuffer, T * dstLo, T * dstHi, float (*filterLo)[4], float (*filterHi)[4],
+							 const int taps, const int offset, const int width, const int height, const int row);
+#else
 		void AnalysisFilterSubsampVertical (T * srcbuffer, T * dstLo, T * dstHi, float *filterLo, float *filterHi, 
-							 int taps, int offset, int pitch, int srclen, int row);
+							 int const taps, const int offset, const int width, const int height, const int row);
+#endif
 		void SynthesisFilterSubsampHorizontal (T * srcLo, T * srcHi, T * dst,
-							  float *filterLo, float *filterHi, int taps, int offset, int dstlen);
-		void SynthesisFilterSubsampVertical (T * srcLo, T * srcHi, T * dst, float *filterLo, float *filterHi, int taps, int offset, int pitch, int dstlen);
-
+							  float *filterLo, float *filterHi, const int taps, const int offset, const int scrwidth, const int dstwidth, const int height);
+#ifdef __SSE2__
+		void SynthesisFilterSubsampVertical (T * srcLo, T * srcHi, T * dst, float (*filterLo)[4], float (*filterHi)[4], const int taps, const int offset, const int width, const int srcheight, const int dstheight);
+#else
+		void SynthesisFilterSubsampVertical (T * srcLo, T * srcHi, T * dst, float *filterLo, float *filterHi, const int taps, const int offset, const int width, const int srcheight, const int dstheight);
+#endif
 	public:

 		T ** wavcoeffs;
@@ -72,18 +77,20 @@ namespace rtengine {
 		size_t m_w2, m_h2;

 		template<typename E>
-		wavelet_level(E * src, E * dst, int level, int subsamp, int padding, size_t w, size_t h, float *filterV, float *filterH, int len, int offset)
-		: m_w(w), m_h(h), m_w2(w), m_h2(h), m_pad(padding), wavcoeffs(NULL), lvl(level), skip(1<<level), subsamp_out((subsamp>>level)&1)
+		wavelet_level(E * src, E * dst, int level, int subsamp, size_t w, size_t h, float *filterV, float *filterH, int len, int offset, int skipcrop)
+		: lvl(level), subsamp_out((subsamp>>level)&1), skip(1<<level), wavcoeffs(NULL), m_w(w), m_h(h), m_w2(w), m_h2(h)
 		{
 			if (subsamp) {
 				skip = 1;
 				for (int n=0; n<level; n++) {
 					skip *= 2-((subsamp>>n)&1);
-				}
+				}
+				skip /= skipcrop;
+				if(skip < 1) skip=1;
+
 			}
-			m_w2 = (subsamp_out ? ((w+1+2*skip*padding)/2) : (w+2*skip*padding));
-			m_h2 = (subsamp_out ? ((h+1+2*skip*padding)/2) : (h+2*skip*padding));
-			m_pad= skip*padding;
+			m_w2 = (subsamp_out ? (w+1)/2 : w);
+			m_h2 = (subsamp_out ? (h+1)/2 : h);
 			
 			wavcoeffs = create((m_w2)*(m_h2));
 			decompose_level(src, dst, filterV, filterH, len, offset);
@@ -115,11 +122,6 @@ namespace rtengine {
 			return m_h2;
 		}

-		size_t padding() const
-		{
-			return m_pad/skip;
-		}
-
 		size_t stride() const
 		{
 			return skip;
@@ -151,33 +153,33 @@ namespace rtengine {
 	}

 	template<typename T>
-	void wavelet_level<T>::AnalysisFilterHaarHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int srclen, int row) {
+	void wavelet_level<T>::AnalysisFilterHaarHorizontal (const T * const RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, const int width, const int row) {
 		/* Basic convolution code
 		 * Applies a Haar filter 
 		*/							
-			for(int i = 0; i < (srclen - skip); i++) {
-				dstLo[row*srclen+i] = (srcbuffer[i] + srcbuffer[i+skip]);
-				dstHi[row*srclen+i] = (srcbuffer[i] - srcbuffer[i+skip]);
+			for(int i = 0; i < (width - skip); i++) {
+				dstLo[row*width+i] = (srcbuffer[i] + srcbuffer[i+skip]);
+				dstHi[row*width+i] = (srcbuffer[i] - srcbuffer[i+skip]);
 			}
-			for(size_t i = max(srclen-skip,skip); i < (srclen); i++) {
-				dstLo[row*srclen+i] = (srcbuffer[i] + srcbuffer[i-skip]);
-				dstHi[row*srclen+i] = (srcbuffer[i] - srcbuffer[i-skip]);
+			for(size_t i = max(width-skip,skip); i < (width); i++) {
+				dstLo[row*width+i] = (srcbuffer[i] + srcbuffer[i-skip]);
+				dstHi[row*width+i] = (srcbuffer[i] - srcbuffer[i-skip]);
 			}
 	}

-	template<typename T> void wavelet_level<T>::AnalysisFilterHaarVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, int pitch, int srclen, int row) {
+	template<typename T> void wavelet_level<T>::AnalysisFilterHaarVertical (const T * const RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, const int width, const int height, const int row) {
 	/* Basic convolution code
 	 * Applies a Haar filter 
 	*/
-		if(row < (srclen - skip)) {
-			for(int j=0;j<pitch;j++) {
-				dstLo[j] = 0.25f*(srcbuffer[row*pitch+j] + srcbuffer[(row+skip)*pitch+j]);
-				dstHi[j] = 0.25f*(srcbuffer[row*pitch+j] - srcbuffer[(row+skip)*pitch+j]);
+		if(row < (height - skip)) {
+			for(int j=0;j<width;j++) {
+				dstLo[j] = 0.25f*(srcbuffer[row*width+j] + srcbuffer[(row+skip)*width+j]);
+				dstHi[j] = 0.25f*(srcbuffer[row*width+j] - srcbuffer[(row+skip)*width+j]);
 			}
-		} else if(row>=max(srclen-skip,skip)) {
-			for(int j=0;j<pitch;j++) {
-				dstLo[j] = 0.25f*(srcbuffer[row*pitch+j] + srcbuffer[(row-skip)*pitch+j]);
-				dstHi[j] = 0.25f*(srcbuffer[row*pitch+j] - srcbuffer[(row-skip)*pitch+j]);
+		} else if(row>=max(height-skip,skip)) {
+			for(int j=0;j<width;j++) {
+				dstLo[j] = 0.25f*(srcbuffer[row*width+j] + srcbuffer[(row-skip)*width+j]);
+				dstHi[j] = 0.25f*(srcbuffer[row*width+j] - srcbuffer[(row-skip)*width+j]);
 			}
 		}
 	}
@@ -185,43 +187,43 @@ namespace rtengine {
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 	// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
 	
-	template<typename T> void wavelet_level<T>::SynthesisFilterHaarHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, int dstlen) {
+	template<typename T> void wavelet_level<T>::SynthesisFilterHaarHorizontal (const T * const RESTRICT srcLo, const T * const RESTRICT srcHi, T * RESTRICT dst, const int width, const int height) {

 		/* Basic convolution code
 		 * Applies a Haar filter 
 		 *
 		 */

-		for (int k=0; k<m_h2; k++) {
-			for(size_t i = (m_pad); i < (m_pad+skip); i++) {
-				dst[k*m_w+(i-m_pad)] = (srcLo[k*m_w2+i] + srcHi[k*m_w2+i]);			
+		for (int k=0; k<height; k++) {
+			for(size_t i = 0; i < skip; i++) {
+				dst[k*width+i] = (srcLo[k*width+i] + srcHi[k*width+i]);			
 			}
-			for(size_t i = m_pad+skip; i < (dstlen+m_pad); i++) {
-				dst[k*m_w+(i-m_pad)] = 0.5f*(srcLo[k*m_w2+i] + srcHi[k*m_w2+i] + srcLo[k*m_w2+i-skip] - srcHi[k*m_w2+i-skip]);			
+			for(size_t i = skip; i < width; i++) {
+				dst[k*width+i] = 0.5f*(srcLo[k*width+i] + srcHi[k*width+i] + srcLo[k*width+i-skip] - srcHi[k*width+i-skip]);			
 			}
 		}
 	}

-	template<typename T> void wavelet_level<T>::SynthesisFilterHaarVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, int pitch, int dstlen) {
+	template<typename T> void wavelet_level<T>::SynthesisFilterHaarVertical (const T * const RESTRICT srcLo, const T * const RESTRICT srcHi, T * RESTRICT dst, const int width, const int height) {

 		/* Basic convolution code
 		 * Applies a Haar filter 
 		 *
 		 */

-		for(size_t i = (m_pad); i < (m_pad+skip); i++) {
-			for(int j=0;j<pitch;j++)
-				dst[pitch*(i-m_pad)+j] = (srcLo[i*pitch+j] + srcHi[i*pitch+j]);			
+		for(size_t i = 0; i < skip; i++) {
+			for(int j=0;j<width;j++)
+				dst[width*i+j] = (srcLo[i*width+j] + srcHi[i*width+j]);			
 		}
-		for(size_t i = m_pad+skip; i < (dstlen+m_pad); i++) {
-			for(int j=0;j<pitch;j++)
-				dst[pitch*(i-m_pad)+j] = 0.5f*(srcLo[i*pitch+j] + srcHi[i*pitch+j] + srcLo[(i-skip)*pitch+j] - srcHi[(i-skip)*pitch+j]);			
+		for(size_t i = skip; i < height; i++) {
+			for(int j=0;j<width;j++)
+				dst[width*i+j] = 0.5f*(srcLo[i*width+j] + srcHi[i*width+j] + srcLo[(i-skip)*width+j] - srcHi[(i-skip)*width+j]);			
 		}
 	}

 	template<typename T>
-	void wavelet_level<T>::AnalysisFilterSubsampHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float *filterHi, 
-												int taps, int offset, int pitch, int srclen, int m_w2, int row) {
+	void wavelet_level<T>::AnalysisFilterSubsampHorizontal (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float *RESTRICT filterHi, 
+												const int taps, const int offset, const int srcwidth, const int dstwidth, const int row) {
 		/* Basic convolution code
 		 * Applies an FIR filter 'filter' with filter length 'taps', 
 		 * aligning the 'offset' element of the filter with
@@ -229,62 +231,125 @@ namespace rtengine {
 		 * Output is subsampled by two
 		 */
 		// calculate coefficients
-			for(int i = 0; i < srclen; i+=2) {
-				float lo = 0.f, hi = 0.f;
-				if (LIKELY(i>skip*taps && i<srclen-skip*taps)) {//bulk
-					for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
-						float src = srcbuffer[i-l];
-						lo += filterLo[j] * src;//lopass channel
-						hi += filterHi[j] * src;//hipass channel
-					}
-				} else {
-					for (int j=0; j<taps; j++) {
-						int arg = max(0,min(i+skip*(offset-j),srclen-1));//clamped BC's
-						lo += filterLo[j] * srcbuffer[arg];//lopass channel
-						hi += filterHi[j] * srcbuffer[arg];//hipass channel
-					}
-				}
-				dstLo[row*m_w2+((i/2))] = lo;
-				dstHi[row*m_w2+((i/2))] = hi;
-			}
-	}
-
-	template<typename T> void wavelet_level<T>::AnalysisFilterSubsampVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float * RESTRICT filterHi, 
-													int taps, int offset, int pitch, int srclen, int row) {
-
-			/* Basic convolution code
-			 * Applies an FIR filter 'filter' with filter length 'taps', 
-			 * aligning the 'offset' element of the filter with
-			 * the input pixel, and skipping 'skip' pixels between taps 
-			 * Output is subsampled by two
-			 */
-
-			// calculate coefficients
-			if (LIKELY(row>skip*taps && row<srclen-skip*taps)) {//bulk
-				for (int k=0; k<pitch; k++) {
-					float lo = 0.f, hi = 0.f;
-					for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
-						lo += filterLo[j] * srcbuffer[(row-l)*pitch+k];//lopass channel
-						hi += filterHi[j] * srcbuffer[(row-l)*pitch+k];//hipass channel
-					}
-					dstLo[k] = lo;
-					dstHi[k] = hi;
-				}
-			} else {//boundary
-				for (int k=0; k<pitch; k++) {
-					float lo = 0.f, hi = 0.f;
-					for (int j=0; j<taps; j++) {
-						int arg = max(0,min(row+skip*(offset-j),srclen-1))*pitch+k;//clamped BC's
-						lo += filterLo[j] * srcbuffer[arg];//lopass channel
-						hi += filterHi[j] * srcbuffer[arg];//hipass channel
-					}
-					dstLo[k] = lo;
-					dstHi[k] = hi;
+		for(int i = 0; i < srcwidth; i+=2) {
+			float lo = 0.f, hi = 0.f;
+			if (LIKELY(i>skip*taps && i<srcwidth-skip*taps)) {//bulk
+				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+					float src = srcbuffer[i-l];
+					lo += filterLo[j] * src;//lopass channel
+					hi += filterHi[j] * src;//hipass channel
+				}
+			} else {
+				for (int j=0; j<taps; j++) {
+					int arg = max(0,min(i+skip*(offset-j),srcwidth-1));//clamped BC's
+					lo += filterLo[j] * srcbuffer[arg];//lopass channel
+					hi += filterHi[j] * srcbuffer[arg];//hipass channel
 				}
 			}
+			dstLo[row*dstwidth+((i/2))] = lo;
+			dstHi[row*dstwidth+((i/2))] = hi;
+		}
 	}

-	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, int taps, int offset, int dstlen) {
+#ifdef __SSE2__
+	template<typename T> void wavelet_level<T>::AnalysisFilterSubsampVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float (* RESTRICT filterLo)[4], float (* RESTRICT filterHi)[4],
+													const int taps, const int offset, const int width, const int height, const int row) {
+
+		/* Basic convolution code
+		 * Applies an FIR filter 'filter' with filter length 'taps', 
+		 * aligning the 'offset' element of the filter with
+		 * the input pixel, and skipping 'skip' pixels between taps 
+		 * Output is subsampled by two
+		 */
+
+		// calculate coefficients
+		if (LIKELY(row>skip*taps && row<height-skip*taps)) {//bulk
+			int k;
+			for (k=0; k<width-3; k+=4) {
+				__m128 lov = _mm_setzero_ps();
+				__m128 hiv = _mm_setzero_ps();
+				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+					__m128 srcv = LVFU(srcbuffer[(row-l)*width+k]);
+					lov += LVF(filterLo[j][0]) * srcv;//lopass channel
+					hiv += LVF(filterHi[j][0]) * srcv;//hipass channel
+				}
+				STVF(dstLo[k], lov);
+				STVF(dstHi[k], hiv);
+			}
+			for (; k<width; k++) {
+				float lo = 0.f, hi = 0.f;
+				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+					lo += filterLo[j][0] * srcbuffer[(row-l)*width+k];//lopass channel
+					hi += filterHi[j][0] * srcbuffer[(row-l)*width+k];//hipass channel
+				}
+				dstLo[k] = lo;
+				dstHi[k] = hi;
+			}
+		} else {//boundary
+			int k;
+			for (k=0; k<width-3; k+=4) {
+				__m128 lov = _mm_setzero_ps();
+				__m128 hiv = _mm_setzero_ps();
+				for (int j=0; j<taps; j++) {
+					int arg = max(0,min(row+skip*(offset-j),height-1))*width+k;//clamped BC's
+					__m128 srcv = LVFU(srcbuffer[arg]);
+					lov += LVF(filterLo[j][0]) * srcv;//lopass channel
+					hiv += LVF(filterHi[j][0]) * srcv;//hipass channel
+				}
+				STVF(dstLo[k], lov);
+				STVF(dstHi[k], hiv);
+			}
+			for (; k<width; k++) {
+				float lo = 0.f, hi = 0.f;
+				for (int j=0; j<taps; j++) {
+					int arg = max(0,min(row+skip*(offset-j),height-1))*width+k;//clamped BC's
+					lo += filterLo[j][0] * srcbuffer[arg];//lopass channel
+					hi += filterHi[j][0] * srcbuffer[arg];//hipass channel
+				}
+				dstLo[k] = lo;
+				dstHi[k] = hi;
+			}
+		}
+	}
+#else
+	template<typename T> void wavelet_level<T>::AnalysisFilterSubsampVertical (T * RESTRICT srcbuffer, T * RESTRICT dstLo, T * RESTRICT dstHi, float * RESTRICT filterLo, float * RESTRICT filterHi, 
+													const int taps, const int offset, const int width, const int height, const int row) {
+
+		/* Basic convolution code
+		 * Applies an FIR filter 'filter' with filter length 'taps', 
+		 * aligning the 'offset' element of the filter with
+		 * the input pixel, and skipping 'skip' pixels between taps 
+		 * Output is subsampled by two
+		 */
+
+		// calculate coefficients
+		if (LIKELY(row>skip*taps && row<height-skip*taps)) {//bulk
+			for (int k=0; k<width; k++) {
+				float lo = 0.f, hi = 0.f;
+				for (int j=0, l=-skip*offset; j<taps; j++, l+=skip) {
+					lo += filterLo[j] * srcbuffer[(row-l)*width+k];//lopass channel
+					hi += filterHi[j] * srcbuffer[(row-l)*width+k];//hipass channel
+				}
+				dstLo[k] = lo;
+				dstHi[k] = hi;
+			}
+		} else {//boundary
+			for (int k=0; k<width; k++) {
+				float lo = 0.f, hi = 0.f;
+				for (int j=0; j<taps; j++) {
+					int arg = max(0,min(row+skip*(offset-j),height-1))*width+k;//clamped BC's
+					lo += filterLo[j] * srcbuffer[arg];//lopass channel
+					hi += filterHi[j] * srcbuffer[arg];//hipass channel
+				}
+				dstLo[k] = lo;
+				dstHi[k] = hi;
+			}
+		}
+	}
+#endif
+
+
+	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampHorizontal (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, const int taps, const int offset, const int srcwidth, const int dstwidth, const int height) {

 		/* Basic convolution code
 		 * Applies an FIR filter 'filter' with filter length 'taps', 
@@ -294,78 +359,157 @@ namespace rtengine {
 		 */

 		// calculate coefficients
-		int srclen = (dstlen==m_w ? m_w2 : m_h2);//length of row/col in src (coarser level)
 		int shift = skip*(taps-offset-1);//align filter with data

-		for (int k=0; k<m_h2; k++) {
-			for(size_t i = m_pad; i < (dstlen+m_pad); i++) {
+		for (int k=0; k<height; k++) {
+			int i;	
+			for(i=0; i<=min(skip*taps,dstwidth); i++) {
 				float tot=0.f;
 				//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
 				int i_src = (i+shift)/2;
 				int begin = (i+shift)%2;
-				if (LIKELY(i>skip*taps && i<(srclen-skip*taps))) {//bulk
-					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-						tot += ((filterLo[j] * srcLo[k*m_w2+i_src-l] + filterHi[j] * srcHi[k*m_w2+i_src-l]));
-					}
-				} else {//boundary
-					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-						int arg = max(0,min((i_src-l),srclen-1));//clamped BC's
-						tot += ((filterLo[j] * srcLo[k*m_w2+arg] + filterHi[j] * srcHi[k*m_w2+arg]));
-					}
+				for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+					int arg = max(0,min((i_src-l),srcwidth-1));//clamped BC's
+					tot += ((filterLo[j] * srcLo[k*srcwidth+arg] + filterHi[j] * srcHi[k*srcwidth+arg]));
 				}
-				dst[k*m_w+(i-m_pad)] = tot;
+				dst[k*dstwidth+i] = tot;
+			}
+			for(; i<min(dstwidth-skip*taps,dstwidth); i++) {
+				float tot=0.f;
+				//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
+				int i_src = (i+shift)/2;
+				int begin = (i+shift)%2;
+				for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+					tot += ((filterLo[j] * srcLo[k*srcwidth+i_src-l] + filterHi[j] * srcHi[k*srcwidth+i_src-l]));
+				}
+				dst[k*dstwidth+i] = tot;
+			}
+			for(; i < dstwidth; i++) {
+				float tot=0.f;
+				//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
+				int i_src = (i+shift)/2;
+				int begin = (i+shift)%2;
+				for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+					int arg = max(0,min((i_src-l),srcwidth-1));//clamped BC's
+					tot += ((filterLo[j] * srcLo[k*srcwidth+arg] + filterHi[j] * srcHi[k*srcwidth+arg]));
+				}
+				dst[k*dstwidth+i] = tot;
 			}
 		}
 	}
+
+#ifdef __SSE2__
+	template<typename T> SSEFUNCTION void wavelet_level<T>::SynthesisFilterSubsampVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float (* RESTRICT filterLo)[4], float (* RESTRICT filterHi)[4], const int taps, const int offset, const int width, const int srcheight, const int dstheight)
+	 {

-	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, int taps, int offset, int pitch, int dstlen) {
-
-			/* Basic convolution code
-			 * Applies an FIR filter 'filter' with filter length 'taps', 
-			 * aligning the 'offset' element of the filter with
-			 * the input pixel, and skipping 'skip' pixels between taps 
-			 * Output is subsampled by two
-			 */
+		/* Basic convolution code
+		 * Applies an FIR filter 'filter' with filter length 'taps', 
+		 * aligning the 'offset' element of the filter with
+		 * the input pixel, and skipping 'skip' pixels between taps 
+		 * Output is subsampled by two
+		 */

 		// calculate coefficients
-		int srclen = (dstlen==m_w ? m_w2 : m_h2);//length of row/col in src (coarser level)
 		int shift=skip*(taps-offset-1);//align filter with data
-
-		for(size_t i = m_pad; i < (dstlen+m_pad); i++) {
+		__m128 fourv = _mm_set1_ps(4.f);
+		for(size_t i = 0; i < dstheight; i++) {
 			int i_src = (i+shift)/2;
 			int begin = (i+shift)%2;
 			//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
-			if (LIKELY(i>skip*taps && i<(srclen-skip*taps))) {//bulk
-				for (int k=0; k<pitch; k++) {
-					float tot = 0.f;
+			if (LIKELY(i>skip*taps && i<(dstheight-skip*taps))) {//bulk
+				int k;
+				for (k=0; k<width-3; k+=4) {
+					__m128 totv = _mm_setzero_ps();
 					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-						tot += ((filterLo[j] * srcLo[(i_src-l)*pitch+k] + filterHi[j] * srcHi[(i_src-l)*pitch+k]));
+						totv += ((LVF(filterLo[j][0]) * LVFU(srcLo[(i_src-l)*width+k]) + LVF(filterHi[j][0]) * LVFU(srcHi[(i_src-l)*width+k])));
 					}
-					dst[pitch*(i-m_pad)+k] = 4.f * tot;
+					_mm_storeu_ps(&dst[width*i+k], fourv * totv);
 				}
-			} else {//boundary
-				for (int k=0; k<pitch; k++) {
+				for (; k<width; k++) {
 					float tot = 0.f;
 					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
-						int arg = max(0,min((i_src-l),srclen-1))*pitch+k;//clamped BC's
-						tot += ((filterLo[j] * srcLo[arg] + filterHi[j] * srcHi[arg]));
+						tot += ((filterLo[j][0] * srcLo[(i_src-l)*width+k] + filterHi[j][0] * srcHi[(i_src-l)*width+k]));
+					}
+					dst[width*i+k] = 4.f * tot;
+				}
+			} else {//boundary
+				int k;
+				for (k=0; k<width-3; k+=4) {
+					__m128 totv = _mm_setzero_ps();
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						int arg = max(0,min((i_src-l),srcheight-1))*width+k;//clamped BC's
+						totv += ((LVF(filterLo[j][0]) * LVFU(srcLo[arg]) + LVF(filterHi[j][0]) * LVFU(srcHi[arg])));
 					}
-					dst[pitch*(i-m_pad)+k] = 4.f * tot;
+					_mm_storeu_ps(&dst[width*i+k], fourv * totv);
+				}
+				for (; k<width; k++) {
+					float tot = 0.f;
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						int arg = max(0,min((i_src-l),srcheight-1))*width+k;//clamped BC's
+						tot += ((filterLo[j][0] * srcLo[arg] + filterHi[j][0] * srcHi[arg]));
+					}
+					dst[width*i+k] = 4.f * tot;
 				}
 			}
 		}
 	}
+#else
+	template<typename T> void wavelet_level<T>::SynthesisFilterSubsampVertical (T * RESTRICT srcLo, T * RESTRICT srcHi, T * RESTRICT dst, float * RESTRICT filterLo, float * RESTRICT filterHi, const int taps, const int offset, const int width, const int srcheight, const int dstheight)
+	 {

-	template<typename T> template<typename E> void wavelet_level<T>::decompose_level(E *src, E *dst, float *filterV, float *filterH, int taps, int offset) { 
+		/* Basic convolution code
+		 * Applies an FIR filter 'filter' with filter length 'taps', 
+		 * aligning the 'offset' element of the filter with
+		 * the input pixel, and skipping 'skip' pixels between taps 
+		 * Output is subsampled by two
+		 */
+
+		// calculate coefficients
+		int shift=skip*(taps-offset-1);//align filter with data
+
+		for(size_t i = 0; i < dstheight; i++) {
+			int i_src = (i+shift)/2;
+			int begin = (i+shift)%2;
+			//TODO: this is correct only if skip=1; otherwise, want to work with cosets of length 'skip'
+			if (LIKELY(i>skip*taps && i<(dstheight-skip*taps))) {//bulk
+				for (int k=0; k<width; k++) {
+					float tot = 0.f;
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						tot += ((filterLo[j] * srcLo[(i_src-l)*width+k] + filterHi[j] * srcHi[(i_src-l)*width+k]));
+					}
+					dst[width*i+k] = 4.f * tot;
+				}
+			} else {//boundary
+				for (int k=0; k<width; k++) {
+					float tot = 0.f;
+					for (int j=begin, l=0; j<taps; j+=2, l+=skip) {
+						int arg = max(0,min((i_src-l),srcheight-1))*width+k;//clamped BC's
+						tot += ((filterLo[j] * srcLo[arg] + filterHi[j] * srcHi[arg]));
+					}
+					dst[width*i+k] = 4.f * tot;
+				}
+			}
+		}
+	}
+#endif
+
+#ifdef __SSE2__
+	template<typename T> template<typename E> SSEFUNCTION void wavelet_level<T>::decompose_level(E *src, E *dst, float *filterV, float *filterH, int taps, int offset) { 

 		T tmpLo[m_w] ALIGNED64;
 		T tmpHi[m_w] ALIGNED64;
 		/* filter along rows and columns */
 		if(subsamp_out) {
+			float filterVarray[2*taps][4] ALIGNED64;
+			for(int i=0;i<2*taps;i++) {
+				for(int j=0;j<4;j++) {
+					filterVarray[i][j] = filterV[i];
+				}
+			}
 			for(int row=0;row<m_h;row+=2) {
-				AnalysisFilterSubsampVertical (src, tmpLo, tmpHi, filterV, filterV+taps, taps, offset, m_w/*output_pitch*/, m_h/*srclen*/, row);
-				AnalysisFilterSubsampHorizontal (tmpLo, dst, wavcoeffs[1], filterH, filterH+taps, taps, offset, m_h2/*output_pitch*/, m_w/*srclen*/, m_w2, row/2);
-				AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_h2/*output_pitch*/, m_w/*srclen*/, m_w2, row/2);
+				AnalysisFilterSubsampVertical (src, tmpLo, tmpHi, filterVarray, filterVarray+taps, taps, offset, m_w, m_h, row);
+				AnalysisFilterSubsampHorizontal (tmpLo, dst, wavcoeffs[1], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
+				AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
 			}
 		} else {
 			for(int row=0;row<m_h;row++) {
@@ -375,21 +519,64 @@ namespace rtengine {
 			}
 		}
 	}
+#else
+	template<typename T> template<typename E> void wavelet_level<T>::decompose_level(E *src, E *dst, float *filterV, float *filterH, int taps, int offset) { 

+		T tmpLo[m_w] ALIGNED64;
+		T tmpHi[m_w] ALIGNED64;
+		/* filter along rows and columns */
+		if(subsamp_out) {
+			for(int row=0;row<m_h;row+=2) {
+				AnalysisFilterSubsampVertical (src, tmpLo, tmpHi, filterV, filterV+taps, taps, offset, m_w, m_h, row);
+				AnalysisFilterSubsampHorizontal (tmpLo, dst, wavcoeffs[1], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
+				AnalysisFilterSubsampHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], filterH, filterH+taps, taps, offset, m_w, m_w2, row/2);
+			}
+		} else {
+			for(int row=0;row<m_h;row++) {
+				AnalysisFilterHaarVertical (src, tmpLo, tmpHi, m_w, m_h, row);
+				AnalysisFilterHaarHorizontal (tmpLo, dst, wavcoeffs[1], m_w, row);
+				AnalysisFilterHaarHorizontal (tmpHi, wavcoeffs[2], wavcoeffs[3], m_w, row);
+			}
+		}
+	}
+#endif
+
+#ifdef __SSE2__
+
+	template<typename T> template<typename E> SSEFUNCTION void wavelet_level<T>::reconstruct_level(E* tmpLo, E* tmpHi, E * src, E *dst, float *filterV, float *filterH, int taps, int offset) { 
+
+		/* filter along rows and columns */
+		if (subsamp_out) {
+			float filterVarray[2*taps][4] ALIGNED64;
+			for(int i=0;i<2*taps;i++) {
+				for(int j=0;j<4;j++) {
+					filterVarray[i][j] = filterV[i];
+				}
+			}
+			SynthesisFilterSubsampHorizontal (src, wavcoeffs[1], tmpLo, filterH, filterH+taps, taps, offset, m_w2, m_w, m_h2);
+			SynthesisFilterSubsampHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, filterH, filterH+taps, taps, offset, m_w2, m_w, m_h2);
+			SynthesisFilterSubsampVertical (tmpLo, tmpHi, dst, filterVarray, filterVarray+taps, taps, offset, m_w, m_h2, m_h);
+		} else {
+			SynthesisFilterHaarHorizontal (src, wavcoeffs[1], tmpLo, m_w, m_h2);
+			SynthesisFilterHaarHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, m_w, m_h2);
+			SynthesisFilterHaarVertical (tmpLo, tmpHi, dst, m_w, m_h);
+		}
+	}
+#else
 	template<typename T> template<typename E> void wavelet_level<T>::reconstruct_level(E* tmpLo, E* tmpHi, E * src, E *dst, float *filterV, float *filterH, int taps, int offset) { 

 		/* filter along rows and columns */
 		if (subsamp_out) {
-			SynthesisFilterSubsampHorizontal (src, wavcoeffs[1], tmpLo, filterH, filterH+taps, taps, offset, m_w/*dstlen*/);
-			SynthesisFilterSubsampHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, filterH, filterH+taps, taps, offset, m_w/*dstlen*/);
-			SynthesisFilterSubsampVertical (tmpLo, tmpHi, dst, filterV, filterV+taps, taps, offset, m_w/*pitch*/, m_h/*dstlen*/);
+			SynthesisFilterSubsampHorizontal (src, wavcoeffs[1], tmpLo, filterH, filterH+taps, taps, offset, m_w2, m_w, m_h2);
+			SynthesisFilterSubsampHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, filterH, filterH+taps, taps, offset, m_w2, m_w, m_h2);
+			SynthesisFilterSubsampVertical (tmpLo, tmpHi, dst, filterV, filterV+taps, taps, offset, m_w, m_h2, m_h);
 		} else {
-			SynthesisFilterHaarHorizontal (src, wavcoeffs[1], tmpLo, m_w);
-			SynthesisFilterHaarHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, m_w);
+			SynthesisFilterHaarHorizontal (src, wavcoeffs[1], tmpLo, m_w, m_h2);
+			SynthesisFilterHaarHorizontal (wavcoeffs[2], wavcoeffs[3], tmpHi, m_w, m_h2);
 			SynthesisFilterHaarVertical (tmpLo, tmpHi, dst, m_w, m_h);
 		}
 	}
-	
+#endif
 };

 #endif