Speedup and reduced memory usage for lmmse demosaic, Issue 2665

This commit is contained in:
Ingo
2015-02-22 22:00:06 +01:00
parent 9dd4da2547
commit 4e8a326645
4 changed files with 382 additions and 213 deletions

View File

@@ -901,17 +901,25 @@ static INLINE vdouble xlog1p(vdouble a) {
typedef struct {
vfloat x, y;
} vfloat2;
#if defined( __FMA__ ) && defined( __x86_64__ ) && defined(WIN32) // experimental
static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x,y,z); }
#if defined( __FMA__ ) && defined( __x86_64__ )
static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x,y,z); }
#else
static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return vaddf(vmulf(x, y), z); }
static INLINE vfloat vmlaf(vfloat x, vfloat y, vfloat z) { return vaddf(vmulf(x, y), z); }
#endif
static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
}
#if defined( __SSE4_1__ ) && defined( __x86_64__ )
// only one instruction when using SSE4.1
static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
return _mm_blendv_ps(y,x,(vfloat)mask);
}
#else
// three instructions when using SSE2
static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
}
#endif
static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));