Cleaned gauss code and included some speedups

This commit is contained in:
heckflosse
2016-01-18 23:56:02 +01:00
parent 33ea7156b8
commit a3c20daa46
12 changed files with 2117 additions and 2177 deletions

View File

@@ -65,40 +65,12 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
#pragma omp parallel
#endif
{
gaussianBlur<float> (src->a, tmp1->a, src->W, src->H, radius);
gaussianBlur<float> (src->b, tmp1->b, src->W, src->H, radius);
gaussianBlur (src->a, tmp1->a, src->W, src->H, radius);
gaussianBlur (src->b, tmp1->b, src->W, src->H, radius);
}
float chromave = 0.0f;
#ifdef __SSE2__
if( chCurve ) {
// vectorized precalculation of the atan2 values
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int j;
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++ )
{
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps(&fringe[i * width + j], xatan2f(LVFU(src->b[i][j]), LVFU(src->a[i][j])));
}
for(; j < width; j++) {
fringe[i * width + j] = xatan2f(src->b[i][j], src->a[i][j]);
}
}
}
}
#endif
#ifdef _OPENMP
#pragma omp parallel
#endif
@@ -109,6 +81,23 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
#endif
for(int i = 0; i < height; i++ ) {
#ifdef __SSE2__
// vectorized per row precalculation of the atan2 values
if (chCurve) {
int k = 0;
for(; k < width - 3; k += 4) {
STVFU(fringe[i * width + k], xatan2f(LVFU(src->b[i][k]), LVFU(src->a[i][k])));
}
for(; k < width; k++) {
fringe[i * width + k] = xatan2f(src->b[i][k], src->a[i][k]);
}
}
#endif // __SSE2__
for(int j = 0; j < width; j++) {
if (chCurve) {
#ifdef __SSE2__
@@ -144,19 +133,21 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
#pragma omp parallel
#endif
{
__m128 sumv = _mm_set1_ps( chromave );
__m128 onev = _mm_set1_ps( 1.0f );
__m128 sumv = F2V( chromave );
__m128 onev = F2V( 1.0f );
#ifdef _OPENMP
#pragma omp for
#pragma omp for nowait
#endif
for(int j = 0; j < width * height - 3; j += 4) {
_mm_storeu_ps( &fringe[j], onev / (LVFU(fringe[j]) + sumv));
STVFU(fringe[j], onev / (LVFU(fringe[j]) + sumv));
}
}
for(int j = width * height - (width * height) % 4; j < width * height; j++) {
fringe[j] = 1.f / (fringe[j] + chromave);
#pragma omp single
for(int j = width * height - (width * height) % 4; j < width * height; j++) {
fringe[j] = 1.f / (fringe[j] + chromave);
}
}
#else
@@ -191,8 +182,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i * width + j] < threshfactor) {
float atot = 0.f;
float btot = 0.f;
@@ -218,8 +207,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i * width + j] < threshfactor) {
float atot = 0.f;
float btot = 0.f;
@@ -245,8 +232,6 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst,
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i * width + j] < threshfactor) {
float atot = 0.f;
float btot = 0.f;
@@ -355,7 +340,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#ifdef __SSE2__
int j;
vfloat2 sincosvalv;
__m128 piidv = _mm_set1_ps(piid);
__m128 piidv = F2V(piid);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for
@@ -366,8 +351,8 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
for (j = 0; j < width - 3; j += 4) {
sincosvalv = xsincosf(piidv * LVFU(src->h_p[i][j]));
_mm_storeu_ps(&sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y);
_mm_storeu_ps(&srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x);
STVFU(sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y);
STVFU(srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x);
}
for (; j < width; j++) {
@@ -392,8 +377,8 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#pragma omp parallel
#endif
{
gaussianBlur<float> (sraa, tmaa, src->W, src->H, radius);
gaussianBlur<float> (srbb, tmbb, src->W, src->H, radius);
gaussianBlur (sraa, tmaa, src->W, src->H, radius);
gaussianBlur (srbb, tmbb, src->W, src->H, radius);
}
float chromave = 0.0f;
@@ -414,7 +399,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
for(int i = 0; i < height; i++ )
{
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps(&fringe[i * width + j], xatan2f(LVFU(srbb[i][j]), LVFU(sraa[i][j])));
STVFU(fringe[i * width + j], xatan2f(LVFU(srbb[i][j]), LVFU(sraa[i][j])));
}
for(; j < width; j++) {
@@ -470,14 +455,14 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#pragma omp parallel
#endif
{
__m128 sumv = _mm_set1_ps( chromave + eps2 );
__m128 onev = _mm_set1_ps( 1.0f );
__m128 sumv = F2V( chromave + eps2 );
__m128 onev = F2V( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int j = 0; j < width * height - 3; j += 4) {
_mm_storeu_ps( &fringe[j], onev / (LVFU(fringe[j]) + sumv));
STVFU(fringe[j], onev / (LVFU(fringe[j]) + sumv));
}
}
@@ -599,7 +584,7 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#ifdef __SSE2__
int j;
__m128 interav, interbv;
__m128 piidv = _mm_set1_ps(piid);
__m128 piidv = F2V(piid);
#endif
#ifdef _OPENMP
#pragma omp for
@@ -609,11 +594,11 @@ SSEFUNCTION void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * ds
#ifdef __SSE2__
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps( &dst->sh_p[i][j], LVFU(src->sh_p[i][j]));
STVFU(dst->sh_p[i][j], LVFU(src->sh_p[i][j]));
interav = LVFU(tmaa[i][j]);
interbv = LVFU(tmbb[i][j]);
_mm_storeu_ps(&dst->h_p[i][j], (xatan2f(interbv, interav)) / piidv);
_mm_storeu_ps(&dst->C_p[i][j], _mm_sqrt_ps(SQRV(interbv) + SQRV(interav)));
STVFU(dst->h_p[i][j], (xatan2f(interbv, interav)) / piidv);
STVFU(dst->C_p[i][j], vsqrtf(SQRV(interbv) + SQRV(interav)));
}
for(; j < width; j++) {
@@ -730,7 +715,7 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
#ifdef __SSE2__
int j;
vfloat2 sincosvalv;
__m128 piidv = _mm_set1_ps(piid);
__m128 piidv = F2V(piid);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for
@@ -741,8 +726,8 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
for (j = 0; j < width - 3; j += 4) {
sincosvalv = xsincosf(piidv * LVFU(src->h_p[i][j]));
_mm_storeu_ps(&sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y);
_mm_storeu_ps(&srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x);
STVFU(sraa[i][j], LVFU(src->C_p[i][j])*sincosvalv.y);
STVFU(srbb[i][j], LVFU(src->C_p[i][j])*sincosvalv.x);
}
for (; j < width; j++) {
@@ -769,12 +754,12 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
{
//chroma a and b
if(mode == 2) { //choice of gaussian blur
gaussianBlur<float> (sraa, tmaa, src->W, src->H, radius);
gaussianBlur<float> (srbb, tmbb, src->W, src->H, radius);
gaussianBlur (sraa, tmaa, src->W, src->H, radius);
gaussianBlur (srbb, tmbb, src->W, src->H, radius);
}
//luma sh_p
gaussianBlur<float> (src->sh_p, tmL, src->W, src->H, 2.0);//low value to avoid artifacts
gaussianBlur (src->sh_p, tmL, src->W, src->H, 2.0);//low value to avoid artifacts
}
if(mode == 1) { //choice of median
@@ -859,8 +844,8 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
int j;
#ifdef __SSE2__
__m128 shfabsv, shmedv;
__m128 shthrv = _mm_set1_ps(shthr);
__m128 onev = _mm_set1_ps(1.0f);
__m128 shthrv = F2V(shthr);
__m128 onev = F2V(1.0f);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for private(shfabs, shmed,i1,j1)
@@ -883,14 +868,14 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
for (; j < width - 5; j += 4) {
shfabsv = vabsf(LVFU(src->sh_p[i][j]) - LVFU(tmL[i][j]));
shmedv = _mm_setzero_ps();
shmedv = ZEROV;
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
shmedv += vabsf(LVFU(src->sh_p[i1][j1]) - LVFU(tmL[i1][j1]));
}
_mm_storeu_ps( &badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, _mm_setzero_ps()));
STVFU(badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, ZEROV));
}
for (; j < width - 2; j++) {
@@ -1082,15 +1067,15 @@ SSEFUNCTION void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, d
#endif
{
int j;
__m128 sumv = _mm_set1_ps( chrommed + eps2 );
__m128 onev = _mm_set1_ps( 1.0f );
__m128 sumv = F2V( chrommed + eps2 );
__m128 onev = F2V( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++) {
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps( &badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv));
STVFU(badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv));
}
for(; j < width; j++) {
@@ -1341,7 +1326,7 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
#ifdef __SSE2__
int j;
// vfloat2 sincosvalv;
// __m128 piidv = _mm_set1_ps(piid);
// __m128 piidv = F2V(piid);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for
@@ -1351,8 +1336,8 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
#ifdef __SSE2__
for (j = 0; j < width - 3; j += 4) {
_mm_storeu_ps(&sraa[i][j], LVFU(src->a[i][j]));
_mm_storeu_ps(&srbb[i][j], LVFU(src->b[i][j]));
STVFU(sraa[i][j], LVFU(src->a[i][j]));
STVFU(srbb[i][j], LVFU(src->b[i][j]));
}
for (; j < width; j++) {
@@ -1377,12 +1362,12 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
{
//chroma a and b
if(mode >= 2) { //choice of gaussian blur
gaussianBlur<float> (sraa, tmaa, src->W, src->H, radius);
gaussianBlur<float> (srbb, tmbb, src->W, src->H, radius);
gaussianBlur (sraa, tmaa, src->W, src->H, radius);
gaussianBlur (srbb, tmbb, src->W, src->H, radius);
}
//luma sh_p
gaussianBlur<float> (src->L, tmL, src->W, src->H, 2.0);//low value to avoid artifacts
gaussianBlur (src->L, tmL, src->W, src->H, 2.0);//low value to avoid artifacts
}
if(mode == 1) { //choice of median
@@ -1467,8 +1452,8 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
int j;
#ifdef __SSE2__
__m128 shfabsv, shmedv;
__m128 shthrv = _mm_set1_ps(shthr);
__m128 onev = _mm_set1_ps(1.0f);
__m128 shthrv = F2V(shthr);
__m128 onev = F2V(1.0f);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for private(shfabs, shmed,i1,j1)
@@ -1491,14 +1476,14 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
for (; j < width - 5; j += 4) {
shfabsv = vabsf(LVFU(src->L[i][j]) - LVFU(tmL[i][j]));
shmedv = _mm_setzero_ps();
shmedv = ZEROV;
for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
for (j1 = j - 2; j1 <= j + 2; j1++ ) {
shmedv += vabsf(LVFU(src->L[i1][j1]) - LVFU(tmL[i1][j1]));
}
_mm_storeu_ps( &badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, _mm_setzero_ps()));
STVFU(badpix[i * width + j], vself(vmaskf_gt(shfabsv, (shmedv - shfabsv)*shthrv), onev, ZEROV));
}
for (; j < width - 2; j++) {
@@ -1690,15 +1675,15 @@ SSEFUNCTION void ImProcFunctions::BadpixelsLab(LabImage * src, LabImage * dst, d
#endif
{
int j;
__m128 sumv = _mm_set1_ps( chrommed + eps2 );
__m128 onev = _mm_set1_ps( 1.0f );
__m128 sumv = F2V( chrommed + eps2 );
__m128 onev = F2V( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++) {
for(j = 0; j < width - 3; j += 4) {
_mm_storeu_ps( &badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv));
STVFU(badpix[i * width + j], onev / (LVFU(badpix[i * width + j]) + sumv));
}
for(; j < width; j++) {