Files
rawTherapee/rtengine/PF_correct_RT.cc
Hombre 8b2eac9a3d Pipette and "On Preview Widgets" branch. See issue 227
The pipette part is already working quite nice but need to be finished. The widgets part needs more work...
2014-01-21 23:37:36 +01:00

1076 lines
29 KiB
C++

////////////////////////////////////////////////////////////////
//
// Chromatic Aberration Auto-correction
//
// copyright (c) 2008-2010 Emil Martinec <ejmartin@uchicago.edu>
//
//
// code dated: November 24, 2010
// optimized: September 2013, Ingo Weyrich
//
// PF_correct_RT.cc is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
////////////////////////////////////////////////////////////////
//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#include "gauss.h"
#include "improcfun.h"
#include "sleef.c"
#include "mytime.h"
#include "../rtgui/myflatcurve.h"
#include "rt_math.h"
#ifdef __SSE2__
#include "sleefsseavx.c"
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
using namespace std;
namespace rtengine {
extern const Settings* settings;
#if defined( __SSE2__ ) && defined( WIN32 )
__attribute__((force_align_arg_pointer)) void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, double radius, int thresh)
#else
void ImProcFunctions::PF_correct_RT(LabImage * src, LabImage * dst, double radius, int thresh)
#endif
{
const int halfwin = ceil(2*radius)+1;
FlatCurve* chCurve = NULL;
if (params->defringe.huecurve.size() && FlatCurveType(params->defringe.huecurve.at(0)) > FCT_Linear)
chCurve = new FlatCurve(params->defringe.huecurve);
// local variables
const int width=src->W, height=src->H;
//temporary array to store chromaticity
float (*fringe);
fringe = (float (*)) malloc (height * width * sizeof(*fringe));
LabImage * tmp1;
tmp1 = new LabImage(width, height);
#ifdef _OPENMP
#pragma omp parallel
#endif
{
AlignedBufferMP<double> buffer(max(src->W,src->H));
gaussHorizontal<float> (src->a, tmp1->a, buffer, src->W, src->H, radius);
gaussHorizontal<float> (src->b, tmp1->b, buffer, src->W, src->H, radius);
gaussVertical<float> (tmp1->a, tmp1->a, buffer, src->W, src->H, radius);
gaussVertical<float> (tmp1->b, tmp1->b, buffer, src->W, src->H, radius);
}
float chromave=0.0f;
#ifdef __SSE2__
if( chCurve ) {
// vectorized precalculation of the atan2 values
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int j;
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++ ) {
for(j = 0; j < width-3; j+=4)
_mm_storeu_ps(&fringe[i*width+j], xatan2f(LVFU(src->b[i][j]),LVFU(src->a[i][j])));
for(; j < width; j++)
fringe[i*width+j]=xatan2f(src->b[i][j],src->a[i][j]);
}
}
}
#endif
#ifdef _OPENMP
#pragma omp parallel
#endif
{
float chromaChfactor = 1.0f;
#ifdef _OPENMP
#pragma omp for reduction(+:chromave)
#endif
for(int i = 0; i < height; i++ ) {
for(int j = 0; j < width; j++) {
if (chCurve) {
#ifdef __SSE2__
// use the precalculated atan values
float HH=fringe[i*width+j];
#else
// no precalculated values without SSE => calculate
float HH=xatan2f(src->b[i][j],src->a[i][j]);
#endif
double hr;
float chparam = float((chCurve->getVal((hr=Color::huelab_to_huehsv2(HH)))-0.5f) * 2.0f);//get C=f(H)
if(chparam > 0.f) chparam /=2.f; // reduced action if chparam > 0
chromaChfactor=1.0f+chparam;
}
float chroma = SQR(chromaChfactor*(src->a[i][j]-tmp1->a[i][j]))+SQR(chromaChfactor*(src->b[i][j]-tmp1->b[i][j]));//modulate chroma function hue
chromave += chroma;
fringe[i*width+j]=chroma;
}
}
}
chromave /= (height*width);
float threshfactor = SQR(thresh/33.f)*chromave*5.0f;
// now chromave is calculated, so we postprocess fringe to reduce the number of divisions in future
#ifdef __SSE2__
#ifdef _OPENMP
#pragma omp parallel
#endif
{
__m128 sumv = _mm_set1_ps( chromave );
__m128 onev = _mm_set1_ps( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int j=0; j < width*height-3; j+=4)
_mm_storeu_ps( &fringe[j], onev/(LVFU(fringe[j])+sumv));
}
for(int j=width*height - (width*height)%4; j < width*height; j++)
fringe[j] = 1.f/(fringe[j]+chromave);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
for(int j = 0; j < width*height; j++)
fringe[j] = 1.f/(fringe[j]+chromave);
#endif
// because we changed the values of fringe we also have to recalculate threshfactor
threshfactor = 1.0f/(threshfactor + chromave);
// Issue 1674:
// often, CA isn't evenly distributed, e.g. a lot in contrasty regions and none in the sky.
// so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work
// Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better
// choice for the chunk_size than 16
// Issue 1972: Split this loop in three parts to avoid most of the min and max-operations
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,16)
#endif
for(int i = 0; i < height; i++ ) {
int j;
for(j = 0; j < halfwin-1; j++) {
tmp1->a[i][j] = src->a[i][j];
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=0; j1<j+halfwin; j1++) {
//neighborhood average of pixels weighted by chrominance
wt = fringe[i1*width+j1];
atot += wt*src->a[i1][j1];
btot += wt*src->b[i1][j1];
norm += wt;
}
tmp1->a[i][j] = atot/norm;
tmp1->b[i][j] = btot/norm;
}
}
for(; j < width-halfwin+1; j++) {
tmp1->a[i][j] = src->a[i][j];
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=j-halfwin+1; j1<j+halfwin; j1++) {
//neighborhood average of pixels weighted by chrominance
wt = fringe[i1*width+j1];
atot += wt*src->a[i1][j1];
btot += wt*src->b[i1][j1];
norm += wt;
}
tmp1->a[i][j] = atot/norm;
tmp1->b[i][j] = btot/norm;
}
}
for(; j < width; j++) {
tmp1->a[i][j] = src->a[i][j];
tmp1->b[i][j] = src->b[i][j];
//test for pixel darker than some fraction of neighborhood ave, near an edge, more saturated than average
/*if (100*tmp1->L[i][j]>50*src->L[i][j] && \*/
/*1000*abs(tmp1->L[i][j]-src->L[i][j])>thresh*(tmp1->L[i][j]+src->L[i][j]) && \*/
if (fringe[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=j-halfwin+1; j1<width; j1++) {
//neighborhood average of pixels weighted by chrominance
wt = fringe[i1*width+j1];
atot += wt*src->a[i1][j1];
btot += wt*src->b[i1][j1];
norm += wt;
}
tmp1->a[i][j] = atot/norm;
tmp1->b[i][j] = btot/norm;
}
}
}//end of ab channel averaging
if(src != dst)
#ifdef _OPENMP
#pragma omp parallel for
#endif
for(int i = 0; i < height; i++ ) {
for(int j = 0; j < width; j++) {
dst->L[i][j] = src->L[i][j];
}
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for(int i = 0; i < height; i++ ) {
for(int j = 0; j < width; j++) {
dst->a[i][j] = tmp1->a[i][j];
dst->b[i][j] = tmp1->b[i][j];
}
}
delete tmp1;
if(chCurve) delete chCurve;
free(fringe);
}
#if defined( __SSE2__ ) && defined( WIN32 )
__attribute__((force_align_arg_pointer)) void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * dst, double radius, int thresh)
#else
void ImProcFunctions::PF_correct_RTcam(CieImage * src, CieImage * dst, double radius, int thresh)
#endif
{
const int halfwin = ceil(2*radius)+1;
FlatCurve* chCurve = NULL;
if (params->defringe.huecurve.size() && FlatCurveType(params->defringe.huecurve.at(0)) > FCT_Linear)
chCurve = new FlatCurve(params->defringe.huecurve);
// local variables
const int width=src->W, height=src->H;
const float piid=3.14159265f/180.f;
const float eps2=0.01f;
//temporary array to store chromaticity
float (*fringe);
fringe = (float (*)) malloc (height * width * sizeof(*fringe));
float** sraa;
sraa = new float*[height];
for (int i=0; i<height; i++)
sraa[i] = new float[width];
float** srbb;
srbb = new float*[height];
for (int i=0; i<height; i++)
srbb[i] = new float[width];
float** tmaa;
tmaa = new float*[height];
for (int i=0; i<height; i++)
tmaa[i] = new float[width];
float** tmbb;
tmbb = new float*[height];
for (int i=0; i<height; i++)
tmbb[i] = new float[width];
#ifdef _OPENMP
#pragma omp parallel
#endif
{
float2 sincosval;
#ifdef __SSE2__
int j;
vfloat2 sincosvalv;
__m128 piidv = _mm_set1_ps(piid);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for
#endif
for (int i=0; i<height; i++) {
#ifdef __SSE2__
for (j=0; j<width-3; j+=4) {
sincosvalv = xsincosf(piidv*LVFU(src->h_p[i][j]));
_mm_storeu_ps(&sraa[i][j],LVFU(src->C_p[i][j])*sincosvalv.y);
_mm_storeu_ps(&srbb[i][j],LVFU(src->C_p[i][j])*sincosvalv.x);
}
for (; j<width; j++) {
sincosval = xsincosf(piid*src->h_p[i][j]);
sraa[i][j]=src->C_p[i][j]*sincosval.y;
srbb[i][j]=src->C_p[i][j]*sincosval.x;
}
#else
for (int j=0; j<width; j++) {
sincosval = xsincosf(piid*src->h_p[i][j]);
sraa[i][j]=src->C_p[i][j]*sincosval.y;
srbb[i][j]=src->C_p[i][j]*sincosval.x;
}
#endif
}
}
#ifdef _OPENMP
#pragma omp parallel
#endif
{
AlignedBufferMP<double> buffer(max(src->W,src->H));
gaussHorizontal<float> (sraa, tmaa, buffer, src->W, src->H, radius);
gaussHorizontal<float> (srbb, tmbb, buffer, src->W, src->H, radius);
gaussVertical<float> (tmaa, tmaa, buffer, src->W, src->H, radius);
gaussVertical<float> (tmbb, tmbb, buffer, src->W, src->H, radius);
}
float chromave=0.0f;
#ifdef __SSE2__
if( chCurve ) {
// vectorized precalculation of the atan2 values
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int j;
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++ ) {
for(j = 0; j < width-3; j+=4)
_mm_storeu_ps(&fringe[i*width+j], xatan2f(LVFU(srbb[i][j]),LVFU(sraa[i][j])));
for(; j < width; j++)
fringe[i*width+j]=xatan2f(srbb[i][j],sraa[i][j]);
}
}
}
#endif
#ifdef _OPENMP
#pragma omp parallel
#endif
{
float chromaChfactor = 1.0f;
#ifdef _OPENMP
#pragma omp for reduction(+:chromave)
#endif
for(int i = 0; i < height; i++ ) {
for(int j = 0; j < width; j++) {
if (chCurve) {
#ifdef __SSE2__
// use the precalculated atan values
float HH=fringe[i*width+j];
#else
// no precalculated values without SSE => calculate
float HH=xatan2f(srbb[i][j],sraa[i][j]);
#endif
double hr;
float chparam = float((chCurve->getVal((hr=Color::huelab_to_huehsv2(HH)))-0.5f) * 2.0f);//get C=f(H)
if(chparam > 0.f) chparam /=2.f; // reduced action if chparam > 0
chromaChfactor=1.0f+chparam;
}
float chroma = SQR(chromaChfactor*(sraa[i][j]-tmaa[i][j]))+SQR(chromaChfactor*(srbb[i][j]-tmbb[i][j]));//modulate chroma function hue
chromave += chroma;
fringe[i*width+j]=chroma;
}
}
}
chromave /= (height*width);
float threshfactor = SQR(thresh/33.f)*chromave*5.0f; // Calculated once to eliminate mult inside the next loop
// now chromave is calculated, so we postprocess fringe to reduce the number of divisions in future
#ifdef __SSE2__
#ifdef _OPENMP
#pragma omp parallel
#endif
{
__m128 sumv = _mm_set1_ps( chromave + eps2 );
__m128 onev = _mm_set1_ps( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int j=0; j < width*height-3; j+=4)
_mm_storeu_ps( &fringe[j], onev/(LVFU(fringe[j])+sumv));
}
for(int j=width*height - (width*height)%4; j < width*height; j++)
fringe[j] = 1.f/(fringe[j]+chromave+eps2);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
for(int j = 0; j < width*height; j++)
fringe[j] = 1.f/(fringe[j]+chromave+eps2);
#endif
// because we changed the values of fringe we also have to recalculate threshfactor
threshfactor = 1.0f/(threshfactor + chromave + eps2);
// Issue 1674:
// often, CA isn't evenly distributed, e.g. a lot in contrasty regions and none in the sky.
// so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work
// Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better
// choice for the chunk_size than 16
// Issue 1972: Split this loop in three parts to avoid most of the min and max-operations
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,16)
#endif
for(int i = 0; i < height; i++ ) {
int j;
for(j = 0; j < halfwin-1; j++) {
tmaa[i][j] = sraa[i][j];
tmbb[i][j] = srbb[i][j];
if (fringe[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=0; j1<j+halfwin; j1++) {
//neighborhood average of pixels weighted by chrominance
wt = fringe[i1*width+j1];
atot += wt*sraa[i1][j1];
btot += wt*srbb[i1][j1];
norm += wt;
}
if(norm > 0.f){
tmaa[i][j] = (atot/norm);
tmbb[i][j] = (btot/norm);
}
}
}
for(; j < width-halfwin+1; j++) {
tmaa[i][j] = sraa[i][j];
tmbb[i][j] = srbb[i][j];
if (fringe[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=j-halfwin+1; j1<j+halfwin; j1++) {
//neighborhood average of pixels weighted by chrominance
wt = fringe[i1*width+j1];
atot += wt*sraa[i1][j1];
btot += wt*srbb[i1][j1];
norm += wt;
}
if(norm > 0.f){
tmaa[i][j] = (atot/norm);
tmbb[i][j] = (btot/norm);
}
}
}
for(; j < width; j++) {
tmaa[i][j] = sraa[i][j];
tmbb[i][j] = srbb[i][j];
if (fringe[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=j-halfwin+1; j1<width; j1++) {
//neighborhood average of pixels weighted by chrominance
wt = fringe[i1*width+j1];
atot += wt*sraa[i1][j1];
btot += wt*srbb[i1][j1];
norm += wt;
}
if(norm > 0.f){
tmaa[i][j] = (atot/norm);
tmbb[i][j] = (btot/norm);
}
}
}
} //end of ab channel averaging
#ifdef _OPENMP
#pragma omp parallel
#endif
{
#ifdef __SSE2__
int j;
__m128 interav, interbv;
__m128 piidv = _mm_set1_ps(piid);
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++ ) {
#ifdef __SSE2__
for(j = 0; j < width-3; j+=4) {
_mm_storeu_ps( &dst->sh_p[i][j], LVFU(src->sh_p[i][j]));
interav = LVFU(tmaa[i][j]);
interbv = LVFU(tmbb[i][j]);
_mm_storeu_ps(&dst->h_p[i][j],(xatan2f(interbv,interav))/piidv);
_mm_storeu_ps(&dst->C_p[i][j],_mm_sqrt_ps(SQRV(interbv)+SQRV(interav)));
}
for(; j < width; j++) {
dst->sh_p[i][j] = src->sh_p[i][j];
float intera = tmaa[i][j];
float interb = tmbb[i][j];
dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
}
#else
for(int j = 0; j < width; j++) {
dst->sh_p[i][j] = src->sh_p[i][j];
float intera = tmaa[i][j];
float interb = tmbb[i][j];
dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
}
#endif
}
}
for (int i=0; i<height; i++)
delete [] sraa[i];
delete [] sraa;
for (int i=0; i<height; i++)
delete [] srbb[i];
delete [] srbb;
for (int i=0; i<height; i++)
delete [] tmaa[i];
delete [] tmaa;
for (int i=0; i<height; i++)
delete [] tmbb[i];
delete [] tmbb;
if(chCurve) delete chCurve;
free(fringe);
}
#define PIX_SORT(a,b) { if ((a)>(b)) {temp=(a);(a)=(b);(b)=temp;} }
#define med3(a0,a1,a2,a3,a4,a5,a6,a7,a8,median) { \
pp[0]=a0; pp[1]=a1; pp[2]=a2; pp[3]=a3; pp[4]=a4; pp[5]=a5; pp[6]=a6; pp[7]=a7; pp[8]=a8; \
PIX_SORT(pp[1],pp[2]); PIX_SORT(pp[4],pp[5]); PIX_SORT(pp[7],pp[8]); \
PIX_SORT(pp[0],pp[1]); PIX_SORT(pp[3],pp[4]); PIX_SORT(pp[6],pp[7]); \
PIX_SORT(pp[1],pp[2]); PIX_SORT(pp[4],pp[5]); PIX_SORT(pp[7],pp[8]); \
PIX_SORT(pp[0],pp[3]); PIX_SORT(pp[5],pp[8]); PIX_SORT(pp[4],pp[7]); \
PIX_SORT(pp[3],pp[6]); PIX_SORT(pp[1],pp[4]); PIX_SORT(pp[2],pp[5]); \
PIX_SORT(pp[4],pp[7]); PIX_SORT(pp[4],pp[2]); PIX_SORT(pp[6],pp[4]); \
PIX_SORT(pp[4],pp[2]); median=pp[4];} //pp4 = median
#if defined( __SSE2__ ) && defined( WIN32 )
__attribute__((force_align_arg_pointer)) void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, double radius, int thresh, int mode)
#else
void ImProcFunctions::Badpixelscam(CieImage * src, CieImage * dst, double radius, int thresh, int mode)
#endif
{
const int halfwin = ceil(2*radius)+1;
MyTime t1,t2;
t1.set();
const int width=src->W, height=src->H;
const float piid=3.14159265f/180.f;
float shfabs, shmed;
int i1, j1, tot;
const float eps = 1.0f;
const float eps2 = 0.01f;
float shsum, dirsh, norm, sum;
float** sraa;
sraa = new float*[height];
for (int i=0; i<height; i++)
sraa[i] = new float[width];
float** srbb;
srbb = new float*[height];
for (int i=0; i<height; i++)
srbb[i] = new float[width];
float** tmaa;
tmaa = new float*[height];
for (int i=0; i<height; i++)
tmaa[i] = new float[width];
float** tmbb;
tmbb = new float*[height];
for (int i=0; i<height; i++)
tmbb[i] = new float[width];
float* badpix = (float*)malloc(width*height*sizeof(float));
float** tmL;
tmL = new float*[height];
for (int i=0; i<height; i++) {
tmL[i] = new float[width];
}
#ifdef _OPENMP
#pragma omp parallel
#endif
{
float2 sincosval;
#ifdef __SSE2__
int j;
vfloat2 sincosvalv;
__m128 piidv = _mm_set1_ps(piid);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for
#endif
for (int i=0; i<height; i++) {
#ifdef __SSE2__
for (j=0; j<width-3; j+=4) {
sincosvalv = xsincosf(piidv*LVFU(src->h_p[i][j]));
_mm_storeu_ps(&sraa[i][j],LVFU(src->C_p[i][j])*sincosvalv.y);
_mm_storeu_ps(&srbb[i][j],LVFU(src->C_p[i][j])*sincosvalv.x);
}
for (; j<width; j++) {
sincosval = xsincosf(piid*src->h_p[i][j]);
sraa[i][j]=src->C_p[i][j]*sincosval.y;
srbb[i][j]=src->C_p[i][j]*sincosval.x;
}
#else
for (int j=0; j<width; j++) {
sincosval = xsincosf(piid*src->h_p[i][j]);
sraa[i][j]=src->C_p[i][j]*sincosval.y;
srbb[i][j]=src->C_p[i][j]*sincosval.x;
}
#endif
}
}
#ifdef _OPENMP
#pragma omp parallel
#endif
{
AlignedBufferMP<double> buffer(max(src->W,src->H));
//chroma a and b
if(mode==2) {//choice of gaussian blur
gaussHorizontal<float> (sraa, tmaa, buffer, src->W, src->H, radius);
gaussHorizontal<float> (srbb, tmbb, buffer, src->W, src->H, radius);
gaussVertical<float> (tmaa, tmaa, buffer, src->W, src->H, radius);
gaussVertical<float> (tmbb, tmbb, buffer, src->W, src->H, radius);
}
//luma sh_p
gaussHorizontal<float> (src->sh_p, tmL, buffer, src->W, src->H, 2.0);//low value to avoid artifacts
gaussVertical<float> (tmL, tmL, buffer, src->W, src->H, 2.0);
}
if(mode==1){ //choice of median
#pragma omp parallel
{
int ip,in,jp,jn;
float pp[9],temp;
#pragma omp for nowait //nowait because next loop inside this parallel region is independent on this one
for (int i=0; i<height; i++) {
if (i<2) {ip=i+2;} else {ip=i-2;}
if (i>height-3) {in=i-2;} else {in=i+2;}
for (int j=0; j<width; j++) {
if (j<2) {jp=j+2;} else {jp=j-2;}
if (j>width-3) {jn=j-2;} else {jn=j+2;}
med3(sraa[ip][jp],sraa[ip][j],sraa[ip][jn],sraa[i][jp],sraa[i][j],sraa[i][jn],sraa[in][jp],sraa[in][j],sraa[in][jn],tmaa[i][j]);
}
}
#pragma omp for
for (int i=0; i<height; i++) {
if (i<2) {ip=i+2;} else {ip=i-2;}
if (i>height-3) {in=i-2;} else {in=i+2;}
for (int j=0; j<width; j++) {
if (j<2) {jp=j+2;} else {jp=j-2;}
if (j>width-3) {jn=j-2;} else {jn=j+2;}
med3(srbb[ip][jp],srbb[ip][j],srbb[ip][jn],srbb[i][jp],srbb[i][j],srbb[i][jn],srbb[in][jp],srbb[in][j],srbb[in][jn],tmbb[i][j]);
}
}
}
}
//luma badpixels
const float sh_thr = 4.5f;//low value for luma sh_p to avoid artifacts
const float shthr = sh_thr / 24.0f;
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int j;
#ifdef __SSE2__
__m128 shfabsv, shmedv;
__m128 shthrv = _mm_set1_ps(shthr);
__m128 onev = _mm_set1_ps(1.0f);
#endif // __SSE2__
#ifdef _OPENMP
#pragma omp for private(shfabs, shmed,i1,j1)
#endif
for (int i=0; i < height; i++) {
for (j=0; j < 2; j++) {
shfabs = fabs(src->sh_p[i][j]-tmL[i][j]);
shmed=0.0f;
for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
for (j1=0; j1<=j+2; j1++ ) {
shmed += fabs(src->sh_p[i1][j1]-tmL[i1][j1]);
}
badpix[i*width+j] = (shfabs>((shmed-shfabs)*shthr));
}
#ifdef __SSE2__
for (; j < width-5; j+=4) {
shfabsv = vabsf(LVFU(src->sh_p[i][j])-LVFU(tmL[i][j]));
shmedv = _mm_setzero_ps();
for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
for (j1=j-2; j1<=j+2; j1++ ) {
shmedv += vabsf(LVFU(src->sh_p[i1][j1])-LVFU(tmL[i1][j1]));
}
_mm_storeu_ps( &badpix[i*width+j], vself(vmaskf_gt(shfabsv,(shmedv - shfabsv)*shthrv), onev, _mm_setzero_ps()));
}
for (; j < width-2; j++) {
shfabs = fabs(src->sh_p[i][j]-tmL[i][j]);
shmed=0.0f;
for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
for (j1=j-2; j1<=j+2; j1++ ) {
shmed += fabs(src->sh_p[i1][j1]-tmL[i1][j1]);
}
badpix[i*width+j] = (shfabs>((shmed-shfabs)*shthr));
}
#else
for (; j < width-2; j++) {
shfabs = fabs(src->sh_p[i][j]-tmL[i][j]);
shmed=0.0f;
for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
for (j1=j-2; j1<=j+2; j1++ ) {
shmed += fabs(src->sh_p[i1][j1]-tmL[i1][j1]);
}
badpix[i*width+j] = (shfabs>((shmed-shfabs)*shthr));
}
#endif
for (; j < width; j++) {
shfabs = fabs(src->sh_p[i][j]-tmL[i][j]);
shmed=0.0f;
for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
for (j1=j-2; j1<width; j1++ ) {
shmed += fabs(src->sh_p[i1][j1]-tmL[i1][j1]);
}
badpix[i*width+j] = (shfabs>((shmed-shfabs)*shthr));
}
}
}
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int j;
#ifdef _OPENMP
#pragma omp for private(shsum,norm,dirsh,sum,i1,j1) schedule(dynamic,16)
#endif
for (int i=0; i < height; i++) {
for (j=0; j < 2; j++) {
if (!badpix[i*width+j]) continue;
norm=0.0f;
shsum=0.0f;
sum=0.0f;
tot=0;
for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
for (j1=0; j1<=j+2; j1++ ) {
if (i1==i && j1==j) continue;
if (badpix[i1*width+j1]) continue;
sum += src->sh_p[i1][j1];
tot++;
dirsh = 1.f/(SQR(src->sh_p[i1][j1]-src->sh_p[i][j])+eps);
shsum += dirsh*src->sh_p[i1][j1];
norm += dirsh;
}
if (norm > 0.f) {
src->sh_p[i][j]=shsum/norm;
}
else {
if(tot > 0) src->sh_p[i][j]=sum / tot;
}
}
for (; j < width-2; j++) {
if (!badpix[i*width+j]) continue;
norm=0.0f;
shsum=0.0f;
sum=0.0f;
tot=0;
for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
for (j1=j-2; j1<=j+2; j1++ ) {
if (i1==i && j1==j) continue;
if (badpix[i1*width+j1]) continue;
sum += src->sh_p[i1][j1];
tot++;
dirsh = 1.f/(SQR(src->sh_p[i1][j1]-src->sh_p[i][j])+eps);
shsum += dirsh*src->sh_p[i1][j1];
norm += dirsh;
}
if (norm > 0.f) {
src->sh_p[i][j]=shsum/norm;
}
else {
if(tot > 0) src->sh_p[i][j]=sum / tot;
}
}
for (; j < width; j++) {
if (!badpix[i*width+j]) continue;
norm=0.0f;
shsum=0.0f;
sum=0.0f;
tot=0;
for (i1=max(0,i-2); i1<=min(i+2,height-1); i1++ )
for (j1=j-2; j1<width; j1++ ) {
if (i1==i && j1==j) continue;
if (badpix[i1*width+j1]) continue;
sum += src->sh_p[i1][j1];
tot++;
dirsh = 1.f/(SQR(src->sh_p[i1][j1]-src->sh_p[i][j])+eps);
shsum += dirsh*src->sh_p[i1][j1];
norm += dirsh;
}
if (norm > 0.f) {
src->sh_p[i][j]=shsum/norm;
}
else {
if(tot > 0) src->sh_p[i][j]=sum / tot;
}
}
}
}
// end luma badpixels
// begin chroma badpixels
float chrommed=0.f;
#ifdef _OPENMP
#pragma omp parallel for reduction(+:chrommed)
#endif
for(int i = 0; i < height; i++ ) {
for(int j = 0; j < width; j++) {
float chroma =SQR(sraa[i][j]-tmaa[i][j])+SQR(srbb[i][j]-tmbb[i][j]);
chrommed += chroma;
badpix[i*width+j]=chroma;
}
}
chrommed /= (height*width);
float threshfactor = (thresh*chrommed)/33.f;
// now chrommed is calculated, so we postprocess badpix to reduce the number of divisions in future
#ifdef __SSE2__
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int j;
__m128 sumv = _mm_set1_ps( chrommed + eps2 );
__m128 onev = _mm_set1_ps( 1.0f );
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i<height; i++) {
for(j=0; j < width-3; j+=4)
_mm_storeu_ps( &badpix[i*width+j], onev/(LVFU(badpix[i*width+j])+sumv));
for(; j < width; j++)
badpix[i*width+j] = 1.f/(badpix[i*width+j]+chrommed+eps2);
}
}
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
for(int i = 0; i<height; i++)
for(int j = 0; j < width; j++)
badpix[i*width+j] = 1.f/(badpix[i*width+j]+chrommed+eps2);
#endif
// because we changed the values of badpix we also have to recalculate threshfactor
threshfactor = 1.0f/(threshfactor + chrommed + eps2);
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int j;
#ifdef _OPENMP
#pragma omp for schedule(dynamic,16)
#endif
for(int i = 0; i < height; i++ ) {
for(j = 0; j < halfwin; j++) {
tmaa[i][j] = sraa[i][j];
tmbb[i][j] = srbb[i][j];
if (badpix[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=0; j1<j+halfwin; j1++) {
wt = badpix[i1*width+j1];
atot += wt*sraa[i1][j1];
btot += wt*srbb[i1][j1];
norm += wt;
}
if(norm > 0.f){
tmaa[i][j] = (atot/norm);
tmbb[i][j] = (btot/norm);
}
}
}
for(; j < width-halfwin; j++) {
tmaa[i][j] = sraa[i][j];
tmbb[i][j] = srbb[i][j];
if (badpix[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=j-halfwin+1; j1<j+halfwin; j1++) {
wt = badpix[i1*width+j1];
atot += wt*sraa[i1][j1];
btot += wt*srbb[i1][j1];
norm += wt;
}
if(norm > 0.f){
tmaa[i][j] = (atot/norm);
tmbb[i][j] = (btot/norm);
}
}
}
for(; j < width; j++) {
tmaa[i][j] = sraa[i][j];
tmbb[i][j] = srbb[i][j];
if (badpix[i*width+j]<threshfactor) {
float atot=0.f;
float btot=0.f;
float norm=0.f;
float wt;
for (int i1=max(0,i-halfwin+1); i1<min(height,i+halfwin); i1++)
for (int j1=j-halfwin+1; j1<width; j1++) {
wt = badpix[i1*width+j1];
atot += wt*sraa[i1][j1];
btot += wt*srbb[i1][j1];
norm += wt;
}
if(norm > 0.f){
tmaa[i][j] = (atot/norm);
tmbb[i][j] = (btot/norm);
}
}
}
}
}
#ifdef _OPENMP
#pragma omp parallel
#endif
{
#ifdef __SSE2__
int j;
__m128 interav, interbv;
__m128 piidv = _mm_set1_ps(piid);
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for(int i = 0; i < height; i++ ) {
#ifdef __SSE2__
for(j = 0; j < width-3; j+=4) {
interav = LVFU(tmaa[i][j]);
interbv = LVFU(tmbb[i][j]);
_mm_storeu_ps(&dst->h_p[i][j],(xatan2f(interbv,interav))/piidv);
_mm_storeu_ps(&dst->C_p[i][j],_mm_sqrt_ps(SQRV(interbv)+SQRV(interav)));
}
for(; j < width; j++) {
float intera = tmaa[i][j];
float interb = tmbb[i][j];
dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
}
#else
for(int j = 0; j < width; j++) {
float intera = tmaa[i][j];
float interb = tmbb[i][j];
dst->h_p[i][j]=(xatan2f(interb,intera))/piid;
dst->C_p[i][j]=sqrt(SQR(interb)+SQR(intera));
}
#endif
}
}
if(src != dst) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for(int i = 0; i < height; i++ )
for(int j = 0; j < width; j++)
dst->sh_p[i][j] = src->sh_p[i][j];
}
for (int i=0; i<height; i++)
delete [] sraa[i];
delete [] sraa;
for (int i=0; i<height; i++)
delete [] srbb[i];
delete [] srbb;
for (int i=0; i<height; i++)
delete [] tmaa[i];
delete [] tmaa;
for (int i=0; i<height; i++)
delete [] tmbb[i];
delete [] tmbb;
for (int i=0; i<height; i++){
delete [] tmL[i];
}
delete [] tmL;
free(badpix);
t2.set();
if( settings->verbose )
printf("Ciecam badpixels:- %d usec\n", t2.etime(t1));
}
}
#undef PIX_SORT
#undef med3