CIECAM02 speedup

This commit is contained in:
Ingo 2015-07-07 16:34:05 +02:00
parent b6f8bc675b
commit 1008e0e98d
7 changed files with 555 additions and 86 deletions

View File

@ -20,10 +20,12 @@
#include "rtengine.h"
#include "curves.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "sleef.c"
#ifdef _DEBUG
#include "settings.h"
#include <stdio.h>
#endif
#undef CLIPD
#define CLIPD(a) ((a)>0.0?((a)<1.0?(a):1.0):0.0)
@ -33,7 +35,9 @@
namespace rtengine
{
#ifdef _DEBUG
extern const Settings* settings;
#endif
void Ciecam02::curvecolor(double satind, double satval, double &sres, double parsat)
{
@ -407,6 +411,15 @@ void Ciecam02::xyz_to_cat02float( float &r, float &g, float &b, float x, float y
b = ( 0.0000f * x) + (0.0000f * y) + (1.0000f * z);
}
}
#ifdef __SSE2__
void Ciecam02::xyz_to_cat02float( vfloat &r, vfloat &g, vfloat &b, vfloat x, vfloat y, vfloat z )
{
//gamut correction M.H.Brill S.Susstrunk
r = ( F2V(1.007245f) * x) + (F2V(0.011136f) * y) - (F2V(0.018381f) * z);//Changjun Li
g = (F2V(-0.318061f) * x) + (F2V(1.314589f) * y) + (F2V(0.003471f) * z);
b = z;
}
#endif
void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g, double b, int gamu )
{
@ -425,6 +438,7 @@ void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g
}
}
#ifndef __SSE2__
void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b, int gamu )
{
gamu=1;
@ -441,7 +455,15 @@ void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g
z = ( 0.000000f * r) - (0.000000f * g) + (1.000000f * b);
}
}
#else
void Ciecam02::cat02_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
{
//gamut correction M.H.Brill S.Susstrunk
x = ( F2V(0.99015849f) * r) - (F2V(0.00838772f)* g) + (F2V(0.018229217f) * b);//Changjun Li
y = ( F2V(0.239565979f) * r) + (F2V(0.758664642f) * g) + (F2V(0.001770137f)* b);
z = b;
}
#endif
void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g, double b )
{
@ -450,12 +472,21 @@ void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g,
z = b;
}
#ifndef __SSE2__
void Ciecam02::hpe_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b )
{
x = (1.910197f * r) - (1.112124f * g) + (0.201908f * b);
y = (0.370950f * r) + (0.629054f * g) - (0.000008f * b);
z = b;
}
#else
void Ciecam02::hpe_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
{
x = (F2V(1.910197f) * r) - (F2V(1.112124f) * g) + (F2V(0.201908f) * b);
y = (F2V(0.370950f) * r) + (F2V(0.629054f) * g) - (F2V(0.000008f) * b);
z = b;
}
#endif
void Ciecam02::cat02_to_hpe( double &rh, double &gh, double &bh, double r, double g, double b, int gamu )
{
@ -485,6 +516,16 @@ void Ciecam02::cat02_to_hpefloat( float &rh, float &gh, float &bh, float r, floa
}
}
#ifdef __SSE2__
void Ciecam02::cat02_to_hpefloat( vfloat &rh, vfloat &gh, vfloat &bh, vfloat r, vfloat g, vfloat b)
{
//Changjun Li
rh = ( F2V(0.550930835f) * r) + (F2V(0.519435987f)* g) - ( F2V(0.070356303f)* b);
gh = ( F2V(0.055954056f) * r) + (F2V(0.89973132f) * g) + (F2V(0.044315524f) * b);
bh = b;
}
#endif
void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa, double bb, double nbb )
{
double x = (A / nbb) + 0.305;
@ -496,6 +537,8 @@ void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa,
/* c1 c6 c7 */
b = (0.32787 * x) - (0.15681 * aa) - (4.49038 * bb);
}
#ifndef __SSE2__
void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb )
{
float x = (A / nbb) + 0.305f;
@ -507,6 +550,19 @@ void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa,
/* c1 c6 c7 */
b = (0.32787f * x) - (0.15681f * aa) - (4.49038f * bb);
}
#else
void Ciecam02::Aab_to_rgbfloat( vfloat &r, vfloat &g, vfloat &b, vfloat A, vfloat aa, vfloat bb, vfloat nbb )
{
vfloat c1 = F2V(0.32787f) * ((A / nbb) + F2V(0.305f));
/* c1 c2 c3 */
r = c1 + (F2V(0.32145f) * aa) + (F2V(0.20527f) * bb);
/* c1 c4 c5 */
g = c1 - (F2V(0.63507f) * aa) - (F2V(0.18603f) * bb);
/* c1 c6 c7 */
b = c1 - (F2V(0.15681f) * aa) - (F2V(4.49038f) * bb);
}
#endif
void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double t, double nbb, double a )
{
@ -535,7 +591,7 @@ void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double
bb = (aa * sinh) / cosh;
}
}
#ifndef __SSE2__
void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a )
{
float2 sincosval = xsincosf((h * M_PI) / 180.0f);
@ -565,6 +621,34 @@ void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float
std::swap(aa,bb);
}
}
#else
void Ciecam02::calculate_abfloat( vfloat &aa, vfloat &bb, vfloat h, vfloat e, vfloat t, vfloat nbb, vfloat a )
{
vfloat2 sincosval = xsincosf((h * F2V(M_PI)) / F2V(180.0f));
vfloat sinh = sincosval.x;
vfloat cosh = sincosval.y;
vfloat x = (a / nbb) + F2V(0.305f);
vfloat p3 = F2V(1.05f);
vmask swapMask = vmaskf_gt(vabsf(sinh), vabsf(cosh));
vswap(swapMask, sinh, cosh);
vfloat div = ((e / (t * cosh)) - (F2V(-0.31362f) - (p3 * F2V(0.15681f))) - ((F2V(0.01924f) - (p3 * F2V(4.49038f))) * (sinh / cosh)));
// for large values of t the above calculation can change its sign which results in a hue shift of 180 degree
// so we have to check the sign to avoid this shift.
// Additionally it seems useful to limit the minimum value of div
// I limited it, but I'm sure the actual limit is not the best one
vmask limitMask = vmaskf_neq(vsignf(div), vsignf(cosh));
limitMask = vorm(limitMask, vmaskf_le(vabsf(div), vabsf(cosh) * F2V(2.f)));
div = vself(limitMask, cosh * F2V(2.f), div);
aa = ((F2V(0.32787f) * x) * (F2V(2.0f) + p3)) / div;
bb = (aa * sinh) / cosh;
vswap(swapMask, aa, bb);
}
#endif
void Ciecam02::initcam1(double gamu, double yb, double pilotd, double f, double la, double xw, double yw, double zw, double &n, double &d, double &nbb, double &ncb,
double &cz, double &aw, double &wh, double &pfl, double &fl, double &c)
@ -758,6 +842,63 @@ void Ciecam02::xyz2jchqms_ciecam02float( float &J, float &C, float &h, float &Q,
s = 100.0f * sqrtf( M / Q );
h = (myh * 180.f) / (float)M_PI;
}
#ifdef __SSE2__
void Ciecam02::xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h, vfloat &Q, vfloat &M, vfloat &s, vfloat aw, vfloat fl, vfloat wh,
vfloat x, vfloat y, vfloat z, vfloat xw, vfloat yw, vfloat zw,
vfloat yb, vfloat la, vfloat f, vfloat c, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d)
{
vfloat r, g, b;
vfloat rw, gw, bw;
vfloat rc, gc, bc;
vfloat rp, gp, bp;
vfloat rpa, gpa, bpa;
vfloat a, ca, cb;
vfloat e, t;
xyz_to_cat02float( r, g, b, x, y, z);
xyz_to_cat02float( rw, gw, bw, xw, yw, zw);
vfloat onev = F2V(1.f);
rc = r * (((yw * d) / rw) + (onev - d));
gc = g * (((yw * d) / gw) + (onev - d));
bc = b * (((yw * d) / bw) + (onev - d));
cat02_to_hpefloat( rp, gp, bp, rc, gc, bc);
//gamut correction M.H.Brill S.Susstrunk
rp = _mm_max_ps(rp,ZEROV);
gp = _mm_max_ps(gp,ZEROV);
bp = _mm_max_ps(bp,ZEROV);
rpa = nonlinear_adaptationfloat( rp, fl );
gpa = nonlinear_adaptationfloat( gp, fl );
bpa = nonlinear_adaptationfloat( bp, fl );
ca = rpa - ((F2V(12.0f) * gpa) - bpa) / F2V(11.0f);
cb = F2V(0.11111111f) * (rpa + gpa - (bpa + bpa));
vfloat myh = xatan2f( cb, ca );
vfloat temp = F2V(M_PI);
temp += temp;
temp += myh;
myh = vself(vmaskf_lt(myh, ZEROV), temp, myh);
a = ((rpa + rpa) + gpa + (F2V(0.05f) * bpa) - F2V(0.305f)) * nbb;
a = _mm_max_ps(a,ZEROV); //gamut correction M.H.Brill S.Susstrunk
J = pow_F( a / aw, c * cz * F2V(0.5f));
e = ((F2V(961.53846f)) * nc * ncb) * (xcosf( myh + F2V(2.0f) ) + F2V(3.8f));
t = (e * _mm_sqrt_ps( (ca * ca) + (cb * cb) )) / (rpa + gpa + (F2V(1.05f) * bpa));
C = pow_F( t, F2V(0.9f) ) * J * pow1;
Q = wh * J;
J *= J * F2V(100.0f);
M = C * pfl;
Q = _mm_max_ps(Q,F2V(0.0001f)); // avoid division by zero
s = F2V(100.0f) * _mm_sqrt_ps( M / Q );
h = (myh * F2V(180.f)) / F2V(M_PI);
}
#endif
void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, double C, double h,
double xw, double yw, double zw, double yb, double la,
@ -792,7 +933,7 @@ void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, doub
cat02_to_xyz( x, y, z, r, g, b, gamu );
}
#ifndef __SSE2__
void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, float C, float h,
float xw, float yw, float zw, float yb, float la,
float f, float c, float nc , int gamu, float pow1, float nbb, float ncb, float fl, float cz, float d, float aw)
@ -827,6 +968,41 @@ void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, flo
cat02_to_xyzfloat( x, y, z, r, g, b, gamu );
}
#else
void Ciecam02::jch2xyz_ciecam02float( vfloat &x, vfloat &y, vfloat &z, vfloat J, vfloat C, vfloat h,
vfloat xw, vfloat yw, vfloat zw, vfloat yb, vfloat la,
vfloat f, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat fl, vfloat d, vfloat aw, vfloat reccmcz)
{
vfloat r, g, b;
vfloat rc, gc, bc;
vfloat rp, gp, bp;
vfloat rpa, gpa, bpa;
vfloat rw, gw, bw;
vfloat a, ca, cb;
vfloat e, t;
xyz_to_cat02float( rw, gw, bw, xw, yw, zw);
e = ((F2V(961.53846f)) * nc * ncb) * (xcosf( ((h * F2V(M_PI)) / F2V(180.0f)) + F2V(2.0f) ) + F2V(3.8f));
a = pow_F( J / F2V(100.0f), reccmcz ) * aw;
t = pow_F( F2V(10.f) * C / (_mm_sqrt_ps( J ) * pow1), F2V(1.1111111f) );
calculate_abfloat( ca, cb, h, e, t, nbb, a );
Aab_to_rgbfloat( rpa, gpa, bpa, a, ca, cb, nbb );
rp = inverse_nonlinear_adaptationfloat( rpa, fl );
gp = inverse_nonlinear_adaptationfloat( gpa, fl );
bp = inverse_nonlinear_adaptationfloat( bpa, fl );
hpe_to_xyzfloat( x, y, z, rp, gp, bp );
xyz_to_cat02float( rc, gc, bc, x, y, z );
r = rc / (((yw * d) / rw) + (F2V(1.0f) - d));
g = gc / (((yw * d) / gw) + (F2V(1.0f) - d));
b = bc / (((yw * d) / bw) + (F2V(1.0f) - d));
cat02_to_xyzfloat( x, y, z, r, g, b );
}
#endif
double Ciecam02::nonlinear_adaptation( double c, double fl )
{
double p;
@ -841,6 +1017,20 @@ float Ciecam02::nonlinear_adaptationfloat( float c, float fl )
else {p = pow_F( (fl * c) / 100.0f, 0.42f ); return ((400.0f * p) / (27.13f + p)) + 0.1f;}
}
#ifdef __SSE2__
vfloat Ciecam02::nonlinear_adaptationfloat( vfloat c, vfloat fl )
{
vfloat c100 = F2V(100.f);
vfloat czd42 = F2V(0.42f);
vfloat c400 = vmulsignf(F2V(400.f),c);
fl = vmulsignf(fl,c);
vfloat p = pow_F( (fl * c) / c100, czd42 );
vfloat c27d13 = F2V(27.13);
vfloat czd1 = F2V(0.1f);
return ((c400 * p) / (c27d13 + p)) + czd1;
}
#endif
double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
{
int c1;
@ -849,6 +1039,7 @@ double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
return c1*(100.0 / fl) * pow( (27.13 * fabs( c - 0.1 )) / (400.0 - fabs( c - 0.1 )), 1.0 / 0.42 );
}
#ifndef __SSE2__
float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
{
c -= 0.1f;
@ -863,6 +1054,16 @@ float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
return (100.0f / fl) * pow_F( (27.13f * fabsf( c )) / (400.0f - fabsf( c )), 2.38095238f );
}
#else
vfloat Ciecam02::inverse_nonlinear_adaptationfloat( vfloat c, vfloat fl )
{
c -= F2V(0.1f);
fl = vmulsignf(fl,c);
c = vabsf(c);
c = _mm_min_ps( c, F2V(399.99f));
return (F2V(100.0f) / fl) * pow_F( (F2V(27.13f) * c) / (F2V(400.0f) - c), F2V(2.38095238f) );
}
#endif
//end CIECAM Billy Bigg
}

View File

@ -20,6 +20,7 @@
#define _CIECAM02_
#include <cmath>
#include "LUT.h"
#include "opthelper.h"
namespace rtengine
{
@ -40,18 +41,33 @@ private:
static void xyz_to_cat02float ( float &r, float &g, float &b, float x, float y, float z, int gamu );
static void cat02_to_hpefloat ( float &rh, float &gh, float &bh, float r, float g, float b, int gamu );
static void cat02_to_xyzfloat ( float &x, float &y, float &z, float r, float g, float b, int gamu );
static void hpe_to_xyzfloat ( float &x, float &y, float &z, float r, float g, float b );
#ifdef __SSE2__
static void xyz_to_cat02float ( vfloat &r, vfloat &g, vfloat &b, vfloat x, vfloat y, vfloat z );
static void cat02_to_hpefloat ( vfloat &rh, vfloat &gh, vfloat &bh, vfloat r, vfloat g, vfloat b );
static vfloat nonlinear_adaptationfloat( vfloat c, vfloat fl );
#endif
static void Aab_to_rgb( double &r, double &g, double &b, double A, double aa, double bb, double nbb );
static void Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb );
static void calculate_ab( double &aa, double &bb, double h, double e, double t, double nbb, double a );
static void calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a );
static double nonlinear_adaptation( double c, double fl );
static float nonlinear_adaptationfloat( float c, float fl );
static double inverse_nonlinear_adaptation( double c, double fl );
#ifndef __SSE2__
static float inverse_nonlinear_adaptationfloat( float c, float fl );
static void calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a );
static void Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb );
static void hpe_to_xyzfloat ( float &x, float &y, float &z, float r, float g, float b );
static void cat02_to_xyzfloat ( float &x, float &y, float &z, float r, float g, float b, int gamu );
#else
static vfloat inverse_nonlinear_adaptationfloat( vfloat c, vfloat fl );
static void calculate_abfloat( vfloat &aa, vfloat &bb, vfloat h, vfloat e, vfloat t, vfloat nbb, vfloat a );
static void Aab_to_rgbfloat( vfloat &r, vfloat &g, vfloat &b, vfloat A, vfloat aa, vfloat bb, vfloat nbb );
static void hpe_to_xyzfloat ( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b );
static void cat02_to_xyzfloat ( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b );
#endif
public:
Ciecam02 () {}
@ -69,12 +85,19 @@ public:
double yb, double la,
double f, double c, double nc, int gamu, double n, double nbb, double ncb, double fl, double cz, double d, double aw);
#ifndef __SSE2__
static void jch2xyz_ciecam02float( float &x, float &y, float &z,
float J, float C, float h,
float xw, float yw, float zw,
float yb, float la,
float f, float c, float nc,int gamu,float n, float nbb, float ncb, float fl, float cz, float d, float aw );
#else
static void jch2xyz_ciecam02float( vfloat &x, vfloat &y, vfloat &z,
vfloat J, vfloat C, vfloat h,
vfloat xw, vfloat yw, vfloat zw,
vfloat yb, vfloat la,
vfloat f, vfloat nc, vfloat n, vfloat nbb, vfloat ncb, vfloat fl, vfloat d, vfloat aw, vfloat reccmcz );
#endif
/**
* Forward transform from XYZ to CIECAM02 JCh.
*/
@ -104,6 +127,16 @@ public:
float yb, float la,
float f, float c, float nc, float pilotd, int gamu, float n, float nbb, float ncb, float pfl, float cz, float d );
#ifdef __SSE2__
static void xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h,
vfloat &Q, vfloat &M, vfloat &s,vfloat aw, vfloat fl, vfloat wh,
vfloat x, vfloat y, vfloat z,
vfloat xw, vfloat yw, vfloat zw,
vfloat yb, vfloat la,
vfloat f, vfloat c, vfloat nc, vfloat n, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d );
#endif
};
}

View File

@ -859,6 +859,25 @@ namespace rtengine {
y=(LL>epskap) ? 65535.0f*fy*fy*fy : 65535.0f*LL/kappa;
}
#ifdef __SSE2__
void Color::Lab2XYZ(vfloat L, vfloat a, vfloat b, vfloat &x, vfloat &y, vfloat &z) {
vfloat c327d68 = F2V(327.68f);
L /= c327d68;
a /= c327d68;
b /= c327d68;
vfloat fy = F2V(0.00862069f) * L + F2V(0.137932f);
vfloat fx = F2V(0.002f) * a + fy;
vfloat fz = fy - (F2V(0.005f) * b);
vfloat c65535 = F2V(65535.f);
x = c65535*f2xyz(fx)*F2V(D50x);
z = c65535*f2xyz(fz)*F2V(D50z);
vfloat res1 = fy*fy*fy;
vfloat res2 = L / F2V(kappa);
y = vself(vmaskf_gt(L, F2V(epskap)), res1, res2);
y *= c65535;
}
#endif // __SSE2__
void Color::XYZ2Lab(float X, float Y, float Z, float &L, float &a, float &b) {
float x = X/D50x;

View File

@ -316,6 +316,9 @@ public:
*/
static void Lab2XYZ(float L, float a, float b, float &x, float &y, float &z);
#ifdef __SSE2__
static void Lab2XYZ(vfloat L, vfloat a, vfloat b, vfloat &x, vfloat &y, vfloat &z);
#endif // __SSE2__
/**
* @brief Convert xyz in Lab
@ -439,7 +442,15 @@ public:
return (f > epsilonExpInv3) ? f*f*f : (116.f * f - 16.f) * kappaInv;
}
#ifdef __SSE2__
static inline vfloat f2xyz(vfloat f) {
const vfloat epsilonExpInv3 = F2V(0.20689655f); // 6.0f/29.0f;
const vfloat kappaInv = F2V(0.0011070565f); // 27.0f/24389.0f; // inverse of kappa
vfloat res1 = f*f*f;
vfloat res2 = (116.f * f - 16.f) * kappaInv;
return vself(vmaskf_gt(f, epsilonExpInv3), res1, res2);
}
#endif
/**
* @brief Calculate the effective direction (up or down) to linearly interpolating 2 colors so that it follows the shortest or longest path

View File

@ -64,6 +64,7 @@ typedef __m128i vint2;
#endif
#define ZEROV _mm_setzero_ps()
#define F2V(a) _mm_set1_ps((a))
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
@ -126,6 +127,7 @@ static INLINE vmask vandm(vmask x, vmask y) { return _mm_and_si128(x, y); }
static INLINE vmask vandnotm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
static INLINE vmask vorm(vmask x, vmask y) { return _mm_or_si128(x, y); }
static INLINE vmask vxorm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
static INLINE vmask vnotm(vmask x) { return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); }
static INLINE vmask vmask_eq(vdouble x, vdouble y) { return (__m128i)_mm_cmpeq_pd(x, y); }
static INLINE vmask vmask_neq(vdouble x, vdouble y) { return (__m128i)_mm_cmpneq_pd(x, y); }

View File

@ -1418,6 +1418,7 @@ if(settings->viewinggreySc==1) yb=18.0f;//fixed
const float pow1 = pow_F( 1.64f - pow_F( 0.29f, n ), 0.73f );
float nj,dj,nbbj,ncbj,czj,awj,flj;
Ciecam02::initcam2float(gamu,yb2, f2, la2, xw2, yw2, zw2, nj, dj, nbbj, ncbj,czj, awj, flj);
const float reccmcz = 1.f / (c2*czj);
const float pow1n = pow_F( 1.64f - pow_F( 0.29f, nj ), 0.73f );
const float epsil=0.0001f;
@ -1440,29 +1441,97 @@ if(settings->viewinggreySc==1) yb=18.0f;//fixed
{wiprof[2][0],wiprof[2][1],wiprof[2][2]}
};
#ifdef __SSE2__
int bufferLength = ((width+3)/4) * 4; // bufferLength has to be a multiple of 4
#endif
#ifndef _DEBUG
#pragma omp parallel
#endif
{
float minQThr = 10000.f;
float maxQThr = -1000.f;
#ifndef _DEBUG
#pragma omp for schedule(dynamic, 10)
#ifdef __SSE2__
// one line buffer per channel and thread
float Jbuffer[bufferLength] ALIGNED16;
float Cbuffer[bufferLength] ALIGNED16;
float hbuffer[bufferLength] ALIGNED16;
float Qbuffer[bufferLength] ALIGNED16;
float Mbuffer[bufferLength] ALIGNED16;
float sbuffer[bufferLength] ALIGNED16;
#endif
for (int i=0; i<height; i++)
for (int j=0; j<width; j++) {
#ifndef _DEBUG
#pragma omp for schedule(dynamic, 16)
#endif
for (int i=0; i<height; i++) {
#ifdef __SSE2__
// vectorized conversion from Lab to jchqms
int k;
vfloat x,y,z;
vfloat J, C, h, Q, M, s;
vfloat c655d35 = F2V(655.35f);
for(k=0;k<width-3;k+=4) {
Color::Lab2XYZ(LVFU(lab->L[i][k]),LVFU(lab->a[i][k]),LVFU(lab->b[i][k]),x,y,z);
x = x/c655d35;
y = y/c655d35;
z = z/c655d35;
Ciecam02::xyz2jchqms_ciecam02float( J, C, h,
Q, M, s, F2V(aw), F2V(fl), F2V(wh),
x, y, z,
F2V(xw1), F2V(yw1), F2V(zw1),
F2V(yb), F2V(la),
F2V(f), F2V(c), F2V(nc), F2V(pow1), F2V(nbb), F2V(ncb), F2V(pfl), F2V(cz), F2V(d));
STVF(Jbuffer[k],J);
STVF(Cbuffer[k],C);
STVF(hbuffer[k],h);
STVF(Qbuffer[k],Q);
STVF(Mbuffer[k],M);
STVF(sbuffer[k],s);
}
for(;k<width;k++) {
float L=lab->L[i][k];
float a=lab->a[i][k];
float b=lab->b[i][k];
float x,y,z;
//convert Lab => XYZ
Color::Lab2XYZ(L, a, b, x, y, z);
x = x/655.35f;
y = y/655.35f;
z = z/655.35f;
float J, C, h, Q, M, s;
Ciecam02::xyz2jchqms_ciecam02float( J, C, h,
Q, M, s, aw, fl, wh,
x, y, z,
xw1, yw1, zw1,
yb, la,
f, c, nc, pilot, gamu, pow1, nbb, ncb, pfl, cz, d);
Jbuffer[k] = J;
Cbuffer[k] = C;
hbuffer[k] = h;
Qbuffer[k] = Q;
Mbuffer[k] = M;
sbuffer[k] = s;
}
#endif // __SSE2__
for (int j=0; j<width; j++) {
float J, C, h, Q, M, s;
#ifdef __SSE2__
// use precomputed values from above
J = Jbuffer[j];
C = Cbuffer[j];
h = hbuffer[j];
Q = Qbuffer[j];
M = Mbuffer[j];
s = sbuffer[j];
#else
float x,y,z;
float L=lab->L[i][j];
float a=lab->a[i][j];
float b=lab->b[i][j];
float x1,y1,z1;
float x,y,z;
//convert Lab => XYZ
Color::Lab2XYZ(L, a, b, x1, y1, z1);
float J, C, h, Q, M, s;
float Jpro,Cpro, hpro, Qpro, Mpro, spro;
x=(float)x1/655.35f;
y=(float)y1/655.35f;
z=(float)z1/655.35f;
@ -1473,6 +1542,8 @@ if(settings->viewinggreySc==1) yb=18.0f;//fixed
xw1, yw1, zw1,
yb, la,
f, c, nc, pilot, gamu, pow1, nbb, ncb, pfl, cz, d);
#endif
float Jpro,Cpro, hpro, Qpro, Mpro, spro;
Jpro=J;
Cpro=C;
hpro=h;
@ -1797,6 +1868,12 @@ if(settings->viewinggreySc==1) yb=18.0f;//fixed
}
}
if(LabPassOne){
#ifdef __SSE2__
// write to line buffers
Jbuffer[j] = J;
Cbuffer[j] = C;
hbuffer[j] = h;
#else
float xx,yy,zz;
//process normal==> viewing
@ -1805,22 +1882,13 @@ if(LabPassOne){
xw2, yw2, zw2,
yb2, la2,
f2, c2, nc2, gamu, pow1n, nbbj, ncbj, flj, czj, dj, awj);
float x,y,z;
x=(float)xx*655.35f;
y=(float)yy*655.35f;
z=(float)zz*655.35f;
float Ll,aa,bb;
//convert xyz=>lab
Color::XYZ2Lab(x, y, z, Ll, aa, bb);
#ifdef _DEBUG
if(Ll > 70000.f && J < 1.f) {
#pragma omp critical
{
printf("Why is Ll so big when J is so small?\n");
printf("J : %f, Ll : %f, xx : %f, yy : %f, zz : %f\n",J,Ll,xx,yy,zz);
printf("J : %f, C : %f, h : %f\n",J,C,h);
}
}
#endif
// gamut control in Lab mode; I must study how to do with cIECAM only
if(gamu==1) {
@ -1857,9 +1925,66 @@ if(LabPassOne){
lab->a[i][j]=aa;
lab->b[i][j]=bb;
}
#endif
}
}
}
#ifdef __SSE2__
// process line buffers
float *xbuffer = Qbuffer;
float *ybuffer = Mbuffer;
float *zbuffer = sbuffer;
for(k=0;k<bufferLength;k+=4) {
Ciecam02::jch2xyz_ciecam02float( x, y, z,
LVF(Jbuffer[k]), LVF(Cbuffer[k]), LVF(hbuffer[k]),
F2V(xw2), F2V(yw2), F2V(zw2),
F2V(yb2), F2V(la2),
F2V(f2), F2V(nc2), F2V(pow1n), F2V(nbbj), F2V(ncbj), F2V(flj), F2V(dj), F2V(awj), F2V(reccmcz));
STVF(xbuffer[k],x*c655d35);
STVF(ybuffer[k],y*c655d35);
STVF(zbuffer[k],z*c655d35);
}
// XYZ2Lab uses a lookup table. The function behind that lut is a cube root.
// SSE can't beat the speed of that lut, so it doesn't make sense to use SSE
for(int j=0;j<width;j++) {
float Ll,aa,bb;
//convert xyz=>lab
Color::XYZ2Lab(xbuffer[j], ybuffer[j], zbuffer[j], Ll, aa, bb);
// gamut control in Lab mode; I must study how to do with cIECAM only
if(gamu==1) {
float HH, Lprov1, Chprov1;
Lprov1=Ll/327.68f;
Chprov1=sqrtf(SQR(aa) + SQR(bb))/327.68f;
HH=xatan2f(bb,aa);
float2 sincosval;
if(Chprov1==0.0f) {
sincosval.y = 1.f;
sincosval.x = 0.0f;
} else {
sincosval.y = aa/(Chprov1*327.68f);
sincosval.x = bb/(Chprov1*327.68f);
}
#ifdef _DEBUG
bool neg=false;
bool more_rgb=false;
//gamut control : Lab values are in gamut
Color::gamutLchonly(sincosval,Lprov1,Chprov1, wip, highlight, 0.15f, 0.96f, neg, more_rgb);
#else
//gamut control : Lab values are in gamut
Color::gamutLchonly(sincosval,Lprov1,Chprov1, wip, highlight, 0.15f, 0.96f);
#endif
lab->L[i][j]=Lprov1*327.68f;
lab->a[i][j]=327.68f*Chprov1*sincosval.y;
lab->b[i][j]=327.68f*Chprov1*sincosval.x;
} else {
lab->L[i][j]=Ll;
lab->a[i][j]=aa;
lab->b[i][j]=bb;
}
}
#endif
}
#pragma omp critical
{
if(minQThr < minQ)
@ -2023,12 +2148,20 @@ if((params->colorappearance.tonecie && (epdEnabled)) || (params->sharpening.enab
#pragma omp parallel
#endif
{
#ifdef __SSE2__
// one line buffer per channel
float Jbuffer[bufferLength] ALIGNED16;
float Cbuffer[bufferLength] ALIGNED16;
float hbuffer[bufferLength] ALIGNED16;
float *xbuffer = Jbuffer; // we can use one of the above buffers
float *ybuffer = Cbuffer; // "
float *zbuffer = hbuffer; // "
#endif
#ifndef _DEBUG
#pragma omp for schedule(dynamic, 10)
#endif
for (int i=0; i<height; i++) // update CIECAM with new values after tone-mapping
for (int i=0; i<height; i++) { // update CIECAM with new values after tone-mapping
for (int j=0; j<width; j++) {
float xx,yy,zz;
float x,y,z;
@ -2066,6 +2199,11 @@ if((params->colorappearance.tonecie && (epdEnabled)) || (params->sharpening.enab
}
//end histograms
#ifdef __SSE2__
Jbuffer[j] = ncie->J_p[i][j];
Cbuffer[j] = ncie_C_p;
hbuffer[j] = ncie->h_p[i][j];
#else
Ciecam02::jch2xyz_ciecam02float( xx, yy, zz,
ncie->J_p[i][j], ncie_C_p, ncie->h_p[i][j],
xw2, yw2, zw2,
@ -2109,6 +2247,64 @@ if((params->colorappearance.tonecie && (epdEnabled)) || (params->sharpening.enab
lab->a[i][j]=aa;
lab->b[i][j]=bb;
}
#endif
}
#ifdef __SSE2__
// process line buffers
int k;
vfloat x,y,z;
vfloat c655d35 = F2V(655.35f);
for(k=0;k<bufferLength;k+=4) {
Ciecam02::jch2xyz_ciecam02float( x, y, z,
LVF(Jbuffer[k]), LVF(Cbuffer[k]), LVF(hbuffer[k]),
F2V(xw2), F2V(yw2), F2V(zw2),
F2V(yb2), F2V(la2),
F2V(f2), F2V(nc2), F2V(pow1n), F2V(nbbj), F2V(ncbj), F2V(flj), F2V(dj), F2V(awj), F2V(reccmcz));
x *= c655d35;
y *= c655d35;
z *= c655d35;
STVF(xbuffer[k],x);
STVF(ybuffer[k],y);
STVF(zbuffer[k],z);
}
// XYZ2Lab uses a lookup table. The function behind that lut is a cube root.
// SSE can't beat the speed of that lut, so it doesn't make sense to use SSE
for(int j=0;j<width;j++) {
float Ll,aa,bb;
//convert xyz=>lab
Color::XYZ2Lab(xbuffer[j], ybuffer[j], zbuffer[j], Ll, aa, bb);
if(gamu==1) {
float Lprov1, Chprov1;
Lprov1=Ll/327.68f;
Chprov1=sqrtf(SQR(aa) + SQR(bb))/327.68f;
float2 sincosval;
if(Chprov1==0.0f) {
sincosval.y = 1.f;
sincosval.x = 0.0f;
} else {
sincosval.y = aa/(Chprov1*327.68f);
sincosval.x = bb/(Chprov1*327.68f);
}
#ifdef _DEBUG
bool neg=false;
bool more_rgb=false;
//gamut control : Lab values are in gamut
Color::gamutLchonly(sincosval,Lprov1,Chprov1, wipa, highlight, 0.15f, 0.96f, neg, more_rgb);
#else
//gamut control : Lab values are in gamut
Color::gamutLchonly(sincosval,Lprov1,Chprov1, wipa, highlight, 0.15f, 0.96f);
#endif
lab->L[i][j]=Lprov1*327.68f;
lab->a[i][j]=327.68f*Chprov1*sincosval.y;
lab->b[i][j]=327.68f*Chprov1*sincosval.x;
} else {
lab->L[i][j]=Ll;
lab->a[i][j]=aa;
lab->b[i][j]=bb;
}
}
#endif // __SSE2__
}
} //end parallelization

View File

@ -1315,5 +1315,12 @@ static INLINE vfloat SQRV(vfloat a){
return _mm_mul_ps( a,a );
}
static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
vfloat temp = vself(condition, a, b); // the larger of the two
condition = vnotm(condition); // invert the mask
a = vself(condition, a, b); // the smaller of the two
b = temp;
}
#endif // __SSE2__
#endif // SLEEFSSEAVX