CIECAM02 speedup

This commit is contained in:
Ingo
2015-07-07 16:34:05 +02:00
parent b6f8bc675b
commit 1008e0e98d
7 changed files with 555 additions and 86 deletions

View File

@@ -20,10 +20,12 @@
#include "rtengine.h"
#include "curves.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "sleef.c"
#ifdef _DEBUG
#include "settings.h"
#include <stdio.h>
#endif
#undef CLIPD
#define CLIPD(a) ((a)>0.0?((a)<1.0?(a):1.0):0.0)
@@ -33,7 +35,9 @@
namespace rtengine
{
#ifdef _DEBUG
extern const Settings* settings;
#endif
void Ciecam02::curvecolor(double satind, double satval, double &sres, double parsat)
{
@@ -407,6 +411,15 @@ void Ciecam02::xyz_to_cat02float( float &r, float &g, float &b, float x, float y
b = ( 0.0000f * x) + (0.0000f * y) + (1.0000f * z);
}
}
#ifdef __SSE2__
void Ciecam02::xyz_to_cat02float( vfloat &r, vfloat &g, vfloat &b, vfloat x, vfloat y, vfloat z )
{
//gamut correction M.H.Brill S.Susstrunk
r = ( F2V(1.007245f) * x) + (F2V(0.011136f) * y) - (F2V(0.018381f) * z);//Changjun Li
g = (F2V(-0.318061f) * x) + (F2V(1.314589f) * y) + (F2V(0.003471f) * z);
b = z;
}
#endif
void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g, double b, int gamu )
{
@@ -425,6 +438,7 @@ void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g
}
}
#ifndef __SSE2__
void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b, int gamu )
{
gamu=1;
@@ -441,7 +455,15 @@ void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g
z = ( 0.000000f * r) - (0.000000f * g) + (1.000000f * b);
}
}
#else
void Ciecam02::cat02_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
{
//gamut correction M.H.Brill S.Susstrunk
x = ( F2V(0.99015849f) * r) - (F2V(0.00838772f)* g) + (F2V(0.018229217f) * b);//Changjun Li
y = ( F2V(0.239565979f) * r) + (F2V(0.758664642f) * g) + (F2V(0.001770137f)* b);
z = b;
}
#endif
void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g, double b )
{
@@ -450,12 +472,21 @@ void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g,
z = b;
}
#ifndef __SSE2__
void Ciecam02::hpe_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b )
{
x = (1.910197f * r) - (1.112124f * g) + (0.201908f * b);
y = (0.370950f * r) + (0.629054f * g) - (0.000008f * b);
z = b;
}
#else
void Ciecam02::hpe_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
{
x = (F2V(1.910197f) * r) - (F2V(1.112124f) * g) + (F2V(0.201908f) * b);
y = (F2V(0.370950f) * r) + (F2V(0.629054f) * g) - (F2V(0.000008f) * b);
z = b;
}
#endif
void Ciecam02::cat02_to_hpe( double &rh, double &gh, double &bh, double r, double g, double b, int gamu )
{
@@ -485,6 +516,16 @@ void Ciecam02::cat02_to_hpefloat( float &rh, float &gh, float &bh, float r, floa
}
}
#ifdef __SSE2__
void Ciecam02::cat02_to_hpefloat( vfloat &rh, vfloat &gh, vfloat &bh, vfloat r, vfloat g, vfloat b)
{
//Changjun Li
rh = ( F2V(0.550930835f) * r) + (F2V(0.519435987f)* g) - ( F2V(0.070356303f)* b);
gh = ( F2V(0.055954056f) * r) + (F2V(0.89973132f) * g) + (F2V(0.044315524f) * b);
bh = b;
}
#endif
void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa, double bb, double nbb )
{
double x = (A / nbb) + 0.305;
@@ -496,6 +537,8 @@ void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa,
/* c1 c6 c7 */
b = (0.32787 * x) - (0.15681 * aa) - (4.49038 * bb);
}
#ifndef __SSE2__
void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb )
{
float x = (A / nbb) + 0.305f;
@@ -507,6 +550,19 @@ void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa,
/* c1 c6 c7 */
b = (0.32787f * x) - (0.15681f * aa) - (4.49038f * bb);
}
#else
void Ciecam02::Aab_to_rgbfloat( vfloat &r, vfloat &g, vfloat &b, vfloat A, vfloat aa, vfloat bb, vfloat nbb )
{
vfloat c1 = F2V(0.32787f) * ((A / nbb) + F2V(0.305f));
/* c1 c2 c3 */
r = c1 + (F2V(0.32145f) * aa) + (F2V(0.20527f) * bb);
/* c1 c4 c5 */
g = c1 - (F2V(0.63507f) * aa) - (F2V(0.18603f) * bb);
/* c1 c6 c7 */
b = c1 - (F2V(0.15681f) * aa) - (F2V(4.49038f) * bb);
}
#endif
void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double t, double nbb, double a )
{
@@ -535,7 +591,7 @@ void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double
bb = (aa * sinh) / cosh;
}
}
#ifndef __SSE2__
void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a )
{
float2 sincosval = xsincosf((h * M_PI) / 180.0f);
@@ -565,6 +621,34 @@ void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float
std::swap(aa,bb);
}
}
#else
void Ciecam02::calculate_abfloat( vfloat &aa, vfloat &bb, vfloat h, vfloat e, vfloat t, vfloat nbb, vfloat a )
{
vfloat2 sincosval = xsincosf((h * F2V(M_PI)) / F2V(180.0f));
vfloat sinh = sincosval.x;
vfloat cosh = sincosval.y;
vfloat x = (a / nbb) + F2V(0.305f);
vfloat p3 = F2V(1.05f);
vmask swapMask = vmaskf_gt(vabsf(sinh), vabsf(cosh));
vswap(swapMask, sinh, cosh);
vfloat div = ((e / (t * cosh)) - (F2V(-0.31362f) - (p3 * F2V(0.15681f))) - ((F2V(0.01924f) - (p3 * F2V(4.49038f))) * (sinh / cosh)));
// for large values of t the above calculation can change its sign which results in a hue shift of 180 degree
// so we have to check the sign to avoid this shift.
// Additionally it seems useful to limit the minimum value of div
// I limited it, but I'm sure the actual limit is not the best one
vmask limitMask = vmaskf_neq(vsignf(div), vsignf(cosh));
limitMask = vorm(limitMask, vmaskf_le(vabsf(div), vabsf(cosh) * F2V(2.f)));
div = vself(limitMask, cosh * F2V(2.f), div);
aa = ((F2V(0.32787f) * x) * (F2V(2.0f) + p3)) / div;
bb = (aa * sinh) / cosh;
vswap(swapMask, aa, bb);
}
#endif
void Ciecam02::initcam1(double gamu, double yb, double pilotd, double f, double la, double xw, double yw, double zw, double &n, double &d, double &nbb, double &ncb,
double &cz, double &aw, double &wh, double &pfl, double &fl, double &c)
@@ -758,6 +842,63 @@ void Ciecam02::xyz2jchqms_ciecam02float( float &J, float &C, float &h, float &Q,
s = 100.0f * sqrtf( M / Q );
h = (myh * 180.f) / (float)M_PI;
}
#ifdef __SSE2__
void Ciecam02::xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h, vfloat &Q, vfloat &M, vfloat &s, vfloat aw, vfloat fl, vfloat wh,
vfloat x, vfloat y, vfloat z, vfloat xw, vfloat yw, vfloat zw,
vfloat yb, vfloat la, vfloat f, vfloat c, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d)
{
vfloat r, g, b;
vfloat rw, gw, bw;
vfloat rc, gc, bc;
vfloat rp, gp, bp;
vfloat rpa, gpa, bpa;
vfloat a, ca, cb;
vfloat e, t;
xyz_to_cat02float( r, g, b, x, y, z);
xyz_to_cat02float( rw, gw, bw, xw, yw, zw);
vfloat onev = F2V(1.f);
rc = r * (((yw * d) / rw) + (onev - d));
gc = g * (((yw * d) / gw) + (onev - d));
bc = b * (((yw * d) / bw) + (onev - d));
cat02_to_hpefloat( rp, gp, bp, rc, gc, bc);
//gamut correction M.H.Brill S.Susstrunk
rp = _mm_max_ps(rp,ZEROV);
gp = _mm_max_ps(gp,ZEROV);
bp = _mm_max_ps(bp,ZEROV);
rpa = nonlinear_adaptationfloat( rp, fl );
gpa = nonlinear_adaptationfloat( gp, fl );
bpa = nonlinear_adaptationfloat( bp, fl );
ca = rpa - ((F2V(12.0f) * gpa) - bpa) / F2V(11.0f);
cb = F2V(0.11111111f) * (rpa + gpa - (bpa + bpa));
vfloat myh = xatan2f( cb, ca );
vfloat temp = F2V(M_PI);
temp += temp;
temp += myh;
myh = vself(vmaskf_lt(myh, ZEROV), temp, myh);
a = ((rpa + rpa) + gpa + (F2V(0.05f) * bpa) - F2V(0.305f)) * nbb;
a = _mm_max_ps(a,ZEROV); //gamut correction M.H.Brill S.Susstrunk
J = pow_F( a / aw, c * cz * F2V(0.5f));
e = ((F2V(961.53846f)) * nc * ncb) * (xcosf( myh + F2V(2.0f) ) + F2V(3.8f));
t = (e * _mm_sqrt_ps( (ca * ca) + (cb * cb) )) / (rpa + gpa + (F2V(1.05f) * bpa));
C = pow_F( t, F2V(0.9f) ) * J * pow1;
Q = wh * J;
J *= J * F2V(100.0f);
M = C * pfl;
Q = _mm_max_ps(Q,F2V(0.0001f)); // avoid division by zero
s = F2V(100.0f) * _mm_sqrt_ps( M / Q );
h = (myh * F2V(180.f)) / F2V(M_PI);
}
#endif
void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, double C, double h,
double xw, double yw, double zw, double yb, double la,
@@ -792,7 +933,7 @@ void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, doub
cat02_to_xyz( x, y, z, r, g, b, gamu );
}
#ifndef __SSE2__
void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, float C, float h,
float xw, float yw, float zw, float yb, float la,
float f, float c, float nc , int gamu, float pow1, float nbb, float ncb, float fl, float cz, float d, float aw)
@@ -827,6 +968,41 @@ void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, flo
cat02_to_xyzfloat( x, y, z, r, g, b, gamu );
}
#else
void Ciecam02::jch2xyz_ciecam02float( vfloat &x, vfloat &y, vfloat &z, vfloat J, vfloat C, vfloat h,
vfloat xw, vfloat yw, vfloat zw, vfloat yb, vfloat la,
vfloat f, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat fl, vfloat d, vfloat aw, vfloat reccmcz)
{
vfloat r, g, b;
vfloat rc, gc, bc;
vfloat rp, gp, bp;
vfloat rpa, gpa, bpa;
vfloat rw, gw, bw;
vfloat a, ca, cb;
vfloat e, t;
xyz_to_cat02float( rw, gw, bw, xw, yw, zw);
e = ((F2V(961.53846f)) * nc * ncb) * (xcosf( ((h * F2V(M_PI)) / F2V(180.0f)) + F2V(2.0f) ) + F2V(3.8f));
a = pow_F( J / F2V(100.0f), reccmcz ) * aw;
t = pow_F( F2V(10.f) * C / (_mm_sqrt_ps( J ) * pow1), F2V(1.1111111f) );
calculate_abfloat( ca, cb, h, e, t, nbb, a );
Aab_to_rgbfloat( rpa, gpa, bpa, a, ca, cb, nbb );
rp = inverse_nonlinear_adaptationfloat( rpa, fl );
gp = inverse_nonlinear_adaptationfloat( gpa, fl );
bp = inverse_nonlinear_adaptationfloat( bpa, fl );
hpe_to_xyzfloat( x, y, z, rp, gp, bp );
xyz_to_cat02float( rc, gc, bc, x, y, z );
r = rc / (((yw * d) / rw) + (F2V(1.0f) - d));
g = gc / (((yw * d) / gw) + (F2V(1.0f) - d));
b = bc / (((yw * d) / bw) + (F2V(1.0f) - d));
cat02_to_xyzfloat( x, y, z, r, g, b );
}
#endif
double Ciecam02::nonlinear_adaptation( double c, double fl )
{
double p;
@@ -841,6 +1017,20 @@ float Ciecam02::nonlinear_adaptationfloat( float c, float fl )
else {p = pow_F( (fl * c) / 100.0f, 0.42f ); return ((400.0f * p) / (27.13f + p)) + 0.1f;}
}
#ifdef __SSE2__
vfloat Ciecam02::nonlinear_adaptationfloat( vfloat c, vfloat fl )
{
vfloat c100 = F2V(100.f);
vfloat czd42 = F2V(0.42f);
vfloat c400 = vmulsignf(F2V(400.f),c);
fl = vmulsignf(fl,c);
vfloat p = pow_F( (fl * c) / c100, czd42 );
vfloat c27d13 = F2V(27.13);
vfloat czd1 = F2V(0.1f);
return ((c400 * p) / (c27d13 + p)) + czd1;
}
#endif
double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
{
int c1;
@@ -849,6 +1039,7 @@ double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
return c1*(100.0 / fl) * pow( (27.13 * fabs( c - 0.1 )) / (400.0 - fabs( c - 0.1 )), 1.0 / 0.42 );
}
#ifndef __SSE2__
float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
{
c -= 0.1f;
@@ -863,6 +1054,16 @@ float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
return (100.0f / fl) * pow_F( (27.13f * fabsf( c )) / (400.0f - fabsf( c )), 2.38095238f );
}
#else
vfloat Ciecam02::inverse_nonlinear_adaptationfloat( vfloat c, vfloat fl )
{
c -= F2V(0.1f);
fl = vmulsignf(fl,c);
c = vabsf(c);
c = _mm_min_ps( c, F2V(399.99f));
return (F2V(100.0f) / fl) * pow_F( (F2V(27.13f) * c) / (F2V(400.0f) - c), F2V(2.38095238f) );
}
#endif
//end CIECAM Billy Bigg
}