712 lines
21 KiB
C++

/*
* LUT.h
* This file is part of RawTherapee.
*
* Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com)
*
* RawTherapee is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* RawTherapee is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RawTherapee. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* Declaration of flexible Lookup Tables
*
* Usage:
*
* LUT<type> name (size);
* LUT<type> name (size, flags);
*
* creates an array which is valid within the normal C/C++ scope "{ ... }"
*
* access to elements is a simple as:
*
* LUT<float> my_lut (10);
* float value = my_lut[3];
* float value = my_lut[2.5]; // this will interpolate
*
* when using a float type index it will interpolate the lookup values
*
* extra setting in flags: (clipping is set by default)
* LUT_CLIP_ABOVE
* LUT_CLIP_BELOW
*
* example:
* LUT<float> my_lut (10,LUT_CLIP_BELOW);
* float value = my_lut[22.5]; // this will extrapolate
* float value = my_lut[-22.5]; // this will not extrapolate
*
* LUT<float> my_lut (10,0); // this will extrapolate on either side
*
* shotcuts:
*
* LUTf stands for LUT<float>
* LUTi stands for LUT<int>
* LUTu stands for LUT<unsigned int>
* LUTd stands for LUT<double>
* LUTuc stands for LUT<unsigned char>
*/
#ifndef LUT_H_
#define LUT_H_
#include <cstring>
#include <cstdint>
#include <cassert>
#ifndef NDEBUG
#include <glibmm.h>
#include <fstream>
#endif
#include "opthelper.h"
#include "rt_math.h"
#include "noncopyable.h"
// Bit representations of flags
enum {
LUT_CLIP_BELOW = 1 << 0,
LUT_CLIP_ABOVE = 1 << 1
};
template<typename T>
class LUT;
using LUTf = LUT<float>;
using LUTi = LUT<int32_t>;
using LUTu = LUT<uint32_t>;
using LUTd = LUT<double>;
using LUTuc = LUT<uint8_t>;
template<typename T>
class LUT :
public rtengine::NonCopyable
{
protected:
// list of variables ordered to improve cache speed
int maxs;
float maxsf;
// For the SSE routine operator[](vfloat), we just clip float lookup values
// to just below the max value.
float maxIndexFloat;
T * data;
unsigned int clip;
unsigned int size;
unsigned int upperBound; // always equals size-1, parameter created for performance reason
private:
unsigned int owner;
#ifdef __SSE2__
vfloat maxsv ALIGNED16;
vfloat sizev ALIGNED16;
vint sizeiv ALIGNED16;
#endif
public:
/// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
/// The user have to handle it itself, even if some method can (re)initialize it
bool dirty;
LUT(int s, int flags = 0xfffffff)
{
#ifndef NDEBUG
if (s <= 0) {
printf("s<=0!\n");
}
assert (s > 0);
#endif
dirty = true;
clip = flags;
// Add a few extra elements so [](vfloat) won't access out-of-bounds memory.
// The routine would still produce the right answer, but might cause issues
// with address/heap checking programs.
data = new T[s + 3];
owner = 1;
size = s;
upperBound = size - 1;
maxs = size - 2;
maxsf = (float)maxs;
maxIndexFloat = ((float)upperBound) - 1e-5;
#ifdef __SSE2__
maxsv = F2V( maxs );
sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = F2V( size - 1 );
#endif
}
void operator ()(int s, int flags = 0xfffffff)
{
#ifndef NDEBUG
if (s <= 0) {
printf("s<=0!\n");
}
assert (s > 0);
#endif
if (owner && data) {
delete[] data;
}
dirty = true; // Assumption!
clip = flags;
// See comment in constructor.
data = new T[s + 3];
owner = 1;
size = s;
upperBound = size - 1;
maxs = size - 2;
maxsf = (float)maxs;
maxIndexFloat = ((float)upperBound) - 1e-5;
#ifdef __SSE2__
maxsv = F2V( maxs );
sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = F2V( size - 1 );
#endif
}
LUT()
{
data = nullptr;
reset();
#ifdef __SSE2__
maxsv = ZEROV;
sizev = ZEROV;
sizeiv = _mm_setzero_si128();
#endif
}
~LUT()
{
if (owner) {
delete[] data;
#ifndef NDEBUG
data = (T*)0xBAADF00D;
#endif
}
}
void setClip(int flags)
{
clip = flags;
}
int getClip() const {
return clip;
}
/** @brief Get the number of element in the LUT (i.e. dimension of the array)
* For a LUT(500), it will return 500
* @return number of element in the array
*/
unsigned int getSize() const
{
return size;
}
/** @brief Get the highest value possible (i.e. dimension of the array)
* For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499
* @return number of element in the array
*/
unsigned int getUpperBound() const
{
return size > 0 ? upperBound : 0;
}
LUT<T> & operator=(LUT<T> &rhs)
{
if (this != &rhs) {
if (rhs.size > this->size) {
delete [] this->data;
this->data = nullptr;
}
if (this->data == nullptr) {
// See comment in constructor.
this->data = new T[rhs.size + 3];
}
this->clip = rhs.clip;
this->owner = 1;
memcpy(this->data, rhs.data, rhs.size * sizeof(T));
this->size = rhs.size;
this->upperBound = rhs.upperBound;
this->maxs = this->size - 2;
this->maxsf = (float)this->maxs;
this->maxIndexFloat = ((float)this->upperBound) - 1e-5;
#ifdef __SSE2__
this->maxsv = F2V( this->size - 2);
this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) );
this->sizev = F2V( this->size - 1 );
#endif
}
return *this;
}
// handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t).
template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
LUT<T> & operator+=(LUT<T> &rhs)
{
if (rhs.size == this->size) {
#ifdef _OPENMP
#pragma omp simd
#endif
for(unsigned int i = 0; i < this->size; i++) {
data[i] += rhs.data[i];
}
}
return *this;
}
// multiply all elements of LUT<float> with a constant float value
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
LUT<float> & operator*=(float factor)
{
#ifdef _OPENMP
#pragma omp simd
#endif
for(unsigned int i = 0; i < this->size; i++) {
data[i] *= factor;
}
return *this;
}
// divide all elements of LUT<float> by a constant float value
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
LUT<float> & operator/=(float divisor)
{
#ifdef _OPENMP
#pragma omp simd
#endif
for(unsigned int i = 0; i < this->size; i++) {
data[i] /= divisor;
}
return *this;
}
// use with integer indices
T& operator[](int index) const
{
return data[ rtengine::LIM<int>(index, 0, upperBound) ];
}
#ifdef __SSE2__
// NOTE: This function requires LUTs which clips only at lower bound
vfloat cb(vfloat indexv) const
{
static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
// Clamp and convert to integer values. Extract out of SSE register because all
// lookup operations use regular addresses.
vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxIndexFloat), indexv));
vint indexes = _mm_cvttps_epi32(clampedIndexes);
int indexArray[4];
_mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
// Load data from the table. This reads more than necessary, but there don't seem
// to exist more granular operations (though we could try non-SSE).
// Cast to int for convenience in the next operation (partial transpose).
vint values[4];
for (int i = 0; i < 4; ++i) {
values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
}
// Partial 4x4 transpose operation. We want two new vectors, the first consisting
// of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
__m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
__m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes);
return vintpf(diff, upper, lower);
}
// NOTE: This version requires LUTs which clip at upper and lower bounds
// (which is the default).
vfloat operator[](vfloat indexv) const
{
static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
// Clamp and convert to integer values. Extract out of SSE register because all
// lookup operations use regular addresses.
vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxIndexFloat), indexv));
vint indexes = _mm_cvttps_epi32(clampedIndexes);
int indexArray[4];
_mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
// Load data from the table. This reads more than necessary, but there don't seem
// to exist more granular operations (though we could try non-SSE).
// Cast to int for convenience in the next operation (partial transpose).
vint values[4];
for (int i = 0; i < 4; ++i) {
values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
}
// Partial 4x4 transpose operation. We want two new vectors, the first consisting
// of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
__m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
__m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
vfloat diff = clampedIndexes - _mm_cvtepi32_ps(indexes);
return vintpf(diff, upper, lower);
}
// NOTE: This version requires LUTs which do not clip at upper and lower bounds
vfloat operator()(vfloat indexv) const
{
static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
// Clamp and convert to integer values. Extract out of SSE register because all
// lookup operations use regular addresses.
vfloat clampedIndexes = vmaxf(ZEROV, vminf(F2V(maxsf), indexv));
vint indexes = _mm_cvttps_epi32(clampedIndexes);
int indexArray[4];
_mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
// Load data from the table. This reads more than necessary, but there don't seem
// to exist more granular operations (though we could try non-SSE).
// Cast to int for convenience in the next operation (partial transpose).
vint values[4];
for (int i = 0; i < 4; ++i) {
values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
}
// Partial 4x4 transpose operation. We want two new vectors, the first consisting
// of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
__m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
__m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
return vintpf(diff, upper, lower);
}
#ifdef __SSE4_1__
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
vfloat operator[](vint idxv ) const
{
vfloat tempv, p1v;
idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
// access the LUT 4 times and shuffle the values into p1v
int idx;
// get 4th value
idx = _mm_extract_epi32(idxv, 3);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
// now p1v is 3 3 3 3
// get 3rd value
idx = _mm_extract_epi32(idxv, 2);
tempv = _mm_load_ss(&data[idx]);
p1v = _mm_move_ss( p1v, tempv);
// now p1v is 3 3 3 2
// get 2nd value
idx = _mm_extract_epi32(idxv, 1);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
// now p1v is 3 2 3 2
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 3 1
// get 1st value
idx = _mm_cvtsi128_si32(idxv);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
// now p1v is 3 2 1 1
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 1 0
return p1v;
}
#else
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
vfloat operator[](vint idxv ) const
{
vfloat tempv, p1v;
tempv = _mm_cvtepi32_ps(idxv);
tempv = _mm_min_ps( tempv, sizev );
idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( ) ));
// access the LUT 4 times and shuffle the values into p1v
int idx;
// get 4th value
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
// now p1v is 3 3 3 3
// get 3rd value
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
tempv = _mm_load_ss(&data[idx]);
p1v = _mm_move_ss( p1v, tempv);
// now p1v is 3 3 3 2
// get 2nd value
idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
// now p1v is 3 2 3 2
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 3 1
// get 1st value
idx = _mm_cvtsi128_si32 (idxv);
tempv = _mm_load_ss(&data[idx]);
p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
// now p1v is 3 2 1 1
p1v = _mm_move_ss( p1v, tempv );
// now p1v is 3 2 1 0
return p1v;
}
#endif
#endif
// use with float indices
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
T operator[](float index) const
{
int idx = (int)index; // don't use floor! The difference in negative space is no problems here
if (index < 0.f) {
if (clip & LUT_CLIP_BELOW) {
return data[0];
}
idx = 0;
} else if (idx > maxs) {
if (clip & LUT_CLIP_ABOVE) {
return data[upperBound];
}
idx = maxs;
}
float diff = index - (float) idx;
T p1 = data[idx];
T p2 = data[idx + 1] - p1;
return (p1 + p2 * diff);
}
// Return the value for "index" that is in the [0-1] range.
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
T getVal01 (float index) const
{
index *= (float)upperBound;
int idx = (int)index; // don't use floor! The difference in negative space is no problems here
if (index < 0.f) {
if (clip & LUT_CLIP_BELOW) {
return data[0];
}
idx = 0;
} else if (index > maxsf) {
if (clip & LUT_CLIP_ABOVE) {
return data[upperBound];
}
idx = maxs;
}
float diff = index - (float) idx;
T p1 = data[idx];
T p2 = data[idx + 1] - p1;
return (p1 + p2 * diff);
}
#ifndef NDEBUG
// Debug facility ; dump the content of the LUT in a file. No control of the filename is done
void dump(Glib::ustring fname)
{
if (size) {
Glib::ustring fname_ = fname + ".xyz"; // TopSolid'Design "plot" file format
std::ofstream f (fname_.c_str());
f << "$" << std::endl;
for (unsigned int iter = 0; iter < size; iter++) {
f << iter << ", " << data[iter] << ", 0." << std::endl;
}
f << "$" << std::endl;
f.close ();
}
}
#endif
operator bool (void) const
{
return size > 0;
}
void clear(void)
{
if (data && size) {
memset(data, 0, size * sizeof(T));
}
}
void reset(void)
{
if (data) {
delete[] data;
}
dirty = true;
data = nullptr;
owner = 1;
size = 0;
upperBound = 0;
maxs = 0;
maxsf = 0.f;
clip = 0;
maxIndexFloat = ((float)upperBound) - 1e-5;
}
// create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor)
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
void makeIdentity(float divisor = 1.f)
{
if(divisor == 1.f) {
for(unsigned int i = 0; i < size; i++) {
data[i] = i;
}
} else {
for(unsigned int i = 0; i < size; i++) {
data[i] = i / divisor;
}
}
}
// compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x)
template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
void compressTo(LUT<T> &dest, unsigned int numVals = 0) const
{
numVals = numVals == 0 ? size : numVals;
numVals = std::min(numVals, size);
float divisor = numVals - 1;
float mult = (dest.size - 1) / divisor;
for (unsigned int i = 0; i < numVals; i++) {
int hi = (int)(mult * i);
dest.data[hi] += this->data[i] ;
}
}
// compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passTrough LUT to calculate indexes
template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const
{
if(passThrough) {
numVals = std::min(numVals, size);
numVals = std::min(numVals, passThrough.getSize());
float mult = dest.size - 1;
for (unsigned int i = 0; i < numVals; i++) {
int hi = (int)(mult * passThrough[i]);
dest[hi] += this->data[i] ;
}
}
}
// compute sum and average of a LUT<uint32_t>
template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
void getSumAndAverage(float &sum, float &avg) const
{
sum = 0.f;
avg = 0.f;
int i = 0;
#ifdef __SSE2__
vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f);
vfloat fourv = F2V(4.f);
vint sumv = (vint)ZEROV;
vfloat avgv = ZEROV;
for(; i < static_cast<int>(size) - 3; i += 4) {
vint datav = _mm_loadu_si128((__m128i*)&data[i]);
sumv += datav;
avgv += iv * _mm_cvtepi32_ps(datav);
iv += fourv;
}
sum = vhadd(_mm_cvtepi32_ps(sumv));
avg = vhadd(avgv);
#endif
for (; i < static_cast<int>(size); i++) {
T val = data[i];
sum += val;
avg += i * val;
}
avg /= sum;
}
template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
void makeConstant(float value, unsigned int numVals = 0)
{
numVals = numVals == 0 ? size : numVals;
numVals = std::min(numVals, size);
for(unsigned int i = 0; i < numVals; i++) {
data[i] = value;
}
}
// share the buffer with another LUT, handy for same data but different clip flags
void share(const LUT<T> &source, int flags = 0xfffffff)
{
if (owner && data) {
delete[] data;
}
dirty = false; // Assumption
clip = flags;
data = source.data;
owner = 0;
size = source.getSize();
upperBound = size - 1;
maxs = size - 2;
maxsf = (float)maxs;
maxIndexFloat = ((float)upperBound) - 1e-5;
#ifdef __SSE2__
maxsv = F2V( size - 2);
sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = F2V( size - 1 );
#endif
}
};
#endif /* LUT_H_ */