rawTherapee/rtengine/LUT.h

/*
 * LUT.h
 *  This file is part of RawTherapee.
 *
 *  Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com)
 *
 *  RawTherapee is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  RawTherapee is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *  Declaration of flexible Lookup Tables
 *
 *  Usage:
 *
 *      LUT<type> name (size);
 *      LUT<type> name (size, flags);
 *
 *      creates an array which is valid within the normal C/C++ scope "{ ... }"
 *
 *      access to elements is a simple as:
 *
 *          LUT<float> my_lut (10);
 *          float value = my_lut[3];
 *          float value = my_lut[2.5]; // this will interpolate
 *
 *      when using a float type index it will interpolate the lookup values
 *
 *      extra setting in flags: (clipping is set by default)
 *      LUT_CLIP_ABOVE
 *      LUT_CLIP_BELOW
 *
 *      example:
 *          LUT<float> my_lut (10,LUT_CLIP_BELOW);
 *          float value = my_lut[22.5];  // this will extrapolate
 *          float value = my_lut[-22.5]; // this will not extrapolate
 *
 *          LUT<float> my_lut (10,0); // this will extrapolate on either side
 *
 *      shotcuts:
 *
 *          LUTf stands for LUT<float>
 *          LUTi stands for LUT<int>
 *          LUTu stands for LUT<unsigned int>
 *          LUTd stands for LUT<double>
 *          LUTuc stands for LUT<unsigned char>
 */

#ifndef LUT_H_
#define LUT_H_

// bit representations of flags
#define LUT_CLIP_BELOW 1
#define LUT_CLIP_ABOVE 2

#define LUTf LUT<float>
#define LUTi LUT<int>
#define LUTu LUT<unsigned int>
#define LUTd LUT<double>
#define LUTuc LUT<unsigned char>

#include <cstring>
#ifndef NDEBUG
#include <glibmm.h>
#include <fstream>
#endif
#include "opthelper.h"
#include <assert.h>
#include "rt_math.h"

template<typename T>
class LUT
{
protected:
    // list of variables ordered to improve cache speed
    unsigned int maxs;
    float maxsf;
    T * data;
    unsigned int clip;
    unsigned int size;
    unsigned int upperBound;  // always equals size-1, parameter created for performance reason
private:
    unsigned int owner;
#if defined( __SSE2__ ) && defined( __x86_64__ )
    vfloat maxsv ALIGNED16;
    vfloat sizev ALIGNED16;
    vint sizeiv ALIGNED16;
#endif
public:
    /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
    /// The user have to handle it itself, even if some method can (re)initialize it
    bool dirty;

    LUT(int s, int flags = 0xfffffff)
    {
#ifndef NDEBUG

        if (s <= 0) {
            printf("s<=0!\n");
        }

        assert (s > 0);
#endif
        dirty = true;
        clip = flags;
        data = new T[s];
        owner = 1;
        size = s;
        upperBound = size - 1;
        maxs = size - 2;
        maxsf = (float)maxs;
#if defined( __SSE2__ ) && defined( __x86_64__ )
        maxsv =  F2V( maxs );
        sizeiv =  _mm_set1_epi32( (int)(size - 1) );
        sizev = F2V( size - 1 );
#endif
    }
    void operator ()(int s, int flags = 0xfffffff)
    {
#ifndef NDEBUG

        if (s <= 0) {
            printf("s<=0!\n");
        }

        assert (s > 0);
#endif

        if (owner && data) {
            delete[] data;
        }

        dirty = true; // Assumption!
        clip = flags;
        data = new T[s];
        owner = 1;
        size = s;
        upperBound = size - 1;
        maxs = size - 2;
        maxsf = (float)maxs;
#if defined( __SSE2__ ) && defined( __x86_64__ )
        maxsv =  F2V( maxs );
        sizeiv =  _mm_set1_epi32( (int)(size - 1) );
        sizev = F2V( size - 1 );
#endif
    }

    LUT(int s, T * source, int flags = 0xfffffff)
    {
#ifndef NDEBUG

        if (s <= 0) {
            printf("s<=0!\n");
        }

        assert (s > 0);

        if (!source) {
            printf("source is NULL!\n");
        }

        assert (source != nullptr);
#endif
        dirty = false;  // Assumption
        clip = flags;
        data = new T[s];
        owner = 1;
        size = s;
        upperBound = size - 1;
        maxs = size - 2;
        maxsf = (float)maxs;
#if defined( __SSE2__ ) && defined( __x86_64__ )
        maxsv =  F2V( size - 2);
        sizeiv =  _mm_set1_epi32( (int)(size - 1) );
        sizev = F2V( size - 1 );
#endif

        for (int i = 0; i < s; i++) {
            data[i] = source[i];
        }
    }

    LUT()
    {
        data = nullptr;
        reset();
    }

    ~LUT()
    {
        if (owner) {
            delete[] data;
#ifndef NDEBUG
            data = (T*)0xBAADF00D;
#endif
        }
    }

    void setClip(int flags)
    {
        clip = flags;
    }

    /** @brief Get the number of element in the LUT (i.e. dimension of the array)
     *  For a LUT(500), it will return 500
     *  @return number of element in the array
     */
    int getSize()
    {
        return size;
    }

    /** @brief Get the highest value possible (i.e. dimension of the array)
     *  For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499
     *  @return number of element in the array
     */
    int getUpperBound()
    {
        return size > 0 ? upperBound : 0;
    }

    LUT<T> & operator=(LUT<T> &rhs)
    {
        if (this != &rhs) {
            if (rhs.size > this->size) {
                delete [] this->data;
                this->data = nullptr;
            }

            if (this->data == nullptr) {
                this->data = new T[rhs.size];
            }

            this->clip = rhs.clip;
            this->owner = 1;
            memcpy(this->data, rhs.data, rhs.size * sizeof(T));
            this->size = rhs.size;
            this->upperBound = rhs.upperBound;
            this->maxs = this->size - 2;
            this->maxsf = (float)this->maxs;
#if defined( __SSE2__ ) && defined( __x86_64__ )
            this->maxsv =  F2V( this->size - 2);
            this->sizeiv =  _mm_set1_epi32( (int)(this->size - 1) );
            this->sizev = F2V( this->size - 1 );
#endif
        }

        return *this;
    }

    // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (unsigned int).
    LUT<T> & operator+=(LUT<T> &rhs)
    {
        if (rhs.size == this->size) {
#ifdef _RT_NESTED_OPENMP // temporary solution to fix Issue #3324
            #pragma omp simd
#endif

            for(unsigned int i = 0; i < this->size; i++) {
                data[i] += rhs.data[i];
            }
        }

        return *this;
    }

    // use with integer indices
    T& operator[](int index) const
    {
        return data[ rtengine::LIM<int>(index, 0, upperBound) ];
    }

#if defined( __SSE2__ ) && defined( __x86_64__ )
/*
    vfloat operator[](vfloat indexv ) const
    {
//      printf("don't use this operator. It's not ready for production");
        return _mm_setzero_ps();

        // convert floats to ints
        vint idxv =  _mm_cvttps_epi32( indexv );
        vfloat tempv, resultv, p1v, p2v;
        vmask maxmask = vmaskf_gt(indexv, maxsv);
        idxv = _mm_castps_si128(vself(maxmask, maxsv, _mm_castsi128_ps(idxv)));
        vmask minmask = vmaskf_lt(indexv, _mm_setzero_ps());
        idxv = _mm_castps_si128(vself(minmask, _mm_setzero_ps(), _mm_castsi128_ps(idxv)));
        // access the LUT 4 times and shuffle the values into p1v and p2v

        int idx;

        // get 4th value
        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
        tempv = LVFU(data[idx]);
        p1v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(0, 0, 0, 0));
        p2v = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
        // now p1v is 3 3 3 3
        //     p2v is 3 3 3 3

        // get 3rd value
        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
        tempv = LVFU(data[idx]);
        p1v = _mm_move_ss( p1v, tempv);
        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
        p2v = _mm_move_ss( p2v, tempv);
        // now p1v is 3 3 3 2
        //     p2v is 3 3 3 2

        // get 2nd value
        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
        tempv = LVFU(data[idx]);
        p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(1, 0, 1, 0));
        p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(1, 0, 1, 0));
        // now p1v is 3 2 3 2
        // now p2v is 3 2 3 2
        p1v = _mm_move_ss( p1v, tempv );
        // now p1v is 3 2 3 1
        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
        p2v = _mm_move_ss( p2v, tempv);
        // now p1v is 3 2 3 1

        // get 1st value
        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(0, 0, 0, 0)));
        tempv = LVFU(data[idx]);
        p1v = _mm_shuffle_ps( p1v, p1v, _MM_SHUFFLE(3, 2, 0, 0));
        // now p1v is 3 2 1 1
        p2v = _mm_shuffle_ps( p2v, p2v, _MM_SHUFFLE(3, 2, 0, 0));
        // now p2v is 3 2 1 1
        p1v = _mm_move_ss( p1v, tempv );
        // now p1v is 3 2 1 0
        tempv = _mm_shuffle_ps(tempv, tempv, _MM_SHUFFLE(1, 1, 1, 1));
        p2v = _mm_move_ss( p2v, tempv);
        // now p2v is 3 2 1 0

        vfloat diffv = indexv - _mm_cvtepi32_ps ( idxv );
        diffv = vself(vorm(maxmask, minmask), _mm_setzero_ps(), diffv);
        resultv = p1v + p2v * diffv;
        return resultv  ;
    }
*/
#ifdef __SSE4_1__
    vfloat operator[](vint idxv ) const
    {
        vfloat tempv, p1v;
        idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
        // access the LUT 4 times and shuffle the values into p1v

        int idx;

        // get 4th value
        idx = _mm_extract_epi32(idxv, 3);
        tempv = _mm_load_ss(&data[idx]);
        p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
        // now p1v is 3 3 3 3

        // get 3rd value
        idx = _mm_extract_epi32(idxv, 2);
        tempv = _mm_load_ss(&data[idx]);
        p1v = _mm_move_ss( p1v, tempv);
        // now p1v is 3 3 3 2

        // get 2nd value
        idx = _mm_extract_epi32(idxv, 1);
        tempv = _mm_load_ss(&data[idx]);
        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
        // now p1v is 3 2 3 2
        p1v = _mm_move_ss( p1v, tempv );
        // now p1v is 3 2 3 1

        // get 1st value
        idx = _mm_cvtsi128_si32(idxv);
        tempv = _mm_load_ss(&data[idx]);
        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
        // now p1v is 3 2 1 1
        p1v = _mm_move_ss( p1v, tempv );
        // now p1v is 3 2 1 0

        return p1v;
    }
#else
    vfloat operator[](vint idxv ) const
    {
        vfloat tempv, p1v;
        tempv = _mm_cvtepi32_ps(idxv);
        tempv = _mm_min_ps( tempv, sizev );
        idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( )  ));
        // access the LUT 4 times and shuffle the values into p1v

        int idx;

        // get 4th value
        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)));
        tempv = _mm_load_ss(&data[idx]);
        p1v = PERMUTEPS(tempv, _MM_SHUFFLE(0, 0, 0, 0));
        // now p1v is 3 3 3 3

        // get 3rd value
        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)));
        tempv = _mm_load_ss(&data[idx]);
        p1v = _mm_move_ss( p1v, tempv);
        // now p1v is 3 3 3 2

        // get 2nd value
        idx = _mm_cvtsi128_si32 (_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)));
        tempv = _mm_load_ss(&data[idx]);
        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(1, 0, 1, 0));
        // now p1v is 3 2 3 2
        p1v = _mm_move_ss( p1v, tempv );
        // now p1v is 3 2 3 1

        // get 1st value
        idx = _mm_cvtsi128_si32 (idxv);
        tempv = _mm_load_ss(&data[idx]);
        p1v = PERMUTEPS( p1v, _MM_SHUFFLE(3, 2, 0, 0));
        // now p1v is 3 2 1 1
        p1v = _mm_move_ss( p1v, tempv );
        // now p1v is 3 2 1 0

        return p1v;
    }
#endif
#endif

    // use with float indices
    T operator[](float index) const
    {
        int idx = (int)index;  // don't use floor! The difference in negative space is no problems here

        if (index < 0.f) {
            if (clip & LUT_CLIP_BELOW) {
                return data[0];
            }

            idx = 0;
        } else if (index > maxsf) {
            if (clip & LUT_CLIP_ABOVE) {
                return data[upperBound];
            }

            idx = maxs;
        }

        float diff = index - (float) idx;
        T p1 = data[idx];
        T p2 = data[idx + 1] - p1;
        return (p1 + p2 * diff);
    }

    // Return the value for "index" that is in the [0-1] range.
    T getVal01 (float index) const
    {
        index *= float(upperBound);
        int idx = (int)index;  // don't use floor! The difference in negative space is no problems here

        if (index < 0.f) {
            if (clip & LUT_CLIP_BELOW) {
                return data[0];
            }

            idx = 0;
        } else if (index > maxsf) {
            if (clip & LUT_CLIP_ABOVE) {
                return data[upperBound];
            }

            idx = maxs;
        }

        float diff = index - (float) idx;
        T p1 = data[idx];
        T p2 = data[idx + 1] - p1;
        return (p1 + p2 * diff);
    }

#ifndef NDEBUG
    // Debug facility ; dump the content of the LUT in a file. No control of the filename is done
    void dump(Glib::ustring fname)
    {
        if (size) {
            Glib::ustring fname_ = fname + ".xyz"; // TopSolid'Design "plot" file format
            std::ofstream f (fname_.c_str());
            f << "$" << std::endl;

            for (unsigned int iter = 0; iter < size; iter++) {
                f << iter << ", " << data[iter] << ", 0." << std::endl;
            }

            f << "$" << std::endl;
            f.close ();
        }
    }
#endif


    operator bool (void) const
    {
        return size > 0;
    }

    void clear(void)
    {
        if (data && size) {
            memset(data, 0, size * sizeof(T));
        }
    }

    void reset(void)
    {
        if (data) {
            delete[] data;
        }

        dirty = true;
        data = nullptr;
        owner = 1;
        size = 0;
        upperBound = 0;
        maxs = 0;
    }
};


// TODO: HOMBRE: HueLUT is actually unused, could we delete this class now that LUT::getVal01 has been created?


/** @brief LUT subclass handling hue values specifically.
    The array has a fixed size of float values and have to be in the [0.; 1.] range in both axis (no error checking implemented) */
class HueLUT : public LUTf
{
public:
    HueLUT() : LUTf() {}
    explicit HueLUT(bool createArray) : LUTf()
    {
        if (createArray) {
            this->operator () (501, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
        }
    }

    void create()
    {
        this->operator () (501, LUT_CLIP_BELOW | LUT_CLIP_ABOVE);
    }

    // use with integer indices
    float& operator[](int index) const
    {
        return data[ rtengine::LIM<int>(index, 0, upperBound) ];
    }

    // use with float indices in the [0.;1.] range
    float operator[](float index) const
    {
        int idx = int(index * 500.f); // don't use floor! The difference in negative space is no problems here

        if (index < 0.f) {
            return data[0];
        } else if (index > 1.f) {
            return data[upperBound];
        }

        float balance = index - float(idx / 500.f);
        float h1 = data[idx];
        float h2 = data[idx + 1];

        if (h1 == h2) {
            return h1;
        }

        if ((h1 > h2) && (h1 - h2 > 0.5f)) {
            h1 -= 1.f;
            float value = h1 + balance * (h2 - h1);

            if (value < 0.f) {
                value += 1.f;
            }

            return value;
        } else if (h2 - h1 > 0.5f) {
            h2 -= 1.f;
            float value = h1 + balance * (h2 - h1);

            if (value < 0.f) {
                value += 1.f;
            }

            return value;
        } else {
            return h1 + balance * (h2 - h1);
        }
    }
};


#endif /* LUT_H_ */