From e495093b187b647b6585604d2827fb0eb1776871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Sat, 16 Apr 2016 22:20:18 +0200 Subject: [PATCH 01/15] Clean up clutstore.* and add LRU cache This commit adds a true LRU cache to `rtengine` which is used in the new `CLUTStore` class. The code in `clutstore.*` was cleaned up with C++11 features and small optimizations taken from my `clutbench` project. The `CLUTStore` class was converted to a true singleton. --- rtengine/cache.h | 236 ++++++++++++++++ rtengine/clutstore.cc | 571 ++++++++++++-------------------------- rtengine/clutstore.h | 119 +++----- rtengine/improcfun.cc | 14 +- rtengine/simpleprocess.cc | 2 +- rtgui/filmsimulation.cc | 6 +- 6 files changed, 476 insertions(+), 472 deletions(-) create mode 100644 rtengine/cache.h diff --git a/rtengine/cache.h b/rtengine/cache.h new file mode 100644 index 000000000..2e53aab2a --- /dev/null +++ b/rtengine/cache.h @@ -0,0 +1,236 @@ +/* + * This file is part of RawTherapee. + * + * Copyright (c) 2016 Flössie + * + * RawTherapee is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RawTherapee is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RawTherapee. If not, see . + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "../rtgui/threadutils.h" + +namespace rtengine +{ + +namespace cache_helper +{ + + // See http://stackoverflow.com/a/20790050 + template + struct has_hash + : std::false_type + { + }; + + template + struct has_hash()(std::declval()), void())> + : std::true_type + { + }; + +} + +template +class Cache +{ +public: + class Hook + { + public: + virtual ~Hook() + { + } + virtual void onDiscard(const K& key, const V& value) = 0; + virtual void onDisplace(const K& key, const V& value) = 0; + virtual void onRemove(const K& key, const V& value) = 0; + virtual void onDestroy() = 0; + }; + + Cache(unsigned long _size, Hook* _hook = 0) : + store_size(_size), + hook(_hook) + { + } + + ~Cache() + { + if (hook) { + resize(0); + hook->onDestroy(); + } + } + + bool get(const K& key, V& value) const + { + mutex.lock(); + const StoreConstIterator store_it = store.find(key); + const bool present = store_it != store.end(); + if (present) { + lru_list.splice( + lru_list.begin(), + lru_list, + store_it->second.lru_list_it + ); + value = store_it->second.value; + } + mutex.unlock(); + + return present; + } + + bool set(const K& key, const V& value) + { + return set(key, value, Mode::UNCOND); + } + + bool replace(const K& key, const V& value) + { + return set(key, value, Mode::KNOWN); + } + + bool insert(const K& key, const V& value) + { + return set(key, value, Mode::UNKNOWN); + } + + bool remove(const K& key) + { + mutex.lock(); + const StoreIterator store_it = store.find(key); + const bool present = store_it != store.end(); + if (present) { + remove(store_it); + } + mutex.unlock(); + + return present; + } + + void resize(unsigned long size) + { + mutex.lock(); + while (lru_list.size() > size) { + discard(); + } + store_size = size; + mutex.unlock(); + } + + void clear() + { + mutex.lock(); + if (hook) { + for (const auto& entry : store) { + hook->onRemove(entry.first, entry.second.value); + } + } + lru_list.clear(); + store.clear(); + mutex.unlock(); + } + +private: + struct Value; + + using Store = typename std::conditional< + cache_helper::has_hash::value, + std::unordered_map, + std::map + >::type; + using StoreIterator = typename Store::iterator; + using StoreConstIterator = typename Store::const_iterator; + + typedef std::list LruList; + using LruListIterator = typename LruList::iterator; + + struct Value { + V value; + LruListIterator lru_list_it; + }; + + enum class Mode { + UNCOND, + KNOWN, + UNKNOWN + }; + + void discard() + { + const StoreIterator store_it = lru_list.back(); + if (hook) { + hook->onDiscard(store_it->first, store_it->second.value); + } + store.erase(store_it); + lru_list.pop_back(); + } + + bool set(const K& key, const V& value, Mode mode) + { + mutex.lock(); + const StoreIterator store_it = store.find(key); + const bool is_new_key = store_it == store.end(); + if (is_new_key) { + if (mode == Mode::UNCOND || mode == Mode::UNKNOWN) { + if (lru_list.size() >= store_size) { + discard(); + } + lru_list.push_front(store.end()); + const Value v = { + value, + lru_list.begin() + }; + lru_list.front() = store.emplace(key, v).first; + } + } else { + if (mode == Mode::UNCOND || mode == Mode::KNOWN) { + if (hook) { + hook->onDisplace(key, store_it->second.value); + } + lru_list.splice( + lru_list.begin(), + lru_list, + store_it->second.lru_list_it + ); + store_it->second.value = value; + } + } + mutex.unlock(); + + return is_new_key; + } + + void remove(const StoreIterator& store_it) + { + if (hook) { + hook->onRemove(store_it->first, store_it->second.value); + } + lru_list.erase(store_it->second.lru_list_it); + store.erase(store_it); + } + + unsigned long store_size; + Hook* const hook; + mutable MyMutex mutex; + Store store; + mutable LruList lru_list; +}; + +} diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index a0ea5afb4..533d1e89d 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -3,440 +3,233 @@ #include "stdimagesource.h" #include "../rtgui/options.h" -rtengine::CLUTStore clutStore; - -using namespace rtengine; - -const float MAXVAL8 = 255.; - -CLUTStore::CLUTStore() +namespace { -} -CLUT* CLUTStore::getClut( const Glib::ustring& filename ) +std::unique_ptr loadFile( + const Glib::ustring& filename, + const Glib::ustring& working_color_space, + unsigned int& clut_level +) { - CLUT *result = 0; - m_mutex.lock(); - Cluts::iterator cluts_it = m_cluts.find(filename); + std::unique_ptr result; - if (cluts_it == m_cluts.end()) { - if (m_cluts.size() >= options.clutCacheSize) { - // Evict a "random" entry from cache - Cluts::iterator victim_it = m_cluts.begin(); + rtengine::StdImageSource img_src; - if (--victim_it->second.first == -1) { - delete victim_it->second.second; - m_cluts.erase(victim_it); - } - } - - cluts_it = m_cluts.insert(std::make_pair(filename, std::make_pair(0, new HaldCLUT))).first; - cluts_it->second.second->load( filename ); - } - - if (cluts_it->second.second->isValid()) { - result = cluts_it->second.second; - ++cluts_it->second.first; - } else { - delete cluts_it->second.second; - m_cluts.erase(cluts_it); - } - - m_mutex.unlock(); - - return result; -} - -void CLUTStore::releaseClut( const CLUT* clut ) -{ - m_mutex.lock(); - - for (Cluts::iterator cluts_it = m_cluts.begin(); cluts_it != m_cluts.end(); ++cluts_it) { - if (cluts_it->second.second == clut) { - if (--cluts_it->second.first == -1) { - delete cluts_it->second.second; - m_cluts.erase(cluts_it); - } - - break; - } - } - - m_mutex.unlock(); -} - -void CLUTStore::clearCache() -{ - m_mutex.lock(); - - for (Cluts::iterator cluts_it = m_cluts.begin(); cluts_it != m_cluts.end();) { - if (--cluts_it->second.first == -1) { - delete cluts_it->second.second; - Cluts::iterator tmp = cluts_it; - ++cluts_it; - m_cluts.erase(tmp); - } else { - ++cluts_it; - } - } - - m_mutex.unlock(); -} - -void rtengine::splitClutFilename( Glib::ustring filename, Glib::ustring &name, Glib::ustring &extension, Glib::ustring &profileName ) -{ - filename = Glib::path_get_basename( filename ); - name = filename; - //remove dirs - size_t lastSlashPos = filename.find_last_of( "/" ); - - if ( lastSlashPos == Glib::ustring::npos ) { - lastSlashPos = filename.find_last_of( "\\" ); - } - - size_t lastDotPos = filename.find_last_of( '.' ); - - if ( lastDotPos != Glib::ustring::npos ) { - name = filename.substr( 0, lastDotPos ); - extension = filename.substr( lastDotPos + 1, Glib::ustring::npos ); - } - - profileName = "sRGB"; // sRGB by default - static std::vector workingProfiles = rtengine::getWorkingProfiles(); - - for ( std::vector::iterator it = workingProfiles.begin(); it != workingProfiles.end(); ++it ) { - Glib::ustring ¤tProfile = *it; - - if ( std::search( name.rbegin(), name.rend(), currentProfile.rbegin(), currentProfile.rend() ) == name.rbegin() ) { - profileName = currentProfile; - name = name.substr( 0, name.size() - currentProfile.size() ); - break; - } - } -} - -//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: - -HaldCLUT::HaldCLUT() - : m_clutImage( 0 ), - m_level (0), - m_profile( "sRGB" ) -{ -} - -HaldCLUT::~HaldCLUT() -{ - if ( m_clutImage ) { - m_clutImage->free(); - m_clutImage = 0; - } -} - -void HaldCLUT::load( Glib::ustring filename ) -{ - m_clutImage = loadFile( filename, "", m_level ); - Glib::ustring name, ext; - splitClutFilename( filename, name, ext, m_profile ); - - if ( m_clutImage ) { - m_filename = filename; - } -} - -Glib::ustring HaldCLUT::profile() const -{ - return m_profile; -} - -Imagefloat* HaldCLUT::loadFile( Glib::ustring filename, Glib::ustring workingColorSpace, int &outLevel ) -{ - Imagefloat *result = 0; - StdImageSource imgSrc; - - if ( !Glib::file_test( filename, Glib::FILE_TEST_EXISTS ) || imgSrc.load(filename) ) { + if (!Glib::file_test(filename, Glib::FILE_TEST_EXISTS) || img_src.load(filename)) { return result; } int fw, fh; - imgSrc.getFullSize (fw, fh, TR_NONE); + img_src.getFullSize(fw, fh, TR_NONE); bool valid = false; - //test on Hald format, copypasted from http://www.quelsolaar.com/technology/clut.html - if ( fw == fh ) { - outLevel = 1; - - for(; outLevel * outLevel * outLevel < fw; outLevel++); - - if( !( outLevel * outLevel * outLevel > fw ) ) { + if (fw == fh) { + unsigned int level = 1; + while (level * level * level < fw) { + ++level; + } + if (level * level * level == fw && level > 1) { + clut_level = level; valid = true; } } - if ( valid ) { - ColorTemp currWB = imgSrc.getWB(); - Imagefloat* baseImg = new Imagefloat (fw, fh); - PreviewProps pp (0, 0, fw, fh, 1); + if (valid) { + rtengine::ColorTemp curr_wb = img_src.getWB(); + result = std::unique_ptr(new rtengine::Imagefloat(fw, fh)); + const PreviewProps pp(0, 0, fw, fh, 1); - procparams::ColorManagementParams icm; - icm.working = workingColorSpace; + rtengine::procparams::ColorManagementParams icm; + icm.working = working_color_space; - imgSrc.getImage (currWB, TR_NONE, baseImg, pp, procparams::ToneCurveParams(), icm, procparams::RAWParams()); + img_src.getImage(curr_wb, TR_NONE, result.get(), pp, rtengine::procparams::ToneCurveParams(), icm, rtengine::procparams::RAWParams()); - if ( !workingColorSpace.empty() ) { - imgSrc.convertColorSpace(baseImg, icm, currWB); + if (!working_color_space.empty()) { + img_src.convertColorSpace(result.get(), icm, curr_wb); } - - result = baseImg; } return result; } -void HaldCLUT::loadClut( Imagefloat *img, RawClut &outClut ) +inline void posToXy(unsigned int pos, unsigned int width, unsigned int (&x)[2], unsigned int (&y)[2]) { - img->normalizeFloatTo1(); - int y_size = img->getH(); - int x_size = img->getW(); - outClut.resize( x_size * y_size * 3 ); - int clutIdx = 0; + x[0] = pos % width; + y[0] = pos / width; + x[1] = (pos + 1) % width; + y[1] = (pos + 1) / width; +} - //int level = m_level * m_level; (unused) - for(int y = 0; y < y_size; y++) { - for(int x = 0; x < x_size; x++) { - outClut[ clutIdx * 3 ] = img->r( y, x ) * MAXVAL8; - outClut[ clutIdx * 3 + 1 ] = img->g( y, x ) * MAXVAL8; - outClut[ clutIdx * 3 + 2 ] = img->b( y, x ) * MAXVAL8; +} - ++clutIdx; +void rtengine::CLUT::splitClutFilename( + const Glib::ustring& filename, + Glib::ustring& name, + Glib::ustring& extension, + Glib::ustring& profile_name +) +{ + Glib::ustring basename = Glib::path_get_basename(filename); + + Glib::ustring::size_type last_slash_pos = basename.rfind('/'); + if (last_slash_pos == Glib::ustring::npos) { + last_slash_pos = basename.rfind('\\'); + } + + const Glib::ustring::size_type last_dot_pos = basename.rfind('.'); + + if (last_dot_pos != Glib::ustring::npos) { + name.assign(basename, 0, last_dot_pos); + extension.assign(basename, last_dot_pos + 1, Glib::ustring::npos); + } else { + name = basename; + } + + profile_name = "sRGB"; + + for (const auto& working_profile : rtengine::getWorkingProfiles()) { + if ( std::search( name.rbegin(), name.rend(), working_profile.rbegin(), working_profile.rend() ) == name.rbegin() ) { + profile_name = working_profile; + name.erase(name.size() - working_profile.size()); + break; } } } -Imagefloat* HaldCLUT::generateIdentImage( int level ) +rtengine::HaldCLUT::HaldCLUT() : + clut_level(0), + clut_profile("sRGB") { - int imageWidth = level * level * level; - Imagefloat *resultImg = new Imagefloat( imageWidth, imageWidth ); +} - int cubeSideSize = level * level; - float step = MAXVALF / (cubeSideSize - 1); - int pos = 0; +rtengine::HaldCLUT::~HaldCLUT() +{ +} - for( int b = 0; b < cubeSideSize; ++b ) { - for ( int g = 0; g < cubeSideSize; ++g ) { - for ( int r = 0; r < cubeSideSize; ++r ) { - int x = pos / imageWidth; - int y = pos % imageWidth; - resultImg->r( x, y ) = step * r; - resultImg->g( x, y ) = step * g; - resultImg->b( x, y ) = step * b; - ++pos; - } +bool rtengine::HaldCLUT::load(const Glib::ustring& filename) +{ + clut_image = loadFile(filename, "", clut_level); + Glib::ustring name, ext; + splitClutFilename(filename, name, ext, clut_profile); + + if (clut_image) { + clut_filename = filename; + return true; + } + + return false; +} + +rtengine::HaldCLUT::operator bool() const +{ + return static_cast(clut_image); +} + +Glib::ustring rtengine::HaldCLUT::getFilename() const +{ + return clut_filename; +} + +Glib::ustring rtengine::HaldCLUT::getProfile() const +{ + return clut_profile; +} + +void rtengine::HaldCLUT::getRGB(float r, float g, float b, float& out_r, float& out_g, float& out_b) const +{ + const unsigned int level = clut_level * clut_level; + + const float flevel_minus_one = static_cast(level - 1) / 65535.0f; + const float flevel_minus_two = static_cast(level - 2); + + const unsigned int red = std::min(flevel_minus_two, r * flevel_minus_one); + const unsigned int green = std::min(flevel_minus_two, g * flevel_minus_one); + const unsigned int blue = std::min(flevel_minus_two, b * flevel_minus_one); + + r = r * flevel_minus_one - red; + g = g * flevel_minus_one - green; + b = b * flevel_minus_one - blue; + + const unsigned int level_square = level * level; + + const unsigned int color = red + green * level + blue * level_square; + + unsigned int x[2]; + unsigned int y[2]; + posToXy(color, clut_image->getWidth(), x, y); + + float tmp1[4] __attribute__((aligned(16))); + tmp1[0] = clut_image->r(y[0], x[0]) * (1 - r) + clut_image->r(y[1], x[1]) * r; + tmp1[1] = clut_image->g(y[0], x[0]) * (1 - r) + clut_image->g(y[1], x[1]) * r; + tmp1[2] = clut_image->b(y[0], x[0]) * (1 - r) + clut_image->b(y[1], x[1]) * r; + + posToXy(color + level, clut_image->getWidth(), x, y); + + float tmp2[4] __attribute__((aligned(16))); + tmp2[0] = clut_image->r(y[0], x[0]) * (1 - r) + clut_image->r(y[1], x[1]) * r; + tmp2[1] = clut_image->g(y[0], x[0]) * (1 - r) + clut_image->g(y[1], x[1]) * r; + tmp2[2] = clut_image->b(y[0], x[0]) * (1 - r) + clut_image->b(y[1], x[1]) * r; + + float out[4] __attribute__((aligned(16))); + out[0] = tmp1[0] * (1 - g) + tmp2[0] * g; + out[1] = tmp1[1] * (1 - g) + tmp2[1] * g; + out[2] = tmp1[2] * (1 - g) + tmp2[2] * g; + + posToXy(color + level_square, clut_image->getWidth(), x, y); + + tmp1[0] = clut_image->r(y[0], x[0]) * (1 - r) + clut_image->r(y[1], x[1]) * r; + tmp1[1] = clut_image->g(y[0], x[0]) * (1 - r) + clut_image->g(y[1], x[1]) * r; + tmp1[2] = clut_image->b(y[0], x[0]) * (1 - r) + clut_image->b(y[1], x[1]) * r; + + posToXy(color + level + level_square, clut_image->getWidth(), x, y); + + tmp2[0] = clut_image->r(y[0], x[0]) * (1 - r) + clut_image->r(y[1], x[1]) * r; + tmp2[1] = clut_image->g(y[0], x[0]) * (1 - r) + clut_image->g(y[1], x[1]) * r; + tmp2[2] = clut_image->b(y[0], x[0]) * (1 - r) + clut_image->b(y[1], x[1]) * r; + + tmp1[0] = tmp1[0] * (1 - g) + tmp2[0] * g; + tmp1[1] = tmp1[1] * (1 - g) + tmp2[1] * g; + tmp1[2] = tmp1[2] * (1 - g) + tmp2[2] * g; + + out_r = out[0] * (1 - b) + tmp1[0] * b; + out_g = out[1] * (1 - b) + tmp1[1] * b; + out_b = out[2] * (1 - b) + tmp1[2] * b; +} + +rtengine::CLUTStore& rtengine::CLUTStore::getInstance() +{ + static CLUTStore instance; + return instance; +} + +std::shared_ptr rtengine::CLUTStore::getClut(const Glib::ustring& filename) +{ + std::shared_ptr result; + + if (!cache.get(filename, result)) { + std::unique_ptr clut(new rtengine::HaldCLUT); + if (clut->load(filename)) { + result = std::move(clut); + cache.insert(filename, result); } } - return resultImg; + return result; } - -bool HaldCLUT::isValid() const +void rtengine::CLUTStore::releaseClut(const std::shared_ptr& clut) { - return m_clutImage != 0; + cache.remove(clut->getFilename()); } -void HaldCLUT::getRGB( float rr, float gg, float bb, float &outR, float &outG, float &outB ) const +void rtengine::CLUTStore::clearCache() { - rr /= MAXVALF; - gg /= MAXVALF; - bb /= MAXVALF; - correct( *m_clutImage, m_level, rr, gg, bb, outR, outG, outB ); + cache.clear(); } -inline float valF( unsigned char val ) +rtengine::CLUTStore::CLUTStore() : + cache(options.clutCacheSize) { - return float( val ) / MAXVAL8; -} - -// copypasted from http://www.quelsolaar.com/technology/clut.html -void HaldCLUT::correct( const HaldCLUT::RawClut& clut, int level, float rr, float gg, float bb, float &outR, float &outG, float &outB ) -{ - int color, red, green, blue, i, j; - float tmp[6], r, g, b; - level = level * level; - - red = rr * (float)(level - 1); - - if(red > level - 2) { - red = (float)level - 2; - } - - if(red < 0) { - red = 0; - } - - green = gg * (float)(level - 1); - - if(green > level - 2) { - green = (float)level - 2; - } - - if(green < 0) { - green = 0; - } - - blue = bb * (float)(level - 1); - - if(blue > level - 2) { - blue = (float)level - 2; - } - - if(blue < 0) { - blue = 0; - } - - r = rr * (float)(level - 1) - red; - g = gg * (float)(level - 1) - green; - b = bb * (float)(level - 1) - blue; - - color = red + green * level + blue * level * level; - - i = color * 3; - j = (color + 1) * 3; - - tmp[0] = valF( clut[i++] ) * (1 - r) + valF( clut[j++] ) * r; - tmp[1] = valF( clut[i++] ) * (1 - r) + valF( clut[j++] ) * r; - tmp[2] = valF( clut[i] ) * (1 - r) + valF( clut[j] ) * r; - - i = (color + level) * 3; - j = (color + level + 1) * 3; - - tmp[3] = valF( clut[i++] ) * (1 - r) + valF( clut[j++] ) * r; - tmp[4] = valF( clut[i++] ) * (1 - r) + valF( clut[j++] ) * r; - tmp[5] = valF( clut[i] ) * (1 - r) + valF( clut[j] ) * r; - - outR = tmp[0] * (1 - g) + tmp[3] * g; - outG = tmp[1] * (1 - g) + tmp[4] * g; - outB = tmp[2] * (1 - g) + tmp[5] * g; - - i = (color + level * level) * 3; - j = (color + level * level + 1) * 3; - - tmp[0] = valF( clut[i++] ) * (1 - r) + valF( clut[j++] ) * r; - tmp[1] = valF( clut[i++] ) * (1 - r) + valF( clut[j++] ) * r; - tmp[2] = valF( clut[i] ) * (1 - r) + valF( clut[j] ) * r; - - i = (color + level + level * level) * 3; - j = (color + level + level * level + 1) * 3; - - tmp[3] = valF( clut[i++] ) * (1 - r) + valF( clut[j++] ) * r; - tmp[4] = valF( clut[i++] ) * (1 - r) + valF( clut[j++] ) * r; - tmp[5] = valF( clut[i] ) * (1 - r) + valF( clut[j] ) * r; - - tmp[0] = tmp[0] * (1 - g) + tmp[3] * g; - tmp[1] = tmp[1] * (1 - g) + tmp[4] * g; - tmp[2] = tmp[2] * (1 - g) + tmp[5] * g; - - outR = outR * (1 - b) + tmp[0] * b; - outG = outG * (1 - b) + tmp[1] * b; - outB = outB * (1 - b) + tmp[2] * b; -} - -inline void pos2xy( int pos, int imageSideSize, int &outX, int &outY ) -{ - outX = pos / imageSideSize; - outY = pos % imageSideSize; -} - -void HaldCLUT::correct( Imagefloat &clutImage, int level, float rr, float gg, float bb, float &outR, float &outG, float &outB ) -{ - int color, red, green, blue, i, j; - float tmp[6], r, g, b; - level = level * level; - int imageSideSize = clutImage.getW(); - - red = rr * (float)(level - 1); - - if(red > level - 2) { - red = (float)level - 2; - } - - if(red < 0) { - red = 0; - } - - green = gg * (float)(level - 1); - - if(green > level - 2) { - green = (float)level - 2; - } - - if(green < 0) { - green = 0; - } - - blue = bb * (float)(level - 1); - - if(blue > level - 2) { - blue = (float)level - 2; - } - - if(blue < 0) { - blue = 0; - } - - r = rr * (float)(level - 1) - red; - g = gg * (float)(level - 1) - green; - b = bb * (float)(level - 1) - blue; - - color = red + green * level + blue * level * level; - - - i = color; - j = color + 1; - int xi, yi, xj, yj; - pos2xy( i, imageSideSize, xi, yi ); - pos2xy( j, imageSideSize, xj, yj ); - - tmp[0] = clutImage.r( xi, yi ) * (1 - r) + clutImage.r( xj, yj ) * r; - tmp[1] = clutImage.g( xi, yi ) * (1 - r) + clutImage.g( xj, yj ) * r; - tmp[2] = clutImage.b( xi, yi ) * (1 - r) + clutImage.b( xj, yj ) * r; - - i = color + level; - j = color + level + 1; - pos2xy( i, imageSideSize, xi, yi ); - pos2xy( j, imageSideSize, xj, yj ); - - tmp[3] = clutImage.r( xi, yi ) * (1 - r) + clutImage.r( xj, yj ) * r; - tmp[4] = clutImage.g( xi, yi ) * (1 - r) + clutImage.g( xj, yj ) * r; - tmp[5] = clutImage.b( xi, yi ) * (1 - r) + clutImage.b( xj, yj ) * r; - - outR = tmp[0] * (1 - g) + tmp[3] * g; - outG = tmp[1] * (1 - g) + tmp[4] * g; - outB = tmp[2] * (1 - g) + tmp[5] * g; - - i = color + level * level; - j = color + level * level + 1; - pos2xy( i, imageSideSize, xi, yi ); - pos2xy( j, imageSideSize, xj, yj ); - - tmp[0] = clutImage.r( xi, yi ) * (1 - r) + clutImage.r( xj, yj ) * r; - tmp[1] = clutImage.g( xi, yi ) * (1 - r) + clutImage.g( xj, yj ) * r; - tmp[2] = clutImage.b( xi, yi ) * (1 - r) + clutImage.b( xj, yj ) * r; - - i = color + level + level * level; - j = color + level + level * level + 1; - pos2xy( i, imageSideSize, xi, yi ); - pos2xy( j, imageSideSize, xj, yj ); - - tmp[3] = clutImage.r( xi, yi ) * (1 - r) + clutImage.r( xj, yj ) * r; - tmp[4] = clutImage.g( xi, yi ) * (1 - r) + clutImage.g( xj, yj ) * r; - tmp[5] = clutImage.b( xi, yi ) * (1 - r) + clutImage.b( xj, yj ) * r; - - tmp[0] = tmp[0] * (1 - g) + tmp[3] * g; - tmp[1] = tmp[1] * (1 - g) + tmp[4] * g; - tmp[2] = tmp[2] * (1 - g) + tmp[5] * g; - - outR = outR * (1 - b) + tmp[0] * b; - outG = outG * (1 - b) + tmp[1] * b; - outB = outB * (1 - b) + tmp[2] * b; } diff --git a/rtengine/clutstore.h b/rtengine/clutstore.h index de080b737..ce8fd9627 100644 --- a/rtengine/clutstore.h +++ b/rtengine/clutstore.h @@ -1,107 +1,78 @@ -#ifndef CLUT_STORE_INCLUDED -#define CLUT_STORE_INCLUDED +#pragma once + +#include #include -#include "../rtgui/threadutils.h" + #include "imagefloat.h" -#include -#include +#include "cache.h" namespace rtengine { -// simple CLUT interface class CLUT { public: - virtual void getRGB( float r, float g, float b, float &outR, float &outG, float &outB ) const = 0; - virtual Glib::ustring profile() const = 0; -protected: - virtual ~CLUT() {}; + CLUT() = default; + CLUT(const CLUT& other) = delete; + CLUT& operator =(const CLUT& other) = delete; + virtual ~CLUT() = default; + + virtual explicit operator bool() const = 0; + + virtual Glib::ustring getFilename() const = 0; + virtual Glib::ustring getProfile() const = 0; + + virtual void getRGB(float r, float g, float b, float& out_r, float& out_g, float& out_b) const = 0; + + static void splitClutFilename( + const Glib::ustring& filename, + Glib::ustring& name, + Glib::ustring& extension, + Glib::ustring& profile_name + ); }; -class HaldCLUT : public CLUT +class HaldCLUT + : public CLUT { public: HaldCLUT(); ~HaldCLUT(); - void load( Glib::ustring filename ); - bool isValid() const; - void getRGB( float r, float g, float b, float &outR, float &outG, float &outB ) const; - Glib::ustring profile() const; + bool load(const Glib::ustring& filename); - typedef std::vector RawClut; // using 8 bit for reduce memory usage - static void correct( const RawClut&, int level, float r, float g, float b, float &outR, float &outG, float &outB ); - static void correct( Imagefloat &clutImage, int level, float rr, float gg, float bb, float &outR, float &outG, float &outB ); - static Imagefloat* generateIdentImage( int level ); - static Imagefloat* loadFile( Glib::ustring filename, Glib::ustring workingColorSpace, int &outLevel ); + explicit operator bool() const; + + Glib::ustring getFilename() const; + Glib::ustring getProfile() const; + + void getRGB(float r, float g, float b, float& out_r, float& out_g, float& out_b) const; private: - - void loadClut( Imagefloat *img, RawClut &outClut ); - - Imagefloat *m_clutImage; - int m_level; - Glib::ustring m_filename; - Glib::ustring m_profile; + std::unique_ptr clut_image; + unsigned int clut_level; + Glib::ustring clut_filename; + Glib::ustring clut_profile; }; -// CLUT cache class CLUTStore { public: - CLUTStore(); + static CLUTStore& getInstance(); - CLUT* getClut( const Glib::ustring& filename ); - void releaseClut( const CLUT* clut ); + CLUTStore(const CLUTStore& other) = delete; + CLUTStore& operator =(const CLUTStore& other) = delete; + + std::shared_ptr getClut(const Glib::ustring& filename); + void releaseClut(const std::shared_ptr& clut); void clearCache(); private: - typedef std::map > Cluts; + CLUTStore(); - Cluts m_cluts; - MyMutex m_mutex; + Cache> cache; }; -void splitClutFilename( Glib::ustring filename, Glib::ustring &name, Glib::ustring &extension, Glib::ustring &profileName ); - -}; //namespace rtengine - -extern rtengine::CLUTStore clutStore; - -namespace rtengine -{ - -//support class for automate call of clutStore.releaseClut() -class ClutPtr -{ -public: - ClutPtr() : m_point( 0 ) {} - explicit ClutPtr(CLUT *p) : m_point( p ) {} - ~ClutPtr() - { - clutStore.releaseClut( m_point ); - } - const CLUT* operator-> () const - { - return m_point; - } - operator bool() const - { - return m_point != 0; - } - void set( CLUT *p ) - { - m_point = p; - } - -private: - ClutPtr& operator=(ClutPtr const& cp ); - CLUT *m_point; -}; - -}; //namespace rtengine - -#endif +} diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 989d9c6a7..2732c567d 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -3205,21 +3205,21 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } - ClutPtr colorLUT; + std::shared_ptr colorLUT; bool clutAndWorkingProfilesAreSame = false; TMatrix work2xyz, xyz2clut, clut2xyz, xyz2work; if ( params->filmSimulation.enabled && !params->filmSimulation.clutFilename.empty() ) { - colorLUT.set( clutStore.getClut( params->filmSimulation.clutFilename ) ); + colorLUT = CLUTStore::getInstance().getClut( params->filmSimulation.clutFilename ); if ( colorLUT ) { - clutAndWorkingProfilesAreSame = colorLUT->profile() == params->icm.working; + clutAndWorkingProfilesAreSame = colorLUT->getProfile() == params->icm.working; if ( !clutAndWorkingProfilesAreSame ) { work2xyz = iccStore->workingSpaceMatrix( params->icm.working ); - xyz2clut = iccStore->workingSpaceInverseMatrix( colorLUT->profile() ); + xyz2clut = iccStore->workingSpaceInverseMatrix( colorLUT->getProfile() ); xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working ); - clut2xyz = iccStore->workingSpaceMatrix( colorLUT->profile() ); + clut2xyz = iccStore->workingSpaceMatrix( colorLUT->getProfile() ); } } } @@ -4337,6 +4337,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer //Film Simulations if ( colorLUT ) { + MyTime start, stop; + start.set(); for (int i = istart, ti = 0; i < tH; i++, ti++) { for (int j = jstart, tj = 0; j < tW; j++, tj++) { float &sourceR = rtemp[ti * TS + tj]; @@ -4375,6 +4377,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } + stop.set(); + printf("Film simulation took %dus.\n", stop.etime(start)); } diff --git a/rtengine/simpleprocess.cc b/rtengine/simpleprocess.cc index c03addb42..bda0f84b0 100644 --- a/rtengine/simpleprocess.cc +++ b/rtengine/simpleprocess.cc @@ -857,7 +857,7 @@ IImage16* processImage (ProcessingJob* pjob, int& errorCode, ProgressListener* p // if clut was used and size of clut cache == 1 we free the memory used by the clutstore (default clut cache size = 1 for 32 bit OS) if ( params.filmSimulation.enabled && !params.filmSimulation.clutFilename.empty() && options.clutCacheSize == 1) { - clutStore.clearCache(); + CLUTStore::getInstance().clearCache(); } // freeing up some memory diff --git a/rtgui/filmsimulation.cc b/rtgui/filmsimulation.cc index 07f85df94..3ee1f4742 100644 --- a/rtgui/filmsimulation.cc +++ b/rtgui/filmsimulation.cc @@ -72,7 +72,7 @@ void FilmSimulation::onClutSelected() if ( getEnabled() && !currentClutFilename.empty() && listener && currentClutFilename != m_oldClutFilename ) { Glib::ustring clutName, dummy; - splitClutFilename( currentClutFilename, clutName, dummy, dummy ); + CLUT::splitClutFilename( currentClutFilename, clutName, dummy, dummy ); listener->panelChanged( EvFilmSimulationFilename, clutName ); m_oldClutFilename = currentClutFilename; @@ -132,7 +132,7 @@ void FilmSimulation::read( const rtengine::procparams::ProcParams* pp, const Par if ( !get_inconsistent() && !pp->filmSimulation.enabled ) { if (options.clutCacheSize == 1) { - clutStore.clearCache(); + CLUTStore::getInstance().clearCache(); } } @@ -279,7 +279,7 @@ int ClutComboBox::parseDir (const Glib::ustring& path) for (const auto& entry : entries) { Glib::ustring name, extension, profileName; - splitClutFilename (entry, name, extension, profileName); + CLUT::splitClutFilename (entry, name, extension, profileName); extension = extension.casefold (); if (extension.compare ("tif") != 0 && extension.compare ("png") != 0) { From f639cd6b82849a7e3bd3ade4d5e0aaf6507b75c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Mon, 18 Apr 2016 20:38:23 +0200 Subject: [PATCH 02/15] Use Image16 instead of Imagefloat for CLUT Gain speed and reduce memory by using Image16 instead of Imagefloat for the CLUT. --- rtengine/clutstore.cc | 28 ++++++++++++++++++---------- rtengine/clutstore.h | 7 +++++-- rtengine/improcfun.cc | 4 ---- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index 533d1e89d..53adcc013 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -1,18 +1,22 @@ +#include + #include "clutstore.h" -#include "rt_math.h" + +#include "image16.h" +#include "imagefloat.h" #include "stdimagesource.h" #include "../rtgui/options.h" namespace { -std::unique_ptr loadFile( +std::unique_ptr loadFile( const Glib::ustring& filename, const Glib::ustring& working_color_space, unsigned int& clut_level ) { - std::unique_ptr result; + std::unique_ptr result; rtengine::StdImageSource img_src; @@ -38,17 +42,19 @@ std::unique_ptr loadFile( if (valid) { rtengine::ColorTemp curr_wb = img_src.getWB(); - result = std::unique_ptr(new rtengine::Imagefloat(fw, fh)); + std::unique_ptr img_float = std::unique_ptr(new rtengine::Imagefloat(fw, fh)); const PreviewProps pp(0, 0, fw, fh, 1); rtengine::procparams::ColorManagementParams icm; icm.working = working_color_space; - img_src.getImage(curr_wb, TR_NONE, result.get(), pp, rtengine::procparams::ToneCurveParams(), icm, rtengine::procparams::RAWParams()); + img_src.getImage(curr_wb, TR_NONE, img_float.get(), pp, rtengine::procparams::ToneCurveParams(), icm, rtengine::procparams::RAWParams()); if (!working_color_space.empty()) { - img_src.convertColorSpace(result.get(), icm, curr_wb); + img_src.convertColorSpace(img_float.get(), icm, curr_wb); } + + result = std::unique_ptr(img_float->to16()); } return result; @@ -100,6 +106,8 @@ void rtengine::CLUT::splitClutFilename( rtengine::HaldCLUT::HaldCLUT() : clut_level(0), + flevel_minus_one(0.0f), + flevel_minus_two(0.0f), clut_profile("sRGB") { } @@ -116,6 +124,9 @@ bool rtengine::HaldCLUT::load(const Glib::ustring& filename) if (clut_image) { clut_filename = filename; + clut_level *= clut_level; + flevel_minus_one = static_cast(clut_level - 1) / 65535.0f; + flevel_minus_two = static_cast(clut_level - 2); return true; } @@ -139,10 +150,7 @@ Glib::ustring rtengine::HaldCLUT::getProfile() const void rtengine::HaldCLUT::getRGB(float r, float g, float b, float& out_r, float& out_g, float& out_b) const { - const unsigned int level = clut_level * clut_level; - - const float flevel_minus_one = static_cast(level - 1) / 65535.0f; - const float flevel_minus_two = static_cast(level - 2); + const unsigned int level = clut_level; // This is important const unsigned int red = std::min(flevel_minus_two, r * flevel_minus_one); const unsigned int green = std::min(flevel_minus_two, g * flevel_minus_one); diff --git a/rtengine/clutstore.h b/rtengine/clutstore.h index ce8fd9627..b084c67fe 100644 --- a/rtengine/clutstore.h +++ b/rtengine/clutstore.h @@ -4,12 +4,13 @@ #include -#include "imagefloat.h" #include "cache.h" namespace rtengine { +class Image16; + class CLUT { public: @@ -50,8 +51,10 @@ public: void getRGB(float r, float g, float b, float& out_r, float& out_g, float& out_b) const; private: - std::unique_ptr clut_image; + std::unique_ptr clut_image; unsigned int clut_level; + float flevel_minus_one; + float flevel_minus_two; Glib::ustring clut_filename; Glib::ustring clut_profile; }; diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 2732c567d..d61c79513 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -4337,8 +4337,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer //Film Simulations if ( colorLUT ) { - MyTime start, stop; - start.set(); for (int i = istart, ti = 0; i < tH; i++, ti++) { for (int j = jstart, tj = 0; j < tW; j++, tj++) { float &sourceR = rtemp[ti * TS + tj]; @@ -4377,8 +4375,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } - stop.set(); - printf("Film simulation took %dus.\n", stop.etime(start)); } From b1a9e968365c5cafcd499dd706268d6f26ddf98d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Sat, 23 Apr 2016 22:46:21 +0200 Subject: [PATCH 03/15] Store HaldCLUT as flat RGBx array Instead of using an `Image16`, which is organized in planes, store the HaldCLUT in an `AlignedBuffer` with sequential RGBx values. This gives a speedup of roughly 23% here. --- rtengine/alignedbuffer.h | 31 +++----- rtengine/clutstore.cc | 163 ++++++++++++++++++++++++++------------- rtengine/clutstore.h | 16 ++-- rtengine/improcfun.cc | 10 +-- 4 files changed, 131 insertions(+), 89 deletions(-) diff --git a/rtengine/alignedbuffer.h b/rtengine/alignedbuffer.h index a33d4dfe8..ac8471b7e 100644 --- a/rtengine/alignedbuffer.h +++ b/rtengine/alignedbuffer.h @@ -18,9 +18,10 @@ */ #ifndef _ALIGNEDBUFFER_ #define _ALIGNEDBUFFER_ -#include +#include #include #include +#include #include #include "../rtgui/threadutils.h" @@ -58,7 +59,7 @@ public: /** @brief Return true if there's no memory allocated */ - bool isEmpty() + bool isEmpty() const { return allocatedSize == 0; } @@ -120,28 +121,14 @@ public: void swap(AlignedBuffer &other) { - void *tmpReal = other.real; - other.real = real; - real = tmpReal; - - char tmpAlignt = other.alignment; - other.alignment = alignment; - alignment = tmpAlignt; - - size_t tmpAllocSize = other.allocatedSize; - other.allocatedSize = allocatedSize; - allocatedSize = tmpAllocSize; - - T* tmpData = other.data; - other.data = data; - data = tmpData; - - bool tmpInUse = other.inUse; - other.inUse = inUse; - inUse = tmpInUse; + std::swap(real, other.real); + std::swap(alignment, other.alignment); + std::swap(allocatedSize, other.allocatedSize); + std::swap(data, other.data); + std::swap(inUse, other.inUse); } - unsigned int getSize() + unsigned int getSize() const { return unitSize ? allocatedSize / unitSize : 0; } diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index 53adcc013..add051254 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -1,8 +1,11 @@ #include +#ifdef __SSE2__ +#include +#endif + #include "clutstore.h" -#include "image16.h" #include "imagefloat.h" #include "stdimagesource.h" #include "../rtgui/options.h" @@ -10,24 +13,23 @@ namespace { -std::unique_ptr loadFile( +bool loadFile( const Glib::ustring& filename, const Glib::ustring& working_color_space, + AlignedBuffer& clut_image, unsigned int& clut_level ) { - std::unique_ptr result; - rtengine::StdImageSource img_src; if (!Glib::file_test(filename, Glib::FILE_TEST_EXISTS) || img_src.load(filename)) { - return result; + return false; } int fw, fh; img_src.getFullSize(fw, fh, TR_NONE); - bool valid = false; + bool res = false; if (fw == fh) { unsigned int level = 1; @@ -36,11 +38,11 @@ std::unique_ptr loadFile( } if (level * level * level == fw && level > 1) { clut_level = level; - valid = true; + res = true; } } - if (valid) { + if (res) { rtengine::ColorTemp curr_wb = img_src.getWB(); std::unique_ptr img_float = std::unique_ptr(new rtengine::Imagefloat(fw, fh)); const PreviewProps pp(0, 0, fw, fh, 1); @@ -54,20 +56,39 @@ std::unique_ptr loadFile( img_src.convertColorSpace(img_float.get(), icm, curr_wb); } - result = std::unique_ptr(img_float->to16()); + AlignedBuffer image(fw * fh * 4 + 1); + + std::size_t index = 0; + for (int y = 0; y < fh; ++y) { + for (int x = 0; x < fw; ++x) { + image.data[index] = img_float->r(y, x); + ++index; + image.data[index] = img_float->g(y, x); + ++index; + image.data[index] = img_float->b(y, x); + index += 2; + } + } + + clut_image.swap(image); } - return result; + return res; } -inline void posToXy(unsigned int pos, unsigned int width, unsigned int (&x)[2], unsigned int (&y)[2]) +inline void posToIndex(unsigned int pos, size_t (&index)[2]) { - x[0] = pos % width; - y[0] = pos / width; - x[1] = (pos + 1) % width; - y[1] = (pos + 1) / width; + index[0] = static_cast(pos) * 4; + index[1] = static_cast(pos + 1) * 4; } +#ifdef __SSE2__ +inline __m128 getClutValue(const AlignedBuffer& clut_image, size_t index) +{ + return _mm_cvtpu16_ps(*reinterpret_cast(clut_image.data + index)); +} +#endif + } void rtengine::CLUT::splitClutFilename( @@ -118,11 +139,10 @@ rtengine::HaldCLUT::~HaldCLUT() bool rtengine::HaldCLUT::load(const Glib::ustring& filename) { - clut_image = loadFile(filename, "", clut_level); - Glib::ustring name, ext; - splitClutFilename(filename, name, ext, clut_profile); + if (loadFile(filename, "", clut_image, clut_level)) { + Glib::ustring name, ext; + splitClutFilename(filename, name, ext, clut_profile); - if (clut_image) { clut_filename = filename; clut_level *= clut_level; flevel_minus_one = static_cast(clut_level - 1) / 65535.0f; @@ -135,7 +155,7 @@ bool rtengine::HaldCLUT::load(const Glib::ustring& filename) rtengine::HaldCLUT::operator bool() const { - return static_cast(clut_image); + return !clut_image.isEmpty(); } Glib::ustring rtengine::HaldCLUT::getFilename() const @@ -148,62 +168,97 @@ Glib::ustring rtengine::HaldCLUT::getProfile() const return clut_profile; } -void rtengine::HaldCLUT::getRGB(float r, float g, float b, float& out_r, float& out_g, float& out_b) const +void rtengine::HaldCLUT::getRGB(float r, float g, float b, float out_rgbx[4]) const { - const unsigned int level = clut_level; // This is important + const unsigned int level = clut_level; // This is important const unsigned int red = std::min(flevel_minus_two, r * flevel_minus_one); const unsigned int green = std::min(flevel_minus_two, g * flevel_minus_one); const unsigned int blue = std::min(flevel_minus_two, b * flevel_minus_one); + const unsigned int level_square = level * level; + + const unsigned int color = red + green * level + blue * level_square; + +#ifndef __SSE2__ r = r * flevel_minus_one - red; g = g * flevel_minus_one - green; b = b * flevel_minus_one - blue; - const unsigned int level_square = level * level; + size_t index[2]; + posToIndex(color, index); - const unsigned int color = red + green * level + blue * level_square; + float tmp1[4] ALIGNED16; + tmp1[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; + tmp1[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; + tmp1[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; - unsigned int x[2]; - unsigned int y[2]; - posToXy(color, clut_image->getWidth(), x, y); + posToIndex(color + level, index); - float tmp1[4] __attribute__((aligned(16))); - tmp1[0] = clut_image->r(y[0], x[0]) * (1 - r) + clut_image->r(y[1], x[1]) * r; - tmp1[1] = clut_image->g(y[0], x[0]) * (1 - r) + clut_image->g(y[1], x[1]) * r; - tmp1[2] = clut_image->b(y[0], x[0]) * (1 - r) + clut_image->b(y[1], x[1]) * r; + float tmp2[4] ALIGNED16; + tmp2[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; + tmp2[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; + tmp2[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; - posToXy(color + level, clut_image->getWidth(), x, y); + out_rgbx[0] = tmp1[0] * (1 - g) + tmp2[0] * g; + out_rgbx[1] = tmp1[1] * (1 - g) + tmp2[1] * g; + out_rgbx[2] = tmp1[2] * (1 - g) + tmp2[2] * g; - float tmp2[4] __attribute__((aligned(16))); - tmp2[0] = clut_image->r(y[0], x[0]) * (1 - r) + clut_image->r(y[1], x[1]) * r; - tmp2[1] = clut_image->g(y[0], x[0]) * (1 - r) + clut_image->g(y[1], x[1]) * r; - tmp2[2] = clut_image->b(y[0], x[0]) * (1 - r) + clut_image->b(y[1], x[1]) * r; + posToIndex(color + level_square, index); - float out[4] __attribute__((aligned(16))); - out[0] = tmp1[0] * (1 - g) + tmp2[0] * g; - out[1] = tmp1[1] * (1 - g) + tmp2[1] * g; - out[2] = tmp1[2] * (1 - g) + tmp2[2] * g; + tmp1[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; + tmp1[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; + tmp1[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; - posToXy(color + level_square, clut_image->getWidth(), x, y); + posToIndex(color + level + level_square, index); - tmp1[0] = clut_image->r(y[0], x[0]) * (1 - r) + clut_image->r(y[1], x[1]) * r; - tmp1[1] = clut_image->g(y[0], x[0]) * (1 - r) + clut_image->g(y[1], x[1]) * r; - tmp1[2] = clut_image->b(y[0], x[0]) * (1 - r) + clut_image->b(y[1], x[1]) * r; + tmp2[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; + tmp2[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; + tmp2[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; - posToXy(color + level + level_square, clut_image->getWidth(), x, y); + tmp1[0] = tmp1[0] * (1 - g) + tmp2[0] * g; + tmp1[1] = tmp1[1] * (1 - g) + tmp2[1] * g; + tmp1[2] = tmp1[2] * (1 - g) + tmp2[2] * g; - tmp2[0] = clut_image->r(y[0], x[0]) * (1 - r) + clut_image->r(y[1], x[1]) * r; - tmp2[1] = clut_image->g(y[0], x[0]) * (1 - r) + clut_image->g(y[1], x[1]) * r; - tmp2[2] = clut_image->b(y[0], x[0]) * (1 - r) + clut_image->b(y[1], x[1]) * r; + out_rgbx[0] = out_rgbx[0] * (1 - b) + tmp1[0] * b; + out_rgbx[1] = out_rgbx[1] * (1 - b) + tmp1[1] * b; + out_rgbx[2] = out_rgbx[2] * (1 - b) + tmp1[2] * b; +#else + const __m128 v_rgb = _mm_set_ps(0.0f, b, g, r) *_mm_load_ps1(&flevel_minus_one) - _mm_set_ps(0.0f, blue, green, red); - tmp1[0] = tmp1[0] * (1 - g) + tmp2[0] * g; - tmp1[1] = tmp1[1] * (1 - g) + tmp2[1] * g; - tmp1[2] = tmp1[2] * (1 - g) + tmp2[2] * g; + size_t index[2]; + posToIndex(color, index); - out_r = out[0] * (1 - b) + tmp1[0] * b; - out_g = out[1] * (1 - b) + tmp1[1] * b; - out_b = out[2] * (1 - b) + tmp1[2] * b; + const __m128 v_r = _mm_shuffle_ps(v_rgb, v_rgb, 0x00); + + __m128 v_cv0 = getClutValue(clut_image, index[0]); + __m128 v_tmp1 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + + posToIndex(color + level, index); + + v_cv0 = getClutValue(clut_image, index[0]); + __m128 v_tmp2 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + + const __m128 v_g = _mm_shuffle_ps(v_rgb, v_rgb, 0x55); + + __m128 v_out = v_g * (v_tmp2 - v_tmp1) + v_tmp1; + + posToIndex(color + level_square, index); + + v_cv0 = getClutValue(clut_image, index[0]); + v_tmp1 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + + posToIndex(color + level + level_square, index); + + v_cv0 = getClutValue(clut_image, index[0]); + v_tmp2 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + + v_tmp1 = v_g * (v_tmp2 - v_tmp1) + v_tmp1; + + const __m128 v_b = _mm_shuffle_ps(v_rgb, v_rgb, 0xAA); + + _mm_store_ps(out_rgbx, v_b * (v_tmp1 - v_out) + v_out); +#endif } rtengine::CLUTStore& rtengine::CLUTStore::getInstance() diff --git a/rtengine/clutstore.h b/rtengine/clutstore.h index b084c67fe..ed3491fbe 100644 --- a/rtengine/clutstore.h +++ b/rtengine/clutstore.h @@ -1,16 +1,16 @@ #pragma once #include +#include #include #include "cache.h" +#include "alignedbuffer.h" namespace rtengine { -class Image16; - class CLUT { public: @@ -24,7 +24,7 @@ public: virtual Glib::ustring getFilename() const = 0; virtual Glib::ustring getProfile() const = 0; - virtual void getRGB(float r, float g, float b, float& out_r, float& out_g, float& out_b) const = 0; + virtual void getRGB(float r, float g, float b, float out_rgbx[4]) const = 0; static void splitClutFilename( const Glib::ustring& filename, @@ -43,15 +43,15 @@ public: bool load(const Glib::ustring& filename); - explicit operator bool() const; + explicit operator bool() const override; - Glib::ustring getFilename() const; - Glib::ustring getProfile() const; + Glib::ustring getFilename() const override; + Glib::ustring getProfile() const override; - void getRGB(float r, float g, float b, float& out_r, float& out_g, float& out_b) const; + void getRGB(float r, float g, float b, float out_rgbx[4]) const override; private: - std::unique_ptr clut_image; + AlignedBuffer clut_image; unsigned int clut_level; float flevel_minus_one; float flevel_minus_two; diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index d61c79513..2e15da916 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -4355,12 +4355,12 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer sourceG = CLIP( Color::gamma_srgb( sourceG ) ); sourceB = CLIP( Color::gamma_srgb( sourceB ) ); - float r, g, b; - colorLUT->getRGB( sourceR, sourceG, sourceB, r, g, b ); + float out_rgbx[4] ALIGNED16; + colorLUT->getRGB( sourceR, sourceG, sourceB, out_rgbx ); // apply strength - sourceR = r * filmSimCorrectedStrength + sourceR * filmSimSourceStrength; - sourceG = g * filmSimCorrectedStrength + sourceG * filmSimSourceStrength; - sourceB = b * filmSimCorrectedStrength + sourceB * filmSimSourceStrength; + sourceR = out_rgbx[0] * filmSimCorrectedStrength + sourceR * filmSimSourceStrength; + sourceG = out_rgbx[1] * filmSimCorrectedStrength + sourceG * filmSimSourceStrength; + sourceB = out_rgbx[2] * filmSimCorrectedStrength + sourceB * filmSimSourceStrength; // apply inverse gamma sRGB sourceR = Color::igamma_srgb( sourceR ); sourceG = Color::igamma_srgb( sourceG ); From 56f8ea086cdc95a0d177e6f83391b1ebba165344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Sat, 23 Apr 2016 23:02:02 +0200 Subject: [PATCH 04/15] Correct whitespace Last commit messed up some whitespace, this one fixes it. --- rtengine/clutstore.cc | 86 +++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index add051254..9619eff0f 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -170,94 +170,94 @@ Glib::ustring rtengine::HaldCLUT::getProfile() const void rtengine::HaldCLUT::getRGB(float r, float g, float b, float out_rgbx[4]) const { - const unsigned int level = clut_level; // This is important + const unsigned int level = clut_level; // This is important const unsigned int red = std::min(flevel_minus_two, r * flevel_minus_one); const unsigned int green = std::min(flevel_minus_two, g * flevel_minus_one); const unsigned int blue = std::min(flevel_minus_two, b * flevel_minus_one); - const unsigned int level_square = level * level; + const unsigned int level_square = level * level; - const unsigned int color = red + green * level + blue * level_square; + const unsigned int color = red + green * level + blue * level_square; #ifndef __SSE2__ r = r * flevel_minus_one - red; g = g * flevel_minus_one - green; b = b * flevel_minus_one - blue; - size_t index[2]; - posToIndex(color, index); + size_t index[2]; + posToIndex(color, index); - float tmp1[4] ALIGNED16; - tmp1[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; - tmp1[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; - tmp1[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; + float tmp1[4] ALIGNED16; + tmp1[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; + tmp1[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; + tmp1[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; - posToIndex(color + level, index); + posToIndex(color + level, index); - float tmp2[4] ALIGNED16; - tmp2[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; - tmp2[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; - tmp2[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; + float tmp2[4] ALIGNED16; + tmp2[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; + tmp2[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; + tmp2[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; - out_rgbx[0] = tmp1[0] * (1 - g) + tmp2[0] * g; - out_rgbx[1] = tmp1[1] * (1 - g) + tmp2[1] * g; - out_rgbx[2] = tmp1[2] * (1 - g) + tmp2[2] * g; + out_rgbx[0] = tmp1[0] * (1 - g) + tmp2[0] * g; + out_rgbx[1] = tmp1[1] * (1 - g) + tmp2[1] * g; + out_rgbx[2] = tmp1[2] * (1 - g) + tmp2[2] * g; - posToIndex(color + level_square, index); + posToIndex(color + level_square, index); - tmp1[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; - tmp1[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; - tmp1[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; + tmp1[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; + tmp1[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; + tmp1[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; - posToIndex(color + level + level_square, index); + posToIndex(color + level + level_square, index); - tmp2[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; - tmp2[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; - tmp2[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; + tmp2[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; + tmp2[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; + tmp2[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; - tmp1[0] = tmp1[0] * (1 - g) + tmp2[0] * g; - tmp1[1] = tmp1[1] * (1 - g) + tmp2[1] * g; - tmp1[2] = tmp1[2] * (1 - g) + tmp2[2] * g; + tmp1[0] = tmp1[0] * (1 - g) + tmp2[0] * g; + tmp1[1] = tmp1[1] * (1 - g) + tmp2[1] * g; + tmp1[2] = tmp1[2] * (1 - g) + tmp2[2] * g; out_rgbx[0] = out_rgbx[0] * (1 - b) + tmp1[0] * b; out_rgbx[1] = out_rgbx[1] * (1 - b) + tmp1[1] * b; out_rgbx[2] = out_rgbx[2] * (1 - b) + tmp1[2] * b; #else - const __m128 v_rgb = _mm_set_ps(0.0f, b, g, r) *_mm_load_ps1(&flevel_minus_one) - _mm_set_ps(0.0f, blue, green, red); + const __m128 v_rgb = _mm_set_ps(0.0f, b, g, r) *_mm_load_ps1(&flevel_minus_one) - _mm_set_ps(0.0f, blue, green, red); - size_t index[2]; - posToIndex(color, index); + size_t index[2]; + posToIndex(color, index); - const __m128 v_r = _mm_shuffle_ps(v_rgb, v_rgb, 0x00); + const __m128 v_r = _mm_shuffle_ps(v_rgb, v_rgb, 0x00); __m128 v_cv0 = getClutValue(clut_image, index[0]); __m128 v_tmp1 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; - posToIndex(color + level, index); + posToIndex(color + level, index); v_cv0 = getClutValue(clut_image, index[0]); - __m128 v_tmp2 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + __m128 v_tmp2 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; - const __m128 v_g = _mm_shuffle_ps(v_rgb, v_rgb, 0x55); + const __m128 v_g = _mm_shuffle_ps(v_rgb, v_rgb, 0x55); - __m128 v_out = v_g * (v_tmp2 - v_tmp1) + v_tmp1; + __m128 v_out = v_g * (v_tmp2 - v_tmp1) + v_tmp1; - posToIndex(color + level_square, index); + posToIndex(color + level_square, index); v_cv0 = getClutValue(clut_image, index[0]); - v_tmp1 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + v_tmp1 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; - posToIndex(color + level + level_square, index); + posToIndex(color + level + level_square, index); v_cv0 = getClutValue(clut_image, index[0]); - v_tmp2 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + v_tmp2 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; - v_tmp1 = v_g * (v_tmp2 - v_tmp1) + v_tmp1; + v_tmp1 = v_g * (v_tmp2 - v_tmp1) + v_tmp1; - const __m128 v_b = _mm_shuffle_ps(v_rgb, v_rgb, 0xAA); + const __m128 v_b = _mm_shuffle_ps(v_rgb, v_rgb, 0xAA); - _mm_store_ps(out_rgbx, v_b * (v_tmp1 - v_out) + v_out); + _mm_store_ps(out_rgbx, v_b * (v_tmp1 - v_out) + v_out); #endif } From 78c08e9e5c4287e7b00351732a395cbe9d2c6120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Tue, 26 Apr 2016 20:48:11 +0200 Subject: [PATCH 05/15] Add Ingo's optimizations Add Ingo's SSE optimizations and clean up the non-SSE part of `getRGB()` with `intp()`. --- rtengine/clutstore.cc | 105 +++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index 9619eff0f..692dcbcb9 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -1,11 +1,9 @@ #include -#ifdef __SSE2__ -#include -#endif - #include "clutstore.h" +#include "opthelper.h" +#include "rt_math.h" #include "imagefloat.h" #include "stdimagesource.h" #include "../rtgui/options.h" @@ -76,16 +74,14 @@ bool loadFile( return res; } -inline void posToIndex(unsigned int pos, size_t (&index)[2]) -{ - index[0] = static_cast(pos) * 4; - index[1] = static_cast(pos + 1) * 4; -} - #ifdef __SSE2__ -inline __m128 getClutValue(const AlignedBuffer& clut_image, size_t index) +inline vfloat getClutValue(const AlignedBuffer& clut_image, size_t index) { +#ifdef __SSE4_1__ + return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(*reinterpret_cast(clut_image.data + index))); +#else return _mm_cvtpu16_ps(*reinterpret_cast(clut_image.data + index)); +#endif } #endif @@ -185,79 +181,74 @@ void rtengine::HaldCLUT::getRGB(float r, float g, float b, float out_rgbx[4]) co g = g * flevel_minus_one - green; b = b * flevel_minus_one - blue; - size_t index[2]; - posToIndex(color, index); + size_t index = color * 4; float tmp1[4] ALIGNED16; - tmp1[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; - tmp1[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; - tmp1[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; + tmp1[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); + tmp1[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp1[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); - posToIndex(color + level, index); + index = (color + level) * 4; float tmp2[4] ALIGNED16; - tmp2[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; - tmp2[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; - tmp2[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; + tmp2[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); + tmp2[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp2[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); - out_rgbx[0] = tmp1[0] * (1 - g) + tmp2[0] * g; - out_rgbx[1] = tmp1[1] * (1 - g) + tmp2[1] * g; - out_rgbx[2] = tmp1[2] * (1 - g) + tmp2[2] * g; + out_rgbx[0] = intp(g, tmp2[0], tmp1[0]); + out_rgbx[1] = intp(g, tmp2[1], tmp1[1]); + out_rgbx[2] = intp(g, tmp2[2], tmp1[2]); - posToIndex(color + level_square, index); + index = (color + level_square) * 4; - tmp1[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; - tmp1[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; - tmp1[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; + tmp1[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); + tmp1[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp1[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); - posToIndex(color + level + level_square, index); + index = (color + level + level_square) * 4; - tmp2[0] = clut_image.data[index[0]] * (1 - r) + clut_image.data[index[1]] * r; - tmp2[1] = clut_image.data[index[0] + 1] * (1 - r) + clut_image.data[index[1] + 1] * r; - tmp2[2] = clut_image.data[index[0] + 2] * (1 - r) + clut_image.data[index[1] + 2] * r; + tmp2[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); + tmp2[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp2[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); - tmp1[0] = tmp1[0] * (1 - g) + tmp2[0] * g; - tmp1[1] = tmp1[1] * (1 - g) + tmp2[1] * g; - tmp1[2] = tmp1[2] * (1 - g) + tmp2[2] * g; + tmp1[0] = intp(g, tmp2[0], tmp1[0]); + tmp1[1] = intp(g, tmp2[1], tmp1[1]); + tmp1[2] = intp(g, tmp2[2], tmp1[2]); - out_rgbx[0] = out_rgbx[0] * (1 - b) + tmp1[0] * b; - out_rgbx[1] = out_rgbx[1] * (1 - b) + tmp1[1] * b; - out_rgbx[2] = out_rgbx[2] * (1 - b) + tmp1[2] * b; + out_rgbx[0] = intp(b, tmp1[0], out_rgbx[0]); + out_rgbx[1] = intp(b, tmp1[1], out_rgbx[1]); + out_rgbx[2] = intp(b, tmp1[2], out_rgbx[2]); #else - const __m128 v_rgb = _mm_set_ps(0.0f, b, g, r) *_mm_load_ps1(&flevel_minus_one) - _mm_set_ps(0.0f, blue, green, red); + const vfloat v_tmp = _mm_set_ps(0.0f, b, g, r) * _mm_load_ps1(&flevel_minus_one); + const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp))); - size_t index[2]; - posToIndex(color, index); + size_t index = color * 4; - const __m128 v_r = _mm_shuffle_ps(v_rgb, v_rgb, 0x00); + const vfloat v_r = PERMUTEPS(v_rgb, 0x00); - __m128 v_cv0 = getClutValue(clut_image, index[0]); - __m128 v_tmp1 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - posToIndex(color + level, index); + index = (color + level) * 4; - v_cv0 = getClutValue(clut_image, index[0]); - __m128 v_tmp2 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - const __m128 v_g = _mm_shuffle_ps(v_rgb, v_rgb, 0x55); + const vfloat v_g = PERMUTEPS(v_rgb, 0x55); - __m128 v_out = v_g * (v_tmp2 - v_tmp1) + v_tmp1; + vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1); - posToIndex(color + level_square, index); + index = (color + level_square) * 4; - v_cv0 = getClutValue(clut_image, index[0]); - v_tmp1 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - posToIndex(color + level + level_square, index); + index = (color + level + level_square) * 4; - v_cv0 = getClutValue(clut_image, index[0]); - v_tmp2 = v_r * (getClutValue(clut_image, index[1]) - v_cv0) + v_cv0; + v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - v_tmp1 = v_g * (v_tmp2 - v_tmp1) + v_tmp1; + v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1); - const __m128 v_b = _mm_shuffle_ps(v_rgb, v_rgb, 0xAA); + const vfloat v_b = PERMUTEPS(v_rgb, 0xAA); - _mm_store_ps(out_rgbx, v_b * (v_tmp1 - v_out) + v_out); + _mm_store_ps(out_rgbx, vintpf(v_b, v_tmp1, v_out)); #endif } From bf499055e182563f36e26ac3b309a8ee260e1b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Tue, 26 Apr 2016 21:57:58 +0200 Subject: [PATCH 06/15] Apply `HaldCLUT::getRGB()` per tile line `getRGB()` now takes a whole tile line instead of a single pixel. --- rtengine/clutstore.cc | 106 +++++++++++++++++++++--------------------- rtengine/clutstore.h | 4 +- rtengine/improcfun.cc | 26 ++++++++--- 3 files changed, 75 insertions(+), 61 deletions(-) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index 692dcbcb9..c9d4645a2 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -164,92 +164,94 @@ Glib::ustring rtengine::HaldCLUT::getProfile() const return clut_profile; } -void rtengine::HaldCLUT::getRGB(float r, float g, float b, float out_rgbx[4]) const +void rtengine::HaldCLUT::getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const { const unsigned int level = clut_level; // This is important - const unsigned int red = std::min(flevel_minus_two, r * flevel_minus_one); - const unsigned int green = std::min(flevel_minus_two, g * flevel_minus_one); - const unsigned int blue = std::min(flevel_minus_two, b * flevel_minus_one); - const unsigned int level_square = level * level; - const unsigned int color = red + green * level + blue * level_square; + for (std::size_t column = 0; column < line_size; ++column, ++r, ++g, ++b, out_rgbx += 4) { + const unsigned int red = std::min(flevel_minus_two, *r * flevel_minus_one); + const unsigned int green = std::min(flevel_minus_two, *g * flevel_minus_one); + const unsigned int blue = std::min(flevel_minus_two, *b * flevel_minus_one); + + const unsigned int color = red + green * level + blue * level_square; #ifndef __SSE2__ - r = r * flevel_minus_one - red; - g = g * flevel_minus_one - green; - b = b * flevel_minus_one - blue; + const float re = *r * flevel_minus_one - red; + const float gr = *g * flevel_minus_one - green; + const float bl = *b * flevel_minus_one - blue; - size_t index = color * 4; + size_t index = color * 4; - float tmp1[4] ALIGNED16; - tmp1[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); - tmp1[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); - tmp1[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); + float tmp1[4] ALIGNED16; + tmp1[0] = intp(re, clut_image.data[index + 4], clut_image.data[index]); + tmp1[1] = intp(re, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp1[2] = intp(re, clut_image.data[index + 6], clut_image.data[index + 2]); - index = (color + level) * 4; + index = (color + level) * 4; - float tmp2[4] ALIGNED16; - tmp2[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); - tmp2[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); - tmp2[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); + float tmp2[4] ALIGNED16; + tmp2[0] = intp(re, clut_image.data[index + 4], clut_image.data[index]); + tmp2[1] = intp(re, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp2[2] = intp(re, clut_image.data[index + 6], clut_image.data[index + 2]); - out_rgbx[0] = intp(g, tmp2[0], tmp1[0]); - out_rgbx[1] = intp(g, tmp2[1], tmp1[1]); - out_rgbx[2] = intp(g, tmp2[2], tmp1[2]); + out_rgbx[0] = intp(gr, tmp2[0], tmp1[0]); + out_rgbx[1] = intp(gr, tmp2[1], tmp1[1]); + out_rgbx[2] = intp(gr, tmp2[2], tmp1[2]); - index = (color + level_square) * 4; + index = (color + level_square) * 4; - tmp1[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); - tmp1[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); - tmp1[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); + tmp1[0] = intp(re, clut_image.data[index + 4], clut_image.data[index]); + tmp1[1] = intp(re, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp1[2] = intp(re, clut_image.data[index + 6], clut_image.data[index + 2]); - index = (color + level + level_square) * 4; + index = (color + level + level_square) * 4; - tmp2[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); - tmp2[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); - tmp2[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); + tmp2[0] = intp(re, clut_image.data[index + 4], clut_image.data[index]); + tmp2[1] = intp(re, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp2[2] = intp(re, clut_image.data[index + 6], clut_image.data[index + 2]); - tmp1[0] = intp(g, tmp2[0], tmp1[0]); - tmp1[1] = intp(g, tmp2[1], tmp1[1]); - tmp1[2] = intp(g, tmp2[2], tmp1[2]); + tmp1[0] = intp(gr, tmp2[0], tmp1[0]); + tmp1[1] = intp(gr, tmp2[1], tmp1[1]); + tmp1[2] = intp(gr, tmp2[2], tmp1[2]); - out_rgbx[0] = intp(b, tmp1[0], out_rgbx[0]); - out_rgbx[1] = intp(b, tmp1[1], out_rgbx[1]); - out_rgbx[2] = intp(b, tmp1[2], out_rgbx[2]); + out_rgbx[0] = intp(bl, tmp1[0], out_rgbx[0]); + out_rgbx[1] = intp(bl, tmp1[1], out_rgbx[1]); + out_rgbx[2] = intp(bl, tmp1[2], out_rgbx[2]); #else - const vfloat v_tmp = _mm_set_ps(0.0f, b, g, r) * _mm_load_ps1(&flevel_minus_one); - const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp))); + const vfloat v_tmp = _mm_set_ps(0.0f, *b, *g, *r) * _mm_load_ps1(&flevel_minus_one); + const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp))); - size_t index = color * 4; + size_t index = color * 4; - const vfloat v_r = PERMUTEPS(v_rgb, 0x00); + const vfloat v_r = PERMUTEPS(v_rgb, 0x00); - vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - index = (color + level) * 4; + index = (color + level) * 4; - vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - const vfloat v_g = PERMUTEPS(v_rgb, 0x55); + const vfloat v_g = PERMUTEPS(v_rgb, 0x55); - vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1); + vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1); - index = (color + level_square) * 4; + index = (color + level_square) * 4; - v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - index = (color + level + level_square) * 4; + index = (color + level + level_square) * 4; - v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1); + v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1); - const vfloat v_b = PERMUTEPS(v_rgb, 0xAA); + const vfloat v_b = PERMUTEPS(v_rgb, 0xAA); - _mm_store_ps(out_rgbx, vintpf(v_b, v_tmp1, v_out)); + _mm_store_ps(out_rgbx, vintpf(v_b, v_tmp1, v_out)); #endif + } } rtengine::CLUTStore& rtengine::CLUTStore::getInstance() diff --git a/rtengine/clutstore.h b/rtengine/clutstore.h index ed3491fbe..6203e4e61 100644 --- a/rtengine/clutstore.h +++ b/rtengine/clutstore.h @@ -24,7 +24,7 @@ public: virtual Glib::ustring getFilename() const = 0; virtual Glib::ustring getProfile() const = 0; - virtual void getRGB(float r, float g, float b, float out_rgbx[4]) const = 0; + virtual void getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const = 0; static void splitClutFilename( const Glib::ustring& filename, @@ -48,7 +48,7 @@ public: Glib::ustring getFilename() const override; Glib::ustring getProfile() const override; - void getRGB(float r, float g, float b, float out_rgbx[4]) const override; + void getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const override; private: AlignedBuffer clut_image; diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 2e15da916..d274806bb 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -16,6 +16,7 @@ * You should have received a copy of the GNU General Public License * along with RawTherapee. If not, see . */ +#include #include #include #include @@ -3224,8 +3225,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } - double filmSimCorrectedStrength = double(params->filmSimulation.strength) / 100.; - double filmSimSourceStrength = double(100 - params->filmSimulation.strength) / 100.; + float filmSimCorrectedStrength = static_cast(params->filmSimulation.strength) / 100.0f; + float filmSimSourceStrength = 1.0f - filmSimCorrectedStrength; const float exp_scale = pow (2.0, expcomp); const float comp = (max(0.0, expcomp) + 1.0) * hlcompr / 100.0; @@ -4354,13 +4355,24 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer sourceR = CLIP( Color::gamma_srgb( sourceR ) ); sourceG = CLIP( Color::gamma_srgb( sourceG ) ); sourceB = CLIP( Color::gamma_srgb( sourceB ) ); + } + + const std::size_t line_size = std::min(TS, tW - jstart); + std::size_t out_rgbx_size = 4 * (line_size + 16); + std::unique_ptr out_rgbx_buf(new float[out_rgbx_size]); + void* out_rgbx_ptr = out_rgbx_buf.get(); + float* const out_rgbx = reinterpret_cast(std::align(16, 4 * line_size, out_rgbx_ptr, out_rgbx_size)); + colorLUT->getRGB(line_size, rtemp + ti * TS, gtemp + ti * TS, btemp + ti * TS, out_rgbx); + + for (int j = jstart, tj = 0; j < tW; j++, tj++) { + float &sourceR = rtemp[ti * TS + tj]; + float &sourceG = gtemp[ti * TS + tj]; + float &sourceB = btemp[ti * TS + tj]; - float out_rgbx[4] ALIGNED16; - colorLUT->getRGB( sourceR, sourceG, sourceB, out_rgbx ); // apply strength - sourceR = out_rgbx[0] * filmSimCorrectedStrength + sourceR * filmSimSourceStrength; - sourceG = out_rgbx[1] * filmSimCorrectedStrength + sourceG * filmSimSourceStrength; - sourceB = out_rgbx[2] * filmSimCorrectedStrength + sourceB * filmSimSourceStrength; + sourceR = out_rgbx[tj * 4 + 0] * filmSimCorrectedStrength + sourceR * filmSimSourceStrength; + sourceG = out_rgbx[tj * 4 + 1] * filmSimCorrectedStrength + sourceG * filmSimSourceStrength; + sourceB = out_rgbx[tj * 4 + 2] * filmSimCorrectedStrength + sourceB * filmSimSourceStrength; // apply inverse gamma sRGB sourceR = Color::igamma_srgb( sourceR ); sourceG = Color::igamma_srgb( sourceG ); From 9dee6dddf1a114f32fccead94e851205ba110077 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Tue, 26 Apr 2016 22:16:23 +0200 Subject: [PATCH 07/15] Hoist `out_rgbx` allocation out of the loops --- rtengine/improcfun.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index d274806bb..2fd2bfb17 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -4338,6 +4338,11 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer //Film Simulations if ( colorLUT ) { + std::size_t out_rgbx_size = 4 * (TS + 16); + std::unique_ptr out_rgbx_buf(new float[out_rgbx_size]); + void* out_rgbx_ptr = out_rgbx_buf.get(); + float* const out_rgbx = reinterpret_cast(std::align(16, 4 * TS, out_rgbx_ptr, out_rgbx_size)); + for (int i = istart, ti = 0; i < tH; i++, ti++) { for (int j = jstart, tj = 0; j < tW; j++, tj++) { float &sourceR = rtemp[ti * TS + tj]; @@ -4357,12 +4362,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer sourceB = CLIP( Color::gamma_srgb( sourceB ) ); } - const std::size_t line_size = std::min(TS, tW - jstart); - std::size_t out_rgbx_size = 4 * (line_size + 16); - std::unique_ptr out_rgbx_buf(new float[out_rgbx_size]); - void* out_rgbx_ptr = out_rgbx_buf.get(); - float* const out_rgbx = reinterpret_cast(std::align(16, 4 * line_size, out_rgbx_ptr, out_rgbx_size)); - colorLUT->getRGB(line_size, rtemp + ti * TS, gtemp + ti * TS, btemp + ti * TS, out_rgbx); + colorLUT->getRGB(std::min(TS, tW - jstart), rtemp + ti * TS, gtemp + ti * TS, btemp + ti * TS, out_rgbx); for (int j = jstart, tj = 0; j < tW; j++, tj++) { float &sourceR = rtemp[ti * TS + tj]; From 29fe23e517fa8dde0a31f33de9a00f0d5e286085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Fri, 29 Apr 2016 17:26:56 +0200 Subject: [PATCH 08/15] Move `film_simulation_strength` calculation into `HaldCLUT::getRGB()` - Moved `film_simulation_strength` calculation into `HaldCLUT::getRGB()` - Removed unneeded base class `CLUT` - Used `_MM_SHUFFLE` --- rtengine/clutstore.cc | 103 ++++++++++++++++++++++------------------ rtengine/clutstore.h | 49 ++++++++----------- rtengine/improcfun.cc | 43 ++++++++--------- rtgui/filmsimulation.cc | 4 +- 4 files changed, 99 insertions(+), 100 deletions(-) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index c9d4645a2..b5f07c121 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -87,40 +87,6 @@ inline vfloat getClutValue(const AlignedBuffer& clut_image, size_ } -void rtengine::CLUT::splitClutFilename( - const Glib::ustring& filename, - Glib::ustring& name, - Glib::ustring& extension, - Glib::ustring& profile_name -) -{ - Glib::ustring basename = Glib::path_get_basename(filename); - - Glib::ustring::size_type last_slash_pos = basename.rfind('/'); - if (last_slash_pos == Glib::ustring::npos) { - last_slash_pos = basename.rfind('\\'); - } - - const Glib::ustring::size_type last_dot_pos = basename.rfind('.'); - - if (last_dot_pos != Glib::ustring::npos) { - name.assign(basename, 0, last_dot_pos); - extension.assign(basename, last_dot_pos + 1, Glib::ustring::npos); - } else { - name = basename; - } - - profile_name = "sRGB"; - - for (const auto& working_profile : rtengine::getWorkingProfiles()) { - if ( std::search( name.rbegin(), name.rend(), working_profile.rbegin(), working_profile.rend() ) == name.rbegin() ) { - profile_name = working_profile; - name.erase(name.size() - working_profile.size()); - break; - } - } -} - rtengine::HaldCLUT::HaldCLUT() : clut_level(0), flevel_minus_one(0.0f), @@ -164,7 +130,14 @@ Glib::ustring rtengine::HaldCLUT::getProfile() const return clut_profile; } -void rtengine::HaldCLUT::getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const +void rtengine::HaldCLUT::getRGB( + float strength, + std::size_t line_size, + const float* r, + const float* g, + const float* b, + float* out_rgbx +) const { const unsigned int level = clut_level; // This is important @@ -219,13 +192,18 @@ void rtengine::HaldCLUT::getRGB(std::size_t line_size, const float* r, const flo out_rgbx[0] = intp(bl, tmp1[0], out_rgbx[0]); out_rgbx[1] = intp(bl, tmp1[1], out_rgbx[1]); out_rgbx[2] = intp(bl, tmp1[2], out_rgbx[2]); + + out_rgbx[0] = intp(strength, out_rgbx[0], *r); + out_rgbx[1] = intp(strength, out_rgbx[1], *g); + out_rgbx[2] = intp(strength, out_rgbx[2], *b); #else - const vfloat v_tmp = _mm_set_ps(0.0f, *b, *g, *r) * _mm_load_ps1(&flevel_minus_one); + const vfloat v_in = _mm_set_ps(0.0f, *b, *g, *r); + const vfloat v_tmp = v_in * _mm_load_ps1(&flevel_minus_one); const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp))); size_t index = color * 4; - const vfloat v_r = PERMUTEPS(v_rgb, 0x00); + const vfloat v_r = PERMUTEPS(v_rgb, _MM_SHUFFLE(0, 0, 0, 0)); vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); @@ -233,7 +211,7 @@ void rtengine::HaldCLUT::getRGB(std::size_t line_size, const float* r, const flo vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - const vfloat v_g = PERMUTEPS(v_rgb, 0x55); + const vfloat v_g = PERMUTEPS(v_rgb, _MM_SHUFFLE(1, 1, 1, 1)); vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1); @@ -247,22 +225,58 @@ void rtengine::HaldCLUT::getRGB(std::size_t line_size, const float* r, const flo v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1); - const vfloat v_b = PERMUTEPS(v_rgb, 0xAA); + const vfloat v_b = PERMUTEPS(v_rgb, _MM_SHUFFLE(2, 2, 2, 2)); - _mm_store_ps(out_rgbx, vintpf(v_b, v_tmp1, v_out)); + v_out = vintpf(v_b, v_tmp1, v_out); + + _mm_store_ps(out_rgbx, vintpf(_mm_load_ps1(&strength), v_out, v_in)); #endif } } +void rtengine::HaldCLUT::splitClutFilename( + const Glib::ustring& filename, + Glib::ustring& name, + Glib::ustring& extension, + Glib::ustring& profile_name +) +{ + Glib::ustring basename = Glib::path_get_basename(filename); + + Glib::ustring::size_type last_slash_pos = basename.rfind('/'); + if (last_slash_pos == Glib::ustring::npos) { + last_slash_pos = basename.rfind('\\'); + } + + const Glib::ustring::size_type last_dot_pos = basename.rfind('.'); + + if (last_dot_pos != Glib::ustring::npos) { + name.assign(basename, 0, last_dot_pos); + extension.assign(basename, last_dot_pos + 1, Glib::ustring::npos); + } else { + name = basename; + } + + profile_name = "sRGB"; + + for (const auto& working_profile : rtengine::getWorkingProfiles()) { + if ( std::search( name.rbegin(), name.rend(), working_profile.rbegin(), working_profile.rend() ) == name.rbegin() ) { + profile_name = working_profile; + name.erase(name.size() - working_profile.size()); + break; + } + } +} + rtengine::CLUTStore& rtengine::CLUTStore::getInstance() { static CLUTStore instance; return instance; } -std::shared_ptr rtengine::CLUTStore::getClut(const Glib::ustring& filename) +std::shared_ptr rtengine::CLUTStore::getClut(const Glib::ustring& filename) { - std::shared_ptr result; + std::shared_ptr result; if (!cache.get(filename, result)) { std::unique_ptr clut(new rtengine::HaldCLUT); @@ -275,11 +289,6 @@ std::shared_ptr rtengine::CLUTStore::getClut(const Glib::ustring return result; } -void rtengine::CLUTStore::releaseClut(const std::shared_ptr& clut) -{ - cache.remove(clut->getFilename()); -} - void rtengine::CLUTStore::clearCache() { cache.clear(); diff --git a/rtengine/clutstore.h b/rtengine/clutstore.h index 6203e4e61..7383b597f 100644 --- a/rtengine/clutstore.h +++ b/rtengine/clutstore.h @@ -11,20 +11,29 @@ namespace rtengine { -class CLUT +class HaldCLUT { public: - CLUT() = default; - CLUT(const CLUT& other) = delete; - CLUT& operator =(const CLUT& other) = delete; - virtual ~CLUT() = default; + HaldCLUT(); + HaldCLUT(const HaldCLUT& other) = delete; + HaldCLUT& operator =(const HaldCLUT& other) = delete; + ~HaldCLUT(); - virtual explicit operator bool() const = 0; + bool load(const Glib::ustring& filename); - virtual Glib::ustring getFilename() const = 0; - virtual Glib::ustring getProfile() const = 0; + explicit operator bool() const; - virtual void getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const = 0; + Glib::ustring getFilename() const; + Glib::ustring getProfile() const; + + void getRGB( + float strength, + std::size_t line_size, + const float* r, + const float* g, + const float* b, + float* out_rgbx + ) const; static void splitClutFilename( const Glib::ustring& filename, @@ -32,23 +41,6 @@ public: Glib::ustring& extension, Glib::ustring& profile_name ); -}; - -class HaldCLUT - : public CLUT -{ -public: - HaldCLUT(); - ~HaldCLUT(); - - bool load(const Glib::ustring& filename); - - explicit operator bool() const override; - - Glib::ustring getFilename() const override; - Glib::ustring getProfile() const override; - - void getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const override; private: AlignedBuffer clut_image; @@ -67,15 +59,14 @@ public: CLUTStore(const CLUTStore& other) = delete; CLUTStore& operator =(const CLUTStore& other) = delete; - std::shared_ptr getClut(const Glib::ustring& filename); - void releaseClut(const std::shared_ptr& clut); + std::shared_ptr getClut(const Glib::ustring& filename); void clearCache(); private: CLUTStore(); - Cache> cache; + Cache> cache; }; } diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 2fd2bfb17..4bc95e5fa 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -16,7 +16,6 @@ * You should have received a copy of the GNU General Public License * along with RawTherapee. If not, see . */ -#include #include #include #include @@ -3206,27 +3205,26 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } - std::shared_ptr colorLUT; + std::shared_ptr hald_clut; bool clutAndWorkingProfilesAreSame = false; TMatrix work2xyz, xyz2clut, clut2xyz, xyz2work; if ( params->filmSimulation.enabled && !params->filmSimulation.clutFilename.empty() ) { - colorLUT = CLUTStore::getInstance().getClut( params->filmSimulation.clutFilename ); + hald_clut = CLUTStore::getInstance().getClut( params->filmSimulation.clutFilename ); - if ( colorLUT ) { - clutAndWorkingProfilesAreSame = colorLUT->getProfile() == params->icm.working; + if ( hald_clut ) { + clutAndWorkingProfilesAreSame = hald_clut->getProfile() == params->icm.working; if ( !clutAndWorkingProfilesAreSame ) { work2xyz = iccStore->workingSpaceMatrix( params->icm.working ); - xyz2clut = iccStore->workingSpaceInverseMatrix( colorLUT->getProfile() ); + xyz2clut = iccStore->workingSpaceInverseMatrix( hald_clut->getProfile() ); xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working ); - clut2xyz = iccStore->workingSpaceMatrix( colorLUT->getProfile() ); + clut2xyz = iccStore->workingSpaceMatrix( hald_clut->getProfile() ); } } } - float filmSimCorrectedStrength = static_cast(params->filmSimulation.strength) / 100.0f; - float filmSimSourceStrength = 1.0f - filmSimCorrectedStrength; + const float film_simulation_strength = static_cast(params->filmSimulation.strength) / 100.0f; const float exp_scale = pow (2.0, expcomp); const float comp = (max(0.0, expcomp) + 1.0) * hlcompr / 100.0; @@ -4337,11 +4335,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer //Film Simulations - if ( colorLUT ) { - std::size_t out_rgbx_size = 4 * (TS + 16); - std::unique_ptr out_rgbx_buf(new float[out_rgbx_size]); - void* out_rgbx_ptr = out_rgbx_buf.get(); - float* const out_rgbx = reinterpret_cast(std::align(16, 4 * TS, out_rgbx_ptr, out_rgbx_size)); + if ( hald_clut ) { + float out_rgbx[4 * TS] ALIGNED16; for (int i = istart, ti = 0; i < tH; i++, ti++) { for (int j = jstart, tj = 0; j < tW; j++, tj++) { @@ -4362,21 +4357,25 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer sourceB = CLIP( Color::gamma_srgb( sourceB ) ); } - colorLUT->getRGB(std::min(TS, tW - jstart), rtemp + ti * TS, gtemp + ti * TS, btemp + ti * TS, out_rgbx); + const std::size_t line_offset = ti * TS; + hald_clut->getRGB( + film_simulation_strength, + std::min(TS, tW - jstart), + rtemp + line_offset, + gtemp + line_offset, + btemp + line_offset, + out_rgbx + ); for (int j = jstart, tj = 0; j < tW; j++, tj++) { float &sourceR = rtemp[ti * TS + tj]; float &sourceG = gtemp[ti * TS + tj]; float &sourceB = btemp[ti * TS + tj]; - // apply strength - sourceR = out_rgbx[tj * 4 + 0] * filmSimCorrectedStrength + sourceR * filmSimSourceStrength; - sourceG = out_rgbx[tj * 4 + 1] * filmSimCorrectedStrength + sourceG * filmSimSourceStrength; - sourceB = out_rgbx[tj * 4 + 2] * filmSimCorrectedStrength + sourceB * filmSimSourceStrength; // apply inverse gamma sRGB - sourceR = Color::igamma_srgb( sourceR ); - sourceG = Color::igamma_srgb( sourceG ); - sourceB = Color::igamma_srgb( sourceB ); + sourceR = Color::igamma_srgb(out_rgbx[tj * 4 + 0]); + sourceG = Color::igamma_srgb(out_rgbx[tj * 4 + 1]); + sourceB = Color::igamma_srgb(out_rgbx[tj * 4 + 2]); if (!clutAndWorkingProfilesAreSame) { //convert from clut to working profile diff --git a/rtgui/filmsimulation.cc b/rtgui/filmsimulation.cc index 3ee1f4742..f916a5397 100644 --- a/rtgui/filmsimulation.cc +++ b/rtgui/filmsimulation.cc @@ -72,7 +72,7 @@ void FilmSimulation::onClutSelected() if ( getEnabled() && !currentClutFilename.empty() && listener && currentClutFilename != m_oldClutFilename ) { Glib::ustring clutName, dummy; - CLUT::splitClutFilename( currentClutFilename, clutName, dummy, dummy ); + HaldCLUT::splitClutFilename( currentClutFilename, clutName, dummy, dummy ); listener->panelChanged( EvFilmSimulationFilename, clutName ); m_oldClutFilename = currentClutFilename; @@ -279,7 +279,7 @@ int ClutComboBox::parseDir (const Glib::ustring& path) for (const auto& entry : entries) { Glib::ustring name, extension, profileName; - CLUT::splitClutFilename (entry, name, extension, profileName); + HaldCLUT::splitClutFilename (entry, name, extension, profileName); extension = extension.casefold (); if (extension.compare ("tif") != 0 && extension.compare ("png") != 0) { From beaea22779abe3732a7cee4dfa2a4ef4ad1785e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Fri, 29 Apr 2016 20:35:37 +0200 Subject: [PATCH 09/15] Vectorize color space conversion for HaldCLUT Vectorize color space conversion for HaldCLUT depending on the definition of `VECTLENSP`. It's not fully AVX compatible because `F2V`, `LVF`, and `STVF` are SSE only. --- rtengine/color.cc | 9 ++++ rtengine/color.h | 4 +- rtengine/improcfun.cc | 99 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 98 insertions(+), 14 deletions(-) diff --git a/rtengine/color.cc b/rtengine/color.cc index f5a8c86a3..dc0710a2f 100644 --- a/rtengine/color.cc +++ b/rtengine/color.cc @@ -819,6 +819,15 @@ void Color::rgbxyz (float r, float g, float b, float &x, float &y, float &z, con z = ((xyz_rgb[2][0] * r + xyz_rgb[2][1] * g + xyz_rgb[2][2] * b)) ; } +#ifdef __SSE2__ +void Color::rgbxyz (vfloat r, vfloat g, vfloat b, vfloat &x, vfloat &y, vfloat &z, const vfloat xyz_rgb[3][3]) +{ + x = ((xyz_rgb[0][0] * r + xyz_rgb[0][1] * g + xyz_rgb[0][2] * b)) ; + y = ((xyz_rgb[1][0] * r + xyz_rgb[1][1] * g + xyz_rgb[1][2] * b)) ; + z = ((xyz_rgb[2][0] * r + xyz_rgb[2][1] * g + xyz_rgb[2][2] * b)) ; +} +#endif + void Color::xyz2rgb (float x, float y, float z, float &r, float &g, float &b, const double rgb_xyz[3][3]) { //Transform to output color. Standard sRGB is D65, but internal representation is D50 diff --git a/rtengine/color.h b/rtengine/color.h index 3f78692d8..9ff00034c 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -325,7 +325,9 @@ public: */ static void rgbxyz (float r, float g, float b, float &x, float &y, float &z, const double xyz_rgb[3][3]); static void rgbxyz (float r, float g, float b, float &x, float &y, float &z, const float xyz_rgb[3][3]); - +#ifdef __SSE2__ + static void rgbxyz (vfloat r, vfloat g, vfloat b, vfloat &x, vfloat &y, vfloat &z, const vfloat xyz_rgb[3][3]); +#endif /** * @brief Convert Lab in xyz diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 4bc95e5fa..c1de37474 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -3208,6 +3208,12 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer std::shared_ptr hald_clut; bool clutAndWorkingProfilesAreSame = false; TMatrix work2xyz, xyz2clut, clut2xyz, xyz2work; +#ifdef VECTLENSP + vfloat v_work2xyz[3][3]; + vfloat v_xyz2clut[3][3]; + vfloat v_clut2xyz[3][3]; + vfloat v_xyz2work[3][3]; +#endif if ( params->filmSimulation.enabled && !params->filmSimulation.clutFilename.empty() ) { hald_clut = CLUTStore::getInstance().getClut( params->filmSimulation.clutFilename ); @@ -3220,6 +3226,16 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer xyz2clut = iccStore->workingSpaceInverseMatrix( hald_clut->getProfile() ); xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working ); clut2xyz = iccStore->workingSpaceMatrix( hald_clut->getProfile() ); +#ifdef VECTLENSP + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + v_work2xyz[i][j] = F2V(work2xyz[i][j]); + v_xyz2clut[i][j] = F2V(xyz2clut[i][j]); + v_xyz2work[i][j] = F2V(xyz2work[i][j]); + v_clut2xyz[i][j] = F2V(clut2xyz[i][j]); + } + } +#endif } } } @@ -4335,22 +4351,51 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer //Film Simulations - if ( hald_clut ) { + if (hald_clut) { float out_rgbx[4 * TS] ALIGNED16; for (int i = istart, ti = 0; i < tH; i++, ti++) { + if (!clutAndWorkingProfilesAreSame) { +#ifdef VECTLENSP + if (!(std::min(TS, tW - jstart) & ~(VECTLENSP - 1))) { + for (int j = jstart, tj = 0; j < tW; j += VECTLENSP, tj += VECTLENSP) { + vfloat sourceR = LVF(rtemp[ti * TS + tj]); + vfloat sourceG = LVF(gtemp[ti * TS + tj]); + vfloat sourceB = LVF(btemp[ti * TS + tj]); + + //convert from working to clut profile + vfloat x; + vfloat y; + vfloat z; + Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, v_work2xyz ); + Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, v_xyz2clut ); + + STVF(rtemp[ti * TS + tj], sourceR); + STVF(gtemp[ti * TS + tj], sourceG); + STVF(btemp[ti * TS + tj], sourceB); + } + } + else +#endif + { + for (int j = jstart, tj = 0; j < tW; j++, tj++) { + float &sourceR = rtemp[ti * TS + tj]; + float &sourceG = gtemp[ti * TS + tj]; + float &sourceB = btemp[ti * TS + tj]; + + //convert from working to clut profile + float x, y, z; + Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, work2xyz ); + Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, xyz2clut ); + } + } + } + for (int j = jstart, tj = 0; j < tW; j++, tj++) { float &sourceR = rtemp[ti * TS + tj]; float &sourceG = gtemp[ti * TS + tj]; float &sourceB = btemp[ti * TS + tj]; - if (!clutAndWorkingProfilesAreSame) { - //convert from working to clut profile - float x, y, z; - Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, work2xyz ); - Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, xyz2clut ); - } - //appply gamma sRGB (default RT) sourceR = CLIP( Color::gamma_srgb( sourceR ) ); sourceG = CLIP( Color::gamma_srgb( sourceG ) ); @@ -4376,14 +4421,42 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer sourceR = Color::igamma_srgb(out_rgbx[tj * 4 + 0]); sourceG = Color::igamma_srgb(out_rgbx[tj * 4 + 1]); sourceB = Color::igamma_srgb(out_rgbx[tj * 4 + 2]); + } - if (!clutAndWorkingProfilesAreSame) { - //convert from clut to working profile - float x, y, z; - Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, clut2xyz ); - Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, xyz2work ); + if (!clutAndWorkingProfilesAreSame) { +#ifdef VECTLENSP + if (!(std::min(TS, tW - jstart) & ~(VECTLENSP - 1))) { + for (int j = jstart, tj = 0; j < tW; j += VECTLENSP, tj += VECTLENSP) { + vfloat sourceR = LVF(rtemp[ti * TS + tj]); + vfloat sourceG = LVF(gtemp[ti * TS + tj]); + vfloat sourceB = LVF(btemp[ti * TS + tj]); + + //convert from clut to working profile + vfloat x; + vfloat y; + vfloat z; + Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, v_clut2xyz ); + Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, v_xyz2work ); + + STVF(rtemp[ti * TS + tj], sourceR); + STVF(gtemp[ti * TS + tj], sourceG); + STVF(btemp[ti * TS + tj], sourceB); + } } + else +#endif + { + for (int j = jstart, tj = 0; j < tW; j++, tj++) { + float &sourceR = rtemp[ti * TS + tj]; + float &sourceG = gtemp[ti * TS + tj]; + float &sourceB = btemp[ti * TS + tj]; + //convert from clut to working profile + float x, y, z; + Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, clut2xyz ); + Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, xyz2work ); + } + } } } } From eceb024ba88fa5303071b799ff05ca2669b5d238 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Sun, 1 May 2016 10:41:21 +0200 Subject: [PATCH 10/15] Add Ingo's `gamma_srgbclipped` patch --- rtengine/color.h | 4 ++++ rtengine/improcfun.cc | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/rtengine/color.h b/rtengine/color.h index 9ff00034c..be7740e2a 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -1079,6 +1079,10 @@ public: { return gammatab_srgb[x]; } + static inline float gamma_srgbclipped (float x) + { + return gamma2curve[x]; + } static inline float gamma (float x) { return gammatab[x]; diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index c1de37474..4660ed6cb 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -19,6 +19,9 @@ #include #include #include +#ifdef _OPENMP +#include +#endif #include "rtengine.h" #include "improcfun.h" @@ -38,9 +41,6 @@ #include "clutstore.h" #include "ciecam02.h" -#ifdef _OPENMP -#include -#endif #undef CLIPD #define CLIPD(a) ((a)>0.0f?((a)<1.0f?(a):1.0f):0.0f) @@ -4396,10 +4396,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer float &sourceG = gtemp[ti * TS + tj]; float &sourceB = btemp[ti * TS + tj]; - //appply gamma sRGB (default RT) - sourceR = CLIP( Color::gamma_srgb( sourceR ) ); - sourceG = CLIP( Color::gamma_srgb( sourceG ) ); - sourceB = CLIP( Color::gamma_srgb( sourceB ) ); + //apply gamma sRGB (default RT) + sourceR = Color::gamma_srgbclipped( sourceR ); + sourceG = Color::gamma_srgbclipped( sourceG ); + sourceB = Color::gamma_srgbclipped( sourceB ); } const std::size_t line_offset = ti * TS; From 39f4db609a7c589043d84ef7e9a3ce13eb8aa972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Sun, 1 May 2016 11:10:11 +0200 Subject: [PATCH 11/15] Final code cleanup - Corrected whitespace and comments - Replaced `VECTLENSP` with `__SSE2__` and `4` - Removed redundant `inline` (see: http://programmers.stackexchange.com/a/35436 and http://stackoverflow.com/a/5971755) --- rtengine/clutstore.cc | 4 ++-- rtengine/improcfun.cc | 52 +++++++++++++++++++++---------------------- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index b5f07c121..5e96046ae 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -75,7 +75,7 @@ bool loadFile( } #ifdef __SSE2__ -inline vfloat getClutValue(const AlignedBuffer& clut_image, size_t index) +vfloat getClutValue(const AlignedBuffer& clut_image, size_t index) { #ifdef __SSE4_1__ return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(*reinterpret_cast(clut_image.data + index))); @@ -260,7 +260,7 @@ void rtengine::HaldCLUT::splitClutFilename( profile_name = "sRGB"; for (const auto& working_profile : rtengine::getWorkingProfiles()) { - if ( std::search( name.rbegin(), name.rend(), working_profile.rbegin(), working_profile.rend() ) == name.rbegin() ) { + if (std::search(name.rbegin(), name.rend(), working_profile.rbegin(), working_profile.rend()) == name.rbegin()) { profile_name = working_profile; name.erase(name.size() - working_profile.size()); break; diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 4660ed6cb..c1079c8b1 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -3208,7 +3208,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer std::shared_ptr hald_clut; bool clutAndWorkingProfilesAreSame = false; TMatrix work2xyz, xyz2clut, clut2xyz, xyz2work; -#ifdef VECTLENSP +#ifdef __SSE2__ vfloat v_work2xyz[3][3]; vfloat v_xyz2clut[3][3]; vfloat v_clut2xyz[3][3]; @@ -3226,7 +3226,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer xyz2clut = iccStore->workingSpaceInverseMatrix( hald_clut->getProfile() ); xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working ); clut2xyz = iccStore->workingSpaceMatrix( hald_clut->getProfile() ); -#ifdef VECTLENSP +#ifdef __SSE2__ for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { v_work2xyz[i][j] = F2V(work2xyz[i][j]); @@ -4350,25 +4350,25 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } - //Film Simulations + // Film Simulations if (hald_clut) { float out_rgbx[4 * TS] ALIGNED16; for (int i = istart, ti = 0; i < tH; i++, ti++) { if (!clutAndWorkingProfilesAreSame) { -#ifdef VECTLENSP - if (!(std::min(TS, tW - jstart) & ~(VECTLENSP - 1))) { - for (int j = jstart, tj = 0; j < tW; j += VECTLENSP, tj += VECTLENSP) { + // Convert from working to clut profile +#ifdef __SSE2__ + if (!(std::min(TS, tW - jstart) & ~3)) { + for (int j = jstart, tj = 0; j < tW; j += 4, tj += 4) { vfloat sourceR = LVF(rtemp[ti * TS + tj]); vfloat sourceG = LVF(gtemp[ti * TS + tj]); vfloat sourceB = LVF(btemp[ti * TS + tj]); - //convert from working to clut profile vfloat x; vfloat y; vfloat z; - Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, v_work2xyz ); - Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, v_xyz2clut ); + Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_work2xyz); + Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2clut); STVF(rtemp[ti * TS + tj], sourceR); STVF(gtemp[ti * TS + tj], sourceG); @@ -4383,10 +4383,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer float &sourceG = gtemp[ti * TS + tj]; float &sourceB = btemp[ti * TS + tj]; - //convert from working to clut profile float x, y, z; - Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, work2xyz ); - Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, xyz2clut ); + Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, work2xyz); + Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, xyz2clut); } } } @@ -4396,10 +4395,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer float &sourceG = gtemp[ti * TS + tj]; float &sourceB = btemp[ti * TS + tj]; - //apply gamma sRGB (default RT) - sourceR = Color::gamma_srgbclipped( sourceR ); - sourceG = Color::gamma_srgbclipped( sourceG ); - sourceB = Color::gamma_srgbclipped( sourceB ); + // Apply gamma sRGB (default RT) + sourceR = Color::gamma_srgbclipped(sourceR); + sourceG = Color::gamma_srgbclipped(sourceG); + sourceB = Color::gamma_srgbclipped(sourceB); } const std::size_t line_offset = ti * TS; @@ -4417,26 +4416,26 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer float &sourceG = gtemp[ti * TS + tj]; float &sourceB = btemp[ti * TS + tj]; - // apply inverse gamma sRGB + // Apply inverse gamma sRGB sourceR = Color::igamma_srgb(out_rgbx[tj * 4 + 0]); sourceG = Color::igamma_srgb(out_rgbx[tj * 4 + 1]); sourceB = Color::igamma_srgb(out_rgbx[tj * 4 + 2]); } if (!clutAndWorkingProfilesAreSame) { -#ifdef VECTLENSP - if (!(std::min(TS, tW - jstart) & ~(VECTLENSP - 1))) { - for (int j = jstart, tj = 0; j < tW; j += VECTLENSP, tj += VECTLENSP) { + // Convert from clut to working profile +#ifdef __SSE2__ + if (!(std::min(TS, tW - jstart) & ~3)) { + for (int j = jstart, tj = 0; j < tW; j += 4, tj += 4) { vfloat sourceR = LVF(rtemp[ti * TS + tj]); vfloat sourceG = LVF(gtemp[ti * TS + tj]); vfloat sourceB = LVF(btemp[ti * TS + tj]); - //convert from clut to working profile vfloat x; vfloat y; vfloat z; - Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, v_clut2xyz ); - Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, v_xyz2work ); + Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_clut2xyz); + Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2work); STVF(rtemp[ti * TS + tj], sourceR); STVF(gtemp[ti * TS + tj], sourceG); @@ -4451,10 +4450,9 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer float &sourceG = gtemp[ti * TS + tj]; float &sourceB = btemp[ti * TS + tj]; - //convert from clut to working profile float x, y, z; - Color::rgbxyz( sourceR, sourceG, sourceB, x, y, z, clut2xyz ); - Color::xyz2rgb( x, y, z, sourceR, sourceG, sourceB, xyz2work ); + Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, clut2xyz); + Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, xyz2work); } } } @@ -4462,7 +4460,7 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } - if(!blackwhite) { + if (!blackwhite) { // ready, fill lab for (int i = istart, ti = 0; i < tH; i++, ti++) { for (int j = jstart, tj = 0; j < tW; j++, tj++) { From 17635cf53565b4fed6fc4cced16cd5dd2ec5e901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Sun, 1 May 2016 11:17:40 +0200 Subject: [PATCH 12/15] Add myself to AUTHORS.txt With permission by [Ingo][1] I added myself to AUTHORS.txt. [1]: https://github.com/Beep6581/RawTherapee/pull/3260#issuecomment-212565582 --- AUTHORS.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.txt b/AUTHORS.txt index 8c87a42c3..37ac98993 100644 --- a/AUTHORS.txt +++ b/AUTHORS.txt @@ -12,6 +12,7 @@ Developement contributors, in last name alphabetical order: Oliver Duis Maciek Dworak Michael Ezra + Flössie Jean-Christophe Frisch Ilias Giarimis Steve Herrell From d530617ae19b72de01b883ae6385c1fc3753ec68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Sun, 1 May 2016 20:49:17 +0200 Subject: [PATCH 13/15] Include last remarks from Ingo - Changed `_mm_store_ps` to `STVF` - Increased number of cached CLUTs by factor 1.5 --- rtengine/clutstore.cc | 2 +- rtgui/preferences.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index 5e96046ae..e96614e4c 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -229,7 +229,7 @@ void rtengine::HaldCLUT::getRGB( v_out = vintpf(v_b, v_tmp1, v_out); - _mm_store_ps(out_rgbx, vintpf(_mm_load_ps1(&strength), v_out, v_in)); + STVF(*out_rgbx, vintpf(_mm_load_ps1(&strength), v_out, v_in)); #endif } } diff --git a/rtgui/preferences.cc b/rtgui/preferences.cc index 10800d527..14c9cb0eb 100644 --- a/rtgui/preferences.cc +++ b/rtgui/preferences.cc @@ -572,9 +572,9 @@ Gtk::Widget* Preferences::getPerformancePanel () clutCacheSizeSB->set_increments (1, 5); clutCacheSizeSB->set_max_length(2); // Will this be sufficient? :) #ifdef _OPENMP - clutCacheSizeSB->set_range (1, 2 * omp_get_num_procs()); + clutCacheSizeSB->set_range (1, 3 * omp_get_num_procs()); #else - clutCacheSizeSB->set_range (1, 8); + clutCacheSizeSB->set_range (1, 12); #endif clutCacheSizeHB->pack_start (*CLUTLl, Gtk::PACK_SHRINK, 0); clutCacheSizeHB->pack_end (*clutCacheSizeSB, Gtk::PACK_SHRINK, 0); From 2b9f6e3355390ced1a81975f60713f5b0f6cd48c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=B6ssie?= Date: Sun, 1 May 2016 21:36:13 +0200 Subject: [PATCH 14/15] Add Ingo's clutspeed.patch --- rtengine/clutstore.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index e96614e4c..a4c177b3d 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -143,6 +143,10 @@ void rtengine::HaldCLUT::getRGB( const unsigned int level_square = level * level; +#ifdef __SSE2__ + const vfloat v_strength = F2V(strength); +#endif + for (std::size_t column = 0; column < line_size; ++column, ++r, ++g, ++b, out_rgbx += 4) { const unsigned int red = std::min(flevel_minus_two, *r * flevel_minus_one); const unsigned int green = std::min(flevel_minus_two, *g * flevel_minus_one); @@ -229,7 +233,7 @@ void rtengine::HaldCLUT::getRGB( v_out = vintpf(v_b, v_tmp1, v_out); - STVF(*out_rgbx, vintpf(_mm_load_ps1(&strength), v_out, v_in)); + STVF(*out_rgbx, vintpf(v_strength, v_out, v_in)); #endif } } From add88cc62c8ba06d801e1488e1777398f952a54b Mon Sep 17 00:00:00 2001 From: Beep6581 Date: Sun, 1 May 2016 22:49:10 +0200 Subject: [PATCH 15/15] astyle'd rtengine/clutstore.cc --- rtengine/clutstore.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index a4c177b3d..ea3a2a7fc 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -31,9 +31,11 @@ bool loadFile( if (fw == fh) { unsigned int level = 1; + while (level * level * level < fw) { ++level; } + if (level * level * level == fw && level > 1) { clut_level = level; res = true; @@ -57,6 +59,7 @@ bool loadFile( AlignedBuffer image(fw * fh * 4 + 1); std::size_t index = 0; + for (int y = 0; y < fh; ++y) { for (int x = 0; x < fw; ++x) { image.data[index] = img_float->r(y, x); @@ -248,6 +251,7 @@ void rtengine::HaldCLUT::splitClutFilename( Glib::ustring basename = Glib::path_get_basename(filename); Glib::ustring::size_type last_slash_pos = basename.rfind('/'); + if (last_slash_pos == Glib::ustring::npos) { last_slash_pos = basename.rfind('\\'); } @@ -284,6 +288,7 @@ std::shared_ptr rtengine::CLUTStore::getClut(const Glib::ust if (!cache.get(filename, result)) { std::unique_ptr clut(new rtengine::HaldCLUT); + if (clut->load(filename)) { result = std::move(clut); cache.insert(filename, result);