diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index 692dcbcb9..c9d4645a2 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -164,92 +164,94 @@ Glib::ustring rtengine::HaldCLUT::getProfile() const return clut_profile; } -void rtengine::HaldCLUT::getRGB(float r, float g, float b, float out_rgbx[4]) const +void rtengine::HaldCLUT::getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const { const unsigned int level = clut_level; // This is important - const unsigned int red = std::min(flevel_minus_two, r * flevel_minus_one); - const unsigned int green = std::min(flevel_minus_two, g * flevel_minus_one); - const unsigned int blue = std::min(flevel_minus_two, b * flevel_minus_one); - const unsigned int level_square = level * level; - const unsigned int color = red + green * level + blue * level_square; + for (std::size_t column = 0; column < line_size; ++column, ++r, ++g, ++b, out_rgbx += 4) { + const unsigned int red = std::min(flevel_minus_two, *r * flevel_minus_one); + const unsigned int green = std::min(flevel_minus_two, *g * flevel_minus_one); + const unsigned int blue = std::min(flevel_minus_two, *b * flevel_minus_one); + + const unsigned int color = red + green * level + blue * level_square; #ifndef __SSE2__ - r = r * flevel_minus_one - red; - g = g * flevel_minus_one - green; - b = b * flevel_minus_one - blue; + const float re = *r * flevel_minus_one - red; + const float gr = *g * flevel_minus_one - green; + const float bl = *b * flevel_minus_one - blue; - size_t index = color * 4; + size_t index = color * 4; - float tmp1[4] ALIGNED16; - tmp1[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); - tmp1[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); - tmp1[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); + float tmp1[4] ALIGNED16; + tmp1[0] = intp(re, clut_image.data[index + 4], clut_image.data[index]); + tmp1[1] = intp(re, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp1[2] = intp(re, clut_image.data[index + 6], clut_image.data[index + 2]); - index = (color + level) * 4; + index = (color + level) * 4; - float tmp2[4] ALIGNED16; - tmp2[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); - tmp2[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); - tmp2[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); + float tmp2[4] ALIGNED16; + tmp2[0] = intp(re, clut_image.data[index + 4], clut_image.data[index]); + tmp2[1] = intp(re, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp2[2] = intp(re, clut_image.data[index + 6], clut_image.data[index + 2]); - out_rgbx[0] = intp(g, tmp2[0], tmp1[0]); - out_rgbx[1] = intp(g, tmp2[1], tmp1[1]); - out_rgbx[2] = intp(g, tmp2[2], tmp1[2]); + out_rgbx[0] = intp(gr, tmp2[0], tmp1[0]); + out_rgbx[1] = intp(gr, tmp2[1], tmp1[1]); + out_rgbx[2] = intp(gr, tmp2[2], tmp1[2]); - index = (color + level_square) * 4; + index = (color + level_square) * 4; - tmp1[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); - tmp1[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); - tmp1[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); + tmp1[0] = intp(re, clut_image.data[index + 4], clut_image.data[index]); + tmp1[1] = intp(re, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp1[2] = intp(re, clut_image.data[index + 6], clut_image.data[index + 2]); - index = (color + level + level_square) * 4; + index = (color + level + level_square) * 4; - tmp2[0] = intp(r, clut_image.data[index + 4], clut_image.data[index]); - tmp2[1] = intp(r, clut_image.data[index + 5], clut_image.data[index + 1]); - tmp2[2] = intp(r, clut_image.data[index + 6], clut_image.data[index + 2]); + tmp2[0] = intp(re, clut_image.data[index + 4], clut_image.data[index]); + tmp2[1] = intp(re, clut_image.data[index + 5], clut_image.data[index + 1]); + tmp2[2] = intp(re, clut_image.data[index + 6], clut_image.data[index + 2]); - tmp1[0] = intp(g, tmp2[0], tmp1[0]); - tmp1[1] = intp(g, tmp2[1], tmp1[1]); - tmp1[2] = intp(g, tmp2[2], tmp1[2]); + tmp1[0] = intp(gr, tmp2[0], tmp1[0]); + tmp1[1] = intp(gr, tmp2[1], tmp1[1]); + tmp1[2] = intp(gr, tmp2[2], tmp1[2]); - out_rgbx[0] = intp(b, tmp1[0], out_rgbx[0]); - out_rgbx[1] = intp(b, tmp1[1], out_rgbx[1]); - out_rgbx[2] = intp(b, tmp1[2], out_rgbx[2]); + out_rgbx[0] = intp(bl, tmp1[0], out_rgbx[0]); + out_rgbx[1] = intp(bl, tmp1[1], out_rgbx[1]); + out_rgbx[2] = intp(bl, tmp1[2], out_rgbx[2]); #else - const vfloat v_tmp = _mm_set_ps(0.0f, b, g, r) * _mm_load_ps1(&flevel_minus_one); - const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp))); + const vfloat v_tmp = _mm_set_ps(0.0f, *b, *g, *r) * _mm_load_ps1(&flevel_minus_one); + const vfloat v_rgb = v_tmp - _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_min_ps(_mm_load_ps1(&flevel_minus_two), v_tmp))); - size_t index = color * 4; + size_t index = color * 4; - const vfloat v_r = PERMUTEPS(v_rgb, 0x00); + const vfloat v_r = PERMUTEPS(v_rgb, 0x00); - vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - index = (color + level) * 4; + index = (color + level) * 4; - vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - const vfloat v_g = PERMUTEPS(v_rgb, 0x55); + const vfloat v_g = PERMUTEPS(v_rgb, 0x55); - vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1); + vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1); - index = (color + level_square) * 4; + index = (color + level_square) * 4; - v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - index = (color + level + level_square) * 4; + index = (color + level + level_square) * 4; - v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); - v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1); + v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1); - const vfloat v_b = PERMUTEPS(v_rgb, 0xAA); + const vfloat v_b = PERMUTEPS(v_rgb, 0xAA); - _mm_store_ps(out_rgbx, vintpf(v_b, v_tmp1, v_out)); + _mm_store_ps(out_rgbx, vintpf(v_b, v_tmp1, v_out)); #endif + } } rtengine::CLUTStore& rtengine::CLUTStore::getInstance() diff --git a/rtengine/clutstore.h b/rtengine/clutstore.h index ed3491fbe..6203e4e61 100644 --- a/rtengine/clutstore.h +++ b/rtengine/clutstore.h @@ -24,7 +24,7 @@ public: virtual Glib::ustring getFilename() const = 0; virtual Glib::ustring getProfile() const = 0; - virtual void getRGB(float r, float g, float b, float out_rgbx[4]) const = 0; + virtual void getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const = 0; static void splitClutFilename( const Glib::ustring& filename, @@ -48,7 +48,7 @@ public: Glib::ustring getFilename() const override; Glib::ustring getProfile() const override; - void getRGB(float r, float g, float b, float out_rgbx[4]) const override; + void getRGB(std::size_t line_size, const float* r, const float* g, const float* b, float* out_rgbx) const override; private: AlignedBuffer clut_image; diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index 2e15da916..d274806bb 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -16,6 +16,7 @@ * You should have received a copy of the GNU General Public License * along with RawTherapee. If not, see . */ +#include #include #include #include @@ -3224,8 +3225,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer } } - double filmSimCorrectedStrength = double(params->filmSimulation.strength) / 100.; - double filmSimSourceStrength = double(100 - params->filmSimulation.strength) / 100.; + float filmSimCorrectedStrength = static_cast(params->filmSimulation.strength) / 100.0f; + float filmSimSourceStrength = 1.0f - filmSimCorrectedStrength; const float exp_scale = pow (2.0, expcomp); const float comp = (max(0.0, expcomp) + 1.0) * hlcompr / 100.0; @@ -4354,13 +4355,24 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer sourceR = CLIP( Color::gamma_srgb( sourceR ) ); sourceG = CLIP( Color::gamma_srgb( sourceG ) ); sourceB = CLIP( Color::gamma_srgb( sourceB ) ); + } + + const std::size_t line_size = std::min(TS, tW - jstart); + std::size_t out_rgbx_size = 4 * (line_size + 16); + std::unique_ptr out_rgbx_buf(new float[out_rgbx_size]); + void* out_rgbx_ptr = out_rgbx_buf.get(); + float* const out_rgbx = reinterpret_cast(std::align(16, 4 * line_size, out_rgbx_ptr, out_rgbx_size)); + colorLUT->getRGB(line_size, rtemp + ti * TS, gtemp + ti * TS, btemp + ti * TS, out_rgbx); + + for (int j = jstart, tj = 0; j < tW; j++, tj++) { + float &sourceR = rtemp[ti * TS + tj]; + float &sourceG = gtemp[ti * TS + tj]; + float &sourceB = btemp[ti * TS + tj]; - float out_rgbx[4] ALIGNED16; - colorLUT->getRGB( sourceR, sourceG, sourceB, out_rgbx ); // apply strength - sourceR = out_rgbx[0] * filmSimCorrectedStrength + sourceR * filmSimSourceStrength; - sourceG = out_rgbx[1] * filmSimCorrectedStrength + sourceG * filmSimSourceStrength; - sourceB = out_rgbx[2] * filmSimCorrectedStrength + sourceB * filmSimSourceStrength; + sourceR = out_rgbx[tj * 4 + 0] * filmSimCorrectedStrength + sourceR * filmSimSourceStrength; + sourceG = out_rgbx[tj * 4 + 1] * filmSimCorrectedStrength + sourceG * filmSimSourceStrength; + sourceB = out_rgbx[tj * 4 + 2] * filmSimCorrectedStrength + sourceB * filmSimSourceStrength; // apply inverse gamma sRGB sourceR = Color::igamma_srgb( sourceR ); sourceG = Color::igamma_srgb( sourceG );