From 88685c66f6b2c82810b8ed40b7078424c69d2fd9 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Fri, 17 Jun 2016 17:56:34 +0200 Subject: [PATCH 1/3] SSS3 code to speedup save to 8bit formats (mainly for 8bit tiff and jpeg) --- rtengine/image16.cc | 121 ++++++++++++++++++++++++++++++++++++-------- rtengine/imageio.cc | 9 ++-- 2 files changed, 104 insertions(+), 26 deletions(-) diff --git a/rtengine/image16.cc b/rtengine/image16.cc index 8ba88bd4c..56f3a4290 100644 --- a/rtengine/image16.cc +++ b/rtengine/image16.cc @@ -56,8 +56,85 @@ void Image16::getScanline (int row, unsigned char* buffer, int bps) } } else if (bps == 8) { int ix = 0; + int i = 0; +#ifdef __SSSE3__ + // process 48 values using SSSE3. Looks like a lot of code, but it only needs about one instruction per value, whereas scalar version needs about five instructions per value + vmask reduceWord2Bytev = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 15, 13, 11, 9, 7, 5, 3, 1); + // we need fivev and sixv to reduce the number of registers used for permutation masks from 9 to 6 + vint fivev = _mm_set1_epi8(5); + vint sixv = _mm_set1_epi8(6); - for (int i = 0; i < width; i++) { + for (; i < width - 15; i += 16, ix += 48) { + // generate initial shuffle masks. Gaps are set to 0xf0 to allow calculating subsequent masks from previous ones + vint redmaskv = _mm_set_epi8(5, 0xf0, 0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0); + vint greenmaskv = _mm_set_epi8(0xf0, 0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0, 0xf0); + vint bluemaskv = _mm_set_epi8(0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0, 0xf0, 0xf0); + + // load first 8 values for each colour + vint red1v = _mm_loadu_si128((__m128i*)&r(row, i)); + vint green1v = _mm_loadu_si128((__m128i*)&g(row, i)); + vint blue1v = _mm_loadu_si128((__m128i*)&b(row, i)); + + // load second 8 values for each colour + vint red2v = _mm_loadu_si128((__m128i*)&r(row, i + 8)); + vint green2v = _mm_loadu_si128((__m128i*)&g(row, i + 8)); + vint blue2v = _mm_loadu_si128((__m128i*)&b(row, i + 8)); + + // shuffle the high bytes of the values to the lower 64 bit of the register + red1v = _mm_shuffle_epi8(red1v, reduceWord2Bytev); + green1v = _mm_shuffle_epi8(green1v, reduceWord2Bytev); + blue1v = _mm_shuffle_epi8(blue1v, reduceWord2Bytev); + + // shuffle the high bytes of the values to the lower 64 bit of the register + red2v = _mm_shuffle_epi8(red2v, reduceWord2Bytev); + green2v = _mm_shuffle_epi8(green2v, reduceWord2Bytev); + blue2v = _mm_shuffle_epi8(blue2v, reduceWord2Bytev); + + // mix first and second 8 values of each colour together + red1v = (vint)_mm_shuffle_pd((__m128d)red1v, (__m128d)red2v, 0); + green1v = (vint)_mm_shuffle_pd((__m128d)green1v, (__m128d)green2v, 0); + blue1v = (vint)_mm_shuffle_pd((__m128d)blue1v, (__m128d)blue2v, 0); + + // now we have the input in registers => let's generate the output + + // first we need r0g0b0r1g1b1r2g2b2r3g3b3r4g4b4r5 + vint destv = _mm_shuffle_epi8(red1v, redmaskv); + vint greenv = _mm_shuffle_epi8(green1v, greenmaskv); + destv = _mm_or_si128(destv, greenv); + vint bluev = _mm_shuffle_epi8(blue1v, bluemaskv); + destv = _mm_or_si128(destv, bluev); + _mm_storeu_si128((__m128i*) & (buffer[ix]), destv); + + // then we need g5b5r6g6b6r7g7b7r8g8b8r9g9b9raga + // we can calculate the shuffle masks from previous ones => needs only 6 instead of 9 registers to handle the 9 different shuffle masks + vint tempmaskv = _mm_add_epi8(redmaskv, fivev); + redmaskv = _mm_add_epi8(bluemaskv, sixv); + bluemaskv = _mm_add_epi8(greenmaskv, fivev); + greenmaskv = tempmaskv; + destv = _mm_shuffle_epi8(red1v, redmaskv); + greenv = _mm_shuffle_epi8(green1v, greenmaskv); + destv = _mm_or_si128(destv, greenv); + bluev = _mm_shuffle_epi8(blue1v, bluemaskv); + destv = _mm_or_si128(destv, bluev); + _mm_storeu_si128((__m128i*) & (buffer[ix + 16]), destv); + + // and last one is barbgbbbrcgcbcrdgdbdregeberfgfbf + // we can calculate the shuffle masks from previous ones => needs only 6 instead of 9 registers to handle the 9 different shuffle masks + tempmaskv = _mm_add_epi8(greenmaskv, fivev); + greenmaskv = _mm_add_epi8(redmaskv, fivev); + redmaskv = _mm_add_epi8(bluemaskv, sixv); + bluemaskv = tempmaskv; + destv = _mm_shuffle_epi8(red1v, redmaskv); + greenv = _mm_shuffle_epi8(green1v, greenmaskv); + destv = _mm_or_si128(destv, greenv); + bluev = _mm_shuffle_epi8(blue1v, bluemaskv); + destv = _mm_or_si128(destv, bluev); + _mm_storeu_si128((__m128i*) & (buffer[ix + 32]), destv); + } + +#endif + + for (; i < width; i++) { buffer[ix++] = r(row, i) >> 8; buffer[ix++] = g(row, i) >> 8; buffer[ix++] = b(row, i) >> 8; @@ -80,34 +157,34 @@ void Image16::setScanline (int row, unsigned char* buffer, int bps, float *minVa assert(!minValue); switch (sampleFormat) { - case (IIOSF_UNSIGNED_CHAR): { - int ix = 0; + case (IIOSF_UNSIGNED_CHAR): { + int ix = 0; - for (int i = 0; i < width; i++) { - r(row, i) = (unsigned short)(buffer[ix++]) << 8; - g(row, i) = (unsigned short)(buffer[ix++]) << 8; - b(row, i) = (unsigned short)(buffer[ix++]) << 8; + for (int i = 0; i < width; i++) { + r(row, i) = (unsigned short)(buffer[ix++]) << 8; + g(row, i) = (unsigned short)(buffer[ix++]) << 8; + b(row, i) = (unsigned short)(buffer[ix++]) << 8; + } + + break; } - break; - } + case (IIOSF_UNSIGNED_SHORT): { + unsigned short* sbuffer = (unsigned short*) buffer; + int ix = 0; - case (IIOSF_UNSIGNED_SHORT): { - unsigned short* sbuffer = (unsigned short*) buffer; - int ix = 0; + for (int i = 0; i < width; i++) { + r(row, i) = sbuffer[ix++]; + g(row, i) = sbuffer[ix++]; + b(row, i) = sbuffer[ix++]; + } - for (int i = 0; i < width; i++) { - r(row, i) = sbuffer[ix++]; - g(row, i) = sbuffer[ix++]; - b(row, i) = sbuffer[ix++]; + break; } - break; - } - - default: - // Other type are ignored, but could be implemented if necessary - break; + default: + // Other type are ignored, but could be implemented if necessary + break; } /* diff --git a/rtengine/imageio.cc b/rtengine/imageio.cc index 4fa5a5a93..2c779a99b 100644 --- a/rtengine/imageio.cc +++ b/rtengine/imageio.cc @@ -41,7 +41,8 @@ #include "color.h" #include "jpeg.h" - +#define BENCHMARK +#include "StopWatch.h" using namespace std; using namespace rtengine; using namespace rtengine::procparams; @@ -917,7 +918,7 @@ int ImageIO::loadPPMFromMemory(const char* buffer, int width, int height, bool s int ImageIO::savePNG (Glib::ustring fname, int compression, volatile int bps) { - +BENCHFUN FILE *file = g_fopen_withBinaryAndLock (fname); if (!file) { @@ -1011,7 +1012,7 @@ int ImageIO::savePNG (Glib::ustring fname, int compression, volatile int bps) // Quality 0..100, subsampling: 1=low quality, 2=medium, 3=high int ImageIO::saveJPEG (Glib::ustring fname, int quality, int subSamp) { - +BENCHFUN FILE *file = g_fopen_withBinaryAndLock (fname); if (!file) { @@ -1198,7 +1199,7 @@ int ImageIO::saveJPEG (Glib::ustring fname, int quality, int subSamp) int ImageIO::saveTIFF (Glib::ustring fname, int bps, bool uncompressed) { - +BENCHFUN //TODO: Handling 32 bits floating point output images! bool writeOk = true; int width = getW (); From aab8bad3916cb6fba9e0da78e21693712e004d42 Mon Sep 17 00:00:00 2001 From: heckflosse Date: Sat, 18 Jun 2016 14:04:00 +0200 Subject: [PATCH 2/3] Let the compiler optimize instead of using handwritten SSSE3 code, same speed --- rtengine/image16.cc | 117 ++++++++++---------------------------------- 1 file changed, 25 insertions(+), 92 deletions(-) diff --git a/rtengine/image16.cc b/rtengine/image16.cc index 56f3a4290..5f03ede9b 100644 --- a/rtengine/image16.cc +++ b/rtengine/image16.cc @@ -23,6 +23,29 @@ #include #include "rtengine.h" +namespace +{ + +void getScanline8 (uint16_t *red, uint16_t *green, uint16_t *blue, int width, unsigned char* buffer) +{ + for (int i = 0, ix = 0; i < width; i++) { + buffer[ix++] = red[i] >> 8; + buffer[ix++] = green[i] >> 8; + buffer[ix++] = blue[i] >> 8; + } +} + +void getScanline16 (uint16_t *red, uint16_t *green, uint16_t *blue, int width, unsigned short* buffer) +{ + for (int i = 0, ix = 0; i < width; i++) { + buffer[ix++] = red[i]; + buffer[ix++] = green[i]; + buffer[ix++] = blue[i]; + } +} + +} + using namespace rtengine; Image16::Image16 () @@ -46,99 +69,9 @@ void Image16::getScanline (int row, unsigned char* buffer, int bps) } if (bps == 16) { - int ix = 0; - unsigned short* sbuffer = (unsigned short*) buffer; - - for (int i = 0; i < width; i++) { - sbuffer[ix++] = r(row, i); - sbuffer[ix++] = g(row, i); - sbuffer[ix++] = b(row, i); - } + getScanline16 (&r(row, 0), &g(row, 0), &b(row, 0), width, (unsigned short*)buffer); } else if (bps == 8) { - int ix = 0; - int i = 0; -#ifdef __SSSE3__ - // process 48 values using SSSE3. Looks like a lot of code, but it only needs about one instruction per value, whereas scalar version needs about five instructions per value - vmask reduceWord2Bytev = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 15, 13, 11, 9, 7, 5, 3, 1); - // we need fivev and sixv to reduce the number of registers used for permutation masks from 9 to 6 - vint fivev = _mm_set1_epi8(5); - vint sixv = _mm_set1_epi8(6); - - for (; i < width - 15; i += 16, ix += 48) { - // generate initial shuffle masks. Gaps are set to 0xf0 to allow calculating subsequent masks from previous ones - vint redmaskv = _mm_set_epi8(5, 0xf0, 0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0); - vint greenmaskv = _mm_set_epi8(0xf0, 0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0, 0xf0); - vint bluemaskv = _mm_set_epi8(0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0, 0xf0, 0xf0); - - // load first 8 values for each colour - vint red1v = _mm_loadu_si128((__m128i*)&r(row, i)); - vint green1v = _mm_loadu_si128((__m128i*)&g(row, i)); - vint blue1v = _mm_loadu_si128((__m128i*)&b(row, i)); - - // load second 8 values for each colour - vint red2v = _mm_loadu_si128((__m128i*)&r(row, i + 8)); - vint green2v = _mm_loadu_si128((__m128i*)&g(row, i + 8)); - vint blue2v = _mm_loadu_si128((__m128i*)&b(row, i + 8)); - - // shuffle the high bytes of the values to the lower 64 bit of the register - red1v = _mm_shuffle_epi8(red1v, reduceWord2Bytev); - green1v = _mm_shuffle_epi8(green1v, reduceWord2Bytev); - blue1v = _mm_shuffle_epi8(blue1v, reduceWord2Bytev); - - // shuffle the high bytes of the values to the lower 64 bit of the register - red2v = _mm_shuffle_epi8(red2v, reduceWord2Bytev); - green2v = _mm_shuffle_epi8(green2v, reduceWord2Bytev); - blue2v = _mm_shuffle_epi8(blue2v, reduceWord2Bytev); - - // mix first and second 8 values of each colour together - red1v = (vint)_mm_shuffle_pd((__m128d)red1v, (__m128d)red2v, 0); - green1v = (vint)_mm_shuffle_pd((__m128d)green1v, (__m128d)green2v, 0); - blue1v = (vint)_mm_shuffle_pd((__m128d)blue1v, (__m128d)blue2v, 0); - - // now we have the input in registers => let's generate the output - - // first we need r0g0b0r1g1b1r2g2b2r3g3b3r4g4b4r5 - vint destv = _mm_shuffle_epi8(red1v, redmaskv); - vint greenv = _mm_shuffle_epi8(green1v, greenmaskv); - destv = _mm_or_si128(destv, greenv); - vint bluev = _mm_shuffle_epi8(blue1v, bluemaskv); - destv = _mm_or_si128(destv, bluev); - _mm_storeu_si128((__m128i*) & (buffer[ix]), destv); - - // then we need g5b5r6g6b6r7g7b7r8g8b8r9g9b9raga - // we can calculate the shuffle masks from previous ones => needs only 6 instead of 9 registers to handle the 9 different shuffle masks - vint tempmaskv = _mm_add_epi8(redmaskv, fivev); - redmaskv = _mm_add_epi8(bluemaskv, sixv); - bluemaskv = _mm_add_epi8(greenmaskv, fivev); - greenmaskv = tempmaskv; - destv = _mm_shuffle_epi8(red1v, redmaskv); - greenv = _mm_shuffle_epi8(green1v, greenmaskv); - destv = _mm_or_si128(destv, greenv); - bluev = _mm_shuffle_epi8(blue1v, bluemaskv); - destv = _mm_or_si128(destv, bluev); - _mm_storeu_si128((__m128i*) & (buffer[ix + 16]), destv); - - // and last one is barbgbbbrcgcbcrdgdbdregeberfgfbf - // we can calculate the shuffle masks from previous ones => needs only 6 instead of 9 registers to handle the 9 different shuffle masks - tempmaskv = _mm_add_epi8(greenmaskv, fivev); - greenmaskv = _mm_add_epi8(redmaskv, fivev); - redmaskv = _mm_add_epi8(bluemaskv, sixv); - bluemaskv = tempmaskv; - destv = _mm_shuffle_epi8(red1v, redmaskv); - greenv = _mm_shuffle_epi8(green1v, greenmaskv); - destv = _mm_or_si128(destv, greenv); - bluev = _mm_shuffle_epi8(blue1v, bluemaskv); - destv = _mm_or_si128(destv, bluev); - _mm_storeu_si128((__m128i*) & (buffer[ix + 32]), destv); - } - -#endif - - for (; i < width; i++) { - buffer[ix++] = r(row, i) >> 8; - buffer[ix++] = g(row, i) >> 8; - buffer[ix++] = b(row, i) >> 8; - } + getScanline8 (&r(row, 0), &g(row, 0), &b(row, 0), width, buffer); } } From e4a7287650de4010905e285c27a90e48660345fd Mon Sep 17 00:00:00 2001 From: heckflosse Date: Sat, 18 Jun 2016 16:04:54 +0200 Subject: [PATCH 3/3] Removed StopWatches and cleaned code --- rtengine/image16.cc | 17 ++++++++--------- rtengine/imageio.cc | 16 +++++++++------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/rtengine/image16.cc b/rtengine/image16.cc index 5f03ede9b..937e2d19d 100644 --- a/rtengine/image16.cc +++ b/rtengine/image16.cc @@ -19,14 +19,13 @@ #include "image16.h" #include "imagefloat.h" #include "image8.h" -#include #include #include "rtengine.h" namespace { -void getScanline8 (uint16_t *red, uint16_t *green, uint16_t *blue, int width, unsigned char* buffer) +void getScanline8 (const uint16_t *red, const uint16_t *green, const uint16_t *blue, int width, unsigned char* buffer) { for (int i = 0, ix = 0; i < width; i++) { buffer[ix++] = red[i] >> 8; @@ -35,7 +34,7 @@ void getScanline8 (uint16_t *red, uint16_t *green, uint16_t *blue, int width, un } } -void getScanline16 (uint16_t *red, uint16_t *green, uint16_t *blue, int width, unsigned short* buffer) +void getScanline16 (const uint16_t *red, const uint16_t *green, const uint16_t *blue, int width, unsigned short* buffer) { for (int i = 0, ix = 0; i < width; i++) { buffer[ix++] = red[i]; @@ -64,14 +63,14 @@ Image16::~Image16 () void Image16::getScanline (int row, unsigned char* buffer, int bps) { - if (data == NULL) { + if (data == nullptr) { return; } if (bps == 16) { - getScanline16 (&r(row, 0), &g(row, 0), &b(row, 0), width, (unsigned short*)buffer); + getScanline16 (r(row), g(row), b(row), width, (unsigned short*)buffer); } else if (bps == 8) { - getScanline8 (&r(row, 0), &g(row, 0), &b(row, 0), width, buffer); + getScanline8 (r(row), g(row), b(row), width, buffer); } } @@ -82,11 +81,11 @@ void Image16::getScanline (int row, unsigned char* buffer, int bps) void Image16::setScanline (int row, unsigned char* buffer, int bps, float *minValue, float *maxValue) { - if (data == NULL) { + if (data == nullptr) { return; } - // For optimization purpose, we're assuming that this class never have to provide min/max bound + // For optimization purpose, we're assuming that this class never has to provide min/max bounds assert(!minValue); switch (sampleFormat) { @@ -116,7 +115,7 @@ void Image16::setScanline (int row, unsigned char* buffer, int bps, float *minVa } default: - // Other type are ignored, but could be implemented if necessary + // Other types are ignored, but could be implemented if necessary break; } diff --git a/rtengine/imageio.cc b/rtengine/imageio.cc index 2c779a99b..3ad8faee0 100644 --- a/rtengine/imageio.cc +++ b/rtengine/imageio.cc @@ -41,8 +41,7 @@ #include "color.h" #include "jpeg.h" -#define BENCHMARK -#include "StopWatch.h" + using namespace std; using namespace rtengine; using namespace rtengine::procparams; @@ -62,6 +61,7 @@ FILE* g_fopen_withBinaryAndLock(const Glib::ustring& fname) std::unique_ptr wfname (reinterpret_cast(g_utf8_to_utf16 (fname.c_str (), -1, NULL, NULL, NULL)), g_free); HANDLE hFile = CreateFileW ( wfname.get (), GENERIC_READ | GENERIC_WRITE, 0 /* no sharing allowed */, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + if (hFile != INVALID_HANDLE_VALUE) { f = _fdopen (_open_osfhandle ((intptr_t)hFile, 0), "wb"); } @@ -918,7 +918,7 @@ int ImageIO::loadPPMFromMemory(const char* buffer, int width, int height, bool s int ImageIO::savePNG (Glib::ustring fname, int compression, volatile int bps) { -BENCHFUN + FILE *file = g_fopen_withBinaryAndLock (fname); if (!file) { @@ -1012,7 +1012,7 @@ BENCHFUN // Quality 0..100, subsampling: 1=low quality, 2=medium, 3=high int ImageIO::saveJPEG (Glib::ustring fname, int quality, int subSamp) { -BENCHFUN + FILE *file = g_fopen_withBinaryAndLock (fname); if (!file) { @@ -1199,7 +1199,7 @@ BENCHFUN int ImageIO::saveTIFF (Glib::ustring fname, int bps, bool uncompressed) { -BENCHFUN + //TODO: Handling 32 bits floating point output images! bool writeOk = true; int width = getW (); @@ -1228,9 +1228,11 @@ BENCHFUN // buffer for the exif and iptc int bufferSize = 165535; //TODO: Is it really 165535... or 65535 ? - if(profileData) + + if(profileData) { bufferSize += profileLength; - + } + unsigned char* buffer = new unsigned char[bufferSize]; unsigned char* iptcdata = NULL; unsigned int iptclen = 0;