SSS3 code to speedup save to 8bit formats (mainly for 8bit tiff and jpeg)
This commit is contained in:
parent
da9e760403
commit
88685c66f6
@ -56,8 +56,85 @@ void Image16::getScanline (int row, unsigned char* buffer, int bps)
|
|||||||
}
|
}
|
||||||
} else if (bps == 8) {
|
} else if (bps == 8) {
|
||||||
int ix = 0;
|
int ix = 0;
|
||||||
|
int i = 0;
|
||||||
|
#ifdef __SSSE3__
|
||||||
|
// process 48 values using SSSE3. Looks like a lot of code, but it only needs about one instruction per value, whereas scalar version needs about five instructions per value
|
||||||
|
vmask reduceWord2Bytev = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 15, 13, 11, 9, 7, 5, 3, 1);
|
||||||
|
// we need fivev and sixv to reduce the number of registers used for permutation masks from 9 to 6
|
||||||
|
vint fivev = _mm_set1_epi8(5);
|
||||||
|
vint sixv = _mm_set1_epi8(6);
|
||||||
|
|
||||||
for (int i = 0; i < width; i++) {
|
for (; i < width - 15; i += 16, ix += 48) {
|
||||||
|
// generate initial shuffle masks. Gaps are set to 0xf0 to allow calculating subsequent masks from previous ones
|
||||||
|
vint redmaskv = _mm_set_epi8(5, 0xf0, 0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0);
|
||||||
|
vint greenmaskv = _mm_set_epi8(0xf0, 0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0, 0xf0);
|
||||||
|
vint bluemaskv = _mm_set_epi8(0xf0, 4, 0xf0, 0xf0, 3, 0xf0, 0xf0, 2, 0xf0, 0xf0, 1, 0xf0, 0xf0, 0, 0xf0, 0xf0);
|
||||||
|
|
||||||
|
// load first 8 values for each colour
|
||||||
|
vint red1v = _mm_loadu_si128((__m128i*)&r(row, i));
|
||||||
|
vint green1v = _mm_loadu_si128((__m128i*)&g(row, i));
|
||||||
|
vint blue1v = _mm_loadu_si128((__m128i*)&b(row, i));
|
||||||
|
|
||||||
|
// load second 8 values for each colour
|
||||||
|
vint red2v = _mm_loadu_si128((__m128i*)&r(row, i + 8));
|
||||||
|
vint green2v = _mm_loadu_si128((__m128i*)&g(row, i + 8));
|
||||||
|
vint blue2v = _mm_loadu_si128((__m128i*)&b(row, i + 8));
|
||||||
|
|
||||||
|
// shuffle the high bytes of the values to the lower 64 bit of the register
|
||||||
|
red1v = _mm_shuffle_epi8(red1v, reduceWord2Bytev);
|
||||||
|
green1v = _mm_shuffle_epi8(green1v, reduceWord2Bytev);
|
||||||
|
blue1v = _mm_shuffle_epi8(blue1v, reduceWord2Bytev);
|
||||||
|
|
||||||
|
// shuffle the high bytes of the values to the lower 64 bit of the register
|
||||||
|
red2v = _mm_shuffle_epi8(red2v, reduceWord2Bytev);
|
||||||
|
green2v = _mm_shuffle_epi8(green2v, reduceWord2Bytev);
|
||||||
|
blue2v = _mm_shuffle_epi8(blue2v, reduceWord2Bytev);
|
||||||
|
|
||||||
|
// mix first and second 8 values of each colour together
|
||||||
|
red1v = (vint)_mm_shuffle_pd((__m128d)red1v, (__m128d)red2v, 0);
|
||||||
|
green1v = (vint)_mm_shuffle_pd((__m128d)green1v, (__m128d)green2v, 0);
|
||||||
|
blue1v = (vint)_mm_shuffle_pd((__m128d)blue1v, (__m128d)blue2v, 0);
|
||||||
|
|
||||||
|
// now we have the input in registers => let's generate the output
|
||||||
|
|
||||||
|
// first we need r0g0b0r1g1b1r2g2b2r3g3b3r4g4b4r5
|
||||||
|
vint destv = _mm_shuffle_epi8(red1v, redmaskv);
|
||||||
|
vint greenv = _mm_shuffle_epi8(green1v, greenmaskv);
|
||||||
|
destv = _mm_or_si128(destv, greenv);
|
||||||
|
vint bluev = _mm_shuffle_epi8(blue1v, bluemaskv);
|
||||||
|
destv = _mm_or_si128(destv, bluev);
|
||||||
|
_mm_storeu_si128((__m128i*) & (buffer[ix]), destv);
|
||||||
|
|
||||||
|
// then we need g5b5r6g6b6r7g7b7r8g8b8r9g9b9raga
|
||||||
|
// we can calculate the shuffle masks from previous ones => needs only 6 instead of 9 registers to handle the 9 different shuffle masks
|
||||||
|
vint tempmaskv = _mm_add_epi8(redmaskv, fivev);
|
||||||
|
redmaskv = _mm_add_epi8(bluemaskv, sixv);
|
||||||
|
bluemaskv = _mm_add_epi8(greenmaskv, fivev);
|
||||||
|
greenmaskv = tempmaskv;
|
||||||
|
destv = _mm_shuffle_epi8(red1v, redmaskv);
|
||||||
|
greenv = _mm_shuffle_epi8(green1v, greenmaskv);
|
||||||
|
destv = _mm_or_si128(destv, greenv);
|
||||||
|
bluev = _mm_shuffle_epi8(blue1v, bluemaskv);
|
||||||
|
destv = _mm_or_si128(destv, bluev);
|
||||||
|
_mm_storeu_si128((__m128i*) & (buffer[ix + 16]), destv);
|
||||||
|
|
||||||
|
// and last one is barbgbbbrcgcbcrdgdbdregeberfgfbf
|
||||||
|
// we can calculate the shuffle masks from previous ones => needs only 6 instead of 9 registers to handle the 9 different shuffle masks
|
||||||
|
tempmaskv = _mm_add_epi8(greenmaskv, fivev);
|
||||||
|
greenmaskv = _mm_add_epi8(redmaskv, fivev);
|
||||||
|
redmaskv = _mm_add_epi8(bluemaskv, sixv);
|
||||||
|
bluemaskv = tempmaskv;
|
||||||
|
destv = _mm_shuffle_epi8(red1v, redmaskv);
|
||||||
|
greenv = _mm_shuffle_epi8(green1v, greenmaskv);
|
||||||
|
destv = _mm_or_si128(destv, greenv);
|
||||||
|
bluev = _mm_shuffle_epi8(blue1v, bluemaskv);
|
||||||
|
destv = _mm_or_si128(destv, bluev);
|
||||||
|
_mm_storeu_si128((__m128i*) & (buffer[ix + 32]), destv);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (; i < width; i++) {
|
||||||
buffer[ix++] = r(row, i) >> 8;
|
buffer[ix++] = r(row, i) >> 8;
|
||||||
buffer[ix++] = g(row, i) >> 8;
|
buffer[ix++] = g(row, i) >> 8;
|
||||||
buffer[ix++] = b(row, i) >> 8;
|
buffer[ix++] = b(row, i) >> 8;
|
||||||
@ -80,34 +157,34 @@ void Image16::setScanline (int row, unsigned char* buffer, int bps, float *minVa
|
|||||||
assert(!minValue);
|
assert(!minValue);
|
||||||
|
|
||||||
switch (sampleFormat) {
|
switch (sampleFormat) {
|
||||||
case (IIOSF_UNSIGNED_CHAR): {
|
case (IIOSF_UNSIGNED_CHAR): {
|
||||||
int ix = 0;
|
int ix = 0;
|
||||||
|
|
||||||
for (int i = 0; i < width; i++) {
|
for (int i = 0; i < width; i++) {
|
||||||
r(row, i) = (unsigned short)(buffer[ix++]) << 8;
|
r(row, i) = (unsigned short)(buffer[ix++]) << 8;
|
||||||
g(row, i) = (unsigned short)(buffer[ix++]) << 8;
|
g(row, i) = (unsigned short)(buffer[ix++]) << 8;
|
||||||
b(row, i) = (unsigned short)(buffer[ix++]) << 8;
|
b(row, i) = (unsigned short)(buffer[ix++]) << 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
case (IIOSF_UNSIGNED_SHORT): {
|
||||||
}
|
unsigned short* sbuffer = (unsigned short*) buffer;
|
||||||
|
int ix = 0;
|
||||||
|
|
||||||
case (IIOSF_UNSIGNED_SHORT): {
|
for (int i = 0; i < width; i++) {
|
||||||
unsigned short* sbuffer = (unsigned short*) buffer;
|
r(row, i) = sbuffer[ix++];
|
||||||
int ix = 0;
|
g(row, i) = sbuffer[ix++];
|
||||||
|
b(row, i) = sbuffer[ix++];
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < width; i++) {
|
break;
|
||||||
r(row, i) = sbuffer[ix++];
|
|
||||||
g(row, i) = sbuffer[ix++];
|
|
||||||
b(row, i) = sbuffer[ix++];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
default:
|
||||||
}
|
// Other type are ignored, but could be implemented if necessary
|
||||||
|
break;
|
||||||
default:
|
|
||||||
// Other type are ignored, but could be implemented if necessary
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -41,7 +41,8 @@
|
|||||||
#include "color.h"
|
#include "color.h"
|
||||||
|
|
||||||
#include "jpeg.h"
|
#include "jpeg.h"
|
||||||
|
#define BENCHMARK
|
||||||
|
#include "StopWatch.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace rtengine;
|
using namespace rtengine;
|
||||||
using namespace rtengine::procparams;
|
using namespace rtengine::procparams;
|
||||||
@ -917,7 +918,7 @@ int ImageIO::loadPPMFromMemory(const char* buffer, int width, int height, bool s
|
|||||||
|
|
||||||
int ImageIO::savePNG (Glib::ustring fname, int compression, volatile int bps)
|
int ImageIO::savePNG (Glib::ustring fname, int compression, volatile int bps)
|
||||||
{
|
{
|
||||||
|
BENCHFUN
|
||||||
FILE *file = g_fopen_withBinaryAndLock (fname);
|
FILE *file = g_fopen_withBinaryAndLock (fname);
|
||||||
|
|
||||||
if (!file) {
|
if (!file) {
|
||||||
@ -1011,7 +1012,7 @@ int ImageIO::savePNG (Glib::ustring fname, int compression, volatile int bps)
|
|||||||
// Quality 0..100, subsampling: 1=low quality, 2=medium, 3=high
|
// Quality 0..100, subsampling: 1=low quality, 2=medium, 3=high
|
||||||
int ImageIO::saveJPEG (Glib::ustring fname, int quality, int subSamp)
|
int ImageIO::saveJPEG (Glib::ustring fname, int quality, int subSamp)
|
||||||
{
|
{
|
||||||
|
BENCHFUN
|
||||||
FILE *file = g_fopen_withBinaryAndLock (fname);
|
FILE *file = g_fopen_withBinaryAndLock (fname);
|
||||||
|
|
||||||
if (!file) {
|
if (!file) {
|
||||||
@ -1198,7 +1199,7 @@ int ImageIO::saveJPEG (Glib::ustring fname, int quality, int subSamp)
|
|||||||
|
|
||||||
int ImageIO::saveTIFF (Glib::ustring fname, int bps, bool uncompressed)
|
int ImageIO::saveTIFF (Glib::ustring fname, int bps, bool uncompressed)
|
||||||
{
|
{
|
||||||
|
BENCHFUN
|
||||||
//TODO: Handling 32 bits floating point output images!
|
//TODO: Handling 32 bits floating point output images!
|
||||||
bool writeOk = true;
|
bool writeOk = true;
|
||||||
int width = getW ();
|
int width = getW ();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user