Better UNICODE support (UCS-2/UTF-8) for Exif.UserComment (#2017)
- BOM is now checked and correctly handled - auto-detection of UTF-8 string if no BOM available, otherwise assume it's an UCS-2/UTF-16 string - try to autodetect endianess of UTF-16 string by counting zeros - possibility to enable writing BOM for this field (disabled for now) - for undefined charset (empty identifier), RT now assume that the string is what glib think is the local charset (not tested), and try to convert it to UTF-8 for display/editing JIS is still not handled though.
This commit is contained in:
@@ -1928,24 +1928,48 @@ void Tag::initInt (int data, TagType t, int cnt)
|
|||||||
setInt (data, 0, t);
|
setInt (data, 0, t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Tag::swapByteOrder2(char *buffer, int count)
|
||||||
|
{
|
||||||
|
char* ptr = buffer;
|
||||||
|
for (int i = 0; i < count; i+=2) {
|
||||||
|
unsigned char c = ptr[0];
|
||||||
|
ptr[0] = ptr[1];
|
||||||
|
ptr[1] = c;
|
||||||
|
ptr += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
void Tag::initUserComment (const Glib::ustring &text)
|
void Tag::initUserComment (const Glib::ustring &text)
|
||||||
{
|
{
|
||||||
|
const bool useBOM = false; // set it to true if you want to output BOM in UCS-2/UTF-8 UserComments ; this could be turned to an options entry
|
||||||
type = UNDEFINED;
|
type = UNDEFINED;
|
||||||
if (text.is_ascii()) {
|
if (text.is_ascii()) {
|
||||||
count = 8 + strlen (text.c_str());
|
valuesize = count = 8 + strlen (text.c_str());
|
||||||
valuesize = count;
|
|
||||||
value = new unsigned char[valuesize];
|
value = new unsigned char[valuesize];
|
||||||
strcpy ((char*)value, "ASCII");
|
memcpy(value, "ASCII\0\0\0", 8);
|
||||||
value[5] = value[6] = value[7] = 0;
|
memcpy(value + 8, text.c_str(), valuesize - 8);
|
||||||
strcpy ((char*)value + 8, text.c_str());
|
|
||||||
} else {
|
} else {
|
||||||
wchar_t *commentStr = (wchar_t*)g_utf8_to_utf16 (text.c_str(), -1, NULL, NULL, NULL);
|
wchar_t *commentStr = (wchar_t*)g_utf8_to_utf16 (text.c_str(), -1, nullptr, nullptr, nullptr);
|
||||||
count = 8 + wcslen(commentStr)*2;
|
size_t wcStrSize = wcslen(commentStr);
|
||||||
valuesize = count;
|
valuesize = count = wcStrSize * 2 + 8 + (useBOM ? 2 : 0);
|
||||||
value = (unsigned char*)new char[valuesize];
|
value = new unsigned char[valuesize];
|
||||||
strcpy ((char*)value, "UNICODE");
|
memcpy(value, "UNICODE\0", 8);
|
||||||
value[7] = 0;
|
|
||||||
wcscpy(((wchar_t*)value) + 4, commentStr);
|
if (useBOM) {
|
||||||
|
if (getOrder() == INTEL) { //Little Endian
|
||||||
|
value[8] = 0xFF;
|
||||||
|
value[9] = 0xFE;
|
||||||
|
} else {
|
||||||
|
value[8] = 0xFE;
|
||||||
|
value[9] = 0xFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Swapping byte order to match the Exif's byte order
|
||||||
|
if (getOrder() != HOSTORDER) {
|
||||||
|
swapByteOrder2((char*)commentStr, wcStrSize * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(value + 8 + (useBOM ? 2 : 0), (char*)commentStr, wcStrSize * 2);
|
||||||
g_free(commentStr);
|
g_free(commentStr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -236,6 +236,8 @@ public:
|
|||||||
void initLongArray (const char* data, int len);
|
void initLongArray (const char* data, int len);
|
||||||
void initRational (int num, int den);
|
void initRational (int num, int den);
|
||||||
|
|
||||||
|
static void swapByteOrder2 (char *buffer, int count);
|
||||||
|
|
||||||
// get basic tag properties
|
// get basic tag properties
|
||||||
int getID () const
|
int getID () const
|
||||||
{
|
{
|
||||||
|
@@ -452,12 +452,109 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
count = std::min (count, 65535); // limit to 65535 chars to avoid crashes in case of corrupted metadata
|
count = std::min (count, 65535); // limit to 65535 chars to avoid crashes in case of corrupted metadata
|
||||||
char *buffer = new char[count - 7];
|
char *buffer = new char[count - 6]; // include 2 ending null chars for UCS-2 string (possibly)
|
||||||
|
char *value = (char*)t->getValue();
|
||||||
|
|
||||||
if (!memcmp ((char*)t->getValue(), "ASCII\0\0\0", 8)) {
|
if (!memcmp(value, "ASCII\0\0\0", 8)) {
|
||||||
strncpy (buffer, (char*)t->getValue() + 8, count - 8);
|
memcpy(buffer, value + 8, count - 8);
|
||||||
buffer[count - 8] = '\0';
|
buffer[count - 8] = '\0';
|
||||||
|
} else if (!memcmp(value, "UNICODE\0", 8)) {
|
||||||
|
memcpy(buffer, value + 8, count - 8);
|
||||||
|
buffer[count - 7] = buffer[count - 8] = '\0';
|
||||||
|
Glib::ustring tmp1(buffer);
|
||||||
|
|
||||||
|
|
||||||
|
bool hasBOM = false;
|
||||||
|
enum ByteOrder bo = UNKNOWN;
|
||||||
|
if (count % 2 || (count >= 11 && (buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF))) {
|
||||||
|
// odd string length can only be UTF-8, don't change anything
|
||||||
|
std::string retVal (buffer + 3);
|
||||||
|
delete [] buffer;
|
||||||
|
return retVal;
|
||||||
|
} else if (count >= 10) {
|
||||||
|
if (buffer[0] == 0xFF && buffer[1] == 0xFE) {
|
||||||
|
bo = INTEL; // little endian
|
||||||
|
hasBOM = true;
|
||||||
|
} else if (buffer[0] == 0xFE && buffer[1] == 0xFF) {
|
||||||
|
bo = MOTOROLA; // big endian
|
||||||
|
hasBOM = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (bo == UNKNOWN) {
|
||||||
|
// auto-detecting byte order; we still don't know if it's UCS-2 or UTF-8
|
||||||
|
int a = 0, b = 0, c = 0, d = 0;
|
||||||
|
for (int j = 8; j < count; j++) {
|
||||||
|
char cc = value[j];
|
||||||
|
if (!(j%2)) {
|
||||||
|
// counting zeros for first byte
|
||||||
|
if (!cc) {
|
||||||
|
++a;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// counting zeros for second byte
|
||||||
|
if (!cc) {
|
||||||
|
++b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!(cc & 0x80) || ((cc & 0xC0) == 0xC0) || ((cc & 0xC0) == 0x80)) {
|
||||||
|
++c;
|
||||||
|
}
|
||||||
|
if ((cc & 0xC0) == 0x80) {
|
||||||
|
++d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (c == (count - 8) && d) {
|
||||||
|
// this is an UTF-8 string
|
||||||
|
std::string retVal (buffer);
|
||||||
|
delete [] buffer;
|
||||||
|
return retVal;
|
||||||
|
}
|
||||||
|
if ((a || b) && a != b) {
|
||||||
|
bo = a > b ? MOTOROLA : INTEL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (bo == UNKNOWN) {
|
||||||
|
// assuming platform's byte order
|
||||||
|
#if __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
|
||||||
|
bo = INTEL;
|
||||||
|
#else
|
||||||
|
bo = MOTOROLA;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// now swapping if necessary
|
||||||
|
if (!hasBOM && bo != HOSTORDER) {
|
||||||
|
if (t->getOrder() != HOSTORDER) {
|
||||||
|
Tag::swapByteOrder2(buffer, count - 8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
glong written;
|
||||||
|
char* utf8Str = g_utf16_to_utf8((unsigned short int*)buffer, -1, nullptr, &written, nullptr);
|
||||||
|
delete [] buffer;
|
||||||
|
buffer = new char[written + 1];
|
||||||
|
memcpy(buffer, utf8Str, written);
|
||||||
|
buffer[written] = 0;
|
||||||
|
} else if (!memcmp(value, "\0\0\0\0\0\0\0\0", 8)) {
|
||||||
|
// local charset string, whatever it is
|
||||||
|
memcpy(buffer, value + 8, count - 8);
|
||||||
|
buffer[count - 7] = buffer[count - 8] = '\0';
|
||||||
|
|
||||||
|
gsize written = 0;
|
||||||
|
char *utf8Str = g_locale_to_utf8(buffer, count - 8, nullptr, &written, nullptr);
|
||||||
|
if (utf8Str && written) {
|
||||||
|
delete [] buffer;
|
||||||
|
size_t length = strlen(utf8Str);
|
||||||
|
buffer = new char[length + 1];
|
||||||
|
strcpy(buffer, utf8Str);
|
||||||
|
} else {
|
||||||
|
buffer[0] = 0;
|
||||||
|
}
|
||||||
|
if (utf8Str) {
|
||||||
|
g_free(utf8Str);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// JIS: unsupported
|
||||||
buffer[0] = 0;
|
buffer[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -467,11 +564,8 @@ public:
|
|||||||
}
|
}
|
||||||
virtual void fromString (Tag* t, const std::string& value)
|
virtual void fromString (Tag* t, const std::string& value)
|
||||||
{
|
{
|
||||||
char *buffer = new char[t->getCount()];
|
Glib::ustring tmpStr(value);
|
||||||
memcpy (buffer, "ASCII\0\0\0", 8);
|
t->userCommentFromString (tmpStr);
|
||||||
strcpy (buffer + 8, value.c_str());
|
|
||||||
t->fromString (buffer, value.size() + 9);
|
|
||||||
delete [] buffer;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
UserCommentInterpreter userCommentInterpreter;
|
UserCommentInterpreter userCommentInterpreter;
|
||||||
|
Reference in New Issue
Block a user