Better UNICODE support (UCS-2/UTF-8) for Exif.UserComment (#2017)

- BOM is now checked and correctly handled - auto-detection of UTF-8 string if no BOM available, otherwise assume it's an UCS-2/UTF-16 string - try to autodetect endianess of UTF-16 string by counting zeros - possibility to enable writing BOM for this field (disabled for now) - for undefined charset (empty identifier), RT now assume that the string is what glib think is the local charset (not tested), and try to convert it to UTF-8 for display/editing JIS is still not handled though.
2018-01-01 14:24:47 +01:00
parent 96863bb956
commit 9d2ccc19d3
3 changed files with 140 additions and 20 deletions
--- a/rtexif/rtexif.cc
+++ b/rtexif/rtexif.cc
@@ -1928,24 +1928,48 @@ void Tag::initInt (int data, TagType t, int cnt)
    setInt (data, 0, t);
 }

+void Tag::swapByteOrder2(char *buffer, int count)
+{
+    char* ptr = buffer;
+    for (int i = 0; i < count; i+=2) {
+        unsigned char c = ptr[0];
+        ptr[0] = ptr[1];
+        ptr[1] = c;
+        ptr += 2;
+    }
+}
 void Tag::initUserComment (const Glib::ustring &text)
 {
+    const bool useBOM = false; // set it to true if you want to output BOM in UCS-2/UTF-8 UserComments ; this could be turned to an options entry
    type = UNDEFINED;
    if (text.is_ascii()) {
-        count = 8 + strlen (text.c_str());
-        valuesize = count;
+        valuesize = count = 8 + strlen (text.c_str());
        value = new unsigned char[valuesize];
-        strcpy ((char*)value, "ASCII");
-        value[5] = value[6] = value[7] = 0;
-        strcpy ((char*)value + 8, text.c_str());
+        memcpy(value, "ASCII\0\0\0", 8);
+        memcpy(value + 8, text.c_str(), valuesize - 8);
    } else {
-        wchar_t *commentStr = (wchar_t*)g_utf8_to_utf16 (text.c_str(), -1, NULL, NULL, NULL);
-        count = 8 + wcslen(commentStr)*2;
-        valuesize = count;
-        value = (unsigned char*)new char[valuesize];
-        strcpy ((char*)value, "UNICODE");
-        value[7] = 0;
-        wcscpy(((wchar_t*)value) + 4, commentStr);
+        wchar_t *commentStr = (wchar_t*)g_utf8_to_utf16 (text.c_str(), -1, nullptr, nullptr, nullptr);
+        size_t wcStrSize = wcslen(commentStr);
+        valuesize = count = wcStrSize * 2 + 8 + (useBOM ? 2 : 0);
+        value = new unsigned char[valuesize];
+        memcpy(value, "UNICODE\0", 8);
+
+        if (useBOM) {
+            if (getOrder() == INTEL) { //Little Endian
+                value[8] = 0xFF;
+                value[9] = 0xFE;
+            } else {
+                value[8] = 0xFE;
+                value[9] = 0xFF;
+            }
+        }
+
+        // Swapping byte order to match the Exif's byte order
+        if (getOrder() != HOSTORDER) {
+            swapByteOrder2((char*)commentStr, wcStrSize * 2);
+        }
+
+        memcpy(value + 8 + (useBOM ? 2 : 0), (char*)commentStr, wcStrSize * 2);
        g_free(commentStr);
    }
 }
--- a/rtexif/rtexif.h
+++ b/rtexif/rtexif.h
@@ -236,6 +236,8 @@ public:
    void initLongArray   (const char* data, int len);
    void initRational    (int num, int den);

+    static void swapByteOrder2 (char *buffer, int count);
+
    // get basic tag properties
    int                  getID          () const
    {
--- a/rtexif/stdattribs.cc
+++ b/rtexif/stdattribs.cc
@@ -452,12 +452,109 @@ public:
        }

        count = std::min (count, 65535); // limit to 65535 chars to avoid crashes in case of corrupted metadata
-        char *buffer = new char[count - 7];
+        char *buffer = new char[count - 6]; // include 2 ending null chars for UCS-2 string (possibly)
+        char *value = (char*)t->getValue();

-        if (!memcmp ((char*)t->getValue(), "ASCII\0\0\0", 8)) {
-            strncpy (buffer, (char*)t->getValue() + 8, count - 8);
+        if (!memcmp(value, "ASCII\0\0\0", 8)) {
+            memcpy(buffer, value + 8, count - 8);
            buffer[count - 8] = '\0';
+        } else if (!memcmp(value, "UNICODE\0", 8)) {
+            memcpy(buffer, value + 8, count - 8);
+            buffer[count - 7] = buffer[count - 8] = '\0';
+            Glib::ustring tmp1(buffer);
+
+
+            bool hasBOM = false;
+            enum ByteOrder bo = UNKNOWN;
+            if (count % 2 || (count >= 11 && (buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF))) {
+                // odd string length can only be UTF-8, don't change anything
+                std::string retVal (buffer + 3);
+                delete [] buffer;
+                return retVal;
+            } else if (count >= 10) {
+                if (buffer[0] == 0xFF && buffer[1] == 0xFE) {
+                    bo = INTEL; // little endian
+                    hasBOM = true;
+                } else if (buffer[0] == 0xFE && buffer[1] == 0xFF) {
+                    bo = MOTOROLA; // big endian
+                    hasBOM = true;
+                }
+            }
+            if (bo == UNKNOWN) {
+                // auto-detecting byte order; we still don't know if it's UCS-2 or UTF-8
+                int a = 0, b = 0, c = 0, d = 0;
+                for (int j = 8; j < count; j++) {
+                    char cc = value[j];
+                    if (!(j%2)) {
+                        // counting zeros for first byte
+                        if (!cc) {
+                            ++a;
+                        }
+                    } else {
+                        // counting zeros for second byte
+                        if (!cc) {
+                            ++b;
+                        }
+                    }
+                    if (!(cc & 0x80) || ((cc & 0xC0) == 0xC0) || ((cc & 0xC0) == 0x80)) {
+                        ++c;
+                    }
+                    if ((cc & 0xC0) == 0x80) {
+                        ++d;
+                    }
+                }
+                if (c == (count - 8) && d) {
+                    // this is an UTF-8 string
+                    std::string retVal (buffer);
+                    delete [] buffer;
+                    return retVal;
+                }
+                if ((a || b) && a != b) {
+                    bo = a > b ? MOTOROLA : INTEL;
+                }
+            }
+            if (bo == UNKNOWN) {
+                // assuming platform's byte order
+#if __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
+                bo = INTEL;
+#else
+                bo = MOTOROLA;
+#endif
+            }
+
+            // now swapping if necessary
+            if (!hasBOM && bo != HOSTORDER) {
+                if (t->getOrder() != HOSTORDER) {
+                    Tag::swapByteOrder2(buffer, count - 8);
+                }
+            }
+
+            glong written;
+            char* utf8Str = g_utf16_to_utf8((unsigned short int*)buffer, -1, nullptr, &written, nullptr);
+            delete [] buffer;
+            buffer = new char[written + 1];
+            memcpy(buffer, utf8Str, written);
+            buffer[written] = 0;
+        } else if (!memcmp(value, "\0\0\0\0\0\0\0\0", 8)) {
+            // local charset string, whatever it is
+            memcpy(buffer, value + 8, count - 8);
+            buffer[count - 7] = buffer[count - 8] = '\0';
+
+            gsize written = 0;
+            char *utf8Str = g_locale_to_utf8(buffer, count - 8, nullptr, &written, nullptr);
+            if (utf8Str && written) {
+                delete [] buffer;
+                size_t length = strlen(utf8Str);
+                buffer = new char[length + 1];
+                strcpy(buffer, utf8Str);
+            } else {
+                buffer[0] = 0;
+            }
+            if (utf8Str) {
+                g_free(utf8Str);
+            }
        } else {
+            // JIS: unsupported
            buffer[0] = 0;
        }

@@ -467,11 +564,8 @@ public:
    }
    virtual void fromString (Tag* t, const std::string& value)
    {
-        char *buffer = new char[t->getCount()];
-        memcpy (buffer, "ASCII\0\0\0", 8);
-        strcpy (buffer + 8, value.c_str());
-        t->fromString (buffer, value.size() + 9);
-        delete [] buffer;
+        Glib::ustring tmpStr(value);
+        t->userCommentFromString (tmpStr);
    }
 };
 UserCommentInterpreter userCommentInterpreter;