Rearrange SqliteExtensions code to better support custom lowercase mappings.

2025-01-30 15:32:37 +00:00 · 2023-06-19 13:41:16 -07:00 · 2023-06-19 13:41:16 -07:00 · a918ce29d6
commit a918ce29d6
parent d723f96360
1 changed files with 103 additions and 25 deletions
--- a/src/musikcore/db/SqliteExtensions.cpp
+++ b/src/musikcore/db/SqliteExtensions.cpp
@ -36,6 +36,7 @@

 #pragma warning(push, 0)
 #include <sqlite/sqlite3.h>
+#include <utf8/utf8.h>
 #include <musikcore/db/SqliteExtensions.h>
 #pragma warning(pop)

@ -646,6 +647,52 @@ u32 sqlite3Utf8Read(
    return it != accentToChar.end() ? it->second : c;
 }

+static inline int utf8bytes(u32 c) {
+    if (c < 0x80) { return 1; }
+    else if (c < 0x800) { return 2; }
+    else if (c < 0x10000) { return 3; }
+    return 4;
+}
+
+static inline void utf8fill(u32 c, uint8_t bytes[5]) {
+    memset((void*) bytes, 0, sizeof(uint8_t) * 5);
+    if (c < 0x80) {
+        bytes[0] = static_cast<uint8_t>(c); 
+    }
+    else if (c < 0x800) {
+        bytes[0] = static_cast<uint8_t>((c >> 6)   | 0xc0);
+        bytes[1] = static_cast<uint8_t>((c & 0x3f) | 0x80);
+    }
+    else if (c < 0x10000) {
+        bytes[0] = static_cast<uint8_t>((c >> 12)         | 0xe0);
+        bytes[1] = static_cast<uint8_t>(((c >> 6) & 0x3f) | 0x80);
+        bytes[2] = static_cast<uint8_t>((c & 0x3f)        | 0x80);
+    }
+    else {                                // four octets
+        bytes[0] = static_cast<uint8_t>((c >> 18)         | 0xf0);
+        bytes[1] = static_cast<uint8_t>(((c >> 12) & 0x3f)| 0x80);
+        bytes[2] = static_cast<uint8_t>(((c >> 6) & 0x3f) | 0x80);
+        bytes[3] = static_cast<uint8_t>((c & 0x3f)        | 0x80);
+    }
+}
+
+static int utf8strcspn(const char* str1, const char* str2) {
+    uint8_t bytes[5];
+    std::string needles(str2);
+    std::string::const_iterator begin = needles.begin();
+    std::string::const_iterator it = begin;
+    u32 str1len = strlen(str1);
+    const char* foundPos;
+    while (it != needles.end()) {
+        u32 c = utf8::unchecked::next(it);
+        utf8fill(c, bytes);
+        if ((foundPos = strstr(str1, (const char*) bytes)) != nullptr) {
+            return foundPos - str1;
+        }
+    }
+    return str1len;
+}
+
 /*
 ** Compare two UTF-8 strings for equality where the first string is
 ** a GLOB or LIKE expression.  Return values:
@ -736,10 +783,14 @@ static int patternCompare(
            ** c but in the other case and search the input string for either
            ** c or cx.
            */
-            if (c <= 0x80) {
            std::unordered_map<u32, const char*>& charToAccentsMap = charToAccentsInsensitive;
            char zStop[3];
            int bMatch;
+            bool bIsAscii = (c <= 0x80);
+            u32 nByteCount = utf8bytes(c);
+
+            if (bIsAscii || noCase) {
+                if (bIsAscii) {
                    if (noCase) {
                        zStop[0] = sqlite3Toupper(c);
                        zStop[1] = sqlite3Tolower(c);
@ -750,10 +801,20 @@ static int patternCompare(
                        zStop[0] = c;
                        zStop[1] = 0;
                    }
+                }
                while (1) {
+                    auto it = charToAccentsMap.find(c);
+                    /* if we are doing a case insensitive search on a non-ascii character (utf8), then
+                    use our special utf8strscpn to search */
+                    if (!bIsAscii && noCase) {
+                        if (it != charToAccentsMap.end()) {
+                            zString += utf8strcspn((const char*)zString, it->second);
+                        }
+                    }
+                    /* otherwise, we're searching for a regular ascii character... */
+                    else {
                        /* see if we have a mapping from input character to a list of their
                        respective accented characters; if we do, use that mapping */
-                    auto it = charToAccentsMap.find(c);
                        if (it != charToAccentsMap.end()) {
                            const char* stop = it->second;
                            zString += strcspn((const char*)zString, it->second);
@ -763,13 +824,17 @@ static int patternCompare(
                        else {
                            zString += strcspn((const char*)zString, zStop);
                        }
+                    }
                    if (zString[0] == 0) break;
-                    zString++;
+                    zString += nByteCount;
                    bMatch = patternCompare(zPattern, zString, pInfo, matchOther);
                    if (bMatch != SQLITE_NOMATCH) return bMatch;
                }
            }
-            else {
+
+            /* if we fall through to here, and the character we're looking for is utf8 sequence,
+            go ahead and do a regular pattern compare. */
+            if (!bIsAscii) {
                int bMatch;
                while ((c2 = Utf8Read(zString)) != 0) {
                    if (c2 != c) continue;
@ -777,6 +842,7 @@ static int patternCompare(
                    if (bMatch != SQLITE_NOMATCH) return bMatch;
                }
            }
+
            return SQLITE_NOWILDCARDMATCH;
        }
        if (c == matchOther) {
@ -822,9 +888,21 @@ static int patternCompare(
        }
        c2 = Utf8Read(zString);
        if (c == c2) continue;
-        if (noCase && sqlite3Tolower(c) == sqlite3Tolower(c2) && c < 0x80 && c2 < 0x80) {
+        if (noCase) {
+            /* standard sqlite check */
+            if (sqlite3Tolower(c) == sqlite3Tolower(c2) && c < 0x80 && c2 < 0x80) {
                continue;
            }
+            /* check against our internal mappings */
+            auto it = charToAccentsInsensitive.find(c);
+            if (it != charToAccentsInsensitive.end()) {
+                uint8_t bytes[5];
+                utf8fill(c2, bytes);
+                if (utf8strcspn((const char*) bytes, it->second) < strlen((const char*) bytes)) {
+                    continue;
+                }
+            }
+        }
        if (c == matchOne && zPattern != zEscaped && c2 != 0) continue;
        return SQLITE_NOMATCH;
    }