From 78b3f7288a9ddcd924b06dc2635b7e13ef038984 Mon Sep 17 00:00:00 2001 From: elsid Date: Thu, 1 Jun 2023 11:35:34 +0200 Subject: [PATCH] Support UTF-8 by StringRefId::toDebugString --- apps/openmw_test_suite/esm/testrefid.cpp | 6 ++- components/esm/stringrefid.cpp | 59 ++++++++++++++++++++---- components/misc/utf8stream.hpp | 18 ++++++-- 3 files changed, 67 insertions(+), 16 deletions(-) diff --git a/apps/openmw_test_suite/esm/testrefid.cpp b/apps/openmw_test_suite/esm/testrefid.cpp index 15ec9e1766..554964c425 100644 --- a/apps/openmw_test_suite/esm/testrefid.cpp +++ b/apps/openmw_test_suite/esm/testrefid.cpp @@ -287,7 +287,11 @@ namespace ESM { RefId(), "Empty{}" }, { RefId::stringRefId("foo"), "\"foo\"" }, { RefId::stringRefId("BAR"), "\"BAR\"" }, - { RefId::stringRefId(std::string({ 'a', 0, -1, '\n', '\t' })), "\"a\\x0\\xFF\\xA\\x9\"" }, + { RefId::stringRefId(std::string({ 'a', 0, -1, '\n', '\t' })), "\"a\\x0\\xff\\xa\\x9\"" }, + { RefId::stringRefId("Логово дракона"), "\"Логово дракона\"" }, + { RefId::stringRefId("\xd0\x9b"), "\"Л\"" }, + { RefId::stringRefId("\xff\x9b"), "\"\\xff\\x9b\"" }, + { RefId::stringRefId("\xd0\xd0"), "\"\\xd0\\xd0\"" }, { RefId::formIdRefId({ .mIndex = 42, .mContentFile = 0 }), "FormId:0x2a" }, { RefId::formIdRefId({ .mIndex = 0xffffff, .mContentFile = std::numeric_limits::min() }), "FormId:0xff80000000ffffff" }, diff --git a/components/esm/stringrefid.cpp b/components/esm/stringrefid.cpp index ad7d35903e..2572161853 100644 --- a/components/esm/stringrefid.cpp +++ b/components/esm/stringrefid.cpp @@ -1,13 +1,17 @@ #include "stringrefid.hpp" +#include "serializerefid.hpp" +#include #include #include #include #include +#include #include #include "components/misc/guarded.hpp" #include "components/misc/strings/algorithm.hpp" +#include "components/misc/utf8stream.hpp" namespace ESM { @@ -26,6 +30,18 @@ namespace ESM it = locked->emplace(id).first; return &*it; } + + void addHex(unsigned char value, std::string& result) + { + const std::size_t size = 2 + getHexIntegralSize(value); + const std::size_t shift = result.size(); + result.resize(shift + size); + result[shift] = '\\'; + result[shift + 1] = 'x'; + const auto [end, ec] = std::to_chars(result.data() + shift + 2, result.data() + result.size(), value, 16); + if (ec != std::errc()) + throw std::system_error(std::make_error_code(ec)); + } } StringRefId::StringRefId() @@ -60,20 +76,43 @@ namespace ESM std::ostream& operator<<(std::ostream& stream, StringRefId value) { - stream << '"'; - for (char c : *value.mValue) - if (std::isprint(c) && c != '\t' && c != '\n' && c != '\r') - stream << c; - else - stream << "\\x" << std::hex << std::uppercase << static_cast(static_cast(c)); - return stream << '"'; + return stream << value.toDebugString(); } std::string StringRefId::toDebugString() const { - std::ostringstream stream; - stream << *this; - return stream.str(); + std::string result; + result.reserve(2 + mValue->size()); + result.push_back('"'); + const unsigned char* ptr = reinterpret_cast(mValue->data()); + const unsigned char* const end = reinterpret_cast(mValue->data() + mValue->size()); + while (ptr != end) + { + if (Utf8Stream::isAscii(*ptr)) + { + if (std::isprint(*ptr) && *ptr != '\t' && *ptr != '\n' && *ptr != '\r') + result.push_back(*ptr); + else + addHex(*ptr, result); + ++ptr; + continue; + } + const auto [octets, first] = Utf8Stream::getOctetCount(*ptr); + const auto [chr, next] = Utf8Stream::decode(ptr + 1, end, first, octets); + if (chr == Utf8Stream::sBadChar()) + { + while (ptr != std::min(end, ptr + octets)) + { + addHex(*ptr, result); + ++ptr; + } + continue; + } + result.append(ptr, next); + ptr = next; + } + result.push_back('"'); + return result; } bool StringRefId::startsWith(std::string_view prefix) const diff --git a/components/misc/utf8stream.hpp b/components/misc/utf8stream.hpp index a0c26ed762..271376834d 100644 --- a/components/misc/utf8stream.hpp +++ b/components/misc/utf8stream.hpp @@ -1,6 +1,7 @@ #ifndef MISC_UTF8ITER_HPP #define MISC_UTF8ITER_HPP +#include #include #include #include @@ -63,9 +64,11 @@ public: return val; } + static bool isAscii(unsigned char value) { return (value & 0x80) == 0; } + static std::pair decode(Point cur, Point end) { - if ((*cur & 0x80) == 0) + if (isAscii(*cur)) { UnicodeChar chr = *cur++; @@ -75,8 +78,13 @@ public: int octets; UnicodeChar chr; - std::tie(octets, chr) = octet_count(*cur++); + std::tie(octets, chr) = getOctetCount(*cur++); + return decode(cur, end, chr, octets); + } + + static std::pair decode(Point cur, Point end, UnicodeChar chr, std::size_t octets) + { if (octets > 5) return std::make_pair(sBadChar(), cur); @@ -161,10 +169,9 @@ public: return out; } -private: - static std::pair octet_count(unsigned char octet) + static std::pair getOctetCount(unsigned char octet) { - int octets; + std::size_t octets; unsigned char mark = 0xC0; unsigned char mask = 0xE0; @@ -181,6 +188,7 @@ private: return std::make_pair(octets, octet & ~mask); } +private: void next() { std::tie(val, nxt) = decode(nxt, end); } Point cur;