From 4a06351c3b0745b7f38955e0ba56f612eaa22462 Mon Sep 17 00:00:00 2001 From: elsid Date: Wed, 16 Feb 2022 10:48:49 +0100 Subject: [PATCH] update to_utf8 and translation to make use of new stateless utf8 --- components/to_utf8/to_utf8.cpp | 286 +++++++++++++------------ components/to_utf8/to_utf8.hpp | 63 +++--- components/translation/translation.cpp | 11 +- 3 files changed, 188 insertions(+), 172 deletions(-) diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp index 04edfda09d..a193e6375d 100644 --- a/components/to_utf8/to_utf8.cpp +++ b/components/to_utf8/to_utf8.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -44,55 +45,62 @@ using namespace ToUTF8; -Utf8Encoder::Utf8Encoder(const FromType sourceEncoding): - mOutput(50*1024) +namespace { - switch (sourceEncoding) + std::string_view::iterator skipAscii(std::string_view input) { - case ToUTF8::WINDOWS_1252: - { - translationArray = ToUTF8::windows_1252; - break; - } - case ToUTF8::WINDOWS_1250: - { - translationArray = ToUTF8::windows_1250; - break; - } - case ToUTF8::WINDOWS_1251: - { - translationArray = ToUTF8::windows_1251; - break; - } - case ToUTF8::CP437: - { - translationArray = ToUTF8::cp437; - break; - } + return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; }); + } - default: + std::basic_string_view getTranslationArray(FromType sourceEncoding) + { + switch (sourceEncoding) { - assert(0); + case ToUTF8::WINDOWS_1252: + return ToUTF8::windows_1252; + case ToUTF8::WINDOWS_1250: + return ToUTF8::windows_1250; + case ToUTF8::WINDOWS_1251: + return ToUTF8::windows_1251; + case ToUTF8::CP437: + return ToUTF8::cp437; + } + throw std::logic_error("Invalid source encoding: " + std::to_string(sourceEncoding)); + } + + // Make sure the output vector is large enough for 'size' bytes, + // including a terminating zero after it. + void resize(std::size_t size, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) + { + if (buffer.size() >= size) + return; + + switch (bufferAllocationPolicy) + { + case BufferAllocationPolicy::FitToRequiredSize: + buffer.resize(size); + break; + case BufferAllocationPolicy::UseGrowFactor: + // Add some extra padding to reduce the chance of having to resize + // again later. + buffer.resize(3 * size); + // And make sure the string is zero terminated + buffer[size] = 0; + break; } } } -std::string Utf8Encoder::getUtf8(const char* input, size_t size) +StatelessUtf8Encoder::StatelessUtf8Encoder(FromType sourceEncoding) + : mTranslationArray(getTranslationArray(sourceEncoding)) { - // Double check that the input string stops at some point (it might - // contain zero terminators before this, inside its own data, which - // is also ok.) - assert(input[size] == 0); - - std::string inputString(input, size); - std::string output; - toUtf8(inputString, output, size); - - return output; } -void Utf8Encoder::toUtf8(std::string& input, std::string& output, size_t size) +std::string_view StatelessUtf8Encoder::getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const { + if (input.empty()) + return input; + // Note: The rest of this function is designed for single-character // input encodings only. It also assumes that the input encoding // shares its first 128 values (0-127) with ASCII. There are no plans @@ -101,45 +109,34 @@ void Utf8Encoder::toUtf8(std::string& input, std::string& output, size_t size) // Compute output length, and check for pure ascii input at the same // time. - bool ascii; - size_t outlen = getLength(input.c_str(), ascii); + const auto [outlen, ascii] = getLength(input); // If we're pure ascii, then don't bother converting anything. if(ascii) - { - std::swap(input, output); - return; - } + return std::string_view(input.data(), outlen); // Make sure the output is large enough - if (output.size() <= outlen) - // Add some extra padding to reduce the chance of having to resize - // again later. - output.resize(3*outlen); - - // And make sure the string is zero terminated - output[outlen] = 0; - char *in = &input[0]; - char *out = &output[0]; + resize(outlen, bufferAllocationPolicy, buffer); + char *out = buffer.data(); // Translate - while (*in) - copyFromArray(*(in++), out); + for (auto it = input.begin(); it != input.end() && *it != 0; ++it) + copyFromArray(*it, out); // Make sure that we wrote the correct number of bytes - assert((out-&output[0]) == (int)outlen); + assert((out - buffer.data()) == (int)outlen); // And make extra sure the output is null terminated - assert(output.size() > outlen); - assert(output[outlen] == 0); + assert(buffer.size() >= outlen); + assert(buffer[outlen] == 0); + + return std::string_view(buffer.data(), outlen); } -std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size) +std::string_view StatelessUtf8Encoder::getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const { - // Double check that the input string stops at some point (it might - // contain zero terminators before this, inside its own data, which - // is also ok.) - assert(input[size] == 0); + if (input.empty()) + return input; // TODO: The rest of this function is designed for single-character // input encodings only. It also assumes that the input the input @@ -149,43 +146,28 @@ std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size) // Compute output length, and check for pure ascii input at the same // time. - bool ascii; - size_t outlen = getLength2(input, ascii); + const auto [outlen, ascii] = getLengthLegacyEnc(input); // If we're pure ascii, then don't bother converting anything. if(ascii) - return std::string(input, outlen); + return std::string_view(input.data(), outlen); // Make sure the output is large enough - resize(outlen); - char *out = &mOutput[0]; + resize(outlen, bufferAllocationPolicy, buffer); + char *out = buffer.data(); // Translate - while(*input) - copyFromArray2(input, out); + for (auto it = input.begin(); it != input.end() && *it != 0;) + copyFromArrayLegacyEnc(it, input.end(), out); // Make sure that we wrote the correct number of bytes - assert((out-&mOutput[0]) == (int)outlen); + assert((out - buffer.data()) == static_cast(outlen)); // And make extra sure the output is null terminated - assert(mOutput.size() > outlen); - assert(mOutput[outlen] == 0); + assert(buffer.size() >= outlen); + assert(buffer[outlen] == 0); - // Return a string - return std::string(&mOutput[0], outlen); -} - -// Make sure the output vector is large enough for 'size' bytes, -// including a terminating zero after it. -void Utf8Encoder::resize(size_t size) -{ - if (mOutput.size() <= size) - // Add some extra padding to reduce the chance of having to resize - // again later. - mOutput.resize(3*size); - - // And make sure the string is zero terminated - mOutput[size] = 0; + return std::string_view(buffer.data(), outlen); } /** Get the total length length needed to decode the given string with @@ -198,39 +180,35 @@ void Utf8Encoder::resize(size_t size) is the case, then the ascii parameter is set to true, and the caller can optimize for this case. */ -size_t Utf8Encoder::getLength(const char* input, bool &ascii) const +std::pair StatelessUtf8Encoder::getLength(std::string_view input) const { - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - // Do away with the ascii part of the string first (this is almost // always the entire string.) - while (inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); + auto it = skipAscii(input); // If we're not at the null terminator at this point, then there // were some non-ascii characters to deal with. Go to slow-mode for // the rest of the string. - if (inp) + if (it == input.end() || *it == 0) + return {it - input.begin(), true}; + + std::size_t len = it - input.begin(); + + do { - ascii = false; - while (inp) - { - // Find the translated length of this character in the - // lookup table. - len += translationArray[inp*6]; - inp = *(++ptr); - } + // Find the translated length of this character in the + // lookup table. + len += mTranslationArray[static_cast(*it) * 6]; + ++it; } - return len; + while (it != input.end() && *it != 0); + + return {len, false}; } // Translate one character 'ch' using the translation array 'arr', and // advance the output pointer accordingly. -void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const +void StatelessUtf8Encoder::copyFromArray(unsigned char ch, char* &out) const { // Optimize for ASCII values if (ch < 128) @@ -239,57 +217,58 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const return; } - const signed char *in = translationArray + ch*6; + const signed char *in = &mTranslationArray[ch * 6]; int len = *(in++); memcpy(out, in, len); out += len; } -size_t Utf8Encoder::getLength2(const char* input, bool &ascii) const +std::pair StatelessUtf8Encoder::getLengthLegacyEnc(std::string_view input) const { - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - // Do away with the ascii part of the string first (this is almost // always the entire string.) - while (inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); + auto it = skipAscii(input); // If we're not at the null terminator at this point, then there // were some non-ascii characters to deal with. Go to slow-mode for // the rest of the string. - if (inp) - { - ascii = false; - while(inp) - { - len += 1; - // Find the translated length of this character in the - // lookup table. - switch(inp) - { - case 0xe2: len -= 2; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len -= 1; break; - } + if (it == input.end() || *it == 0) + return {it - input.begin(), true}; - inp = *(++ptr); + std::size_t len = it - input.begin(); + std::size_t symbolLen = 0; + + do + { + symbolLen += 1; + // Find the translated length of this character in the + // lookup table. + switch (static_cast(*it)) + { + case 0xe2: symbolLen -= 2; break; + case 0xc2: + case 0xcb: + case 0xc4: + case 0xc6: + case 0xc3: + case 0xd0: + case 0xd1: + case 0xd2: + case 0xc5: symbolLen -= 1; break; + default: + len += symbolLen; + symbolLen = 0; + break; } + + ++it; } - return len; + while (it != input.end() && *it != 0); + + return {len, false}; } -void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const +void StatelessUtf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const { unsigned char ch = *(chp++); // Optimize for ASCII values @@ -320,14 +299,21 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const return; } + if (chp == end) + return; + unsigned char ch2 = *(chp++); unsigned char ch3 = '\0'; if (len == 3) + { + if (chp == end) + return; ch3 = *(chp++); + } for (int i = 128; i < 256; i++) { - unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3]; + unsigned char b1 = mTranslationArray[i*6 + 1], b2 = mTranslationArray[i*6 + 2], b3 = mTranslationArray[i*6 + 3]; if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) { *(out++) = (char)i; @@ -340,6 +326,22 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const *(out++) = ch; // Could not find glyph, just put whatever } +Utf8Encoder::Utf8Encoder(FromType sourceEncoding) + : mBuffer(50 * 1024, '\0') + , mImpl(sourceEncoding) +{ +} + +std::string_view Utf8Encoder::getUtf8(std::string_view input) +{ + return mImpl.getUtf8(input, BufferAllocationPolicy::UseGrowFactor, mBuffer); +} + +std::string_view Utf8Encoder::getLegacyEnc(std::string_view input) +{ + return mImpl.getLegacyEnc(input, BufferAllocationPolicy::UseGrowFactor, mBuffer); +} + ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) { if (encodingName == "win1250") diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp index 23dc09c06e..037e3ea3bf 100644 --- a/components/to_utf8/to_utf8.hpp +++ b/components/to_utf8/to_utf8.hpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace ToUTF8 { @@ -17,41 +18,55 @@ namespace ToUTF8 CP437 // Used for fonts (*.fnt) if data files encoding is 1252. Otherwise, uses the same encoding as the data files. }; + enum class BufferAllocationPolicy + { + FitToRequiredSize, + UseGrowFactor, + }; + FromType calculateEncoding(const std::string& encodingName); std::string encodingUsingMessage(const std::string& encodingName); - // class + class StatelessUtf8Encoder + { + public: + explicit StatelessUtf8Encoder(FromType sourceEncoding); + + /// Convert to UTF8 from the previously given code page. + /// Returns a view to passed buffer that will be resized to fit output if it's too small. + std::string_view getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const; + + /// Convert from UTF-8 to sourceEncoding. + /// Returns a view to passed buffer that will be resized to fit output if it's too small. + std::string_view getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const; + + private: + inline std::pair getLength(std::string_view input) const; + inline void copyFromArray(unsigned char chp, char* &out) const; + inline std::pair getLengthLegacyEnc(std::string_view input) const; + inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const; + + const std::basic_string_view mTranslationArray; + }; class Utf8Encoder { public: - Utf8Encoder(FromType sourceEncoding); + explicit Utf8Encoder(FromType sourceEncoding); - // Convert to UTF8 from the previously given code page. - std::string getUtf8(const char *input, size_t size); - inline std::string getUtf8(const std::string &str) - { - return getUtf8(str.c_str(), str.size()); - } + /// Convert to UTF8 from the previously given code page. + /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not + /// ASCII-only string. Otherwise returns a view to the input. + std::string_view getUtf8(std::string_view input); - // Convert input to UTF8 to the given output string - void toUtf8(std::string& input, std::string& output, size_t size); - - std::string getLegacyEnc(const char *input, size_t size); - inline std::string getLegacyEnc(const std::string &str) - { - return getLegacyEnc(str.c_str(), str.size()); - } + /// Convert from UTF-8 to sourceEncoding. + /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not + /// ASCII-only string. Otherwise returns a view to the input. + std::string_view getLegacyEnc(std::string_view input); private: - void resize(size_t size); - size_t getLength(const char* input, bool &ascii) const; - void copyFromArray(unsigned char chp, char* &out) const; - size_t getLength2(const char* input, bool &ascii) const; - void copyFromArray2(const char*& chp, char* &out) const; - - std::vector mOutput; - const signed char* translationArray; + std::string mBuffer; + StatelessUtf8Encoder mImpl; }; } diff --git a/components/translation/translation.cpp b/components/translation/translation.cpp index 37068fd70d..ef0f432075 100644 --- a/components/translation/translation.cpp +++ b/components/translation/translation.cpp @@ -53,14 +53,13 @@ namespace Translation if (!line.empty()) { - std::string utf8Line; - mEncoder->toUtf8(line, utf8Line, line.size()); + line = mEncoder->getUtf8(line); - size_t tab_pos = utf8Line.find('\t'); - if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < utf8Line.size() - 1) + size_t tab_pos = line.find('\t'); + if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1) { - std::string key = utf8Line.substr(0, tab_pos); - std::string value = utf8Line.substr(tab_pos + 1); + std::string key = line.substr(0, tab_pos); + std::string value = line.substr(tab_pos + 1); if (!key.empty() && !value.empty()) container.insert(std::make_pair(key, value));