From 4a06351c3b0745b7f38955e0ba56f612eaa22462 Mon Sep 17 00:00:00 2001
From: elsid <elsid.mail@gmail.com>
Date: Wed, 16 Feb 2022 10:48:49 +0100
Subject: [PATCH] update to_utf8 and translation to make use of new stateless
 utf8

---
 components/to_utf8/to_utf8.cpp         | 286 +++++++++++++------------
 components/to_utf8/to_utf8.hpp         |  63 +++---
 components/translation/translation.cpp |  11 +-
 3 files changed, 188 insertions(+), 172 deletions(-)
diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp
index 04edfda09d..a193e6375d 100644
--- a/components/to_utf8/to_utf8.cpp
+++ b/components/to_utf8/to_utf8.cpp
@@ -3,6 +3,7 @@
 #include <vector>
 #include <cassert>
 #include <stdexcept>
+#include <algorithm>
 
 #include <components/debug/debuglog.hpp>
 
@@ -44,55 +45,62 @@
 
 using namespace ToUTF8;
 
-Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
-    mOutput(50*1024)
+namespace
 {
-    switch (sourceEncoding)
+    std::string_view::iterator skipAscii(std::string_view input)
     {
-        case ToUTF8::WINDOWS_1252:
-        {
-            translationArray = ToUTF8::windows_1252;
-            break;
-        }
-        case ToUTF8::WINDOWS_1250:
-        {
-            translationArray = ToUTF8::windows_1250;
-            break;
-        }
-        case ToUTF8::WINDOWS_1251:
-        {
-            translationArray = ToUTF8::windows_1251;
-            break;
-        }
-        case ToUTF8::CP437:
-        {
-            translationArray = ToUTF8::cp437;
-            break;
-        }
+        return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; });
+    }
 
-        default:
+    std::basic_string_view<signed char> getTranslationArray(FromType sourceEncoding)
+    {
+        switch (sourceEncoding)
         {
-            assert(0);
+            case ToUTF8::WINDOWS_1252:
+                return ToUTF8::windows_1252;
+            case ToUTF8::WINDOWS_1250:
+                return ToUTF8::windows_1250;
+            case ToUTF8::WINDOWS_1251:
+                return ToUTF8::windows_1251;
+            case ToUTF8::CP437:
+                return ToUTF8::cp437;
+        }
+        throw std::logic_error("Invalid source encoding: " + std::to_string(sourceEncoding));
+    }
+
+    // Make sure the output vector is large enough for 'size' bytes,
+    // including a terminating zero after it.
+    void resize(std::size_t size, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer)
+    {
+        if (buffer.size() >= size)
+            return;
+
+        switch (bufferAllocationPolicy)
+        {
+            case BufferAllocationPolicy::FitToRequiredSize:
+                buffer.resize(size);
+                break;
+            case BufferAllocationPolicy::UseGrowFactor:
+                // Add some extra padding to reduce the chance of having to resize
+                // again later.
+                buffer.resize(3 * size);
+                // And make sure the string is zero terminated
+                buffer[size] = 0;
+                break;
         }
     }
 }
 
-std::string Utf8Encoder::getUtf8(const char* input, size_t size)
+StatelessUtf8Encoder::StatelessUtf8Encoder(FromType sourceEncoding)
+    : mTranslationArray(getTranslationArray(sourceEncoding))
 {
-    // Double check that the input string stops at some point (it might
-    // contain zero terminators before this, inside its own data, which
-    // is also ok.)
-    assert(input[size] == 0);
-
-    std::string inputString(input, size);
-    std::string output;
-    toUtf8(inputString, output, size);
-
-    return output;
 }
 
-void Utf8Encoder::toUtf8(std::string& input, std::string& output, size_t size)
+std::string_view StatelessUtf8Encoder::getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const
 {
+    if (input.empty())
+        return input;
+
     // Note: The rest of this function is designed for single-character
     // input encodings only. It also assumes that the input encoding
     // shares its first 128 values (0-127) with ASCII. There are no plans
@@ -101,45 +109,34 @@ void Utf8Encoder::toUtf8(std::string& input, std::string& output, size_t size)
 
     // Compute output length, and check for pure ascii input at the same
     // time.
-    bool ascii;
-    size_t outlen = getLength(input.c_str(), ascii);
+    const auto [outlen, ascii] = getLength(input);
 
     // If we're pure ascii, then don't bother converting anything.
     if(ascii)
-    {
-        std::swap(input, output);
-        return;
-    }
+        return std::string_view(input.data(), outlen);
 
     // Make sure the output is large enough
-    if (output.size() <= outlen)
-        // Add some extra padding to reduce the chance of having to resize
-        // again later.
-        output.resize(3*outlen);
-
-    // And make sure the string is zero terminated
-    output[outlen] = 0;
-    char *in = &input[0];
-    char *out = &output[0];
+    resize(outlen, bufferAllocationPolicy, buffer);
+    char *out = buffer.data();
 
     // Translate
-    while (*in)
-        copyFromArray(*(in++), out);
+    for (auto it = input.begin(); it != input.end() && *it != 0; ++it)
+        copyFromArray(*it, out);
 
     // Make sure that we wrote the correct number of bytes
-    assert((out-&output[0]) == (int)outlen);
+    assert((out - buffer.data()) == (int)outlen);
 
     // And make extra sure the output is null terminated
-    assert(output.size() > outlen);
-    assert(output[outlen] == 0);
+    assert(buffer.size() >= outlen);
+    assert(buffer[outlen] == 0);
+
+    return std::string_view(buffer.data(), outlen);
 }
 
-std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
+std::string_view StatelessUtf8Encoder::getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const
 {
-    // Double check that the input string stops at some point (it might
-    // contain zero terminators before this, inside its own data, which
-    // is also ok.)
-    assert(input[size] == 0);
+    if (input.empty())
+        return input;
 
     // TODO: The rest of this function is designed for single-character
     // input encodings only. It also assumes that the input the input
@@ -149,43 +146,28 @@ std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
 
     // Compute output length, and check for pure ascii input at the same
     // time.
-    bool ascii;
-    size_t outlen = getLength2(input, ascii);
+    const auto [outlen, ascii] = getLengthLegacyEnc(input);
 
     // If we're pure ascii, then don't bother converting anything.
     if(ascii)
-        return std::string(input, outlen);
+        return std::string_view(input.data(), outlen);
 
     // Make sure the output is large enough
-    resize(outlen);
-    char *out = &mOutput[0];
+    resize(outlen, bufferAllocationPolicy, buffer);
+    char *out = buffer.data();
 
     // Translate
-    while(*input)
-        copyFromArray2(input, out);
+    for (auto it = input.begin(); it != input.end() && *it != 0;)
+        copyFromArrayLegacyEnc(it, input.end(), out);
 
     // Make sure that we wrote the correct number of bytes
-    assert((out-&mOutput[0]) == (int)outlen);
+    assert((out - buffer.data()) == static_cast<int>(outlen));
 
     // And make extra sure the output is null terminated
-    assert(mOutput.size() > outlen);
-    assert(mOutput[outlen] == 0);
+    assert(buffer.size() >= outlen);
+    assert(buffer[outlen] == 0);
 
-    // Return a string
-    return std::string(&mOutput[0], outlen);
-}
-
-// Make sure the output vector is large enough for 'size' bytes,
-// including a terminating zero after it.
-void Utf8Encoder::resize(size_t size)
-{
-    if (mOutput.size() <= size)
-        // Add some extra padding to reduce the chance of having to resize
-        // again later.
-        mOutput.resize(3*size);
-
-    // And make sure the string is zero terminated
-    mOutput[size] = 0;
+    return std::string_view(buffer.data(), outlen);
 }
 
 /** Get the total length length needed to decode the given string with
@@ -198,39 +180,35 @@ void Utf8Encoder::resize(size_t size)
   is the case, then the ascii parameter is set to true, and the
   caller can optimize for this case.
  */
-size_t Utf8Encoder::getLength(const char* input, bool &ascii) const
+std::pair<std::size_t, bool> StatelessUtf8Encoder::getLength(std::string_view input) const
 {
-    ascii = true;
-    size_t len = 0;
-    const char* ptr = input;
-    unsigned char inp = *ptr;
-
     // Do away with the ascii part of the string first (this is almost
     // always the entire string.)
-    while (inp && inp < 128)
-        inp = *(++ptr);
-    len += (ptr-input);
+    auto it = skipAscii(input);
 
     // If we're not at the null terminator at this point, then there
     // were some non-ascii characters to deal with. Go to slow-mode for
     // the rest of the string.
-    if (inp)
+    if (it == input.end() || *it == 0)
+        return {it - input.begin(), true};
+
+    std::size_t len = it - input.begin();
+
+    do
     {
-        ascii = false;
-        while (inp)
-        {
-            // Find the translated length of this character in the
-            // lookup table.
-            len += translationArray[inp*6];
-            inp = *(++ptr);
-        }
+        // Find the translated length of this character in the
+        // lookup table.
+        len += mTranslationArray[static_cast<unsigned char>(*it) * 6];
+        ++it;
     }
-    return len;
+    while (it != input.end() && *it != 0);
+
+    return {len, false};
 }
 
 // Translate one character 'ch' using the translation array 'arr', and
 // advance the output pointer accordingly.
-void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
+void StatelessUtf8Encoder::copyFromArray(unsigned char ch, char* &out) const
 {
     // Optimize for ASCII values
     if (ch < 128)
@@ -239,57 +217,58 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
         return;
     }
 
-    const signed char *in = translationArray + ch*6;
+    const signed char *in = &mTranslationArray[ch * 6];
     int len = *(in++);
     memcpy(out, in, len);
     out += len;
 }
 
-size_t Utf8Encoder::getLength2(const char* input, bool &ascii) const
+std::pair<std::size_t, bool> StatelessUtf8Encoder::getLengthLegacyEnc(std::string_view input) const
 {
-    ascii = true;
-    size_t len = 0;
-    const char* ptr = input;
-    unsigned char inp = *ptr;
-
     // Do away with the ascii part of the string first (this is almost
     // always the entire string.)
-    while (inp && inp < 128)
-        inp = *(++ptr);
-    len += (ptr-input);
+    auto it = skipAscii(input);
 
     // If we're not at the null terminator at this point, then there
     // were some non-ascii characters to deal with. Go to slow-mode for
     // the rest of the string.
-    if (inp)
-    {
-        ascii = false;
-        while(inp)
-        {
-            len += 1;
-            // Find the translated length of this character in the
-            // lookup table.
-            switch(inp)
-            {
-                case 0xe2: len -= 2; break;
-                case 0xc2:
-                case 0xcb:
-                case 0xc4:
-                case 0xc6:
-                case 0xc3:
-                case 0xd0:
-                case 0xd1:
-                case 0xd2:
-                case 0xc5: len -= 1; break;
-            }
+    if (it == input.end() || *it == 0)
+        return {it - input.begin(), true};
 
-            inp = *(++ptr);
+    std::size_t len = it - input.begin();
+    std::size_t symbolLen = 0;
+
+    do
+    {
+        symbolLen += 1;
+        // Find the translated length of this character in the
+        // lookup table.
+        switch (static_cast<unsigned char>(*it))
+        {
+            case 0xe2: symbolLen -= 2; break;
+            case 0xc2:
+            case 0xcb:
+            case 0xc4:
+            case 0xc6:
+            case 0xc3:
+            case 0xd0:
+            case 0xd1:
+            case 0xd2:
+            case 0xc5: symbolLen -= 1; break;
+            default:
+                len += symbolLen;
+                symbolLen = 0;
+                break;
         }
+
+        ++it;
     }
-    return len;
+    while (it != input.end() && *it != 0);
+
+    return {len, false};
 }
 
-void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
+void StatelessUtf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
 {
     unsigned char ch = *(chp++);
     // Optimize for ASCII values
@@ -320,14 +299,21 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
         return;
     }
 
+    if (chp == end)
+        return;
+
     unsigned char ch2 = *(chp++);
     unsigned char ch3 = '\0';
     if (len == 3)
+    {
+        if (chp == end)
+            return;
         ch3 = *(chp++);
+    }
 
     for (int i = 128; i < 256; i++)
     {
-        unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
+        unsigned char b1 = mTranslationArray[i*6 + 1], b2 = mTranslationArray[i*6 + 2], b3 = mTranslationArray[i*6 + 3];
         if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
         {
             *(out++) = (char)i;
@@ -340,6 +326,22 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
     *(out++) = ch; // Could not find glyph, just put whatever
 }
 
+Utf8Encoder::Utf8Encoder(FromType sourceEncoding)
+    : mBuffer(50 * 1024, '\0')
+    , mImpl(sourceEncoding)
+{
+}
+
+std::string_view Utf8Encoder::getUtf8(std::string_view input)
+{
+    return mImpl.getUtf8(input, BufferAllocationPolicy::UseGrowFactor, mBuffer);
+}
+
+std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
+{
+    return mImpl.getLegacyEnc(input, BufferAllocationPolicy::UseGrowFactor, mBuffer);
+}
+
 ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
 {
     if (encodingName == "win1250")
diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp
index 23dc09c06e..037e3ea3bf 100644
--- a/components/to_utf8/to_utf8.hpp
+++ b/components/to_utf8/to_utf8.hpp
@@ -4,6 +4,7 @@
 #include <string>
 #include <cstring>
 #include <vector>
+#include <string_view>
 
 namespace ToUTF8
 {
@@ -17,41 +18,55 @@ namespace ToUTF8
         CP437           // Used for fonts (*.fnt) if data files encoding is 1252. Otherwise, uses the same encoding as the data files.
     };
 
+    enum class BufferAllocationPolicy
+    {
+        FitToRequiredSize,
+        UseGrowFactor,
+    };
+
     FromType calculateEncoding(const std::string& encodingName);
     std::string encodingUsingMessage(const std::string& encodingName);
 
-    // class
+    class StatelessUtf8Encoder
+    {
+        public:
+            explicit StatelessUtf8Encoder(FromType sourceEncoding);
+
+            /// Convert to UTF8 from the previously given code page.
+            /// Returns a view to passed buffer that will be resized to fit output if it's too small.
+            std::string_view getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const;
+
+            /// Convert from UTF-8 to sourceEncoding.
+            /// Returns a view to passed buffer that will be resized to fit output if it's too small.
+            std::string_view getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const;
+
+        private:
+            inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
+            inline void copyFromArray(unsigned char chp, char* &out) const;
+            inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
+            inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
+
+            const std::basic_string_view<signed char> mTranslationArray;
+    };
 
     class Utf8Encoder
     {
         public:
-            Utf8Encoder(FromType sourceEncoding);
+            explicit Utf8Encoder(FromType sourceEncoding);
 
-            // Convert to UTF8 from the previously given code page.
-            std::string getUtf8(const char *input, size_t size);
-            inline std::string getUtf8(const std::string &str)
-            {
-                return getUtf8(str.c_str(), str.size());
-            }
+            /// Convert to UTF8 from the previously given code page.
+            /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
+            /// ASCII-only string. Otherwise returns a view to the input.
+            std::string_view getUtf8(std::string_view input);
 
-            // Convert input to UTF8 to the given output string
-            void toUtf8(std::string& input, std::string& output, size_t size);
-
-            std::string getLegacyEnc(const char *input, size_t size);
-            inline std::string getLegacyEnc(const std::string &str)
-            {
-                return getLegacyEnc(str.c_str(), str.size());
-            }
+            /// Convert from UTF-8 to sourceEncoding.
+            /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
+            /// ASCII-only string. Otherwise returns a view to the input.
+            std::string_view getLegacyEnc(std::string_view input);
 
         private:
-            void resize(size_t size);
-            size_t getLength(const char* input, bool &ascii) const;
-            void copyFromArray(unsigned char chp, char* &out) const;
-            size_t getLength2(const char* input, bool &ascii) const;
-            void copyFromArray2(const char*& chp, char* &out) const;
-
-            std::vector<char> mOutput;
-            const signed char* translationArray;
+            std::string mBuffer;
+            StatelessUtf8Encoder mImpl;
     };
 }
 
diff --git a/components/translation/translation.cpp b/components/translation/translation.cpp
index 37068fd70d..ef0f432075 100644
--- a/components/translation/translation.cpp
+++ b/components/translation/translation.cpp
@@ -53,14 +53,13 @@ namespace Translation
 
             if (!line.empty())
             {
-                std::string utf8Line;
-                mEncoder->toUtf8(line, utf8Line, line.size());
+                line = mEncoder->getUtf8(line);
 
-                size_t tab_pos = utf8Line.find('\t');
-                if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < utf8Line.size() - 1)
+                size_t tab_pos = line.find('\t');
+                if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
                 {
-                    std::string key = utf8Line.substr(0, tab_pos);
-                    std::string value = utf8Line.substr(tab_pos + 1);
+                    std::string key = line.substr(0, tab_pos);
+                    std::string value = line.substr(tab_pos + 1);
 
                     if (!key.empty() && !value.empty())
                         container.insert(std::make_pair(key, value));