1
0
mirror of https://gitlab.com/OpenMW/openmw.git synced 2025-02-21 00:39:58 +00:00

update to_utf8 and translation to make use of new stateless utf8

This commit is contained in:
elsid 2022-02-16 10:48:49 +01:00 committed by Bret Curtis
parent 8113620dce
commit 4a06351c3b
3 changed files with 188 additions and 172 deletions

View File

@ -3,6 +3,7 @@
#include <vector>
#include <cassert>
#include <stdexcept>
#include <algorithm>
#include <components/debug/debuglog.hpp>
@ -44,55 +45,62 @@
using namespace ToUTF8;
Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
mOutput(50*1024)
namespace
{
switch (sourceEncoding)
std::string_view::iterator skipAscii(std::string_view input)
{
case ToUTF8::WINDOWS_1252:
{
translationArray = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
translationArray = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
translationArray = ToUTF8::windows_1251;
break;
}
case ToUTF8::CP437:
{
translationArray = ToUTF8::cp437;
break;
}
return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; });
}
default:
std::basic_string_view<signed char> getTranslationArray(FromType sourceEncoding)
{
switch (sourceEncoding)
{
assert(0);
case ToUTF8::WINDOWS_1252:
return ToUTF8::windows_1252;
case ToUTF8::WINDOWS_1250:
return ToUTF8::windows_1250;
case ToUTF8::WINDOWS_1251:
return ToUTF8::windows_1251;
case ToUTF8::CP437:
return ToUTF8::cp437;
}
throw std::logic_error("Invalid source encoding: " + std::to_string(sourceEncoding));
}
// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it.
void resize(std::size_t size, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer)
{
if (buffer.size() >= size)
return;
switch (bufferAllocationPolicy)
{
case BufferAllocationPolicy::FitToRequiredSize:
buffer.resize(size);
break;
case BufferAllocationPolicy::UseGrowFactor:
// Add some extra padding to reduce the chance of having to resize
// again later.
buffer.resize(3 * size);
// And make sure the string is zero terminated
buffer[size] = 0;
break;
}
}
}
std::string Utf8Encoder::getUtf8(const char* input, size_t size)
StatelessUtf8Encoder::StatelessUtf8Encoder(FromType sourceEncoding)
: mTranslationArray(getTranslationArray(sourceEncoding))
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
std::string inputString(input, size);
std::string output;
toUtf8(inputString, output, size);
return output;
}
void Utf8Encoder::toUtf8(std::string& input, std::string& output, size_t size)
std::string_view StatelessUtf8Encoder::getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const
{
if (input.empty())
return input;
// Note: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input encoding
// shares its first 128 values (0-127) with ASCII. There are no plans
@ -101,45 +109,34 @@ void Utf8Encoder::toUtf8(std::string& input, std::string& output, size_t size)
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(input.c_str(), ascii);
const auto [outlen, ascii] = getLength(input);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
{
std::swap(input, output);
return;
}
return std::string_view(input.data(), outlen);
// Make sure the output is large enough
if (output.size() <= outlen)
// Add some extra padding to reduce the chance of having to resize
// again later.
output.resize(3*outlen);
// And make sure the string is zero terminated
output[outlen] = 0;
char *in = &input[0];
char *out = &output[0];
resize(outlen, bufferAllocationPolicy, buffer);
char *out = buffer.data();
// Translate
while (*in)
copyFromArray(*(in++), out);
for (auto it = input.begin(); it != input.end() && *it != 0; ++it)
copyFromArray(*it, out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
assert((out - buffer.data()) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
assert(buffer.size() >= outlen);
assert(buffer[outlen] == 0);
return std::string_view(buffer.data(), outlen);
}
std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
std::string_view StatelessUtf8Encoder::getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
if (input.empty())
return input;
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
@ -149,43 +146,28 @@ std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(input, ascii);
const auto [outlen, ascii] = getLengthLegacyEnc(input);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
return std::string_view(input.data(), outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
resize(outlen, bufferAllocationPolicy, buffer);
char *out = buffer.data();
// Translate
while(*input)
copyFromArray2(input, out);
for (auto it = input.begin(); it != input.end() && *it != 0;)
copyFromArrayLegacyEnc(it, input.end(), out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
assert((out - buffer.data()) == static_cast<int>(outlen));
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
assert(buffer.size() >= outlen);
assert(buffer[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it.
void Utf8Encoder::resize(size_t size)
{
if (mOutput.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
mOutput.resize(3*size);
// And make sure the string is zero terminated
mOutput[size] = 0;
return std::string_view(buffer.data(), outlen);
}
/** Get the total length length needed to decode the given string with
@ -198,39 +180,35 @@ void Utf8Encoder::resize(size_t size)
is the case, then the ascii parameter is set to true, and the
caller can optimize for this case.
*/
size_t Utf8Encoder::getLength(const char* input, bool &ascii) const
std::pair<std::size_t, bool> StatelessUtf8Encoder::getLength(std::string_view input) const
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
auto it = skipAscii(input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
if (it == input.end() || *it == 0)
return {it - input.begin(), true};
std::size_t len = it - input.begin();
do
{
ascii = false;
while (inp)
{
// Find the translated length of this character in the
// lookup table.
len += translationArray[inp*6];
inp = *(++ptr);
}
// Find the translated length of this character in the
// lookup table.
len += mTranslationArray[static_cast<unsigned char>(*it) * 6];
++it;
}
return len;
while (it != input.end() && *it != 0);
return {len, false};
}
// Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly.
void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
void StatelessUtf8Encoder::copyFromArray(unsigned char ch, char* &out) const
{
// Optimize for ASCII values
if (ch < 128)
@ -239,57 +217,58 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
return;
}
const signed char *in = translationArray + ch*6;
const signed char *in = &mTranslationArray[ch * 6];
int len = *(in++);
memcpy(out, in, len);
out += len;
}
size_t Utf8Encoder::getLength2(const char* input, bool &ascii) const
std::pair<std::size_t, bool> StatelessUtf8Encoder::getLengthLegacyEnc(std::string_view input) const
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
auto it = skipAscii(input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
{
ascii = false;
while(inp)
{
len += 1;
// Find the translated length of this character in the
// lookup table.
switch(inp)
{
case 0xe2: len -= 2; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len -= 1; break;
}
if (it == input.end() || *it == 0)
return {it - input.begin(), true};
inp = *(++ptr);
std::size_t len = it - input.begin();
std::size_t symbolLen = 0;
do
{
symbolLen += 1;
// Find the translated length of this character in the
// lookup table.
switch (static_cast<unsigned char>(*it))
{
case 0xe2: symbolLen -= 2; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: symbolLen -= 1; break;
default:
len += symbolLen;
symbolLen = 0;
break;
}
++it;
}
return len;
while (it != input.end() && *it != 0);
return {len, false};
}
void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
void StatelessUtf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
{
unsigned char ch = *(chp++);
// Optimize for ASCII values
@ -320,14 +299,21 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
return;
}
if (chp == end)
return;
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
{
if (chp == end)
return;
ch3 = *(chp++);
}
for (int i = 128; i < 256; i++)
{
unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
unsigned char b1 = mTranslationArray[i*6 + 1], b2 = mTranslationArray[i*6 + 2], b3 = mTranslationArray[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
@ -340,6 +326,22 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
*(out++) = ch; // Could not find glyph, just put whatever
}
Utf8Encoder::Utf8Encoder(FromType sourceEncoding)
: mBuffer(50 * 1024, '\0')
, mImpl(sourceEncoding)
{
}
std::string_view Utf8Encoder::getUtf8(std::string_view input)
{
return mImpl.getUtf8(input, BufferAllocationPolicy::UseGrowFactor, mBuffer);
}
std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
{
return mImpl.getLegacyEnc(input, BufferAllocationPolicy::UseGrowFactor, mBuffer);
}
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{
if (encodingName == "win1250")

View File

@ -4,6 +4,7 @@
#include <string>
#include <cstring>
#include <vector>
#include <string_view>
namespace ToUTF8
{
@ -17,41 +18,55 @@ namespace ToUTF8
CP437 // Used for fonts (*.fnt) if data files encoding is 1252. Otherwise, uses the same encoding as the data files.
};
enum class BufferAllocationPolicy
{
FitToRequiredSize,
UseGrowFactor,
};
FromType calculateEncoding(const std::string& encodingName);
std::string encodingUsingMessage(const std::string& encodingName);
// class
class StatelessUtf8Encoder
{
public:
explicit StatelessUtf8Encoder(FromType sourceEncoding);
/// Convert to UTF8 from the previously given code page.
/// Returns a view to passed buffer that will be resized to fit output if it's too small.
std::string_view getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const;
/// Convert from UTF-8 to sourceEncoding.
/// Returns a view to passed buffer that will be resized to fit output if it's too small.
std::string_view getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const;
private:
inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
inline void copyFromArray(unsigned char chp, char* &out) const;
inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
const std::basic_string_view<signed char> mTranslationArray;
};
class Utf8Encoder
{
public:
Utf8Encoder(FromType sourceEncoding);
explicit Utf8Encoder(FromType sourceEncoding);
// Convert to UTF8 from the previously given code page.
std::string getUtf8(const char *input, size_t size);
inline std::string getUtf8(const std::string &str)
{
return getUtf8(str.c_str(), str.size());
}
/// Convert to UTF8 from the previously given code page.
/// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
/// ASCII-only string. Otherwise returns a view to the input.
std::string_view getUtf8(std::string_view input);
// Convert input to UTF8 to the given output string
void toUtf8(std::string& input, std::string& output, size_t size);
std::string getLegacyEnc(const char *input, size_t size);
inline std::string getLegacyEnc(const std::string &str)
{
return getLegacyEnc(str.c_str(), str.size());
}
/// Convert from UTF-8 to sourceEncoding.
/// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
/// ASCII-only string. Otherwise returns a view to the input.
std::string_view getLegacyEnc(std::string_view input);
private:
void resize(size_t size);
size_t getLength(const char* input, bool &ascii) const;
void copyFromArray(unsigned char chp, char* &out) const;
size_t getLength2(const char* input, bool &ascii) const;
void copyFromArray2(const char*& chp, char* &out) const;
std::vector<char> mOutput;
const signed char* translationArray;
std::string mBuffer;
StatelessUtf8Encoder mImpl;
};
}

View File

@ -53,14 +53,13 @@ namespace Translation
if (!line.empty())
{
std::string utf8Line;
mEncoder->toUtf8(line, utf8Line, line.size());
line = mEncoder->getUtf8(line);
size_t tab_pos = utf8Line.find('\t');
if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < utf8Line.size() - 1)
size_t tab_pos = line.find('\t');
if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
{
std::string key = utf8Line.substr(0, tab_pos);
std::string value = utf8Line.substr(tab_pos + 1);
std::string key = line.substr(0, tab_pos);
std::string value = line.substr(tab_pos + 1);
if (!key.empty() && !value.empty())
container.insert(std::make_pair(key, value));