mirror of
https://gitlab.com/OpenMW/openmw.git
synced 2025-02-10 12:39:53 +00:00
Merge branch 'utf8_encoder_tests' into 'master'
Add tests for Utf8Encoder See merge request OpenMW/openmw!1654
This commit is contained in:
commit
149ef56b60
@ -70,6 +70,8 @@ if (GTEST_FOUND AND GMOCK_FOUND)
|
|||||||
esmloader/esmdata.cpp
|
esmloader/esmdata.cpp
|
||||||
|
|
||||||
files/hash.cpp
|
files/hash.cpp
|
||||||
|
|
||||||
|
toutf8/toutf8.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
source_group(apps\\openmw_test_suite FILES openmw_test_suite.cpp ${UNITTEST_SRC_FILES})
|
source_group(apps\\openmw_test_suite FILES openmw_test_suite.cpp ${UNITTEST_SRC_FILES})
|
||||||
@ -93,6 +95,8 @@ if (GTEST_FOUND AND GMOCK_FOUND)
|
|||||||
EXPECTED_MD5 bf3691034a38611534c74c3b89a7d2c3
|
EXPECTED_MD5 bf3691034a38611534c74c3b89a7d2c3
|
||||||
)
|
)
|
||||||
|
|
||||||
target_compile_definitions(openmw_test_suite PRIVATE OPENMW_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data")
|
target_compile_definitions(openmw_test_suite
|
||||||
|
PRIVATE OPENMW_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data"
|
||||||
|
OPENMW_TEST_SUITE_SOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}")
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
139
apps/openmw_test_suite/toutf8/toutf8.cpp
Normal file
139
apps/openmw_test_suite/toutf8/toutf8.cpp
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
#include <components/to_utf8/to_utf8.hpp>
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
#ifndef OPENMW_TEST_SUITE_SOURCE_DIR
|
||||||
|
#define OPENMW_TEST_SUITE_SOURCE_DIR ""
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
using namespace testing;
|
||||||
|
using namespace ToUTF8;
|
||||||
|
|
||||||
|
struct Params
|
||||||
|
{
|
||||||
|
FromType mLegacyEncoding;
|
||||||
|
std::string mLegacyEncodingFileName;
|
||||||
|
std::string mUtf8FileName;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string readContent(const std::string& fileName)
|
||||||
|
{
|
||||||
|
std::ifstream file;
|
||||||
|
file.exceptions(std::ios::failbit | std::ios::badbit);
|
||||||
|
file.open(std::string(OPENMW_TEST_SUITE_SOURCE_DIR) + "/toutf8/data/" + fileName);
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << file.rdbuf();
|
||||||
|
return buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Utf8EncoderTest : TestWithParam<Params> {};
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getUtf8ShouldReturnEmptyAsIs)
|
||||||
|
{
|
||||||
|
Utf8Encoder encoder(FromType::CP437);
|
||||||
|
EXPECT_EQ(encoder.getUtf8(std::string_view()), std::string_view());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getUtf8ShouldReturnAsciiOnlyAsIs)
|
||||||
|
{
|
||||||
|
std::string input;
|
||||||
|
for (int c = 1; c <= std::numeric_limits<char>::max(); ++c)
|
||||||
|
input.push_back(c);
|
||||||
|
Utf8Encoder encoder(FromType::CP437);
|
||||||
|
const std::string_view result = encoder.getUtf8(input);
|
||||||
|
EXPECT_EQ(result.data(), input.data());
|
||||||
|
EXPECT_EQ(result.size(), input.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilZero)
|
||||||
|
{
|
||||||
|
const std::string input("a\0b");
|
||||||
|
Utf8Encoder encoder(FromType::CP437);
|
||||||
|
const std::string_view result = encoder.getUtf8(input);
|
||||||
|
EXPECT_EQ(result, "a");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilEndOfInputForAscii)
|
||||||
|
{
|
||||||
|
const std::string input("abc");
|
||||||
|
Utf8Encoder encoder(FromType::CP437);
|
||||||
|
const std::string_view result = encoder.getUtf8(std::string_view(input.data(), 2));
|
||||||
|
EXPECT_EQ(result, "ab");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilEndOfInputForNonAscii)
|
||||||
|
{
|
||||||
|
const std::string input("a\x92" "b");
|
||||||
|
Utf8Encoder encoder(FromType::WINDOWS_1252);
|
||||||
|
const std::string_view result = encoder.getUtf8(std::string_view(input.data(), 2));
|
||||||
|
EXPECT_EQ(result, "a\xE2\x80\x99");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(Utf8EncoderTest, getUtf8ShouldConvertFromLegacyEncodingToUtf8)
|
||||||
|
{
|
||||||
|
const std::string input(readContent(GetParam().mLegacyEncodingFileName));
|
||||||
|
const std::string expected(readContent(GetParam().mUtf8FileName));
|
||||||
|
Utf8Encoder encoder(GetParam().mLegacyEncoding);
|
||||||
|
const std::string_view result = encoder.getUtf8(input);
|
||||||
|
EXPECT_EQ(result, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getLegacyEncShouldReturnEmptyAsIs)
|
||||||
|
{
|
||||||
|
Utf8Encoder encoder(FromType::CP437);
|
||||||
|
EXPECT_EQ(encoder.getLegacyEnc(std::string_view()), std::string_view());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getLegacyEncShouldReturnAsciiOnlyAsIs)
|
||||||
|
{
|
||||||
|
std::string input;
|
||||||
|
for (int c = 1; c <= std::numeric_limits<char>::max(); ++c)
|
||||||
|
input.push_back(c);
|
||||||
|
Utf8Encoder encoder(FromType::CP437);
|
||||||
|
const std::string_view result = encoder.getLegacyEnc(input);
|
||||||
|
EXPECT_EQ(result.data(), input.data());
|
||||||
|
EXPECT_EQ(result.size(), input.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getLegacyEncShouldLookUpUntilZero)
|
||||||
|
{
|
||||||
|
const std::string input("a\0b");
|
||||||
|
Utf8Encoder encoder(FromType::CP437);
|
||||||
|
const std::string_view result = encoder.getLegacyEnc(input);
|
||||||
|
EXPECT_EQ(result, "a");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getLegacyEncShouldLookUpUntilEndOfInputForAscii)
|
||||||
|
{
|
||||||
|
const std::string input("abc");
|
||||||
|
Utf8Encoder encoder(FromType::CP437);
|
||||||
|
const std::string_view result = encoder.getLegacyEnc(std::string_view(input.data(), 2));
|
||||||
|
EXPECT_EQ(result, "ab");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Utf8EncoderTest, getLegacyEncShouldStripIncompleteCharacters)
|
||||||
|
{
|
||||||
|
const std::string input("a\xc3\xa2\xe2\x80\x99");
|
||||||
|
Utf8Encoder encoder(FromType::WINDOWS_1252);
|
||||||
|
const std::string_view result = encoder.getLegacyEnc(std::string_view(input.data(), 5));
|
||||||
|
EXPECT_EQ(result, "a\xe2");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(Utf8EncoderTest, getLegacyEncShouldConvertFromUtf8ToLegacyEncoding)
|
||||||
|
{
|
||||||
|
const std::string input(readContent(GetParam().mUtf8FileName));
|
||||||
|
const std::string expected(readContent(GetParam().mLegacyEncodingFileName));
|
||||||
|
Utf8Encoder encoder(GetParam().mLegacyEncoding);
|
||||||
|
const std::string_view result = encoder.getLegacyEnc(input);
|
||||||
|
EXPECT_EQ(result, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(Files, Utf8EncoderTest, Values(
|
||||||
|
Params {ToUTF8::WINDOWS_1251, "russian-win1251.txt", "russian-utf8.txt"},
|
||||||
|
Params {ToUTF8::WINDOWS_1252, "french-win1252.txt", "french-utf8.txt"}
|
||||||
|
));
|
||||||
|
}
|
1
components/to_utf8/tests/.gitignore
vendored
1
components/to_utf8/tests/.gitignore
vendored
@ -1 +0,0 @@
|
|||||||
*_test
|
|
@ -1,4 +0,0 @@
|
|||||||
original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
|
|
||||||
converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
|
|
||||||
original: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
|
|
||||||
converted: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
|
|
@ -1,18 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
make || exit
|
|
||||||
|
|
||||||
mkdir -p output
|
|
||||||
|
|
||||||
PROGS=*_test
|
|
||||||
|
|
||||||
for a in $PROGS; do
|
|
||||||
if [ -f "output/$a.out" ]; then
|
|
||||||
echo "Running $a:"
|
|
||||||
./$a | diff output/$a.out -
|
|
||||||
else
|
|
||||||
echo "Creating $a.out"
|
|
||||||
./$a > "output/$a.out"
|
|
||||||
git add "output/$a.out"
|
|
||||||
fi
|
|
||||||
done
|
|
@ -1,59 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
#include <cassert>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#include "../to_utf8.hpp"
|
|
||||||
|
|
||||||
std::string getFirstLine(const std::string &filename);
|
|
||||||
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
|
|
||||||
const std::string &utf8File);
|
|
||||||
|
|
||||||
/// Test character encoding conversion to and from UTF-8
|
|
||||||
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
|
|
||||||
const std::string &utf8File)
|
|
||||||
{
|
|
||||||
// get some test data
|
|
||||||
std::string legacyEncLine = getFirstLine(legacyEncFile);
|
|
||||||
std::string utf8Line = getFirstLine(utf8File);
|
|
||||||
|
|
||||||
// create an encoder for specified character encoding
|
|
||||||
ToUTF8::Utf8Encoder encoder (encoding);
|
|
||||||
|
|
||||||
// convert text to UTF-8
|
|
||||||
std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
|
|
||||||
|
|
||||||
std::cout << "original: " << utf8Line << std::endl;
|
|
||||||
std::cout << "converted: " << convertedUtf8Line << std::endl;
|
|
||||||
|
|
||||||
// check correctness
|
|
||||||
assert(convertedUtf8Line == utf8Line);
|
|
||||||
|
|
||||||
// convert UTF-8 text to legacy encoding
|
|
||||||
std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
|
|
||||||
// check correctness
|
|
||||||
assert(convertedLegacyEncLine == legacyEncLine);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string getFirstLine(const std::string &filename)
|
|
||||||
{
|
|
||||||
std::string line;
|
|
||||||
std::ifstream text (filename.c_str());
|
|
||||||
|
|
||||||
if (!text.is_open())
|
|
||||||
{
|
|
||||||
throw std::runtime_error("Unable to open file " + filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::getline(text, line);
|
|
||||||
text.close();
|
|
||||||
|
|
||||||
return line;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main()
|
|
||||||
{
|
|
||||||
testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
|
|
||||||
testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -3,6 +3,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#include <components/debug/debuglog.hpp>
|
#include <components/debug/debuglog.hpp>
|
||||||
|
|
||||||
@ -44,6 +45,14 @@
|
|||||||
|
|
||||||
using namespace ToUTF8;
|
using namespace ToUTF8;
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
std::string_view::iterator skipAscii(std::string_view input)
|
||||||
|
{
|
||||||
|
return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
|
Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
|
||||||
mOutput(50*1024)
|
mOutput(50*1024)
|
||||||
{
|
{
|
||||||
@ -82,11 +91,6 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
|
|||||||
if (input.empty())
|
if (input.empty())
|
||||||
return input;
|
return input;
|
||||||
|
|
||||||
// Double check that the input string stops at some point (it might
|
|
||||||
// contain zero terminators before this, inside its own data, which
|
|
||||||
// is also ok.)
|
|
||||||
assert(input[input.size()] == 0);
|
|
||||||
|
|
||||||
// Note: The rest of this function is designed for single-character
|
// Note: The rest of this function is designed for single-character
|
||||||
// input encodings only. It also assumes that the input encoding
|
// input encodings only. It also assumes that the input encoding
|
||||||
// shares its first 128 values (0-127) with ASCII. There are no plans
|
// shares its first 128 values (0-127) with ASCII. There are no plans
|
||||||
@ -95,8 +99,7 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
|
|||||||
|
|
||||||
// Compute output length, and check for pure ascii input at the same
|
// Compute output length, and check for pure ascii input at the same
|
||||||
// time.
|
// time.
|
||||||
bool ascii;
|
const auto [outlen, ascii] = getLength(input);
|
||||||
size_t outlen = getLength(input.data(), ascii);
|
|
||||||
|
|
||||||
// If we're pure ascii, then don't bother converting anything.
|
// If we're pure ascii, then don't bother converting anything.
|
||||||
if(ascii)
|
if(ascii)
|
||||||
@ -107,8 +110,8 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
|
|||||||
char *out = &mOutput[0];
|
char *out = &mOutput[0];
|
||||||
|
|
||||||
// Translate
|
// Translate
|
||||||
for (const char* ptr = input.data(); *ptr;)
|
for (auto it = input.begin(); it != input.end() && *it != 0; ++it)
|
||||||
copyFromArray(*(ptr++), out);
|
copyFromArray(*it, out);
|
||||||
|
|
||||||
// Make sure that we wrote the correct number of bytes
|
// Make sure that we wrote the correct number of bytes
|
||||||
assert((out-&mOutput[0]) == (int)outlen);
|
assert((out-&mOutput[0]) == (int)outlen);
|
||||||
@ -125,11 +128,6 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
|
|||||||
if (input.empty())
|
if (input.empty())
|
||||||
return input;
|
return input;
|
||||||
|
|
||||||
// Double check that the input string stops at some point (it might
|
|
||||||
// contain zero terminators before this, inside its own data, which
|
|
||||||
// is also ok.)
|
|
||||||
assert(input[input.size()] == 0);
|
|
||||||
|
|
||||||
// TODO: The rest of this function is designed for single-character
|
// TODO: The rest of this function is designed for single-character
|
||||||
// input encodings only. It also assumes that the input the input
|
// input encodings only. It also assumes that the input the input
|
||||||
// encoding shares its first 128 values (0-127) with ASCII. These
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
||||||
@ -138,8 +136,7 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
|
|||||||
|
|
||||||
// Compute output length, and check for pure ascii input at the same
|
// Compute output length, and check for pure ascii input at the same
|
||||||
// time.
|
// time.
|
||||||
bool ascii;
|
const auto [outlen, ascii] = getLengthLegacyEnc(input);
|
||||||
size_t outlen = getLength2(input.data(), ascii);
|
|
||||||
|
|
||||||
// If we're pure ascii, then don't bother converting anything.
|
// If we're pure ascii, then don't bother converting anything.
|
||||||
if(ascii)
|
if(ascii)
|
||||||
@ -150,8 +147,8 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
|
|||||||
char *out = &mOutput[0];
|
char *out = &mOutput[0];
|
||||||
|
|
||||||
// Translate
|
// Translate
|
||||||
for (const char* ptr = input.data(); *ptr;)
|
for (auto it = input.begin(); it != input.end() && *it != 0;)
|
||||||
copyFromArray2(ptr, out);
|
copyFromArrayLegacyEnc(it, input.end(), out);
|
||||||
|
|
||||||
// Make sure that we wrote the correct number of bytes
|
// Make sure that we wrote the correct number of bytes
|
||||||
assert((out-&mOutput[0]) == (int)outlen);
|
assert((out-&mOutput[0]) == (int)outlen);
|
||||||
@ -186,34 +183,30 @@ void Utf8Encoder::resize(size_t size)
|
|||||||
is the case, then the ascii parameter is set to true, and the
|
is the case, then the ascii parameter is set to true, and the
|
||||||
caller can optimize for this case.
|
caller can optimize for this case.
|
||||||
*/
|
*/
|
||||||
size_t Utf8Encoder::getLength(const char* input, bool &ascii) const
|
std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) const
|
||||||
{
|
{
|
||||||
ascii = true;
|
|
||||||
size_t len = 0;
|
|
||||||
const char* ptr = input;
|
|
||||||
unsigned char inp = *ptr;
|
|
||||||
|
|
||||||
// Do away with the ascii part of the string first (this is almost
|
// Do away with the ascii part of the string first (this is almost
|
||||||
// always the entire string.)
|
// always the entire string.)
|
||||||
while (inp && inp < 128)
|
auto it = skipAscii(input);
|
||||||
inp = *(++ptr);
|
|
||||||
len += (ptr-input);
|
|
||||||
|
|
||||||
// If we're not at the null terminator at this point, then there
|
// If we're not at the null terminator at this point, then there
|
||||||
// were some non-ascii characters to deal with. Go to slow-mode for
|
// were some non-ascii characters to deal with. Go to slow-mode for
|
||||||
// the rest of the string.
|
// the rest of the string.
|
||||||
if (inp)
|
if (it == input.end() || *it == 0)
|
||||||
{
|
return {it - input.begin(), true};
|
||||||
ascii = false;
|
|
||||||
while (inp)
|
std::size_t len = it - input.begin();
|
||||||
|
|
||||||
|
do
|
||||||
{
|
{
|
||||||
// Find the translated length of this character in the
|
// Find the translated length of this character in the
|
||||||
// lookup table.
|
// lookup table.
|
||||||
len += translationArray[inp*6];
|
len += translationArray[static_cast<unsigned char>(*it) * 6];
|
||||||
inp = *(++ptr);
|
++it;
|
||||||
}
|
}
|
||||||
}
|
while (it != input.end() && *it != 0);
|
||||||
return len;
|
|
||||||
|
return {len, false};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Translate one character 'ch' using the translation array 'arr', and
|
// Translate one character 'ch' using the translation array 'arr', and
|
||||||
@ -233,33 +226,29 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
|
|||||||
out += len;
|
out += len;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Utf8Encoder::getLength2(const char* input, bool &ascii) const
|
std::pair<std::size_t, bool> Utf8Encoder::getLengthLegacyEnc(std::string_view input) const
|
||||||
{
|
{
|
||||||
ascii = true;
|
|
||||||
size_t len = 0;
|
|
||||||
const char* ptr = input;
|
|
||||||
unsigned char inp = *ptr;
|
|
||||||
|
|
||||||
// Do away with the ascii part of the string first (this is almost
|
// Do away with the ascii part of the string first (this is almost
|
||||||
// always the entire string.)
|
// always the entire string.)
|
||||||
while (inp && inp < 128)
|
auto it = skipAscii(input);
|
||||||
inp = *(++ptr);
|
|
||||||
len += (ptr-input);
|
|
||||||
|
|
||||||
// If we're not at the null terminator at this point, then there
|
// If we're not at the null terminator at this point, then there
|
||||||
// were some non-ascii characters to deal with. Go to slow-mode for
|
// were some non-ascii characters to deal with. Go to slow-mode for
|
||||||
// the rest of the string.
|
// the rest of the string.
|
||||||
if (inp)
|
if (it == input.end() || *it == 0)
|
||||||
|
return {it - input.begin(), true};
|
||||||
|
|
||||||
|
std::size_t len = it - input.begin();
|
||||||
|
std::size_t symbolLen = 0;
|
||||||
|
|
||||||
|
do
|
||||||
{
|
{
|
||||||
ascii = false;
|
symbolLen += 1;
|
||||||
while(inp)
|
|
||||||
{
|
|
||||||
len += 1;
|
|
||||||
// Find the translated length of this character in the
|
// Find the translated length of this character in the
|
||||||
// lookup table.
|
// lookup table.
|
||||||
switch(inp)
|
switch (static_cast<unsigned char>(*it))
|
||||||
{
|
{
|
||||||
case 0xe2: len -= 2; break;
|
case 0xe2: symbolLen -= 2; break;
|
||||||
case 0xc2:
|
case 0xc2:
|
||||||
case 0xcb:
|
case 0xcb:
|
||||||
case 0xc4:
|
case 0xc4:
|
||||||
@ -268,16 +257,21 @@ size_t Utf8Encoder::getLength2(const char* input, bool &ascii) const
|
|||||||
case 0xd0:
|
case 0xd0:
|
||||||
case 0xd1:
|
case 0xd1:
|
||||||
case 0xd2:
|
case 0xd2:
|
||||||
case 0xc5: len -= 1; break;
|
case 0xc5: symbolLen -= 1; break;
|
||||||
|
default:
|
||||||
|
len += symbolLen;
|
||||||
|
symbolLen = 0;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
inp = *(++ptr);
|
++it;
|
||||||
}
|
}
|
||||||
}
|
while (it != input.end() && *it != 0);
|
||||||
return len;
|
|
||||||
|
return {len, false};
|
||||||
}
|
}
|
||||||
|
|
||||||
void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
|
void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
|
||||||
{
|
{
|
||||||
unsigned char ch = *(chp++);
|
unsigned char ch = *(chp++);
|
||||||
// Optimize for ASCII values
|
// Optimize for ASCII values
|
||||||
@ -308,10 +302,17 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (chp == end)
|
||||||
|
return;
|
||||||
|
|
||||||
unsigned char ch2 = *(chp++);
|
unsigned char ch2 = *(chp++);
|
||||||
unsigned char ch3 = '\0';
|
unsigned char ch3 = '\0';
|
||||||
if (len == 3)
|
if (len == 3)
|
||||||
|
{
|
||||||
|
if (chp == end)
|
||||||
|
return;
|
||||||
ch3 = *(chp++);
|
ch3 = *(chp++);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 128; i < 256; i++)
|
for (int i = 128; i < 256; i++)
|
||||||
{
|
{
|
||||||
|
@ -38,11 +38,11 @@ namespace ToUTF8
|
|||||||
std::string_view getLegacyEnc(std::string_view input);
|
std::string_view getLegacyEnc(std::string_view input);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void resize(size_t size);
|
inline void resize(std::size_t size);
|
||||||
size_t getLength(const char* input, bool &ascii) const;
|
inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
|
||||||
void copyFromArray(unsigned char chp, char* &out) const;
|
inline void copyFromArray(unsigned char chp, char* &out) const;
|
||||||
size_t getLength2(const char* input, bool &ascii) const;
|
inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
|
||||||
void copyFromArray2(const char*& chp, char* &out) const;
|
inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
|
||||||
|
|
||||||
std::vector<char> mOutput;
|
std::vector<char> mOutput;
|
||||||
const signed char* translationArray;
|
const signed char* translationArray;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user