From 6197054a3cbf624136b94348bd7ede2b4e894712 Mon Sep 17 00:00:00 2001 From: David Capello Date: Sun, 5 Jan 2014 15:33:09 -0300 Subject: [PATCH] Add base::to/from_utf8 impl for Unix-like platforms --- src/base/string.cpp | 86 +++++++++++++++++++++++++++++++++++- src/base/string.h | 1 + src/base/string_unittest.cpp | 14 ++++++ 3 files changed, 99 insertions(+), 2 deletions(-) diff --git a/src/base/string.cpp b/src/base/string.cpp index 21ec1a0e3..b6b16f4d5 100644 --- a/src/base/string.cpp +++ b/src/base/string.cpp @@ -9,8 +9,9 @@ #endif #include "base/string.h" -#include +#include #include +#include #ifdef WIN32 #include @@ -79,7 +80,88 @@ std::wstring from_utf8(const string& src) return std::wstring(&buf[0]); } -#endif // WIN32 +#else + +// Based on Allegro Unicode code (allegro/src/unicode.c) +static size_t insert_utf8_char(string* result, wchar_t chr) +{ + int size, bits, b, i; + + if (chr < 128) { + if (result) + result->push_back(chr); + return 1; + } + + bits = 7; + while (chr >= (1<>b; + for (i=0; i>i); + + result->push_back(firstbyte); + + for (i=1; ipush_back(0x80 | ((chr>>b)&0x3F)); + } + } + + return size; +} + +string to_utf8(const std::wstring& src) +{ + std::wstring::const_iterator it, begin = src.begin(); + std::wstring::const_iterator end = src.end(); + + // Get required size to reserve a string so string::push_back() + // doesn't need to reallocate its data. + size_t required_size = 0; + for (it = begin; it != end; ++it) + required_size += insert_utf8_char(NULL, *it); + if (!required_size) + return ""; + + string result; + result.reserve(++required_size); + for (it = begin; it != end; ++it) + insert_utf8_char(&result, *it); + return result; +} + +std::wstring from_utf8(const string& src) +{ + int required_size = utf8_length(src); + std::vector buf(++required_size); + std::vector::iterator buf_it = buf.begin(); + std::vector::iterator buf_end = buf.end(); + utf8_const_iterator it(src.begin()); + utf8_const_iterator end(src.end()); + + while (it != end) { + assert(buf_it != buf_end); + *buf_it = *it; + ++buf_it; + ++it; + } + + return std::wstring(&buf[0]); +} + +#endif int utf8_length(const string& utf8string) { diff --git a/src/base/string.h b/src/base/string.h index 4fe8c0c35..107c0fd3e 100644 --- a/src/base/string.h +++ b/src/base/string.h @@ -35,6 +35,7 @@ namespace base { : m_internal(it) { } + // Based on Allegro Unicode code (allegro/src/unicode.c) utf8_iteratorT& operator++() { int c = *m_internal; ++m_internal; diff --git a/src/base/string_unittest.cpp b/src/base/string_unittest.cpp index 606212eef..70ffa8c90 100644 --- a/src/base/string_unittest.cpp +++ b/src/base/string_unittest.cpp @@ -14,6 +14,20 @@ using namespace base; bool all(int) { return true; } +TEST(String, Utf8Conversion) +{ + std::string a = "\xE6\xBC\xA2\xE5\xAD\x97"; // 漢字 + ASSERT_EQ(6, a.size()); + + std::wstring b = from_utf8(a); + ASSERT_EQ(2, b.size()); + ASSERT_EQ(0x6f22, b[0]); + ASSERT_EQ(0x5b57, b[1]); + + std::string c = to_utf8(b); + ASSERT_EQ(a, c); +} + TEST(String, Utf8Iterator) { string a = "Hello";