2013-01-27 10:43:41 -08:00
|
|
|
#ifndef MISC_UTF8ITER_HPP
|
|
|
|
#define MISC_UTF8ITER_HPP
|
|
|
|
|
2023-06-01 11:35:34 +02:00
|
|
|
#include <cstdint>
|
2018-05-24 12:53:06 +04:00
|
|
|
#include <cstring>
|
2021-11-05 09:53:52 +00:00
|
|
|
#include <string>
|
2021-11-10 21:37:29 +01:00
|
|
|
#include <string_view>
|
2017-05-06 13:29:47 +02:00
|
|
|
#include <tuple>
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
class Utf8Stream
|
2013-01-27 10:43:41 -08:00
|
|
|
{
|
|
|
|
public:
|
2013-05-01 10:28:59 +02:00
|
|
|
typedef uint32_t UnicodeChar;
|
|
|
|
typedef unsigned char const* Point;
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2013-02-01 22:48:13 -08:00
|
|
|
// static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this
|
2013-05-01 10:28:59 +02:00
|
|
|
static UnicodeChar sBadChar() { return UnicodeChar(0xFFFFFFFF); }
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
Utf8Stream(Point begin, Point end)
|
2013-12-28 13:47:01 +01:00
|
|
|
: cur(begin)
|
|
|
|
, nxt(begin)
|
|
|
|
, end(end)
|
|
|
|
, val(Utf8Stream::sBadChar())
|
2013-01-27 10:43:41 -08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2017-11-21 12:59:47 +04:00
|
|
|
Utf8Stream(const char* str)
|
2022-07-03 15:36:32 +04:00
|
|
|
: cur(reinterpret_cast<const unsigned char*>(str))
|
|
|
|
, nxt(reinterpret_cast<const unsigned char*>(str))
|
|
|
|
, end(reinterpret_cast<const unsigned char*>(str) + strlen(str))
|
|
|
|
, val(Utf8Stream::sBadChar())
|
2017-11-21 12:59:47 +04:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
Utf8Stream(std::pair<Point, Point> range)
|
2013-12-28 13:47:01 +01:00
|
|
|
: cur(range.first)
|
|
|
|
, nxt(range.first)
|
|
|
|
, end(range.second)
|
|
|
|
, val(Utf8Stream::sBadChar())
|
2013-01-27 10:43:41 -08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-11-10 21:37:29 +01:00
|
|
|
Utf8Stream(std::string_view str)
|
|
|
|
: Utf8Stream(reinterpret_cast<Point>(str.data()), reinterpret_cast<Point>(str.data() + str.size()))
|
2021-11-09 21:45:16 +01:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-01-27 10:43:41 -08:00
|
|
|
bool eof() const { return cur == end; }
|
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
Point current() const { return cur; }
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
UnicodeChar peek()
|
2013-01-27 10:43:41 -08:00
|
|
|
{
|
|
|
|
if (cur == nxt)
|
|
|
|
next();
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
UnicodeChar consume()
|
2013-01-27 10:43:41 -08:00
|
|
|
{
|
|
|
|
if (cur == nxt)
|
|
|
|
next();
|
|
|
|
cur = nxt;
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
2023-06-01 11:35:34 +02:00
|
|
|
static bool isAscii(unsigned char value) { return (value & 0x80) == 0; }
|
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
static std::pair<UnicodeChar, Point> decode(Point cur, Point end)
|
2013-01-27 10:43:41 -08:00
|
|
|
{
|
2023-06-01 11:35:34 +02:00
|
|
|
if (isAscii(*cur))
|
2013-01-27 10:43:41 -08:00
|
|
|
{
|
2013-05-01 10:28:59 +02:00
|
|
|
UnicodeChar chr = *cur++;
|
2013-01-27 10:43:41 -08:00
|
|
|
|
|
|
|
return std::make_pair(chr, cur);
|
|
|
|
}
|
|
|
|
|
|
|
|
int octets;
|
2013-05-01 10:28:59 +02:00
|
|
|
UnicodeChar chr;
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2023-06-01 11:35:34 +02:00
|
|
|
std::tie(octets, chr) = getOctetCount(*cur++);
|
|
|
|
|
|
|
|
return decode(cur, end, chr, octets);
|
|
|
|
}
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2023-06-01 11:35:34 +02:00
|
|
|
static std::pair<UnicodeChar, Point> decode(Point cur, Point end, UnicodeChar chr, std::size_t octets)
|
|
|
|
{
|
2013-01-27 10:43:41 -08:00
|
|
|
if (octets > 5)
|
2013-02-01 22:48:13 -08:00
|
|
|
return std::make_pair(sBadChar(), cur);
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
Point eoc = cur + octets;
|
2013-01-27 10:43:41 -08:00
|
|
|
|
|
|
|
if (eoc > end)
|
2013-02-01 22:48:13 -08:00
|
|
|
return std::make_pair(sBadChar(), cur);
|
2013-01-27 10:43:41 -08:00
|
|
|
|
|
|
|
while (cur != eoc)
|
|
|
|
{
|
|
|
|
if ((*cur & 0xC0) != 0x80) // check continuation mark
|
2014-09-06 13:04:52 -05:00
|
|
|
return std::make_pair(sBadChar(), cur);
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
chr = (chr << 6) | UnicodeChar((*cur++) & 0x3F);
|
2013-01-27 10:43:41 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return std::make_pair(chr, cur);
|
|
|
|
}
|
|
|
|
|
2021-11-05 09:53:52 +00:00
|
|
|
static UnicodeChar toLowerUtf8(UnicodeChar ch)
|
|
|
|
{
|
|
|
|
// Russian alphabet
|
|
|
|
if (ch >= 0x0410 && ch < 0x0430)
|
|
|
|
return ch + 0x20;
|
|
|
|
|
|
|
|
// Cyrillic IO character
|
|
|
|
if (ch == 0x0401)
|
|
|
|
return ch + 0x50;
|
|
|
|
|
|
|
|
// Latin alphabet
|
|
|
|
if (ch >= 0x41 && ch < 0x60)
|
|
|
|
return ch + 0x20;
|
|
|
|
|
|
|
|
// German characters
|
|
|
|
if (ch == 0xc4 || ch == 0xd6 || ch == 0xdc)
|
|
|
|
return ch + 0x20;
|
|
|
|
if (ch == 0x1e9e)
|
|
|
|
return 0xdf;
|
|
|
|
|
|
|
|
// TODO: probably we will need to support characters from other languages
|
|
|
|
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
2022-08-16 21:15:03 +02:00
|
|
|
static std::string lowerCaseUtf8(std::string_view str)
|
2021-11-05 09:53:52 +00:00
|
|
|
{
|
|
|
|
if (str.empty())
|
2022-08-16 21:15:03 +02:00
|
|
|
return std::string{ str };
|
2021-11-05 09:53:52 +00:00
|
|
|
|
|
|
|
// Decode string as utf8 characters, convert to lower case and pack them to string
|
|
|
|
std::string out;
|
2022-09-05 19:46:39 +02:00
|
|
|
out.reserve(str.length());
|
2022-08-16 21:15:03 +02:00
|
|
|
Utf8Stream stream(str);
|
2021-11-05 09:53:52 +00:00
|
|
|
while (!stream.eof())
|
|
|
|
{
|
|
|
|
UnicodeChar character = toLowerUtf8(stream.peek());
|
|
|
|
|
|
|
|
if (character <= 0x7f)
|
|
|
|
out.append(1, static_cast<char>(character));
|
|
|
|
else if (character <= 0x7ff)
|
|
|
|
{
|
|
|
|
out.append(1, static_cast<char>(0xc0 | ((character >> 6) & 0x1f)));
|
|
|
|
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
|
|
|
|
}
|
|
|
|
else if (character <= 0xffff)
|
|
|
|
{
|
|
|
|
out.append(1, static_cast<char>(0xe0 | ((character >> 12) & 0x0f)));
|
|
|
|
out.append(1, static_cast<char>(0x80 | ((character >> 6) & 0x3f)));
|
|
|
|
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
out.append(1, static_cast<char>(0xf0 | ((character >> 18) & 0x07)));
|
|
|
|
out.append(1, static_cast<char>(0x80 | ((character >> 12) & 0x3f)));
|
|
|
|
out.append(1, static_cast<char>(0x80 | ((character >> 6) & 0x3f)));
|
|
|
|
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
|
|
|
|
}
|
|
|
|
|
|
|
|
stream.consume();
|
|
|
|
}
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
2023-06-01 11:35:34 +02:00
|
|
|
static std::pair<std::size_t, UnicodeChar> getOctetCount(unsigned char octet)
|
2013-01-27 10:43:41 -08:00
|
|
|
{
|
2023-06-01 11:35:34 +02:00
|
|
|
std::size_t octets;
|
2013-01-27 10:43:41 -08:00
|
|
|
|
|
|
|
unsigned char mark = 0xC0;
|
|
|
|
unsigned char mask = 0xE0;
|
|
|
|
|
|
|
|
for (octets = 1; octets <= 5; ++octets)
|
|
|
|
{
|
|
|
|
if ((octet & mask) == mark)
|
|
|
|
break;
|
|
|
|
|
|
|
|
mark = (mark >> 1) | 0x80;
|
|
|
|
mask = (mask >> 1) | 0x80;
|
|
|
|
}
|
|
|
|
|
|
|
|
return std::make_pair(octets, octet & ~mask);
|
|
|
|
}
|
|
|
|
|
2023-06-01 11:35:34 +02:00
|
|
|
private:
|
2017-05-06 13:29:47 +02:00
|
|
|
void next() { std::tie(val, nxt) = decode(nxt, end); }
|
2013-01-27 10:43:41 -08:00
|
|
|
|
2013-05-01 10:28:59 +02:00
|
|
|
Point cur;
|
|
|
|
Point nxt;
|
|
|
|
Point end;
|
|
|
|
UnicodeChar val;
|
2013-01-27 10:43:41 -08:00
|
|
|
};
|
|
|
|
|
2013-02-01 22:48:13 -08:00
|
|
|
#endif
|