1
0
mirror of https://gitlab.com/OpenMW/openmw.git synced 2025-01-09 21:42:13 +00:00
OpenMW/components/misc/utf8stream.hpp
AnyOldName3 28131fd62b Fixes for a whole bunch of warnings
These warnings were always enabled, but we didn't see them due to https://gitlab.com/OpenMW/openmw/-/issues/7882.
I do not fully understand the cause of 7822 as I can't repro it in a minimal CMake project.

Some of these fixes are thought through.
Some are sensible best guesses.
Some are kind of a stab in the dark as I don't know whether there was a
possible bug the warning was telling me about that I've done nothing to
help by introducing a static_cast.

Nearly all of these warnings were about some kind of narrowing
conversion, so I'm not sure why they weren't firing with GCC and Clang,
which have -Wall -Wextra -pedantic set, which should imply -Wnarrowing,
and they can't have been affected by 7882.

There were also some warnings being triggered from Boost code.
The vast majority of library headers that do questionable things weren't
firing warnings off, but for some reason, /external:I wasn't putting
these Boost headers into external mode.

We need these warnings dealt with one way or another so we can switch
the default Windows CI from MSBuild (which doesn't do ccache) to Ninja
(which does).
I have the necessary magic for that on a branch, but the branch won't
build because of these warnings.
2024-03-14 23:39:33 +00:00

201 lines
5.1 KiB
C++

#ifndef MISC_UTF8ITER_HPP
#define MISC_UTF8ITER_HPP
#include <cstdint>
#include <cstring>
#include <string>
#include <string_view>
#include <tuple>
class Utf8Stream
{
public:
typedef uint32_t UnicodeChar;
typedef unsigned char const* Point;
// static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this
static UnicodeChar sBadChar() { return UnicodeChar(0xFFFFFFFF); }
Utf8Stream(Point begin, Point end)
: cur(begin)
, nxt(begin)
, end(end)
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(const char* str)
: cur(reinterpret_cast<const unsigned char*>(str))
, nxt(reinterpret_cast<const unsigned char*>(str))
, end(reinterpret_cast<const unsigned char*>(str) + strlen(str))
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(std::pair<Point, Point> range)
: cur(range.first)
, nxt(range.first)
, end(range.second)
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(std::string_view str)
: Utf8Stream(reinterpret_cast<Point>(str.data()), reinterpret_cast<Point>(str.data() + str.size()))
{
}
bool eof() const { return cur == end; }
Point current() const { return cur; }
UnicodeChar peek()
{
if (cur == nxt)
next();
return val;
}
UnicodeChar consume()
{
if (cur == nxt)
next();
cur = nxt;
return val;
}
static bool isAscii(unsigned char value) { return (value & 0x80) == 0; }
static std::pair<UnicodeChar, Point> decode(Point cur, Point end)
{
if (isAscii(*cur))
{
UnicodeChar chr = *cur++;
return std::make_pair(chr, cur);
}
std::size_t octets;
UnicodeChar chr;
std::tie(octets, chr) = getOctetCount(*cur++);
return decode(cur, end, chr, octets);
}
static std::pair<UnicodeChar, Point> decode(Point cur, Point end, UnicodeChar chr, std::size_t octets)
{
if (octets > 5)
return std::make_pair(sBadChar(), cur);
Point eoc = cur + octets;
if (eoc > end)
return std::make_pair(sBadChar(), cur);
while (cur != eoc)
{
if ((*cur & 0xC0) != 0x80) // check continuation mark
return std::make_pair(sBadChar(), cur);
chr = (chr << 6) | UnicodeChar((*cur++) & 0x3F);
}
return std::make_pair(chr, cur);
}
static UnicodeChar toLowerUtf8(UnicodeChar ch)
{
// Russian alphabet
if (ch >= 0x0410 && ch < 0x0430)
return ch + 0x20;
// Cyrillic IO character
if (ch == 0x0401)
return ch + 0x50;
// Latin alphabet
if (ch >= 0x41 && ch < 0x60)
return ch + 0x20;
// German characters
if (ch == 0xc4 || ch == 0xd6 || ch == 0xdc)
return ch + 0x20;
if (ch == 0x1e9e)
return 0xdf;
// TODO: probably we will need to support characters from other languages
return ch;
}
static std::string lowerCaseUtf8(std::string_view str)
{
if (str.empty())
return std::string{ str };
// Decode string as utf8 characters, convert to lower case and pack them to string
std::string out;
out.reserve(str.length());
Utf8Stream stream(str);
while (!stream.eof())
{
UnicodeChar character = toLowerUtf8(stream.peek());
if (character <= 0x7f)
out.append(1, static_cast<char>(character));
else if (character <= 0x7ff)
{
out.append(1, static_cast<char>(0xc0 | ((character >> 6) & 0x1f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
else if (character <= 0xffff)
{
out.append(1, static_cast<char>(0xe0 | ((character >> 12) & 0x0f)));
out.append(1, static_cast<char>(0x80 | ((character >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
else
{
out.append(1, static_cast<char>(0xf0 | ((character >> 18) & 0x07)));
out.append(1, static_cast<char>(0x80 | ((character >> 12) & 0x3f)));
out.append(1, static_cast<char>(0x80 | ((character >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
stream.consume();
}
return out;
}
static std::pair<std::size_t, UnicodeChar> getOctetCount(unsigned char octet)
{
std::size_t octets;
unsigned char mark = 0xC0;
unsigned char mask = 0xE0;
for (octets = 1; octets <= 5; ++octets)
{
if ((octet & mask) == mark)
break;
mark = (mark >> 1) | 0x80;
mask = (mask >> 1) | 0x80;
}
return std::make_pair(octets, octet & ~mask);
}
private:
void next() { std::tie(val, nxt) = decode(nxt, end); }
Point cur;
Point nxt;
Point end;
UnicodeChar val;
};
#endif