tomlplusplus/include/toml++/impl/unicode.h
Mark Gillard 1c26ce1dcf fixed UB in internal unicode machinery (closes #144)
also:
- updated conformance tests
- added ubsan options to meson.build
2022-02-26 18:07:11 +02:00

184 lines
5.1 KiB
C++

//# This file is a part of toml++ and is subject to the the terms of the MIT license.
//# Copyright (c) Mark Gillard <mark.gillard@outlook.com.au>
//# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text.
// SPDX-License-Identifier: MIT
#pragma once
#include "unicode_autogenerated.h"
#include "header_start.h"
/// \cond
TOML_IMPL_NAMESPACE_START
{
TOML_CONST_GETTER
constexpr bool is_string_delimiter(char32_t c) noexcept
{
return c == U'"' || c == U'\'';
}
TOML_CONST_GETTER
constexpr bool is_ascii_letter(char32_t c) noexcept
{
return (c >= U'a' && c <= U'z') || (c >= U'A' && c <= U'Z');
}
TOML_CONST_GETTER
constexpr bool is_binary_digit(char32_t c) noexcept
{
return c == U'0' || c == U'1';
}
TOML_CONST_GETTER
constexpr bool is_octal_digit(char32_t c) noexcept
{
return (c >= U'0' && c <= U'7');
}
TOML_CONST_GETTER
constexpr bool is_decimal_digit(char32_t c) noexcept
{
return (c >= U'0' && c <= U'9');
}
TOML_CONST_GETTER
constexpr bool is_hexadecimal_digit(char32_t c) noexcept
{
return U'0' <= c && c <= U'f' && (1ull << (static_cast<uint_least64_t>(c) - 0x30u)) & 0x7E0000007E03FFull;
}
template <typename T>
TOML_CONST_GETTER
constexpr uint_least32_t hex_to_dec(const T c) noexcept
{
if constexpr (std::is_same_v<remove_cvref<T>, uint_least32_t>)
return c >= 0x41u // >= 'A'
? 10u + (c | 0x20u) - 0x61u // - 'a'
: c - 0x30u // - '0'
;
else
return hex_to_dec(static_cast<uint_least32_t>(c));
}
TOML_CONST_GETTER
constexpr bool is_bare_key_character(char32_t c) noexcept
{
return is_ascii_bare_key_character(c)
#if TOML_LANG_UNRELEASED // toml/issues/687 (unicode bare keys)
|| is_non_ascii_bare_key_character(c)
#endif
;
}
TOML_CONST_GETTER
constexpr bool is_value_terminator(char32_t c) noexcept
{
return is_whitespace(c) || c == U']' || c == U'}' || c == U',' || c == U'#';
}
TOML_CONST_GETTER
constexpr bool is_control_character(char c) noexcept
{
return c <= '\u001F' || c == '\u007F';
}
TOML_CONST_GETTER
constexpr bool is_control_character(char32_t c) noexcept
{
return c <= U'\u001F' || c == U'\u007F';
}
TOML_CONST_GETTER
constexpr bool is_nontab_control_character(char32_t c) noexcept
{
return c <= U'\u0008' || (c >= U'\u000A' && c <= U'\u001F') || c == U'\u007F';
}
TOML_CONST_GETTER
constexpr bool is_unicode_surrogate(char32_t c) noexcept
{
return c >= 0xD800u && c <= 0xDFFF;
}
struct utf8_decoder
{
// utf8_decoder based on this: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
uint_least32_t state{};
char32_t codepoint{};
static constexpr uint8_t state_table[]{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6,
6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12,
12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12,
12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12,
36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
};
TOML_PURE_INLINE_GETTER
constexpr bool error() const noexcept
{
return state == uint_least32_t{ 12u };
}
TOML_PURE_INLINE_GETTER
constexpr bool has_code_point() const noexcept
{
return state == uint_least32_t{};
}
TOML_PURE_INLINE_GETTER
constexpr bool needs_more_input() const noexcept
{
return !has_code_point() && !error();
}
constexpr void operator()(uint8_t byte) noexcept
{
TOML_ASSERT_ASSUME(!error());
const auto type = state_table[byte];
codepoint = static_cast<char32_t>(has_code_point() ? (uint_least32_t{ 255u } >> type) & byte
: (byte & uint_least32_t{ 63u })
| (static_cast<uint_least32_t>(codepoint) << 6));
state = state_table[state + uint_least32_t{ 256u } + type];
}
TOML_ALWAYS_INLINE
constexpr void operator()(char c) noexcept
{
operator()(static_cast<uint8_t>(c));
}
TOML_ALWAYS_INLINE
constexpr void reset() noexcept
{
state = {};
}
};
TOML_PURE_GETTER
TOML_ATTR(nonnull)
bool is_ascii(const char* str, size_t len) noexcept;
}
TOML_IMPL_NAMESPACE_END;
#if TOML_GCC && TOML_GCC < 9
#pragma GCC pop_options
#endif
/// \endcond
#include "header_end.h"