diff --git a/CHANGELOG.md b/CHANGELOG.md index 0733495..4a0e1cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,9 @@ code changes at callsites or in build systems are indicated with ⚠️. - fixed missing `TOML_API` on interfaces - fixed parser not correctly round-tripping the format of binary and octal integers in some cases - fixed strong exception guarantee edge-cases in `toml::table` and `toml::array` +- fixed some incorrect unicode scalar sequence transformations (#125) (@moorereason) +- fixed extended-precision fractional times causing parse error instead of truncating per the spec (#127) (@moorereason) +- fixed some non-spec vertical whitespace being accepted as line breaks (#128) (@moorereason) #### Additions: - added `operator->` to `toml::value` for class types @@ -48,6 +51,7 @@ code changes at callsites or in build systems are indicated with ⚠️. - added `toml::format_flags::allow_hexadecimal_integers` - added `toml::format_flags::allow_octal_integers` - added `toml::format_flags::allow_real_tabs_in_strings` +- added `toml::format_flags::allow_unicode_strings` - added `toml::format_flags::indent_array_elements` - added `toml::format_flags::indent_sub_tables` - added `toml::format_flags::quote_infinities_and_nans` diff --git a/examples/error_printer.cpp b/examples/error_printer.cpp index 92b582c..9ac99f3 100644 --- a/examples/error_printer.cpp +++ b/examples/error_printer.cpp @@ -6,7 +6,7 @@ // This example shows the error messages the library produces by forcing a set of specific parsing // failures and printing their results. -#include "examples.hpp" +#include "examples.h" #define TOML_EXCEPTIONS 0 #define TOML_ENABLE_UNRELEASED_FEATURES 0 @@ -17,10 +17,11 @@ using namespace std::string_view_literals; namespace { inline constexpr auto invalid_parses = std::array{ - "########## comments"sv, + "########## comments and whitespace"sv, "# bar\rkek"sv, "# bar\bkek"sv, "# \xf1\x63"sv, + "# val1 = 1\fval2 = 2"sv, "########## inline tables"sv, "val = {,}"sv, diff --git a/examples/error_printer.vcxproj b/examples/error_printer.vcxproj index d866aa5..a7ad5b6 100644 --- a/examples/error_printer.vcxproj +++ b/examples/error_printer.vcxproj @@ -55,7 +55,7 @@ - + \ No newline at end of file diff --git a/examples/examples.hpp b/examples/examples.h similarity index 100% rename from examples/examples.hpp rename to examples/examples.h diff --git a/examples/parse_benchmark.cpp b/examples/parse_benchmark.cpp index 836df92..6d97471 100644 --- a/examples/parse_benchmark.cpp +++ b/examples/parse_benchmark.cpp @@ -5,7 +5,7 @@ // This example is just a short-n-shiny benchmark. -#include "examples.hpp" +#include "examples.h" #include using namespace std::string_view_literals; diff --git a/examples/parse_benchmark.vcxproj b/examples/parse_benchmark.vcxproj index 4621812..927aa4e 100644 --- a/examples/parse_benchmark.vcxproj +++ b/examples/parse_benchmark.vcxproj @@ -56,7 +56,7 @@ - + \ No newline at end of file diff --git a/examples/simple_parser.cpp b/examples/simple_parser.cpp index 5784725..1e369ef 100644 --- a/examples/simple_parser.cpp +++ b/examples/simple_parser.cpp @@ -5,7 +5,7 @@ // This example demonstrates how to parse TOML from a file or stdin and re-serialize it (print it out) to stdout. -#include "examples.hpp" +#include "examples.h" #define TOML_ENABLE_UNRELEASED_FEATURES 1 #include diff --git a/examples/simple_parser.vcxproj b/examples/simple_parser.vcxproj index 0cbea0b..14d2b0f 100644 --- a/examples/simple_parser.vcxproj +++ b/examples/simple_parser.vcxproj @@ -56,7 +56,7 @@ - + \ No newline at end of file diff --git a/examples/toml_generator.cpp b/examples/toml_generator.cpp index d1809ae..2825ce8 100644 --- a/examples/toml_generator.cpp +++ b/examples/toml_generator.cpp @@ -5,7 +5,7 @@ // This example demonstrates the use of some more advanced features to generate a tree of random TOML data. -#include "examples.hpp" +#include "examples.h" #define TOML_ENABLE_PARSER 0 #include diff --git a/examples/toml_generator.vcxproj b/examples/toml_generator.vcxproj index b902a40..3b4e0b2 100644 --- a/examples/toml_generator.vcxproj +++ b/examples/toml_generator.vcxproj @@ -56,7 +56,7 @@ - + \ No newline at end of file diff --git a/examples/toml_to_json_transcoder.cpp b/examples/toml_to_json_transcoder.cpp index b43287d..d785344 100644 --- a/examples/toml_to_json_transcoder.cpp +++ b/examples/toml_to_json_transcoder.cpp @@ -5,7 +5,7 @@ // This example demonstrates how to use the toml::json_formatter to re-serialize TOML data as JSON. -#include "examples.hpp" +#include "examples.h" #define TOML_ENABLE_UNRELEASED_FEATURES 1 #include diff --git a/examples/toml_to_json_transcoder.vcxproj b/examples/toml_to_json_transcoder.vcxproj index f6d3984..6ff8538 100644 --- a/examples/toml_to_json_transcoder.vcxproj +++ b/examples/toml_to_json_transcoder.vcxproj @@ -56,7 +56,7 @@ - + \ No newline at end of file diff --git a/include/toml++/impl/formatter.h b/include/toml++/impl/formatter.h index d4be068..6b85b85 100644 --- a/include/toml++/impl/formatter.h +++ b/include/toml++/impl/formatter.h @@ -106,6 +106,24 @@ TOML_IMPL_NAMESPACE_START return !!(config_.flags & format_flags::allow_literal_strings); } + TOML_PURE_INLINE_GETTER + bool multi_line_strings_allowed() const noexcept + { + return !!(config_.flags & format_flags::allow_multi_line_strings); + } + + TOML_PURE_INLINE_GETTER + bool real_tabs_in_strings_allowed() const noexcept + { + return !!(config_.flags & format_flags::allow_real_tabs_in_strings); + } + + TOML_PURE_INLINE_GETTER + bool unicode_strings_allowed() const noexcept + { + return !!(config_.flags & format_flags::allow_unicode_strings); + } + TOML_API void attach(std::ostream& stream) noexcept; diff --git a/include/toml++/impl/formatter.inl b/include/toml++/impl/formatter.inl index c35e1e5..ce26391 100644 --- a/include/toml++/impl/formatter.inl +++ b/include/toml++/impl/formatter.inl @@ -14,15 +14,29 @@ #include "formatter.h" #include "print_to_stream.h" -#include "utf8.h" #include "value.h" #include "table.h" #include "array.h" +#include "unicode.h" #include "parse_result.h" #include "header_start.h" TOML_IMPL_NAMESPACE_START { + enum class formatted_string_traits : unsigned + { + none, + line_breaks = 1u << 0, // \n + tabs = 1u << 1, // \t + control_chars = 1u << 2, // also includes non-ascii vertical whitespace + single_quotes = 1u << 3, + non_bare = 1u << 4, // anything not satisfying "is bare key character" + non_ascii = 1u << 5, // any codepoint >= 128 + + all = (non_ascii << 1u) - 1u + }; + TOML_MAKE_FLAGS(formatted_string_traits); + TOML_EXTERNAL_LINKAGE formatter::formatter(const node* source_node, const parse_result* source_pr, @@ -101,70 +115,236 @@ TOML_IMPL_NAMESPACE_START TOML_EXTERNAL_LINKAGE void formatter::print_string(std::string_view str, bool allow_multi_line, bool allow_bare) { - auto literal = literal_strings_allowed(); if (str.empty()) { - print_to_stream(*stream_, literal ? "''"sv : "\"\""sv); - naked_newline_ = false; + print_unformatted(literal_strings_allowed() ? "''"sv : "\"\""sv); return; } - bool multi_line = allow_multi_line && !!(config_.flags & format_flags::allow_multi_line_strings); - const bool treat_raw_tab_as_control_char = !(config_.flags & format_flags::allow_real_tabs_in_strings); - if (multi_line || literal || treat_raw_tab_as_control_char || allow_bare) + // pre-scan the string to determine how we should output it + formatted_string_traits traits = {}; + + if (!allow_bare) + traits |= formatted_string_traits::non_bare; + bool unicode_allowed = unicode_strings_allowed(); + + // ascii fast path + if (is_ascii(str.data(), str.length())) { - utf8_decoder decoder; - bool has_line_breaks = false; - bool has_control_chars = false; - bool has_single_quotes = false; - for (size_t i = 0; i < str.length(); i++) + for (auto c : str) { - decoder(static_cast(str[i])); - if (decoder.error()) + switch (c) { - has_line_breaks = false; - has_control_chars = true; // force "" - has_single_quotes = true; - allow_bare = false; - break; - } - else if (decoder.has_code_point()) - { - if (decoder.codepoint == U'\n') + case '\n': traits |= formatted_string_traits::line_breaks; break; + case '\t': traits |= formatted_string_traits::tabs; break; + case '\'': traits |= formatted_string_traits::single_quotes; break; + default: { - has_line_breaks = true; - if (!multi_line) - has_control_chars = true; + if (is_control_character(c)) + traits |= formatted_string_traits::control_chars; + + if (!is_ascii_bare_key_character(static_cast(c))) + traits |= formatted_string_traits::non_bare; + break; } - else if (is_nontab_control_character(decoder.codepoint) - || (treat_raw_tab_as_control_char && decoder.codepoint == U'\t') - || is_vertical_whitespace(decoder.codepoint)) - has_control_chars = true; - else if (decoder.codepoint == U'\'') - has_single_quotes = true; - if (allow_bare) - allow_bare = is_bare_key_character(decoder.codepoint); } - if (has_line_breaks && has_control_chars && has_single_quotes && !allow_bare) + static constexpr auto all_ascii_traits = + formatted_string_traits::all & ~formatted_string_traits::non_ascii; + if (traits == all_ascii_traits) break; } - multi_line = multi_line && has_line_breaks; - literal = literal && !has_control_chars && !(!multi_line && has_single_quotes); } - if (allow_bare) - print_to_stream(*stream_, str); - else if (literal) - print_to_stream_bookended(*stream_, str, multi_line ? "'''"sv : "'"sv); + // unicode slow path else { - const auto quot = multi_line ? R"(""")"sv : R"(")"sv; - print_to_stream(*stream_, quot); - print_to_stream_with_escapes(*stream_, str); - print_to_stream(*stream_, quot); + traits |= formatted_string_traits::non_ascii; + utf8_decoder decoder; + + // if the unicode is malformed just treat the string as a single-line non-literal and + // escape all non-ascii characters (to ensure round-tripping and help with diagnostics) + const auto bad_unicode = [&]() noexcept + { + traits &= ~formatted_string_traits::line_breaks; + traits |= formatted_string_traits::control_chars | formatted_string_traits::non_bare; + unicode_allowed = false; + }; + + for (auto c : str) + { + decoder(c); + + if TOML_UNLIKELY(decoder.error()) + { + bad_unicode(); + break; + } + + if (!decoder.has_code_point()) + continue; + + switch (decoder.codepoint) + { + case U'\n': traits |= formatted_string_traits::line_breaks; break; + case U'\t': traits |= formatted_string_traits::tabs; break; + case U'\'': traits |= formatted_string_traits::single_quotes; break; + default: + { + if (is_control_character(decoder.codepoint) + || is_non_ascii_vertical_whitespace(decoder.codepoint)) + traits |= formatted_string_traits::control_chars; + + if (!is_bare_key_character(decoder.codepoint)) + traits |= formatted_string_traits::non_bare; + break; + } + } + } + + if (decoder.needs_more_input()) + bad_unicode(); } - naked_newline_ = false; + + // if the string meets the requirements of being 'bare' we can emit a bare string + // (bare strings are composed of letters and numbers; no whitespace, control chars, quotes, etc) + if (!(traits & formatted_string_traits::non_bare) + && (!(traits & formatted_string_traits::non_ascii) || unicode_allowed)) + { + print_unformatted(str); + return; + } + + // determine if this should be a multi-line string (triple-quotes) + const auto multi_line = allow_multi_line // + && multi_line_strings_allowed() // + && !!(traits & formatted_string_traits::line_breaks); + + // determine if this should be a literal string (single-quotes with no escaping) + const auto literal = literal_strings_allowed() // + && !(traits & formatted_string_traits::control_chars) // + && (!(traits & formatted_string_traits::single_quotes) || multi_line) // + && (!(traits & formatted_string_traits::tabs) || real_tabs_in_strings_allowed()) // + && (!(traits & formatted_string_traits::non_ascii) || unicode_allowed); + + // literal strings (single quotes, no escape codes) + if (literal) + { + const auto quot = multi_line ? R"(''')"sv : R"(')"sv; + print_unformatted(quot); + print_unformatted(str); + print_unformatted(quot); + return; + } + + // anything from here down is a non-literal string, so requires iteration and escaping. + print_unformatted(multi_line ? R"(""")"sv : R"(")"sv); + + const auto real_tabs_allowed = real_tabs_in_strings_allowed(); + + // ascii fast path + if (!(traits & formatted_string_traits::non_ascii)) + { + for (auto c : str) + { + switch (c) + { + case '"': print_to_stream(*stream_, R"(\")"sv); break; + case '\\': print_to_stream(*stream_, R"(\\)"sv); break; + case '\x7F': print_to_stream(*stream_, R"(\u007F)"sv); break; + case '\t': print_to_stream(*stream_, real_tabs_allowed ? "\t"sv : R"(\t)"sv); break; + case '\n': print_to_stream(*stream_, multi_line ? "\n"sv : R"(\n)"sv); break; + default: + { + // control characters from lookup table + if TOML_UNLIKELY(c >= '\x00' && c <= '\x1F') + print_to_stream(*stream_, control_char_escapes[c]); + + // regular characters + else + print_to_stream(*stream_, c); + } + } + } + } + + // unicode slow path + else + { + utf8_decoder decoder; + const char* cp_start = str.data(); + const char* cp_end = cp_start; + for (auto c : str) + { + decoder(c); + cp_end++; + + // if the decoder encounters malformed unicode just emit raw bytes and + if (decoder.error()) + { + while (cp_start != cp_end) + { + print_to_stream(*stream_, R"(\u00)"sv); + print_to_stream(*stream_, + static_cast(*cp_start), + value_flags::format_as_hexadecimal, + 2); + cp_start++; + } + decoder.reset(); + continue; + } + + if (!decoder.has_code_point()) + continue; + + switch (decoder.codepoint) + { + case U'"': print_to_stream(*stream_, R"(\")"sv); break; + case U'\\': print_to_stream(*stream_, R"(\\)"sv); break; + case U'\x7F': print_to_stream(*stream_, R"(\u007F)"sv); break; + case U'\t': print_to_stream(*stream_, real_tabs_allowed ? "\t"sv : R"(\t)"sv); break; + case U'\n': print_to_stream(*stream_, multi_line ? "\n"sv : R"(\n)"sv); break; + default: + { + // control characters from lookup table + if TOML_UNLIKELY(decoder.codepoint <= U'\x1F') + print_to_stream(*stream_, + control_char_escapes[static_cast(decoder.codepoint)]); + + // escaped unicode characters + else if (decoder.codepoint > U'\x7F' + && (!unicode_allowed || is_non_ascii_vertical_whitespace(decoder.codepoint))) + { + if (static_cast(decoder.codepoint) > 0xFFFFu) + { + print_to_stream(*stream_, R"(\U)"sv); + print_to_stream(*stream_, + static_cast(decoder.codepoint), + value_flags::format_as_hexadecimal, + 8); + } + else + { + print_to_stream(*stream_, R"(\u)"sv); + print_to_stream(*stream_, + static_cast(decoder.codepoint), + value_flags::format_as_hexadecimal, + 4); + } + } + + // regular characters + else + print_to_stream(*stream_, cp_start, static_cast(cp_end - cp_start)); + } + } + + cp_start = cp_end; + } + } + + print_unformatted(multi_line ? R"(""")"sv : R"(")"sv); } TOML_EXTERNAL_LINKAGE diff --git a/include/toml++/impl/forward_declarations.h b/include/toml++/impl/forward_declarations.h index 710c7cd..d6d25e9 100644 --- a/include/toml++/impl/forward_declarations.h +++ b/include/toml++/impl/forward_declarations.h @@ -119,7 +119,7 @@ TOML_IMPL_NAMESPACE_START // clang-format off - inline constexpr std::string_view low_character_escape_table[] = + inline constexpr std::string_view control_char_escapes[] = { "\\u0000"sv, "\\u0001"sv, @@ -309,20 +309,23 @@ TOML_NAMESPACE_START // abi namespace /// \brief Allow real tab characters in string literals (as opposed to the escaped form `\t`). allow_real_tabs_in_strings = (1ull << 4), + /// \brief Allow non-ASCII characters in strings (as opposed to their escaped form, e.g. `\u00DA`). + allow_unicode_strings = (1ull << 5), + /// \brief Allow integers with #value_flags::format_as_binary to be emitted as binary. - allow_binary_integers = (1ull << 5), + allow_binary_integers = (1ull << 6), /// \brief Allow integers with #value_flags::format_as_octal to be emitted as octal. - allow_octal_integers = (1ull << 6), + allow_octal_integers = (1ull << 7), /// \brief Allow integers with #value_flags::format_as_hexadecimal to be emitted as hexadecimal. - allow_hexadecimal_integers = (1ull << 7), + allow_hexadecimal_integers = (1ull << 8), /// \brief Apply indentation to tables nested within other tables/arrays. - indent_sub_tables = (1ull << 8), + indent_sub_tables = (1ull << 9), /// \brief Apply indentation to array elements when the array is forced to wrap over multiple lines. - indent_array_elements = (1ull << 9), + indent_array_elements = (1ull << 10), /// \brief Combination mask of all indentation-enabling flags. indentation = indent_sub_tables | indent_array_elements, diff --git a/include/toml++/impl/json_formatter.h b/include/toml++/impl/json_formatter.h index 784f50f..e169cd2 100644 --- a/include/toml++/impl/json_formatter.h +++ b/include/toml++/impl/json_formatter.h @@ -75,6 +75,7 @@ TOML_NAMESPACE_START /// \brief The default flags for a json_formatter. static constexpr format_flags default_flags = constants.mandatory_flags // | format_flags::quote_infinities_and_nans // + | format_flags::allow_unicode_strings // | format_flags::indentation; /// \brief Constructs a JSON formatter and binds it to a TOML object. diff --git a/include/toml++/impl/key.h b/include/toml++/impl/key.h index 0a43d5a..0f79b33 100644 --- a/include/toml++/impl/key.h +++ b/include/toml++/impl/key.h @@ -35,6 +35,12 @@ TOML_NAMESPACE_START source_region source_; public: + /// A const iterator for iterating over the characters in the key. + using const_iterator = const char*; + + /// A const iterator for iterating over the characters in the key. + using iterator = const_iterator; + /// \brief Default constructor. TOML_NODISCARD_CTOR key() noexcept = default; @@ -287,6 +293,40 @@ TOML_NAMESPACE_START /// @} + /// \name Iterators + /// @{ + + TOML_PURE_INLINE_GETTER + const_iterator begin() const noexcept + { + return key_.data(); + } + + TOML_PURE_INLINE_GETTER + const_iterator end() const noexcept + { + return key_.data() + key_.length(); + } + + /// @} + + /// \name Iterators (ADL) + /// @{ + + TOML_PURE_INLINE_GETTER + friend const_iterator begin(const key& k) noexcept + { + return k.begin(); + } + + TOML_PURE_INLINE_GETTER + friend const_iterator end(const key& k) noexcept + { + return k.end(); + } + + /// @} + /// \brief Prints the key's underlying string out to the stream. friend std::ostream& operator<<(std::ostream& lhs, const key& rhs) { diff --git a/include/toml++/impl/parser.inl b/include/toml++/impl/parser.inl index a25100e..66137d4 100644 --- a/include/toml++/impl/parser.inl +++ b/include/toml++/impl/parser.inl @@ -12,15 +12,15 @@ //# }} #if TOML_ENABLE_PARSER -#include "std_optional.h" #include "parser.h" +#include "std_optional.h" #include "source_region.h" #include "parse_error.h" -#include "utf8.h" #include "date_time.h" #include "value.h" #include "array.h" #include "table.h" +#include "unicode.h" TOML_DISABLE_WARNINGS; #include #include @@ -34,7 +34,6 @@ TOML_DISABLE_WARNINGS; #include #endif TOML_ENABLE_WARNINGS; -#include "simd.h" #include "header_start.h" //#--------------------------------------------------------------------------------------------------------------------- @@ -43,48 +42,6 @@ TOML_ENABLE_WARNINGS; TOML_ANON_NAMESPACE_START { - template - TOML_PURE_GETTER - TOML_ATTR(nonnull) - TOML_INTERNAL_LINKAGE - bool is_ascii(const T* str, size_t size) noexcept - { - static_assert(sizeof(T) == 1); - - const T* const end = str + size; - -#if TOML_HAS_SSE2 && (128 % CHAR_BIT) == 0 - { - constexpr size_t chars_per_vector = 128 / CHAR_BIT; - - if (const size_t simdable = size - (size % chars_per_vector)) - { - __m128i mask = _mm_setzero_si128(); - for (const T* const e = str + simdable; str < e; str += chars_per_vector) - { - const __m128i current_bytes = _mm_loadu_si128(reinterpret_cast(str)); - mask = _mm_or_si128(mask, current_bytes); - } - const __m128i has_error = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); - -#if TOML_HAS_SSE4_1 - if (!_mm_testz_si128(has_error, has_error)) - return false; -#else - if (_mm_movemask_epi8(_mm_cmpeq_epi8(has_error, _mm_setzero_si128())) != 0xFFFF) - return false; -#endif - } - } -#endif - - for (; str < end; str++) - if (static_cast(*str) > 127u) - return false; - - return true; - } - template class utf8_byte_stream; @@ -363,7 +320,7 @@ TOML_ANON_NAMESPACE_START auto& cp = codepoints_.buffer[i]; cp.position = next_pos_; - if (impl::is_vertical_whitespace_excl_cr(cp)) + if (cp == U'\n') { next_pos_.line++; next_pos_.column = source_index{ 1 }; @@ -374,7 +331,7 @@ TOML_ANON_NAMESPACE_START }; // decide whether we need to use the UTF-8 decoder or if we can treat this block as plain ASCII - const auto ascii_fast_path = !decoder_.needs_more_input() && is_ascii(raw_bytes, raw_bytes_read); + const auto ascii_fast_path = !decoder_.needs_more_input() && impl::is_ascii(raw_bytes, raw_bytes_read); // ASCII fast-path if (ascii_fast_path) @@ -726,7 +683,7 @@ TOML_ANON_NAMESPACE_START std::string_view to_sv(const utf8_codepoint& cp) noexcept { if TOML_UNLIKELY(cp.value <= U'\x1F') - return impl::low_character_escape_table[cp.value]; + return impl::control_char_escapes[cp.value]; else if TOML_UNLIKELY(cp.value == U'\x7F') return "\\u007F"sv; else @@ -1183,8 +1140,9 @@ TOML_IMPL_NAMESPACE_START { return_if_error_or_eof({}); - if (!is_vertical_whitespace(*cp)) - return false; + if TOML_UNLIKELY(is_match(*cp, U'\v', U'\f')) + set_error_and_return_default( + R"(vertical tabs '\v' and form-feeds '\f' are not legal whitespace in TOML.)"sv); if (*cp == U'\r') { @@ -1192,10 +1150,14 @@ TOML_IMPL_NAMESPACE_START if (is_eof()) return true; // eof after \r is 'fine' - else if (*cp != U'\n') + + if (*cp != U'\n') set_error_and_return_default("expected \\n, saw '"sv, to_sv(*cp), "'"sv); } - advance_and_return_if_error({}); // skip \n (or other single-character line ending) + else if (*cp != U'\n') + return false; + + advance_and_return_if_error({}); // skip \n return true; } @@ -1205,7 +1167,7 @@ TOML_IMPL_NAMESPACE_START do { - if (is_vertical_whitespace(*cp)) + if (is_ascii_vertical_whitespace(*cp)) return consume_line_break(); else advance(); @@ -1349,7 +1311,7 @@ TOML_IMPL_NAMESPACE_START continue; } - bool skipped_escaped_codepoint = false; + bool skip_escaped_codepoint = true; assert_not_eof(); switch (const auto escaped_codepoint = *cp) { @@ -1373,9 +1335,9 @@ TOML_IMPL_NAMESPACE_START case U'u': [[fallthrough]]; case U'U': { - push_parse_scope("unicode scalar escape sequence"sv); + push_parse_scope("unicode scalar sequence"sv); advance_and_return_if_error_or_eof({}); - skipped_escaped_codepoint = true; + skip_escaped_codepoint = false; uint32_t place_value = escaped_codepoint == U'U' ? 0x10000000u : (escaped_codepoint == U'u' ? 0x1000u : 0x10u); @@ -1395,25 +1357,28 @@ TOML_IMPL_NAMESPACE_START "unicode surrogates (U+D800 - U+DFFF) are explicitly prohibited"sv); else if (sequence_value > 0x10FFFFu) set_error_and_return_default("values greater than U+10FFFF are invalid"sv); - else if (sequence_value <= 0x7Fu) // ascii - str += static_cast(sequence_value & 0x7Fu); - else if (sequence_value <= 0x7FFu) + + if (sequence_value < 0x80) { - str += static_cast(0xC0u | ((sequence_value >> 6) & 0x1Fu)); - str += static_cast(0x80u | (sequence_value & 0x3Fu)); + str += static_cast(sequence_value); } - else if (sequence_value <= 0xFFFFu) + else if (sequence_value < 0x800u) { - str += static_cast(0xE0u | ((sequence_value >> 12) & 0x0Fu)); - str += static_cast(0x80u | ((sequence_value >> 6) & 0x1Fu)); - str += static_cast(0x80u | (sequence_value & 0x3Fu)); + str += static_cast((sequence_value >> 6) | 0xC0u); + str += static_cast((sequence_value & 0x3Fu) | 0x80u); } - else + else if (sequence_value < 0x10000u) { - str += static_cast(0xF0u | ((sequence_value >> 18) & 0x07u)); - str += static_cast(0x80u | ((sequence_value >> 12) & 0x3Fu)); - str += static_cast(0x80u | ((sequence_value >> 6) & 0x3Fu)); - str += static_cast(0x80u | (sequence_value & 0x3Fu)); + str += static_cast((sequence_value >> 12) | 0xE0u); + str += static_cast(((sequence_value >> 6) & 0x3Fu) | 0x80u); + str += static_cast((sequence_value & 0x3Fu) | 0x80u); + } + else if (sequence_value < 0x110000u) + { + str += static_cast((sequence_value >> 18) | 0xF0u); + str += static_cast(((sequence_value >> 12) & 0x3Fu) | 0x80u); + str += static_cast(((sequence_value >> 6) & 0x3Fu) | 0x80u); + str += static_cast((sequence_value & 0x3Fu) | 0x80u); } break; } @@ -1422,8 +1387,7 @@ TOML_IMPL_NAMESPACE_START default: set_error_and_return_default("unknown escape sequence '\\"sv, to_sv(*cp), "'"sv); } - // skip the escaped character - if (!skipped_escaped_codepoint) + if (skip_escaped_codepoint) advance_and_return_if_error_or_eof({}); } else @@ -1492,7 +1456,7 @@ TOML_IMPL_NAMESPACE_START } // handle line endings in multi-line mode - if (multi_line && is_vertical_whitespace(*cp)) + if (multi_line && is_ascii_vertical_whitespace(*cp)) { consume_line_break(); return_if_error({}); @@ -1608,7 +1572,7 @@ TOML_IMPL_NAMESPACE_START } // handle line endings in multi-line mode - if (multi_line && is_vertical_whitespace(*cp)) + if (multi_line && is_ascii_vertical_whitespace(*cp)) { consume_line_break(); return_if_error({}); @@ -2275,7 +2239,8 @@ TOML_IMPL_NAMESPACE_START TOML_ASSERT_ASSUME(is_decimal_digit(*cp)); push_parse_scope("time"sv); - static constexpr size_t max_digits = 9; + static constexpr size_t max_digits = 64; // far more than necessary but needed to allow fractional + // millisecond truncation per the spec uint32_t digits[max_digits]; // "HH" @@ -2344,15 +2309,14 @@ TOML_IMPL_NAMESPACE_START else if (!part_of_datetime && !is_value_terminator(*cp)) set_error_and_return_default("expected value-terminator, saw '"sv, to_sv(*cp), "'"sv); } - uint32_t value = 0u; uint32_t place = 1u; - for (auto i = digit_count; i-- > 0u;) + for (auto i = impl::min(digit_count, 9u); i-- > 0u;) { value += digits[i] * place; place *= 10u; } - for (auto i = digit_count; i < max_digits; i++) // implicit zeros + for (auto i = digit_count; i < 9u; i++) // implicit zeros value *= 10u; time.nanosecond = value; return time; diff --git a/include/toml++/impl/print_to_stream.h b/include/toml++/impl/print_to_stream.h index 65986eb..b1a0bad 100644 --- a/include/toml++/impl/print_to_stream.h +++ b/include/toml++/impl/print_to_stream.h @@ -30,28 +30,28 @@ TOML_IMPL_NAMESPACE_START void print_to_stream(std::ostream&, char); TOML_API - void print_to_stream(std::ostream&, int8_t, value_flags = {}); + void print_to_stream(std::ostream&, int8_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, int16_t, value_flags = {}); + void print_to_stream(std::ostream&, int16_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, int32_t, value_flags = {}); + void print_to_stream(std::ostream&, int32_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, int64_t, value_flags = {}); + void print_to_stream(std::ostream&, int64_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, uint8_t, value_flags = {}); + void print_to_stream(std::ostream&, uint8_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, uint16_t, value_flags = {}); + void print_to_stream(std::ostream&, uint16_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, uint32_t, value_flags = {}); + void print_to_stream(std::ostream&, uint32_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, uint64_t, value_flags = {}); + void print_to_stream(std::ostream&, uint64_t, value_flags = {}, size_t min_digits = 0); TOML_API void print_to_stream(std::ostream&, float, value_flags = {}); @@ -111,24 +111,6 @@ TOML_IMPL_NAMESPACE_START #endif - template - inline void print_to_stream_with_escapes(std::ostream & stream, const T& str) - { - for (auto c : str) - { - if TOML_UNLIKELY(c >= '\x00' && c <= '\x1F') - print_to_stream(stream, low_character_escape_table[c]); - else if TOML_UNLIKELY(c == '\x7F') - print_to_stream(stream, "\\u007F"sv); - else if TOML_UNLIKELY(c == '"') - print_to_stream(stream, "\\\""sv); - else if TOML_UNLIKELY(c == '\\') - print_to_stream(stream, "\\\\"sv); - else - print_to_stream(stream, c); - } - } - template inline void print_to_stream_bookended(std::ostream & stream, const T& val, const U& bookend) { diff --git a/include/toml++/impl/print_to_stream.inl b/include/toml++/impl/print_to_stream.inl index e3d9397..195d40d 100644 --- a/include/toml++/impl/print_to_stream.inl +++ b/include/toml++/impl/print_to_stream.inl @@ -69,11 +69,16 @@ TOML_ANON_NAMESPACE_START template TOML_INTERNAL_LINKAGE - void print_integer_to_stream(std::ostream & stream, T val, value_flags format = {}) + void print_integer_to_stream(std::ostream & stream, T val, value_flags format = {}, size_t min_digits = 0) { if (!val) { - stream.put('0'); + if (!min_digits) + min_digits = 1; + + for (size_t i = 0; i < min_digits; i++) + stream.put('0'); + return; } @@ -82,7 +87,7 @@ TOML_ANON_NAMESPACE_START format &= value_flags_mask; int base = 10; - if (format != value_flags::none && val >= T{}) + if (format != value_flags::none && val > T{}) { switch (format) { @@ -98,6 +103,8 @@ TOML_ANON_NAMESPACE_START char buf[(sizeof(T) * CHAR_BIT)]; const auto res = std::to_chars(buf, buf + sizeof(buf), val, base); const auto len = static_cast(res.ptr - buf); + for (size_t i = len; i < min_digits; i++) + stream.put('0'); if (base == 16) { for (size_t i = 0; i < len; i++) @@ -111,12 +118,16 @@ TOML_ANON_NAMESPACE_START using unsigned_type = std::conditional_t<(sizeof(T) > sizeof(unsigned)), std::make_unsigned_t, unsigned>; using cast_type = std::conditional_t, std::make_signed_t, unsigned_type>; - if TOML_UNLIKELY(format == value_flags::format_as_binary) + if (base == 2) { + const auto len = sizeof(T) * CHAR_BIT; + for (size_t i = len; i < min_digits; i++) + stream.put('0'); + bool found_one = false; const auto v = static_cast(val); - unsigned_type mask = unsigned_type{ 1 } << (sizeof(unsigned_type) * CHAR_BIT - 1u); - for (unsigned i = 0; i < sizeof(unsigned_type) * CHAR_BIT; i++) + unsigned_type mask = unsigned_type{ 1 } << (len - 1u); + for (size_t i = 0; i < len; i++) { if ((v & mask)) { @@ -133,6 +144,8 @@ TOML_ANON_NAMESPACE_START std::ostringstream ss; ss.imbue(std::locale::classic()); ss << std::uppercase << std::setbase(base); + if (min_digits) + ss << std::setfill('0') << std::setw(static_cast(min_digits)); ss << static_cast(val); const auto str = std::move(ss).str(); impl::print_to_stream(stream, str); @@ -194,31 +207,6 @@ TOML_ANON_NAMESPACE_START default: TOML_UNREACHABLE; } } - - template - TOML_INTERNAL_LINKAGE - void print_integer_leftpad_zeros(std::ostream & stream, T val, size_t min_digits) - { -#if TOML_INT_CHARCONV - - char buf[charconv_buffer_length]; - const auto res = std::to_chars(buf, buf + sizeof(buf), val); - const auto len = static_cast(res.ptr - buf); - for (size_t i = len; i < min_digits; i++) - stream.put('0'); - impl::print_to_stream(stream, buf, static_cast(res.ptr - buf)); - -#else - - std::ostringstream ss; - ss.imbue(std::locale::classic()); - using cast_type = std::conditional_t, int64_t, uint64_t>; - ss << std::setfill('0') << std::setw(static_cast(min_digits)) << static_cast(val); - const auto str = std::move(ss).str(); - impl::print_to_stream(stream, str); - -#endif - } } TOML_ANON_NAMESPACE_END; @@ -250,51 +238,51 @@ TOML_IMPL_NAMESPACE_START } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, int8_t val, value_flags format) + void print_to_stream(std::ostream & stream, int8_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, int16_t val, value_flags format) + void print_to_stream(std::ostream & stream, int16_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, int32_t val, value_flags format) + void print_to_stream(std::ostream & stream, int32_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, int64_t val, value_flags format) + void print_to_stream(std::ostream & stream, int64_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, uint8_t val, value_flags format) + void print_to_stream(std::ostream & stream, uint8_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, uint16_t val, value_flags format) + void print_to_stream(std::ostream & stream, uint16_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, uint32_t val, value_flags format) + void print_to_stream(std::ostream & stream, uint32_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, uint64_t val, value_flags format) + void print_to_stream(std::ostream & stream, uint64_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE @@ -318,21 +306,21 @@ TOML_IMPL_NAMESPACE_START TOML_EXTERNAL_LINKAGE void print_to_stream(std::ostream & stream, const toml::date& val) { - print_integer_leftpad_zeros(stream, val.year, 4u); + print_to_stream(stream, val.year, {}, 4); stream.put('-'); - print_integer_leftpad_zeros(stream, val.month, 2u); + print_to_stream(stream, val.month, {}, 2); stream.put('-'); - print_integer_leftpad_zeros(stream, val.day, 2u); + print_to_stream(stream, val.day, {}, 2); } TOML_EXTERNAL_LINKAGE void print_to_stream(std::ostream & stream, const toml::time& val) { - print_integer_leftpad_zeros(stream, val.hour, 2u); + print_to_stream(stream, val.hour, {}, 2); stream.put(':'); - print_integer_leftpad_zeros(stream, val.minute, 2u); + print_to_stream(stream, val.minute, {}, 2); stream.put(':'); - print_integer_leftpad_zeros(stream, val.second, 2u); + print_to_stream(stream, val.second, {}, 2); if (val.nanosecond && val.nanosecond <= 999999999u) { stream.put('.'); @@ -343,7 +331,7 @@ TOML_IMPL_NAMESPACE_START ns /= 10u; digits--; } - print_integer_leftpad_zeros(stream, ns, digits); + print_to_stream(stream, ns, {}, digits); } } @@ -367,13 +355,13 @@ TOML_IMPL_NAMESPACE_START const auto hours = mins / 60; if (hours) { - print_integer_leftpad_zeros(stream, static_cast(hours), 2u); + print_to_stream(stream, static_cast(hours), {}, 2); mins -= hours * 60; } else print_to_stream(stream, "00"sv); stream.put(':'); - print_integer_leftpad_zeros(stream, static_cast(mins), 2u); + print_to_stream(stream, static_cast(mins), {}, 2); } TOML_EXTERNAL_LINKAGE diff --git a/include/toml++/impl/toml_formatter.h b/include/toml++/impl/toml_formatter.h index affdfb8..8cadaff 100644 --- a/include/toml++/impl/toml_formatter.h +++ b/include/toml++/impl/toml_formatter.h @@ -68,8 +68,8 @@ TOML_NAMESPACE_START TOML_API void print(); - static constexpr impl::formatter_constants constants = { format_flags::none, // mandatory flags - format_flags::none, // ignored flags + static constexpr impl::formatter_constants constants = { format_flags::none, // mandatory + format_flags::none, // ignored "inf"sv, "-inf"sv, "nan"sv, @@ -83,6 +83,7 @@ TOML_NAMESPACE_START static constexpr format_flags default_flags = constants.mandatory_flags // | format_flags::allow_literal_strings // | format_flags::allow_multi_line_strings // + | format_flags::allow_unicode_strings // | format_flags::allow_real_tabs_in_strings // | format_flags::allow_binary_integers // | format_flags::allow_octal_integers // diff --git a/include/toml++/impl/toml_formatter.inl b/include/toml++/impl/toml_formatter.inl index a167065..d2196a4 100644 --- a/include/toml++/impl/toml_formatter.inl +++ b/include/toml++/impl/toml_formatter.inl @@ -14,10 +14,10 @@ #include "toml_formatter.h" #include "print_to_stream.h" -#include "utf8.h" #include "value.h" #include "table.h" #include "array.h" +#include "unicode.h" #include "header_start.h" TOML_DISABLE_ARITHMETIC_WARNINGS; diff --git a/include/toml++/impl/utf8.h b/include/toml++/impl/unicode.h similarity index 98% rename from include/toml++/impl/utf8.h rename to include/toml++/impl/unicode.h index a66119b..a4cda53 100644 --- a/include/toml++/impl/utf8.h +++ b/include/toml++/impl/unicode.h @@ -102,15 +102,21 @@ TOML_IMPL_NAMESPACE_START } TOML_CONST_GETTER - constexpr bool is_vertical_whitespace(char32_t c) noexcept + constexpr bool is_ascii_vertical_whitespace(char32_t c) noexcept { - return (U'\n' <= c && c <= U'\r') || (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85'; + return U'\n' <= c && c <= U'\r'; } TOML_CONST_GETTER - constexpr bool is_vertical_whitespace_excl_cr(char32_t c) noexcept + constexpr bool is_non_ascii_vertical_whitespace(char32_t c) noexcept { - return (U'\n' <= c && c <= U'\f') || (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85'; + return (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85'; + } + + TOML_CONST_GETTER + constexpr bool is_vertical_whitespace(char32_t c) noexcept + { + return is_ascii_vertical_whitespace(c) || is_non_ascii_vertical_whitespace(c); } TOML_CONST_GETTER @@ -122,6 +128,11 @@ TOML_IMPL_NAMESPACE_START TOML_CONST_GETTER constexpr bool is_ascii_bare_key_character(char32_t c) noexcept { +#if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys) + if (c == U'+') + return true; +#endif + if (c < U'-' || c > U'z') return false; @@ -861,8 +872,7 @@ TOML_IMPL_NAMESPACE_START constexpr bool is_bare_key_character(char32_t c) noexcept { return is_ascii_bare_key_character(c) -#if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys) & toml/issues/687 (unicode bare keys) - || c == U'+' // +#if TOML_LANG_UNRELEASED // toml/issues/687 (unicode bare keys) || is_non_ascii_bare_key_character(c) #endif ; @@ -874,6 +884,12 @@ TOML_IMPL_NAMESPACE_START return is_whitespace(c) || c == U']' || c == U'}' || c == U',' || c == U'#'; } + TOML_CONST_GETTER + constexpr bool is_control_character(char c) noexcept + { + return c <= '\u001F' || c == '\u007F'; + } + TOML_CONST_GETTER constexpr bool is_control_character(char32_t c) noexcept { @@ -949,12 +965,22 @@ TOML_IMPL_NAMESPACE_START state = state_table[state + uint_least32_t{ 256u } + type]; } + TOML_ALWAYS_INLINE + constexpr void operator()(char c) noexcept + { + operator()(static_cast(c)); + } + TOML_ALWAYS_INLINE constexpr void reset() noexcept { state = {}; } }; + + TOML_PURE_GETTER + TOML_ATTR(nonnull) + bool is_ascii(const char* str, size_t len) noexcept; } TOML_IMPL_NAMESPACE_END; diff --git a/include/toml++/impl/unicode.inl b/include/toml++/impl/unicode.inl new file mode 100644 index 0000000..2fda84e --- /dev/null +++ b/include/toml++/impl/unicode.inl @@ -0,0 +1,59 @@ +//# This file is a part of toml++ and is subject to the the terms of the MIT license. +//# Copyright (c) Mark Gillard +//# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text. +// SPDX-License-Identifier: MIT +#pragma once + +//# {{ +#include "preprocessor.h" +#if !TOML_IMPLEMENTATION +#error This is an implementation-only header. +#endif +//# }} + +#include "unicode.h" +#include "simd.h" +#include "header_start.h" + +TOML_IMPL_NAMESPACE_START +{ + TOML_EXTERNAL_LINKAGE + bool is_ascii(const char* str, size_t len) noexcept + { + const char* const end = str + len; + +#if TOML_HAS_SSE2 && (128 % CHAR_BIT) == 0 + { + constexpr size_t chars_per_vector = 128u / CHAR_BIT; + + if (const size_t simdable = len - (len % chars_per_vector)) + { + __m128i mask = _mm_setzero_si128(); + for (const char* const e = str + simdable; str < e; str += chars_per_vector) + { + const __m128i current_bytes = _mm_loadu_si128(reinterpret_cast(str)); + mask = _mm_or_si128(mask, current_bytes); + } + const __m128i has_error = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); + +#if TOML_HAS_SSE4_1 + if (!_mm_testz_si128(has_error, has_error)) + return false; +#else + if (_mm_movemask_epi8(_mm_cmpeq_epi8(has_error, _mm_setzero_si128())) != 0xFFFF) + return false; +#endif + } + } +#endif + + for (; str < end; str++) + if (static_cast(*str) > 127u) + return false; + + return true; + } +} +TOML_IMPL_NAMESPACE_END; + +#include "header_end.h" diff --git a/include/toml++/impl/yaml_formatter.h b/include/toml++/impl/yaml_formatter.h index c803620..8757e05 100644 --- a/include/toml++/impl/yaml_formatter.h +++ b/include/toml++/impl/yaml_formatter.h @@ -73,6 +73,7 @@ TOML_NAMESPACE_START /// \brief The default flags for a yaml_formatter. static constexpr format_flags default_flags = constants.mandatory_flags // | format_flags::allow_literal_strings // + | format_flags::allow_unicode_strings // | format_flags::allow_octal_integers // | format_flags::allow_hexadecimal_integers; diff --git a/include/toml++/toml.h b/include/toml++/toml.h index 04fb81b..12c489e 100644 --- a/include/toml++/toml.h +++ b/include/toml++/toml.h @@ -39,7 +39,7 @@ TOML_DISABLE_SUGGEST_ATTR_WARNINGS; #include "impl/array.h" #include "impl/key.h" #include "impl/table.h" -#include "impl/utf8.h" +#include "impl/unicode.h" #include "impl/parse_error.h" #include "impl/parse_result.h" #include "impl/parser.h" @@ -57,6 +57,7 @@ TOML_DISABLE_SUGGEST_ATTR_WARNINGS; #include "impl/value.inl" #include "impl/array.inl" #include "impl/table.inl" +#include "impl/unicode.inl" #include "impl/parser.inl" #include "impl/formatter.inl" #include "impl/toml_formatter.inl" diff --git a/tests/formatters.cpp b/tests/formatters.cpp index e2367ad..5ce073f 100644 --- a/tests/formatters.cpp +++ b/tests/formatters.cpp @@ -24,7 +24,7 @@ namespace friend std::ostream& operator<<(std::ostream& os, const char32_printer& p) { if (p.value <= U'\x1F') - return os << '\'' << impl::low_character_escape_table[static_cast(p.value)] << '\''; + return os << '\'' << impl::control_char_escapes[static_cast(p.value)] << '\''; else if (p.value == U'\x7F') return os << "'\\u007F'"sv; else if (p.value < 127u) @@ -52,6 +52,7 @@ namespace { string_difference diff{ { 1u, 1u } }; impl::utf8_decoder a, b; + for (size_t i = 0, e = std::min(str_a.length(), str_b.length()); i < e; i++, diff.index++) { a(static_cast(str_a[i])); @@ -76,7 +77,7 @@ namespace return diff; } - if (impl::is_vertical_whitespace_excl_cr(a.codepoint)) + if (a.codepoint == U'\n') { diff.position.line++; diff.position.column = 1u; diff --git a/tests/tests.h b/tests/tests.h index 98fa4c8..8818943 100644 --- a/tests/tests.h +++ b/tests/tests.h @@ -198,9 +198,9 @@ inline bool parse_expected_value(std::string_view test_file, if (!decoder.has_code_point()) continue; - if (impl::is_vertical_whitespace_excl_cr(decoder.codepoint)) + if (impl::is_ascii_vertical_whitespace(decoder.codepoint)) { - if (decoder.codepoint != U'\r') + if (decoder.codepoint == U'\n') { pos.line++; pos.column = source_index{ 1 }; diff --git a/tests/user_feedback.cpp b/tests/user_feedback.cpp index b1f9e31..aaf8d61 100644 --- a/tests/user_feedback.cpp +++ b/tests/user_feedback.cpp @@ -192,4 +192,42 @@ b = [] )", 4); } + + SECTION("github/issues/125") // https://github.com/marzer/tomlplusplus/issues/125 + { + parse_expected_value(FILE_LINE_ARGS, R"("\u0800")"sv, "\xE0\xA0\x80"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u7840")"sv, "\xE7\xA1\x80"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\uAA23")"sv, "\xEA\xA8\xA3"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\uA928")"sv, "\xEA\xA4\xA8"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u9CBF")"sv, "\xE9\xB2\xBF"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u2247")"sv, "\xE2\x89\x87"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u13D9")"sv, "\xE1\x8F\x99"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u69FC")"sv, "\xE6\xA7\xBC"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u8DE5")"sv, "\xE8\xB7\xA5"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u699C")"sv, "\xE6\xA6\x9C"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u8CD4")"sv, "\xE8\xB3\x94"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u4ED4")"sv, "\xE4\xBB\x94"sv); + parse_expected_value(FILE_LINE_ARGS, R"("\u2597")"sv, "\xE2\x96\x97"sv); + } + + SECTION("github/issues/127") // https://github.com/marzer/tomlplusplus/issues/127 + { + parse_expected_value(FILE_LINE_ARGS, + "12:34:56.11122233345678"sv, + toml::time{ + 12, + 34, + 56, + 111222333u // should truncate the .45678 part + }); + } + + SECTION("github/issues/128") // https://github.com/marzer/tomlplusplus/issues/128 + { + parsing_should_fail(FILE_LINE_ARGS, "\f"sv); + parsing_should_fail(FILE_LINE_ARGS, "\v"sv); + parsing_should_succeed(FILE_LINE_ARGS, " "sv); + parsing_should_succeed(FILE_LINE_ARGS, "\t"sv); + parsing_should_succeed(FILE_LINE_ARGS, "\n"sv); + } } diff --git a/toml++.vcxproj b/toml++.vcxproj index 2c317a6..cc04f39 100644 --- a/toml++.vcxproj +++ b/toml++.vcxproj @@ -72,7 +72,7 @@ - + @@ -95,6 +95,7 @@ + diff --git a/toml++.vcxproj.filters b/toml++.vcxproj.filters index b14e691..14ef797 100644 --- a/toml++.vcxproj.filters +++ b/toml++.vcxproj.filters @@ -61,7 +61,7 @@ include\impl - + include\impl @@ -214,6 +214,9 @@ tools + + include\impl + diff --git a/toml-test/tt.hpp b/toml-test/tt.h similarity index 100% rename from toml-test/tt.hpp rename to toml-test/tt.h diff --git a/toml-test/tt_decoder.cpp b/toml-test/tt_decoder.cpp index 036ea98..293efc5 100644 --- a/toml-test/tt_decoder.cpp +++ b/toml-test/tt_decoder.cpp @@ -3,7 +3,7 @@ //# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text. // SPDX-License-Identifier: MIT -#include "tt.hpp" +#include "tt.h" using nlohmann::json; using namespace std::string_view_literals; @@ -85,30 +85,29 @@ TOML_NAMESPACE_END; int main() { - json j; try { const std::string str(std::istreambuf_iterator{ std::cin }, std::istreambuf_iterator{}); - j = toml::parse(str, "stdin"sv); + json j = toml::parse(str, "stdin"sv); + + std::cout << j << "\n"; } catch (const toml::parse_error& err) { - std::cerr << err << "\n"; + std::cerr << "\n\n" << err << "\n"; return 1; } catch (const std::exception& exc) { - std::cerr << exc.what() << "\n"; + std::cerr << "\n\n" << exc.what() << "\n"; return 1; } catch (...) { - std::cerr << "An unspecified error occurred.\n"; + std::cerr << "\n\nAn unspecified error occurred.\n"; return 1; } - std::cout << j << "\n"; - return 0; } diff --git a/toml-test/tt_decoder.vcxproj b/toml-test/tt_decoder.vcxproj index a11941f..5219826 100644 --- a/toml-test/tt_decoder.vcxproj +++ b/toml-test/tt_decoder.vcxproj @@ -53,7 +53,7 @@ - + \ No newline at end of file diff --git a/toml-test/tt_encoder.cpp b/toml-test/tt_encoder.cpp index 211641b..d202007 100644 --- a/toml-test/tt_encoder.cpp +++ b/toml-test/tt_encoder.cpp @@ -3,7 +3,7 @@ //# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text. // SPDX-License-Identifier: MIT -#include "tt.hpp" +#include "tt.h" using nlohmann::json; using namespace std::string_view_literals; @@ -191,12 +191,13 @@ TOML_NAMESPACE_END; int main() { - toml::table tbl; try { const std::string str(std::istreambuf_iterator{ std::cin }, std::istreambuf_iterator{}); - tbl = json::parse(str); + toml::table tbl = json::parse(str); + + std::cout << tbl << "\n"; } catch (const std::exception& exc) { @@ -209,7 +210,5 @@ int main() return 1; } - std::cout << tbl << "\n"; - return 0; } diff --git a/toml-test/tt_encoder.vcxproj b/toml-test/tt_encoder.vcxproj index 19d6121..137050d 100644 --- a/toml-test/tt_encoder.vcxproj +++ b/toml-test/tt_encoder.vcxproj @@ -53,7 +53,7 @@ - + \ No newline at end of file diff --git a/toml.hpp b/toml.hpp index f249df7..9f453ad 100644 --- a/toml.hpp +++ b/toml.hpp @@ -1138,7 +1138,7 @@ TOML_IMPL_NAMESPACE_START // clang-format off - inline constexpr std::string_view low_character_escape_table[] = + inline constexpr std::string_view control_char_escapes[] = { "\\u0000"sv, "\\u0001"sv, @@ -1262,11 +1262,12 @@ TOML_NAMESPACE_START // abi namespace allow_literal_strings = (1ull << 2), allow_multi_line_strings = (1ull << 3), allow_real_tabs_in_strings = (1ull << 4), - allow_binary_integers = (1ull << 5), - allow_octal_integers = (1ull << 6), - allow_hexadecimal_integers = (1ull << 7), - indent_sub_tables = (1ull << 8), - indent_array_elements = (1ull << 9), + allow_unicode_strings = (1ull << 5), + allow_binary_integers = (1ull << 6), + allow_octal_integers = (1ull << 7), + allow_hexadecimal_integers = (1ull << 8), + indent_sub_tables = (1ull << 9), + indent_array_elements = (1ull << 10), indentation = indent_sub_tables | indent_array_elements, }; TOML_MAKE_FLAGS(format_flags); @@ -1994,28 +1995,28 @@ TOML_IMPL_NAMESPACE_START void print_to_stream(std::ostream&, char); TOML_API - void print_to_stream(std::ostream&, int8_t, value_flags = {}); + void print_to_stream(std::ostream&, int8_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, int16_t, value_flags = {}); + void print_to_stream(std::ostream&, int16_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, int32_t, value_flags = {}); + void print_to_stream(std::ostream&, int32_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, int64_t, value_flags = {}); + void print_to_stream(std::ostream&, int64_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, uint8_t, value_flags = {}); + void print_to_stream(std::ostream&, uint8_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, uint16_t, value_flags = {}); + void print_to_stream(std::ostream&, uint16_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, uint32_t, value_flags = {}); + void print_to_stream(std::ostream&, uint32_t, value_flags = {}, size_t min_digits = 0); TOML_API - void print_to_stream(std::ostream&, uint64_t, value_flags = {}); + void print_to_stream(std::ostream&, uint64_t, value_flags = {}, size_t min_digits = 0); TOML_API void print_to_stream(std::ostream&, float, value_flags = {}); @@ -2075,24 +2076,6 @@ TOML_IMPL_NAMESPACE_START #endif - template - inline void print_to_stream_with_escapes(std::ostream & stream, const T& str) - { - for (auto c : str) - { - if TOML_UNLIKELY(c >= '\x00' && c <= '\x1F') - print_to_stream(stream, low_character_escape_table[c]); - else if TOML_UNLIKELY(c == '\x7F') - print_to_stream(stream, "\\u007F"sv); - else if TOML_UNLIKELY(c == '"') - print_to_stream(stream, "\\\""sv); - else if TOML_UNLIKELY(c == '\\') - print_to_stream(stream, "\\\\"sv); - else - print_to_stream(stream, c); - } - } - template inline void print_to_stream_bookended(std::ostream & stream, const T& val, const U& bookend) { @@ -5895,6 +5878,10 @@ TOML_NAMESPACE_START public: + using const_iterator = const char*; + + using iterator = const_iterator; + TOML_NODISCARD_CTOR key() noexcept = default; @@ -6094,6 +6081,30 @@ TOML_NAMESPACE_START return lhs >= rhs.key_; } + TOML_PURE_INLINE_GETTER + const_iterator begin() const noexcept + { + return key_.data(); + } + + TOML_PURE_INLINE_GETTER + const_iterator end() const noexcept + { + return key_.data() + key_.length(); + } + + TOML_PURE_INLINE_GETTER + friend const_iterator begin(const key& k) noexcept + { + return k.begin(); + } + + TOML_PURE_INLINE_GETTER + friend const_iterator end(const key& k) noexcept + { + return k.end(); + } + friend std::ostream& operator<<(std::ostream& lhs, const key& rhs) { impl::print_to_stream(lhs, rhs.key_); @@ -7132,7 +7143,7 @@ TOML_NAMESPACE_END; #endif TOML_POP_WARNINGS; -//******** impl/utf8.h *********************************************************************************************** +//******** impl/unicode.h ******************************************************************************************** TOML_PUSH_WARNINGS; #ifdef _MSC_VER @@ -7236,15 +7247,21 @@ TOML_IMPL_NAMESPACE_START } TOML_CONST_GETTER - constexpr bool is_vertical_whitespace(char32_t c) noexcept + constexpr bool is_ascii_vertical_whitespace(char32_t c) noexcept { - return (U'\n' <= c && c <= U'\r') || (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85'; + return U'\n' <= c && c <= U'\r'; } TOML_CONST_GETTER - constexpr bool is_vertical_whitespace_excl_cr(char32_t c) noexcept + constexpr bool is_non_ascii_vertical_whitespace(char32_t c) noexcept { - return (U'\n' <= c && c <= U'\f') || (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85'; + return (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85'; + } + + TOML_CONST_GETTER + constexpr bool is_vertical_whitespace(char32_t c) noexcept + { + return is_ascii_vertical_whitespace(c) || is_non_ascii_vertical_whitespace(c); } TOML_CONST_GETTER @@ -7256,6 +7273,11 @@ TOML_IMPL_NAMESPACE_START TOML_CONST_GETTER constexpr bool is_ascii_bare_key_character(char32_t c) noexcept { +#if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys) + if (c == U'+') + return true; +#endif + if (c < U'-' || c > U'z') return false; @@ -7989,8 +8011,7 @@ TOML_IMPL_NAMESPACE_START constexpr bool is_bare_key_character(char32_t c) noexcept { return is_ascii_bare_key_character(c) -#if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys) & toml/issues/687 (unicode bare keys) - || c == U'+' // +#if TOML_LANG_UNRELEASED // toml/issues/687 (unicode bare keys) || is_non_ascii_bare_key_character(c) #endif ; @@ -8002,6 +8023,12 @@ TOML_IMPL_NAMESPACE_START return is_whitespace(c) || c == U']' || c == U'}' || c == U',' || c == U'#'; } + TOML_CONST_GETTER + constexpr bool is_control_character(char c) noexcept + { + return c <= '\u001F' || c == '\u007F'; + } + TOML_CONST_GETTER constexpr bool is_control_character(char32_t c) noexcept { @@ -8076,12 +8103,22 @@ TOML_IMPL_NAMESPACE_START state = state_table[state + uint_least32_t{ 256u } + type]; } + TOML_ALWAYS_INLINE + constexpr void operator()(char c) noexcept + { + operator()(static_cast(c)); + } + TOML_ALWAYS_INLINE constexpr void reset() noexcept { state = {}; } }; + + TOML_PURE_GETTER + TOML_ATTR(nonnull) + bool is_ascii(const char* str, size_t len) noexcept; } TOML_IMPL_NAMESPACE_END; @@ -8722,6 +8759,24 @@ TOML_IMPL_NAMESPACE_START return !!(config_.flags & format_flags::allow_literal_strings); } + TOML_PURE_INLINE_GETTER + bool multi_line_strings_allowed() const noexcept + { + return !!(config_.flags & format_flags::allow_multi_line_strings); + } + + TOML_PURE_INLINE_GETTER + bool real_tabs_in_strings_allowed() const noexcept + { + return !!(config_.flags & format_flags::allow_real_tabs_in_strings); + } + + TOML_PURE_INLINE_GETTER + bool unicode_strings_allowed() const noexcept + { + return !!(config_.flags & format_flags::allow_unicode_strings); + } + TOML_API void attach(std::ostream& stream) noexcept; @@ -8826,8 +8881,8 @@ TOML_NAMESPACE_START TOML_API void print(); - static constexpr impl::formatter_constants constants = { format_flags::none, // mandatory flags - format_flags::none, // ignored flags + static constexpr impl::formatter_constants constants = { format_flags::none, // mandatory + format_flags::none, // ignored "inf"sv, "-inf"sv, "nan"sv, @@ -8839,6 +8894,7 @@ TOML_NAMESPACE_START static constexpr format_flags default_flags = constants.mandatory_flags // | format_flags::allow_literal_strings // | format_flags::allow_multi_line_strings // + | format_flags::allow_unicode_strings // | format_flags::allow_real_tabs_in_strings // | format_flags::allow_binary_integers // | format_flags::allow_octal_integers // @@ -8927,6 +8983,7 @@ TOML_NAMESPACE_START static constexpr format_flags default_flags = constants.mandatory_flags // | format_flags::quote_infinities_and_nans // + | format_flags::allow_unicode_strings // | format_flags::indentation; TOML_NODISCARD_CTOR @@ -9014,6 +9071,7 @@ TOML_NAMESPACE_START static constexpr format_flags default_flags = constants.mandatory_flags // | format_flags::allow_literal_strings // + | format_flags::allow_unicode_strings // | format_flags::allow_octal_integers // | format_flags::allow_hexadecimal_integers; @@ -9217,11 +9275,16 @@ TOML_ANON_NAMESPACE_START template TOML_INTERNAL_LINKAGE - void print_integer_to_stream(std::ostream & stream, T val, value_flags format = {}) + void print_integer_to_stream(std::ostream & stream, T val, value_flags format = {}, size_t min_digits = 0) { if (!val) { - stream.put('0'); + if (!min_digits) + min_digits = 1; + + for (size_t i = 0; i < min_digits; i++) + stream.put('0'); + return; } @@ -9230,7 +9293,7 @@ TOML_ANON_NAMESPACE_START format &= value_flags_mask; int base = 10; - if (format != value_flags::none && val >= T{}) + if (format != value_flags::none && val > T{}) { switch (format) { @@ -9246,6 +9309,8 @@ TOML_ANON_NAMESPACE_START char buf[(sizeof(T) * CHAR_BIT)]; const auto res = std::to_chars(buf, buf + sizeof(buf), val, base); const auto len = static_cast(res.ptr - buf); + for (size_t i = len; i < min_digits; i++) + stream.put('0'); if (base == 16) { for (size_t i = 0; i < len; i++) @@ -9259,12 +9324,16 @@ TOML_ANON_NAMESPACE_START using unsigned_type = std::conditional_t<(sizeof(T) > sizeof(unsigned)), std::make_unsigned_t, unsigned>; using cast_type = std::conditional_t, std::make_signed_t, unsigned_type>; - if TOML_UNLIKELY(format == value_flags::format_as_binary) + if (base == 2) { + const auto len = sizeof(T) * CHAR_BIT; + for (size_t i = len; i < min_digits; i++) + stream.put('0'); + bool found_one = false; const auto v = static_cast(val); - unsigned_type mask = unsigned_type{ 1 } << (sizeof(unsigned_type) * CHAR_BIT - 1u); - for (unsigned i = 0; i < sizeof(unsigned_type) * CHAR_BIT; i++) + unsigned_type mask = unsigned_type{ 1 } << (len - 1u); + for (size_t i = 0; i < len; i++) { if ((v & mask)) { @@ -9281,6 +9350,8 @@ TOML_ANON_NAMESPACE_START std::ostringstream ss; ss.imbue(std::locale::classic()); ss << std::uppercase << std::setbase(base); + if (min_digits) + ss << std::setfill('0') << std::setw(static_cast(min_digits)); ss << static_cast(val); const auto str = std::move(ss).str(); impl::print_to_stream(stream, str); @@ -9342,31 +9413,6 @@ TOML_ANON_NAMESPACE_START default: TOML_UNREACHABLE; } } - - template - TOML_INTERNAL_LINKAGE - void print_integer_leftpad_zeros(std::ostream & stream, T val, size_t min_digits) - { -#if TOML_INT_CHARCONV - - char buf[charconv_buffer_length]; - const auto res = std::to_chars(buf, buf + sizeof(buf), val); - const auto len = static_cast(res.ptr - buf); - for (size_t i = len; i < min_digits; i++) - stream.put('0'); - impl::print_to_stream(stream, buf, static_cast(res.ptr - buf)); - -#else - - std::ostringstream ss; - ss.imbue(std::locale::classic()); - using cast_type = std::conditional_t, int64_t, uint64_t>; - ss << std::setfill('0') << std::setw(static_cast(min_digits)) << static_cast(val); - const auto str = std::move(ss).str(); - impl::print_to_stream(stream, str); - -#endif - } } TOML_ANON_NAMESPACE_END; @@ -9398,51 +9444,51 @@ TOML_IMPL_NAMESPACE_START } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, int8_t val, value_flags format) + void print_to_stream(std::ostream & stream, int8_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, int16_t val, value_flags format) + void print_to_stream(std::ostream & stream, int16_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, int32_t val, value_flags format) + void print_to_stream(std::ostream & stream, int32_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, int64_t val, value_flags format) + void print_to_stream(std::ostream & stream, int64_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, uint8_t val, value_flags format) + void print_to_stream(std::ostream & stream, uint8_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, uint16_t val, value_flags format) + void print_to_stream(std::ostream & stream, uint16_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, uint32_t val, value_flags format) + void print_to_stream(std::ostream & stream, uint32_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE - void print_to_stream(std::ostream & stream, uint64_t val, value_flags format) + void print_to_stream(std::ostream & stream, uint64_t val, value_flags format, size_t min_digits) { - TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format); + TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits); } TOML_EXTERNAL_LINKAGE @@ -9466,21 +9512,21 @@ TOML_IMPL_NAMESPACE_START TOML_EXTERNAL_LINKAGE void print_to_stream(std::ostream & stream, const toml::date& val) { - print_integer_leftpad_zeros(stream, val.year, 4u); + print_to_stream(stream, val.year, {}, 4); stream.put('-'); - print_integer_leftpad_zeros(stream, val.month, 2u); + print_to_stream(stream, val.month, {}, 2); stream.put('-'); - print_integer_leftpad_zeros(stream, val.day, 2u); + print_to_stream(stream, val.day, {}, 2); } TOML_EXTERNAL_LINKAGE void print_to_stream(std::ostream & stream, const toml::time& val) { - print_integer_leftpad_zeros(stream, val.hour, 2u); + print_to_stream(stream, val.hour, {}, 2); stream.put(':'); - print_integer_leftpad_zeros(stream, val.minute, 2u); + print_to_stream(stream, val.minute, {}, 2); stream.put(':'); - print_integer_leftpad_zeros(stream, val.second, 2u); + print_to_stream(stream, val.second, {}, 2); if (val.nanosecond && val.nanosecond <= 999999999u) { stream.put('.'); @@ -9491,7 +9537,7 @@ TOML_IMPL_NAMESPACE_START ns /= 10u; digits--; } - print_integer_leftpad_zeros(stream, ns, digits); + print_to_stream(stream, ns, {}, digits); } } @@ -9515,13 +9561,13 @@ TOML_IMPL_NAMESPACE_START const auto hours = mins / 60; if (hours) { - print_integer_leftpad_zeros(stream, static_cast(hours), 2u); + print_to_stream(stream, static_cast(hours), {}, 2); mins -= hours * 60; } else print_to_stream(stream, "00"sv); stream.put(':'); - print_integer_leftpad_zeros(stream, static_cast(mins), 2u); + print_to_stream(stream, static_cast(mins), {}, 2); } TOML_EXTERNAL_LINKAGE @@ -10498,23 +10544,7 @@ TOML_NAMESPACE_END; #endif TOML_POP_WARNINGS; -//******** impl/parser.inl ******************************************************************************************* - -#if TOML_ENABLE_PARSER - -TOML_DISABLE_WARNINGS; -#include -#include -#if TOML_INT_CHARCONV || TOML_FLOAT_CHARCONV -#include -#endif -#if !TOML_INT_CHARCONV || !TOML_FLOAT_CHARCONV -#include -#endif -#if !TOML_INT_CHARCONV -#include -#endif -TOML_ENABLE_WARNINGS; +//******** impl/unicode.inl ****************************************************************************************** #if TOML_ENABLE_SIMD @@ -10552,26 +10582,21 @@ TOML_PUSH_WARNINGS; #undef max #endif -TOML_ANON_NAMESPACE_START +TOML_IMPL_NAMESPACE_START { - template - TOML_PURE_GETTER - TOML_ATTR(nonnull) - TOML_INTERNAL_LINKAGE - bool is_ascii(const T* str, size_t size) noexcept + TOML_EXTERNAL_LINKAGE + bool is_ascii(const char* str, size_t len) noexcept { - static_assert(sizeof(T) == 1); - - const T* const end = str + size; + const char* const end = str + len; #if TOML_HAS_SSE2 && (128 % CHAR_BIT) == 0 { - constexpr size_t chars_per_vector = 128 / CHAR_BIT; + constexpr size_t chars_per_vector = 128u / CHAR_BIT; - if (const size_t simdable = size - (size % chars_per_vector)) + if (const size_t simdable = len - (len % chars_per_vector)) { __m128i mask = _mm_setzero_si128(); - for (const T* const e = str + simdable; str < e; str += chars_per_vector) + for (const char* const e = str + simdable; str < e; str += chars_per_vector) { const __m128i current_bytes = _mm_loadu_si128(reinterpret_cast(str)); mask = _mm_or_si128(mask, current_bytes); @@ -10595,7 +10620,42 @@ TOML_ANON_NAMESPACE_START return true; } +} +TOML_IMPL_NAMESPACE_END; +#ifdef _MSC_VER +#pragma pop_macro("min") +#pragma pop_macro("max") +#endif +TOML_POP_WARNINGS; + +//******** impl/parser.inl ******************************************************************************************* + +#if TOML_ENABLE_PARSER + +TOML_DISABLE_WARNINGS; +#include +#include +#if TOML_INT_CHARCONV || TOML_FLOAT_CHARCONV +#include +#endif +#if !TOML_INT_CHARCONV || !TOML_FLOAT_CHARCONV +#include +#endif +#if !TOML_INT_CHARCONV +#include +#endif +TOML_ENABLE_WARNINGS; +TOML_PUSH_WARNINGS; +#ifdef _MSC_VER +#pragma push_macro("min") +#pragma push_macro("max") +#undef min +#undef max +#endif + +TOML_ANON_NAMESPACE_START +{ template class utf8_byte_stream; @@ -10874,7 +10934,7 @@ TOML_ANON_NAMESPACE_START auto& cp = codepoints_.buffer[i]; cp.position = next_pos_; - if (impl::is_vertical_whitespace_excl_cr(cp)) + if (cp == U'\n') { next_pos_.line++; next_pos_.column = source_index{ 1 }; @@ -10885,7 +10945,7 @@ TOML_ANON_NAMESPACE_START }; // decide whether we need to use the UTF-8 decoder or if we can treat this block as plain ASCII - const auto ascii_fast_path = !decoder_.needs_more_input() && is_ascii(raw_bytes, raw_bytes_read); + const auto ascii_fast_path = !decoder_.needs_more_input() && impl::is_ascii(raw_bytes, raw_bytes_read); // ASCII fast-path if (ascii_fast_path) @@ -11233,7 +11293,7 @@ TOML_ANON_NAMESPACE_START std::string_view to_sv(const utf8_codepoint& cp) noexcept { if TOML_UNLIKELY(cp.value <= U'\x1F') - return impl::low_character_escape_table[cp.value]; + return impl::control_char_escapes[cp.value]; else if TOML_UNLIKELY(cp.value == U'\x7F') return "\\u007F"sv; else @@ -11690,8 +11750,9 @@ TOML_IMPL_NAMESPACE_START { return_if_error_or_eof({}); - if (!is_vertical_whitespace(*cp)) - return false; + if TOML_UNLIKELY(is_match(*cp, U'\v', U'\f')) + set_error_and_return_default( + R"(vertical tabs '\v' and form-feeds '\f' are not legal whitespace in TOML.)"sv); if (*cp == U'\r') { @@ -11699,10 +11760,14 @@ TOML_IMPL_NAMESPACE_START if (is_eof()) return true; // eof after \r is 'fine' - else if (*cp != U'\n') + + if (*cp != U'\n') set_error_and_return_default("expected \\n, saw '"sv, to_sv(*cp), "'"sv); } - advance_and_return_if_error({}); // skip \n (or other single-character line ending) + else if (*cp != U'\n') + return false; + + advance_and_return_if_error({}); // skip \n return true; } @@ -11712,7 +11777,7 @@ TOML_IMPL_NAMESPACE_START do { - if (is_vertical_whitespace(*cp)) + if (is_ascii_vertical_whitespace(*cp)) return consume_line_break(); else advance(); @@ -11856,7 +11921,7 @@ TOML_IMPL_NAMESPACE_START continue; } - bool skipped_escaped_codepoint = false; + bool skip_escaped_codepoint = true; assert_not_eof(); switch (const auto escaped_codepoint = *cp) { @@ -11880,9 +11945,9 @@ TOML_IMPL_NAMESPACE_START case U'u': [[fallthrough]]; case U'U': { - push_parse_scope("unicode scalar escape sequence"sv); + push_parse_scope("unicode scalar sequence"sv); advance_and_return_if_error_or_eof({}); - skipped_escaped_codepoint = true; + skip_escaped_codepoint = false; uint32_t place_value = escaped_codepoint == U'U' ? 0x10000000u : (escaped_codepoint == U'u' ? 0x1000u : 0x10u); @@ -11902,25 +11967,28 @@ TOML_IMPL_NAMESPACE_START "unicode surrogates (U+D800 - U+DFFF) are explicitly prohibited"sv); else if (sequence_value > 0x10FFFFu) set_error_and_return_default("values greater than U+10FFFF are invalid"sv); - else if (sequence_value <= 0x7Fu) // ascii - str += static_cast(sequence_value & 0x7Fu); - else if (sequence_value <= 0x7FFu) + + if (sequence_value < 0x80) { - str += static_cast(0xC0u | ((sequence_value >> 6) & 0x1Fu)); - str += static_cast(0x80u | (sequence_value & 0x3Fu)); + str += static_cast(sequence_value); } - else if (sequence_value <= 0xFFFFu) + else if (sequence_value < 0x800u) { - str += static_cast(0xE0u | ((sequence_value >> 12) & 0x0Fu)); - str += static_cast(0x80u | ((sequence_value >> 6) & 0x1Fu)); - str += static_cast(0x80u | (sequence_value & 0x3Fu)); + str += static_cast((sequence_value >> 6) | 0xC0u); + str += static_cast((sequence_value & 0x3Fu) | 0x80u); } - else + else if (sequence_value < 0x10000u) { - str += static_cast(0xF0u | ((sequence_value >> 18) & 0x07u)); - str += static_cast(0x80u | ((sequence_value >> 12) & 0x3Fu)); - str += static_cast(0x80u | ((sequence_value >> 6) & 0x3Fu)); - str += static_cast(0x80u | (sequence_value & 0x3Fu)); + str += static_cast((sequence_value >> 12) | 0xE0u); + str += static_cast(((sequence_value >> 6) & 0x3Fu) | 0x80u); + str += static_cast((sequence_value & 0x3Fu) | 0x80u); + } + else if (sequence_value < 0x110000u) + { + str += static_cast((sequence_value >> 18) | 0xF0u); + str += static_cast(((sequence_value >> 12) & 0x3Fu) | 0x80u); + str += static_cast(((sequence_value >> 6) & 0x3Fu) | 0x80u); + str += static_cast((sequence_value & 0x3Fu) | 0x80u); } break; } @@ -11929,8 +11997,7 @@ TOML_IMPL_NAMESPACE_START default: set_error_and_return_default("unknown escape sequence '\\"sv, to_sv(*cp), "'"sv); } - // skip the escaped character - if (!skipped_escaped_codepoint) + if (skip_escaped_codepoint) advance_and_return_if_error_or_eof({}); } else @@ -11999,7 +12066,7 @@ TOML_IMPL_NAMESPACE_START } // handle line endings in multi-line mode - if (multi_line && is_vertical_whitespace(*cp)) + if (multi_line && is_ascii_vertical_whitespace(*cp)) { consume_line_break(); return_if_error({}); @@ -12115,7 +12182,7 @@ TOML_IMPL_NAMESPACE_START } // handle line endings in multi-line mode - if (multi_line && is_vertical_whitespace(*cp)) + if (multi_line && is_ascii_vertical_whitespace(*cp)) { consume_line_break(); return_if_error({}); @@ -12782,7 +12849,8 @@ TOML_IMPL_NAMESPACE_START TOML_ASSERT_ASSUME(is_decimal_digit(*cp)); push_parse_scope("time"sv); - static constexpr size_t max_digits = 9; + static constexpr size_t max_digits = 64; // far more than necessary but needed to allow fractional + // millisecond truncation per the spec uint32_t digits[max_digits]; // "HH" @@ -12851,15 +12919,14 @@ TOML_IMPL_NAMESPACE_START else if (!part_of_datetime && !is_value_terminator(*cp)) set_error_and_return_default("expected value-terminator, saw '"sv, to_sv(*cp), "'"sv); } - uint32_t value = 0u; uint32_t place = 1u; - for (auto i = digit_count; i-- > 0u;) + for (auto i = impl::min(digit_count, 9u); i-- > 0u;) { value += digits[i] * place; place *= 10u; } - for (auto i = digit_count; i < max_digits; i++) // implicit zeros + for (auto i = digit_count; i < 9u; i++) // implicit zeros value *= 10u; time.nanosecond = value; return time; @@ -14336,6 +14403,20 @@ TOML_PUSH_WARNINGS; TOML_IMPL_NAMESPACE_START { + enum class formatted_string_traits : unsigned + { + none, + line_breaks = 1u << 0, // \n + tabs = 1u << 1, // \t + control_chars = 1u << 2, // also includes non-ascii vertical whitespace + single_quotes = 1u << 3, + non_bare = 1u << 4, // anything not satisfying "is bare key character" + non_ascii = 1u << 5, // any codepoint >= 128 + + all = (non_ascii << 1u) - 1u + }; + TOML_MAKE_FLAGS(formatted_string_traits); + TOML_EXTERNAL_LINKAGE formatter::formatter(const node* source_node, const parse_result* source_pr, @@ -14414,70 +14495,236 @@ TOML_IMPL_NAMESPACE_START TOML_EXTERNAL_LINKAGE void formatter::print_string(std::string_view str, bool allow_multi_line, bool allow_bare) { - auto literal = literal_strings_allowed(); if (str.empty()) { - print_to_stream(*stream_, literal ? "''"sv : "\"\""sv); - naked_newline_ = false; + print_unformatted(literal_strings_allowed() ? "''"sv : "\"\""sv); return; } - bool multi_line = allow_multi_line && !!(config_.flags & format_flags::allow_multi_line_strings); - const bool treat_raw_tab_as_control_char = !(config_.flags & format_flags::allow_real_tabs_in_strings); - if (multi_line || literal || treat_raw_tab_as_control_char || allow_bare) + // pre-scan the string to determine how we should output it + formatted_string_traits traits = {}; + + if (!allow_bare) + traits |= formatted_string_traits::non_bare; + bool unicode_allowed = unicode_strings_allowed(); + + // ascii fast path + if (is_ascii(str.data(), str.length())) { - utf8_decoder decoder; - bool has_line_breaks = false; - bool has_control_chars = false; - bool has_single_quotes = false; - for (size_t i = 0; i < str.length(); i++) + for (auto c : str) { - decoder(static_cast(str[i])); - if (decoder.error()) + switch (c) { - has_line_breaks = false; - has_control_chars = true; // force "" - has_single_quotes = true; - allow_bare = false; - break; - } - else if (decoder.has_code_point()) - { - if (decoder.codepoint == U'\n') + case '\n': traits |= formatted_string_traits::line_breaks; break; + case '\t': traits |= formatted_string_traits::tabs; break; + case '\'': traits |= formatted_string_traits::single_quotes; break; + default: { - has_line_breaks = true; - if (!multi_line) - has_control_chars = true; + if (is_control_character(c)) + traits |= formatted_string_traits::control_chars; + + if (!is_ascii_bare_key_character(static_cast(c))) + traits |= formatted_string_traits::non_bare; + break; } - else if (is_nontab_control_character(decoder.codepoint) - || (treat_raw_tab_as_control_char && decoder.codepoint == U'\t') - || is_vertical_whitespace(decoder.codepoint)) - has_control_chars = true; - else if (decoder.codepoint == U'\'') - has_single_quotes = true; - if (allow_bare) - allow_bare = is_bare_key_character(decoder.codepoint); } - if (has_line_breaks && has_control_chars && has_single_quotes && !allow_bare) + static constexpr auto all_ascii_traits = + formatted_string_traits::all & ~formatted_string_traits::non_ascii; + if (traits == all_ascii_traits) break; } - multi_line = multi_line && has_line_breaks; - literal = literal && !has_control_chars && !(!multi_line && has_single_quotes); } - if (allow_bare) - print_to_stream(*stream_, str); - else if (literal) - print_to_stream_bookended(*stream_, str, multi_line ? "'''"sv : "'"sv); + // unicode slow path else { - const auto quot = multi_line ? R"(""")"sv : R"(")"sv; - print_to_stream(*stream_, quot); - print_to_stream_with_escapes(*stream_, str); - print_to_stream(*stream_, quot); + traits |= formatted_string_traits::non_ascii; + utf8_decoder decoder; + + // if the unicode is malformed just treat the string as a single-line non-literal and + // escape all non-ascii characters (to ensure round-tripping and help with diagnostics) + const auto bad_unicode = [&]() noexcept + { + traits &= ~formatted_string_traits::line_breaks; + traits |= formatted_string_traits::control_chars | formatted_string_traits::non_bare; + unicode_allowed = false; + }; + + for (auto c : str) + { + decoder(c); + + if TOML_UNLIKELY(decoder.error()) + { + bad_unicode(); + break; + } + + if (!decoder.has_code_point()) + continue; + + switch (decoder.codepoint) + { + case U'\n': traits |= formatted_string_traits::line_breaks; break; + case U'\t': traits |= formatted_string_traits::tabs; break; + case U'\'': traits |= formatted_string_traits::single_quotes; break; + default: + { + if (is_control_character(decoder.codepoint) + || is_non_ascii_vertical_whitespace(decoder.codepoint)) + traits |= formatted_string_traits::control_chars; + + if (!is_bare_key_character(decoder.codepoint)) + traits |= formatted_string_traits::non_bare; + break; + } + } + } + + if (decoder.needs_more_input()) + bad_unicode(); } - naked_newline_ = false; + + // if the string meets the requirements of being 'bare' we can emit a bare string + // (bare strings are composed of letters and numbers; no whitespace, control chars, quotes, etc) + if (!(traits & formatted_string_traits::non_bare) + && (!(traits & formatted_string_traits::non_ascii) || unicode_allowed)) + { + print_unformatted(str); + return; + } + + // determine if this should be a multi-line string (triple-quotes) + const auto multi_line = allow_multi_line // + && multi_line_strings_allowed() // + && !!(traits & formatted_string_traits::line_breaks); + + // determine if this should be a literal string (single-quotes with no escaping) + const auto literal = literal_strings_allowed() // + && !(traits & formatted_string_traits::control_chars) // + && (!(traits & formatted_string_traits::single_quotes) || multi_line) // + && (!(traits & formatted_string_traits::tabs) || real_tabs_in_strings_allowed()) // + && (!(traits & formatted_string_traits::non_ascii) || unicode_allowed); + + // literal strings (single quotes, no escape codes) + if (literal) + { + const auto quot = multi_line ? R"(''')"sv : R"(')"sv; + print_unformatted(quot); + print_unformatted(str); + print_unformatted(quot); + return; + } + + // anything from here down is a non-literal string, so requires iteration and escaping. + print_unformatted(multi_line ? R"(""")"sv : R"(")"sv); + + const auto real_tabs_allowed = real_tabs_in_strings_allowed(); + + // ascii fast path + if (!(traits & formatted_string_traits::non_ascii)) + { + for (auto c : str) + { + switch (c) + { + case '"': print_to_stream(*stream_, R"(\")"sv); break; + case '\\': print_to_stream(*stream_, R"(\\)"sv); break; + case '\x7F': print_to_stream(*stream_, R"(\u007F)"sv); break; + case '\t': print_to_stream(*stream_, real_tabs_allowed ? "\t"sv : R"(\t)"sv); break; + case '\n': print_to_stream(*stream_, multi_line ? "\n"sv : R"(\n)"sv); break; + default: + { + // control characters from lookup table + if TOML_UNLIKELY(c >= '\x00' && c <= '\x1F') + print_to_stream(*stream_, control_char_escapes[c]); + + // regular characters + else + print_to_stream(*stream_, c); + } + } + } + } + + // unicode slow path + else + { + utf8_decoder decoder; + const char* cp_start = str.data(); + const char* cp_end = cp_start; + for (auto c : str) + { + decoder(c); + cp_end++; + + // if the decoder encounters malformed unicode just emit raw bytes and + if (decoder.error()) + { + while (cp_start != cp_end) + { + print_to_stream(*stream_, R"(\u00)"sv); + print_to_stream(*stream_, + static_cast(*cp_start), + value_flags::format_as_hexadecimal, + 2); + cp_start++; + } + decoder.reset(); + continue; + } + + if (!decoder.has_code_point()) + continue; + + switch (decoder.codepoint) + { + case U'"': print_to_stream(*stream_, R"(\")"sv); break; + case U'\\': print_to_stream(*stream_, R"(\\)"sv); break; + case U'\x7F': print_to_stream(*stream_, R"(\u007F)"sv); break; + case U'\t': print_to_stream(*stream_, real_tabs_allowed ? "\t"sv : R"(\t)"sv); break; + case U'\n': print_to_stream(*stream_, multi_line ? "\n"sv : R"(\n)"sv); break; + default: + { + // control characters from lookup table + if TOML_UNLIKELY(decoder.codepoint <= U'\x1F') + print_to_stream(*stream_, + control_char_escapes[static_cast(decoder.codepoint)]); + + // escaped unicode characters + else if (decoder.codepoint > U'\x7F' + && (!unicode_allowed || is_non_ascii_vertical_whitespace(decoder.codepoint))) + { + if (static_cast(decoder.codepoint) > 0xFFFFu) + { + print_to_stream(*stream_, R"(\U)"sv); + print_to_stream(*stream_, + static_cast(decoder.codepoint), + value_flags::format_as_hexadecimal, + 8); + } + else + { + print_to_stream(*stream_, R"(\u)"sv); + print_to_stream(*stream_, + static_cast(decoder.codepoint), + value_flags::format_as_hexadecimal, + 4); + } + } + + // regular characters + else + print_to_stream(*stream_, cp_start, static_cast(cp_end - cp_start)); + } + } + + cp_start = cp_end; + } + } + + print_unformatted(multi_line ? R"(""")"sv : R"(")"sv); } TOML_EXTERNAL_LINKAGE