fixed incorrect unicode scalar sequence transformations (#125)

also: - fixed extended-precision fractional times causing parse error instead of truncating per the spec (closes #127) - fixed some non-spec vertical whitespace being accepted as line breaks (closes #128) - added `format_flags::allow_unicode_strings`
2024-07-19 19:37:29 +00:00 · 2022-01-04 16:23:45 +02:00 · 2022-01-04 16:23:45 +02:00 · b41e12f736
commit b41e12f736
parent f3bd22bff4
37 changed files with 1015 additions and 458 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -35,6 +35,9 @@ code changes at callsites or in build systems are indicated with ⚠&#xFE0F;.
 - fixed missing `TOML_API` on interfaces
 - fixed parser not correctly round-tripping the format of binary and octal integers in some cases
 - fixed strong exception guarantee edge-cases in `toml::table` and `toml::array`
+- fixed some incorrect unicode scalar sequence transformations (#125) (@moorereason)
+- fixed extended-precision fractional times causing parse error instead of truncating per the spec (#127) (@moorereason)
+- fixed some non-spec vertical whitespace being accepted as line breaks (#128) (@moorereason)

 #### Additions:
 - added `operator->` to `toml::value` for class types
@ -48,6 +51,7 @@ code changes at callsites or in build systems are indicated with ⚠&#xFE0F;.
 - added `toml::format_flags::allow_hexadecimal_integers`
 - added `toml::format_flags::allow_octal_integers`
 - added `toml::format_flags::allow_real_tabs_in_strings`
+- added `toml::format_flags::allow_unicode_strings`
 - added `toml::format_flags::indent_array_elements`
 - added `toml::format_flags::indent_sub_tables`
 - added `toml::format_flags::quote_infinities_and_nans`
--- a/examples/error_printer.cpp
+++ b/examples/error_printer.cpp
@ -6,7 +6,7 @@
 // This example shows the error messages the library produces by forcing a set of specific parsing
 // failures and printing their results.

-#include "examples.hpp"
+#include "examples.h"

 #define TOML_EXCEPTIONS					0
 #define TOML_ENABLE_UNRELEASED_FEATURES 0
@ -17,10 +17,11 @@ using namespace std::string_view_literals;
 namespace
 {
 	inline constexpr auto invalid_parses = std::array{
-		"########## comments"sv,
+		"########## comments and whitespace"sv,
 		"# bar\rkek"sv,
 		"# bar\bkek"sv,
 		"# \xf1\x63"sv,
+		"# val1 = 1\fval2 = 2"sv,

 		"########## inline tables"sv,
 		"val = {,}"sv,
--- a/examples/error_printer.vcxproj
+++ b/examples/error_printer.vcxproj
@ -55,7 +55,7 @@
    <Text Include="CMakeLists.txt" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="examples.hpp" />
+    <ClInclude Include="examples.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 </Project>
--- a/examples/examples.hpp
+++ b/examples/examples.hpp
--- a/examples/parse_benchmark.cpp
+++ b/examples/parse_benchmark.cpp
@ -5,7 +5,7 @@

 // This example is just a short-n-shiny benchmark.

-#include "examples.hpp"
+#include "examples.h"
 #include <toml++/toml.h>

 using namespace std::string_view_literals;
--- a/examples/parse_benchmark.vcxproj
+++ b/examples/parse_benchmark.vcxproj
@ -56,7 +56,7 @@
    <Text Include="CMakeLists.txt" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="examples.hpp" />
+    <ClInclude Include="examples.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 </Project>
--- a/examples/simple_parser.cpp
+++ b/examples/simple_parser.cpp
@ -5,7 +5,7 @@

 // This example demonstrates how to parse TOML from a file or stdin and re-serialize it (print it out) to stdout.

-#include "examples.hpp"
+#include "examples.h"

 #define TOML_ENABLE_UNRELEASED_FEATURES 1
 #include <toml++/toml.h>
--- a/examples/simple_parser.vcxproj
+++ b/examples/simple_parser.vcxproj
@ -56,7 +56,7 @@
    <Text Include="CMakeLists.txt" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="examples.hpp" />
+    <ClInclude Include="examples.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 </Project>
--- a/examples/toml_generator.cpp
+++ b/examples/toml_generator.cpp
@ -5,7 +5,7 @@

 // This example demonstrates the use of some more advanced features to generate a tree of random TOML data.

-#include "examples.hpp"
+#include "examples.h"

 #define TOML_ENABLE_PARSER 0
 #include <toml++/toml.h>
--- a/examples/toml_generator.vcxproj
+++ b/examples/toml_generator.vcxproj
@ -56,7 +56,7 @@
    <Text Include="CMakeLists.txt" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="examples.hpp" />
+    <ClInclude Include="examples.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 </Project>
--- a/examples/toml_to_json_transcoder.cpp
+++ b/examples/toml_to_json_transcoder.cpp
@ -5,7 +5,7 @@

 // This example demonstrates how to use the toml::json_formatter to re-serialize TOML data as JSON.

-#include "examples.hpp"
+#include "examples.h"

 #define TOML_ENABLE_UNRELEASED_FEATURES 1
 #include <toml++/toml.h>
--- a/examples/toml_to_json_transcoder.vcxproj
+++ b/examples/toml_to_json_transcoder.vcxproj
@ -56,7 +56,7 @@
    <Text Include="CMakeLists.txt" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="examples.hpp" />
+    <ClInclude Include="examples.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 </Project>
--- a/include/toml++/impl/formatter.h
+++ b/include/toml++/impl/formatter.h
@ -106,6 +106,24 @@ TOML_IMPL_NAMESPACE_START
 			return !!(config_.flags & format_flags::allow_literal_strings);
 		}

+		TOML_PURE_INLINE_GETTER
+		bool multi_line_strings_allowed() const noexcept
+		{
+			return !!(config_.flags & format_flags::allow_multi_line_strings);
+		}
+
+		TOML_PURE_INLINE_GETTER
+		bool real_tabs_in_strings_allowed() const noexcept
+		{
+			return !!(config_.flags & format_flags::allow_real_tabs_in_strings);
+		}
+
+		TOML_PURE_INLINE_GETTER
+		bool unicode_strings_allowed() const noexcept
+		{
+			return !!(config_.flags & format_flags::allow_unicode_strings);
+		}
+
 		TOML_API
 		void attach(std::ostream& stream) noexcept;

--- a/include/toml++/impl/formatter.inl
+++ b/include/toml++/impl/formatter.inl
@ -14,15 +14,29 @@

 #include "formatter.h"
 #include "print_to_stream.h"
-#include "utf8.h"
 #include "value.h"
 #include "table.h"
 #include "array.h"
+#include "unicode.h"
 #include "parse_result.h"
 #include "header_start.h"

 TOML_IMPL_NAMESPACE_START
 {
+	enum class formatted_string_traits : unsigned
+	{
+		none,
+		line_breaks	  = 1u << 0, // \n
+		tabs		  = 1u << 1, // \t
+		control_chars = 1u << 2, // also includes non-ascii vertical whitespace
+		single_quotes = 1u << 3,
+		non_bare	  = 1u << 4, // anything not satisfying "is bare key character"
+		non_ascii	  = 1u << 5, // any codepoint >= 128
+
+		all = (non_ascii << 1u) - 1u
+	};
+	TOML_MAKE_FLAGS(formatted_string_traits);
+
 	TOML_EXTERNAL_LINKAGE
 	formatter::formatter(const node* source_node,
 						 const parse_result* source_pr,
@ -101,70 +115,236 @@ TOML_IMPL_NAMESPACE_START
 	TOML_EXTERNAL_LINKAGE
 	void formatter::print_string(std::string_view str, bool allow_multi_line, bool allow_bare)
 	{
-		auto literal = literal_strings_allowed();
 		if (str.empty())
 		{
-			print_to_stream(*stream_, literal ? "''"sv : "\"\""sv);
-			naked_newline_ = false;
+			print_unformatted(literal_strings_allowed() ? "''"sv : "\"\""sv);
 			return;
 		}

-		bool multi_line = allow_multi_line && !!(config_.flags & format_flags::allow_multi_line_strings);
-		const bool treat_raw_tab_as_control_char = !(config_.flags & format_flags::allow_real_tabs_in_strings);
-		if (multi_line || literal || treat_raw_tab_as_control_char || allow_bare)
+		// pre-scan the string to determine how we should output it
+		formatted_string_traits traits = {};
+
+		if (!allow_bare)
+			traits |= formatted_string_traits::non_bare;
+		bool unicode_allowed = unicode_strings_allowed();
+
+		// ascii fast path
+		if (is_ascii(str.data(), str.length()))
 		{
-			utf8_decoder decoder;
-			bool has_line_breaks   = false;
-			bool has_control_chars = false;
-			bool has_single_quotes = false;
-			for (size_t i = 0; i < str.length(); i++)
+			for (auto c : str)
 			{
-				decoder(static_cast<uint8_t>(str[i]));
-				if (decoder.error())
+				switch (c)
 				{
-					has_line_breaks	  = false;
-					has_control_chars = true; // force ""
-					has_single_quotes = true;
-					allow_bare		  = false;
-					break;
-				}
-				else if (decoder.has_code_point())
-				{
-					if (decoder.codepoint == U'\n')
+					case '\n': traits |= formatted_string_traits::line_breaks; break;
+					case '\t': traits |= formatted_string_traits::tabs; break;
+					case '\'': traits |= formatted_string_traits::single_quotes; break;
+					default:
 					{
-						has_line_breaks = true;
-						if (!multi_line)
-							has_control_chars = true;
+						if (is_control_character(c))
+							traits |= formatted_string_traits::control_chars;
+
+						if (!is_ascii_bare_key_character(static_cast<char32_t>(c)))
+							traits |= formatted_string_traits::non_bare;
+						break;
 					}
-					else if (is_nontab_control_character(decoder.codepoint)
-							 || (treat_raw_tab_as_control_char && decoder.codepoint == U'\t')
-							 || is_vertical_whitespace(decoder.codepoint))
-						has_control_chars = true;
-					else if (decoder.codepoint == U'\'')
-						has_single_quotes = true;
-					if (allow_bare)
-						allow_bare = is_bare_key_character(decoder.codepoint);
 				}

-				if (has_line_breaks && has_control_chars && has_single_quotes && !allow_bare)
+				static constexpr auto all_ascii_traits =
+					formatted_string_traits::all & ~formatted_string_traits::non_ascii;
+				if (traits == all_ascii_traits)
 					break;
 			}
-			multi_line = multi_line && has_line_breaks;
-			literal	   = literal && !has_control_chars && !(!multi_line && has_single_quotes);
 		}

-		if (allow_bare)
-			print_to_stream(*stream_, str);
-		else if (literal)
-			print_to_stream_bookended(*stream_, str, multi_line ? "'''"sv : "'"sv);
+		// unicode slow path
 		else
 		{
-			const auto quot = multi_line ? R"(""")"sv : R"(")"sv;
-			print_to_stream(*stream_, quot);
-			print_to_stream_with_escapes(*stream_, str);
-			print_to_stream(*stream_, quot);
+			traits |= formatted_string_traits::non_ascii;
+			utf8_decoder decoder;
+
+			// if the unicode is malformed just treat the string as a single-line non-literal and
+			// escape all non-ascii characters (to ensure round-tripping and help with diagnostics)
+			const auto bad_unicode = [&]() noexcept
+			{
+				traits &= ~formatted_string_traits::line_breaks;
+				traits |= formatted_string_traits::control_chars | formatted_string_traits::non_bare;
+				unicode_allowed = false;
+			};
+
+			for (auto c : str)
+			{
+				decoder(c);
+
+				if TOML_UNLIKELY(decoder.error())
+				{
+					bad_unicode();
+					break;
+				}
+
+				if (!decoder.has_code_point())
+					continue;
+
+				switch (decoder.codepoint)
+				{
+					case U'\n': traits |= formatted_string_traits::line_breaks; break;
+					case U'\t': traits |= formatted_string_traits::tabs; break;
+					case U'\'': traits |= formatted_string_traits::single_quotes; break;
+					default:
+					{
+						if (is_control_character(decoder.codepoint)
+							|| is_non_ascii_vertical_whitespace(decoder.codepoint))
+							traits |= formatted_string_traits::control_chars;
+
+						if (!is_bare_key_character(decoder.codepoint))
+							traits |= formatted_string_traits::non_bare;
+						break;
+					}
+				}
+			}
+
+			if (decoder.needs_more_input())
+				bad_unicode();
 		}
-		naked_newline_ = false;
+
+		// if the string meets the requirements of being 'bare' we can emit a bare string
+		// (bare strings are composed of letters and numbers; no whitespace, control chars, quotes, etc)
+		if (!(traits & formatted_string_traits::non_bare)
+			&& (!(traits & formatted_string_traits::non_ascii) || unicode_allowed))
+		{
+			print_unformatted(str);
+			return;
+		}
+
+		// determine if this should be a multi-line string (triple-quotes)
+		const auto multi_line = allow_multi_line			 //
+							 && multi_line_strings_allowed() //
+							 && !!(traits & formatted_string_traits::line_breaks);
+
+		// determine if this should be a literal string (single-quotes with no escaping)
+		const auto literal = literal_strings_allowed()													   //
+						  && !(traits & formatted_string_traits::control_chars)							   //
+						  && (!(traits & formatted_string_traits::single_quotes) || multi_line)			   //
+						  && (!(traits & formatted_string_traits::tabs) || real_tabs_in_strings_allowed()) //
+						  && (!(traits & formatted_string_traits::non_ascii) || unicode_allowed);
+
+		// literal strings (single quotes, no escape codes)
+		if (literal)
+		{
+			const auto quot = multi_line ? R"(''')"sv : R"(')"sv;
+			print_unformatted(quot);
+			print_unformatted(str);
+			print_unformatted(quot);
+			return;
+		}
+
+		// anything from here down is a non-literal string, so requires iteration and escaping.
+		print_unformatted(multi_line ? R"(""")"sv : R"(")"sv);
+
+		const auto real_tabs_allowed = real_tabs_in_strings_allowed();
+
+		// ascii fast path
+		if (!(traits & formatted_string_traits::non_ascii))
+		{
+			for (auto c : str)
+			{
+				switch (c)
+				{
+					case '"': print_to_stream(*stream_, R"(\")"sv); break;
+					case '\\': print_to_stream(*stream_, R"(\\)"sv); break;
+					case '\x7F': print_to_stream(*stream_, R"(\u007F)"sv); break;
+					case '\t': print_to_stream(*stream_, real_tabs_allowed ? "\t"sv : R"(\t)"sv); break;
+					case '\n': print_to_stream(*stream_, multi_line ? "\n"sv : R"(\n)"sv); break;
+					default:
+					{
+						// control characters from lookup table
+						if TOML_UNLIKELY(c >= '\x00' && c <= '\x1F')
+							print_to_stream(*stream_, control_char_escapes[c]);
+
+						// regular characters
+						else
+							print_to_stream(*stream_, c);
+					}
+				}
+			}
+		}
+
+		// unicode slow path
+		else
+		{
+			utf8_decoder decoder;
+			const char* cp_start = str.data();
+			const char* cp_end	 = cp_start;
+			for (auto c : str)
+			{
+				decoder(c);
+				cp_end++;
+
+				// if the decoder encounters malformed unicode just emit raw bytes and
+				if (decoder.error())
+				{
+					while (cp_start != cp_end)
+					{
+						print_to_stream(*stream_, R"(\u00)"sv);
+						print_to_stream(*stream_,
+										static_cast<uint8_t>(*cp_start),
+										value_flags::format_as_hexadecimal,
+										2);
+						cp_start++;
+					}
+					decoder.reset();
+					continue;
+				}
+
+				if (!decoder.has_code_point())
+					continue;
+
+				switch (decoder.codepoint)
+				{
+					case U'"': print_to_stream(*stream_, R"(\")"sv); break;
+					case U'\\': print_to_stream(*stream_, R"(\\)"sv); break;
+					case U'\x7F': print_to_stream(*stream_, R"(\u007F)"sv); break;
+					case U'\t': print_to_stream(*stream_, real_tabs_allowed ? "\t"sv : R"(\t)"sv); break;
+					case U'\n': print_to_stream(*stream_, multi_line ? "\n"sv : R"(\n)"sv); break;
+					default:
+					{
+						// control characters from lookup table
+						if TOML_UNLIKELY(decoder.codepoint <= U'\x1F')
+							print_to_stream(*stream_,
+											control_char_escapes[static_cast<uint_least32_t>(decoder.codepoint)]);
+
+						// escaped unicode characters
+						else if (decoder.codepoint > U'\x7F'
+								 && (!unicode_allowed || is_non_ascii_vertical_whitespace(decoder.codepoint)))
+						{
+							if (static_cast<uint_least32_t>(decoder.codepoint) > 0xFFFFu)
+							{
+								print_to_stream(*stream_, R"(\U)"sv);
+								print_to_stream(*stream_,
+												static_cast<uint_least32_t>(decoder.codepoint),
+												value_flags::format_as_hexadecimal,
+												8);
+							}
+							else
+							{
+								print_to_stream(*stream_, R"(\u)"sv);
+								print_to_stream(*stream_,
+												static_cast<uint_least32_t>(decoder.codepoint),
+												value_flags::format_as_hexadecimal,
+												4);
+							}
+						}
+
+						// regular characters
+						else
+							print_to_stream(*stream_, cp_start, static_cast<size_t>(cp_end - cp_start));
+					}
+				}
+
+				cp_start = cp_end;
+			}
+		}
+
+		print_unformatted(multi_line ? R"(""")"sv : R"(")"sv);
 	}

 	TOML_EXTERNAL_LINKAGE
--- a/include/toml++/impl/forward_declarations.h
+++ b/include/toml++/impl/forward_declarations.h
@ -119,7 +119,7 @@ TOML_IMPL_NAMESPACE_START

 	// clang-format off

-	inline constexpr std::string_view low_character_escape_table[] =
+	inline constexpr std::string_view control_char_escapes[] =
 	{
 		"\\u0000"sv,
 		"\\u0001"sv,
@ -309,20 +309,23 @@ TOML_NAMESPACE_START // abi namespace
 		/// \brief Allow real tab characters in string literals (as opposed to the escaped form `\t`).
 		allow_real_tabs_in_strings = (1ull << 4),

+		/// \brief Allow non-ASCII characters in strings (as opposed to their escaped form, e.g. `\u00DA`).
+		allow_unicode_strings = (1ull << 5),
+
 		/// \brief Allow integers with #value_flags::format_as_binary to be emitted as binary.
-		allow_binary_integers = (1ull << 5),
+		allow_binary_integers = (1ull << 6),

 		/// \brief Allow integers with #value_flags::format_as_octal to be emitted as octal.
-		allow_octal_integers = (1ull << 6),
+		allow_octal_integers = (1ull << 7),

 		/// \brief Allow integers with #value_flags::format_as_hexadecimal to be emitted as hexadecimal.
-		allow_hexadecimal_integers = (1ull << 7),
+		allow_hexadecimal_integers = (1ull << 8),

 		/// \brief Apply indentation to tables nested within other tables/arrays.
-		indent_sub_tables = (1ull << 8),
+		indent_sub_tables = (1ull << 9),

 		/// \brief Apply indentation to array elements when the array is forced to wrap over multiple lines.
-		indent_array_elements = (1ull << 9),
+		indent_array_elements = (1ull << 10),

 		/// \brief Combination mask of all indentation-enabling flags.
 		indentation = indent_sub_tables | indent_array_elements,
--- a/include/toml++/impl/json_formatter.h
+++ b/include/toml++/impl/json_formatter.h
@ -75,6 +75,7 @@ TOML_NAMESPACE_START
 		/// \brief	The default flags for a json_formatter.
 		static constexpr format_flags default_flags = constants.mandatory_flags				  //
 													| format_flags::quote_infinities_and_nans //
+													| format_flags::allow_unicode_strings	  //
 													| format_flags::indentation;

 		/// \brief	Constructs a JSON formatter and binds it to a TOML object.
--- a/include/toml++/impl/key.h
+++ b/include/toml++/impl/key.h
@ -35,6 +35,12 @@ TOML_NAMESPACE_START
 		source_region source_;

 	  public:
+		/// A const iterator for iterating over the characters in the key.
+		using const_iterator = const char*;
+
+		/// A const iterator for iterating over the characters in the key.
+		using iterator = const_iterator;
+
 		/// \brief	Default constructor.
 		TOML_NODISCARD_CTOR
 		key() noexcept = default;
@ -287,6 +293,40 @@ TOML_NAMESPACE_START

 		/// @}

+		/// \name Iterators
+		/// @{
+
+		TOML_PURE_INLINE_GETTER
+		const_iterator begin() const noexcept
+		{
+			return key_.data();
+		}
+
+		TOML_PURE_INLINE_GETTER
+		const_iterator end() const noexcept
+		{
+			return key_.data() + key_.length();
+		}
+
+		/// @}
+
+		/// \name Iterators (ADL)
+		/// @{
+
+		TOML_PURE_INLINE_GETTER
+		friend const_iterator begin(const key& k) noexcept
+		{
+			return k.begin();
+		}
+
+		TOML_PURE_INLINE_GETTER
+		friend const_iterator end(const key& k) noexcept
+		{
+			return k.end();
+		}
+
+		/// @}
+
 		/// \brief	Prints the key's underlying string out to the stream.
 		friend std::ostream& operator<<(std::ostream& lhs, const key& rhs)
 		{
--- a/include/toml++/impl/parser.inl
+++ b/include/toml++/impl/parser.inl
@ -12,15 +12,15 @@
 //# }}
 #if TOML_ENABLE_PARSER

-#include "std_optional.h"
 #include "parser.h"
+#include "std_optional.h"
 #include "source_region.h"
 #include "parse_error.h"
-#include "utf8.h"
 #include "date_time.h"
 #include "value.h"
 #include "array.h"
 #include "table.h"
+#include "unicode.h"
 TOML_DISABLE_WARNINGS;
 #include <istream>
 #include <fstream>
@ -34,7 +34,6 @@ TOML_DISABLE_WARNINGS;
 #include <iomanip>
 #endif
 TOML_ENABLE_WARNINGS;
-#include "simd.h"
 #include "header_start.h"

 //#---------------------------------------------------------------------------------------------------------------------
@ -43,48 +42,6 @@ TOML_ENABLE_WARNINGS;

 TOML_ANON_NAMESPACE_START
 {
-	template <typename T>
-	TOML_PURE_GETTER
-	TOML_ATTR(nonnull)
-	TOML_INTERNAL_LINKAGE
-	bool is_ascii(const T* str, size_t size) noexcept
-	{
-		static_assert(sizeof(T) == 1);
-
-		const T* const end = str + size;
-
-#if TOML_HAS_SSE2 && (128 % CHAR_BIT) == 0
-		{
-			constexpr size_t chars_per_vector = 128 / CHAR_BIT;
-
-			if (const size_t simdable = size - (size % chars_per_vector))
-			{
-				__m128i mask = _mm_setzero_si128();
-				for (const T* const e = str + simdable; str < e; str += chars_per_vector)
-				{
-					const __m128i current_bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(str));
-					mask						= _mm_or_si128(mask, current_bytes);
-				}
-				const __m128i has_error = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
-
-#if TOML_HAS_SSE4_1
-				if (!_mm_testz_si128(has_error, has_error))
-					return false;
-#else
-				if (_mm_movemask_epi8(_mm_cmpeq_epi8(has_error, _mm_setzero_si128())) != 0xFFFF)
-					return false;
-#endif
-			}
-		}
-#endif
-
-		for (; str < end; str++)
-			if (static_cast<unsigned char>(*str) > 127u)
-				return false;
-
-		return true;
-	}
-
 	template <typename T>
 	class utf8_byte_stream;

@ -363,7 +320,7 @@ TOML_ANON_NAMESPACE_START
 					auto& cp	= codepoints_.buffer[i];
 					cp.position = next_pos_;

-					if (impl::is_vertical_whitespace_excl_cr(cp))
+					if (cp == U'\n')
 					{
 						next_pos_.line++;
 						next_pos_.column = source_index{ 1 };
@ -374,7 +331,7 @@ TOML_ANON_NAMESPACE_START
 			};

 			// decide whether we need to use the UTF-8 decoder or if we can treat this block as plain ASCII
-			const auto ascii_fast_path = !decoder_.needs_more_input() && is_ascii(raw_bytes, raw_bytes_read);
+			const auto ascii_fast_path = !decoder_.needs_more_input() && impl::is_ascii(raw_bytes, raw_bytes_read);

 			// ASCII fast-path
 			if (ascii_fast_path)
@ -726,7 +683,7 @@ TOML_ANON_NAMESPACE_START
 	std::string_view to_sv(const utf8_codepoint& cp) noexcept
 	{
 		if TOML_UNLIKELY(cp.value <= U'\x1F')
-			return impl::low_character_escape_table[cp.value];
+			return impl::control_char_escapes[cp.value];
 		else if TOML_UNLIKELY(cp.value == U'\x7F')
 			return "\\u007F"sv;
 		else
@ -1183,8 +1140,9 @@ TOML_IMPL_NAMESPACE_START
 		{
 			return_if_error_or_eof({});

-			if (!is_vertical_whitespace(*cp))
-				return false;
+			if TOML_UNLIKELY(is_match(*cp, U'\v', U'\f'))
+				set_error_and_return_default(
+					R"(vertical tabs '\v' and form-feeds '\f' are not legal whitespace in TOML.)"sv);

 			if (*cp == U'\r')
 			{
@ -1192,10 +1150,14 @@ TOML_IMPL_NAMESPACE_START

 				if (is_eof())
 					return true; // eof after \r is 'fine'
-				else if (*cp != U'\n')
+
+				if (*cp != U'\n')
 					set_error_and_return_default("expected \\n, saw '"sv, to_sv(*cp), "'"sv);
 			}
-			advance_and_return_if_error({}); // skip \n (or other single-character line ending)
+			else if (*cp != U'\n')
+				return false;
+
+			advance_and_return_if_error({}); // skip \n
 			return true;
 		}

@ -1205,7 +1167,7 @@ TOML_IMPL_NAMESPACE_START

 			do
 			{
-				if (is_vertical_whitespace(*cp))
+				if (is_ascii_vertical_whitespace(*cp))
 					return consume_line_break();
 				else
 					advance();
@ -1349,7 +1311,7 @@ TOML_IMPL_NAMESPACE_START
 						continue;
 					}

-					bool skipped_escaped_codepoint = false;
+					bool skip_escaped_codepoint = true;
 					assert_not_eof();
 					switch (const auto escaped_codepoint = *cp)
 					{
@ -1373,9 +1335,9 @@ TOML_IMPL_NAMESPACE_START
 						case U'u': [[fallthrough]];
 						case U'U':
 						{
-							push_parse_scope("unicode scalar escape sequence"sv);
+							push_parse_scope("unicode scalar sequence"sv);
 							advance_and_return_if_error_or_eof({});
-							skipped_escaped_codepoint = true;
+							skip_escaped_codepoint = false;

 							uint32_t place_value =
 								escaped_codepoint == U'U' ? 0x10000000u : (escaped_codepoint == U'u' ? 0x1000u : 0x10u);
@ -1395,25 +1357,28 @@ TOML_IMPL_NAMESPACE_START
 									"unicode surrogates (U+D800 - U+DFFF) are explicitly prohibited"sv);
 							else if (sequence_value > 0x10FFFFu)
 								set_error_and_return_default("values greater than U+10FFFF are invalid"sv);
-							else if (sequence_value <= 0x7Fu) // ascii
-								str += static_cast<char>(sequence_value & 0x7Fu);
-							else if (sequence_value <= 0x7FFu)
+
+							if (sequence_value < 0x80)
 							{
-								str += static_cast<char>(0xC0u | ((sequence_value >> 6) & 0x1Fu));
-								str += static_cast<char>(0x80u | (sequence_value & 0x3Fu));
+								str += static_cast<char>(sequence_value);
 							}
-							else if (sequence_value <= 0xFFFFu)
+							else if (sequence_value < 0x800u)
 							{
-								str += static_cast<char>(0xE0u | ((sequence_value >> 12) & 0x0Fu));
-								str += static_cast<char>(0x80u | ((sequence_value >> 6) & 0x1Fu));
-								str += static_cast<char>(0x80u | (sequence_value & 0x3Fu));
+								str += static_cast<char>((sequence_value >> 6) | 0xC0u);
+								str += static_cast<char>((sequence_value & 0x3Fu) | 0x80u);
 							}
-							else
+							else if (sequence_value < 0x10000u)
 							{
-								str += static_cast<char>(0xF0u | ((sequence_value >> 18) & 0x07u));
-								str += static_cast<char>(0x80u | ((sequence_value >> 12) & 0x3Fu));
-								str += static_cast<char>(0x80u | ((sequence_value >> 6) & 0x3Fu));
-								str += static_cast<char>(0x80u | (sequence_value & 0x3Fu));
+								str += static_cast<char>((sequence_value >> 12) | 0xE0u);
+								str += static_cast<char>(((sequence_value >> 6) & 0x3Fu) | 0x80u);
+								str += static_cast<char>((sequence_value & 0x3Fu) | 0x80u);
+							}
+							else if (sequence_value < 0x110000u)
+							{
+								str += static_cast<char>((sequence_value >> 18) | 0xF0u);
+								str += static_cast<char>(((sequence_value >> 12) & 0x3Fu) | 0x80u);
+								str += static_cast<char>(((sequence_value >> 6) & 0x3Fu) | 0x80u);
+								str += static_cast<char>((sequence_value & 0x3Fu) | 0x80u);
 							}
 							break;
 						}
@ -1422,8 +1387,7 @@ TOML_IMPL_NAMESPACE_START
 						default: set_error_and_return_default("unknown escape sequence '\\"sv, to_sv(*cp), "'"sv);
 					}

-					// skip the escaped character
-					if (!skipped_escaped_codepoint)
+					if (skip_escaped_codepoint)
 						advance_and_return_if_error_or_eof({});
 				}
 				else
@ -1492,7 +1456,7 @@ TOML_IMPL_NAMESPACE_START
 					}

 					// handle line endings in multi-line mode
-					if (multi_line && is_vertical_whitespace(*cp))
+					if (multi_line && is_ascii_vertical_whitespace(*cp))
 					{
 						consume_line_break();
 						return_if_error({});
@ -1608,7 +1572,7 @@ TOML_IMPL_NAMESPACE_START
 				}

 				// handle line endings in multi-line mode
-				if (multi_line && is_vertical_whitespace(*cp))
+				if (multi_line && is_ascii_vertical_whitespace(*cp))
 				{
 					consume_line_break();
 					return_if_error({});
@ -2275,7 +2239,8 @@ TOML_IMPL_NAMESPACE_START
 			TOML_ASSERT_ASSUME(is_decimal_digit(*cp));
 			push_parse_scope("time"sv);

-			static constexpr size_t max_digits = 9;
+			static constexpr size_t max_digits = 64; // far more than necessary but needed to allow fractional
+													 // millisecond truncation per the spec
 			uint32_t digits[max_digits];

 			// "HH"
@ -2344,15 +2309,14 @@ TOML_IMPL_NAMESPACE_START
 				else if (!part_of_datetime && !is_value_terminator(*cp))
 					set_error_and_return_default("expected value-terminator, saw '"sv, to_sv(*cp), "'"sv);
 			}
-
 			uint32_t value = 0u;
 			uint32_t place = 1u;
-			for (auto i = digit_count; i-- > 0u;)
+			for (auto i = impl::min<size_t>(digit_count, 9u); i-- > 0u;)
 			{
 				value += digits[i] * place;
 				place *= 10u;
 			}
-			for (auto i = digit_count; i < max_digits; i++) // implicit zeros
+			for (auto i = digit_count; i < 9u; i++) // implicit zeros
 				value *= 10u;
 			time.nanosecond = value;
 			return time;
--- a/include/toml++/impl/print_to_stream.h
+++ b/include/toml++/impl/print_to_stream.h
@ -30,28 +30,28 @@ TOML_IMPL_NAMESPACE_START
 	void print_to_stream(std::ostream&, char);

 	TOML_API
-	void print_to_stream(std::ostream&, int8_t, value_flags = {});
+	void print_to_stream(std::ostream&, int8_t, value_flags = {}, size_t min_digits = 0);

 	TOML_API
-	void print_to_stream(std::ostream&, int16_t, value_flags = {});
+	void print_to_stream(std::ostream&, int16_t, value_flags = {}, size_t min_digits = 0);

 	TOML_API
-	void print_to_stream(std::ostream&, int32_t, value_flags = {});
+	void print_to_stream(std::ostream&, int32_t, value_flags = {}, size_t min_digits = 0);

 	TOML_API
-	void print_to_stream(std::ostream&, int64_t, value_flags = {});
+	void print_to_stream(std::ostream&, int64_t, value_flags = {}, size_t min_digits = 0);

 	TOML_API
-	void print_to_stream(std::ostream&, uint8_t, value_flags = {});
+	void print_to_stream(std::ostream&, uint8_t, value_flags = {}, size_t min_digits = 0);

 	TOML_API
-	void print_to_stream(std::ostream&, uint16_t, value_flags = {});
+	void print_to_stream(std::ostream&, uint16_t, value_flags = {}, size_t min_digits = 0);

 	TOML_API
-	void print_to_stream(std::ostream&, uint32_t, value_flags = {});
+	void print_to_stream(std::ostream&, uint32_t, value_flags = {}, size_t min_digits = 0);

 	TOML_API
-	void print_to_stream(std::ostream&, uint64_t, value_flags = {});
+	void print_to_stream(std::ostream&, uint64_t, value_flags = {}, size_t min_digits = 0);

 	TOML_API
 	void print_to_stream(std::ostream&, float, value_flags = {});
@ -111,24 +111,6 @@ TOML_IMPL_NAMESPACE_START

 #endif

-	template <typename T>
-	inline void print_to_stream_with_escapes(std::ostream & stream, const T& str)
-	{
-		for (auto c : str)
-		{
-			if TOML_UNLIKELY(c >= '\x00' && c <= '\x1F')
-				print_to_stream(stream, low_character_escape_table[c]);
-			else if TOML_UNLIKELY(c == '\x7F')
-				print_to_stream(stream, "\\u007F"sv);
-			else if TOML_UNLIKELY(c == '"')
-				print_to_stream(stream, "\\\""sv);
-			else if TOML_UNLIKELY(c == '\\')
-				print_to_stream(stream, "\\\\"sv);
-			else
-				print_to_stream(stream, c);
-		}
-	}
-
 	template <typename T, typename U>
 	inline void print_to_stream_bookended(std::ostream & stream, const T& val, const U& bookend)
 	{
--- a/include/toml++/impl/print_to_stream.inl
+++ b/include/toml++/impl/print_to_stream.inl
@ -69,11 +69,16 @@ TOML_ANON_NAMESPACE_START

 	template <typename T>
 	TOML_INTERNAL_LINKAGE
-	void print_integer_to_stream(std::ostream & stream, T val, value_flags format = {})
+	void print_integer_to_stream(std::ostream & stream, T val, value_flags format = {}, size_t min_digits = 0)
 	{
 		if (!val)
 		{
-			stream.put('0');
+			if (!min_digits)
+				min_digits = 1;
+
+			for (size_t i = 0; i < min_digits; i++)
+				stream.put('0');
+
 			return;
 		}

@ -82,7 +87,7 @@ TOML_ANON_NAMESPACE_START
 		format &= value_flags_mask;

 		int base = 10;
-		if (format != value_flags::none && val >= T{})
+		if (format != value_flags::none && val > T{})
 		{
 			switch (format)
 			{
@ -98,6 +103,8 @@ TOML_ANON_NAMESPACE_START
 		char buf[(sizeof(T) * CHAR_BIT)];
 		const auto res = std::to_chars(buf, buf + sizeof(buf), val, base);
 		const auto len = static_cast<size_t>(res.ptr - buf);
+		for (size_t i = len; i < min_digits; i++)
+			stream.put('0');
 		if (base == 16)
 		{
 			for (size_t i = 0; i < len; i++)
@ -111,12 +118,16 @@ TOML_ANON_NAMESPACE_START
 		using unsigned_type = std::conditional_t<(sizeof(T) > sizeof(unsigned)), std::make_unsigned_t<T>, unsigned>;
 		using cast_type		= std::conditional_t<std::is_signed_v<T>, std::make_signed_t<unsigned_type>, unsigned_type>;

-		if TOML_UNLIKELY(format == value_flags::format_as_binary)
+		if (base == 2)
 		{
+			const auto len = sizeof(T) * CHAR_BIT;
+			for (size_t i = len; i < min_digits; i++)
+				stream.put('0');
+
 			bool found_one	   = false;
 			const auto v	   = static_cast<unsigned_type>(val);
-			unsigned_type mask = unsigned_type{ 1 } << (sizeof(unsigned_type) * CHAR_BIT - 1u);
-			for (unsigned i = 0; i < sizeof(unsigned_type) * CHAR_BIT; i++)
+			unsigned_type mask = unsigned_type{ 1 } << (len - 1u);
+			for (size_t i = 0; i < len; i++)
 			{
 				if ((v & mask))
 				{
@ -133,6 +144,8 @@ TOML_ANON_NAMESPACE_START
 			std::ostringstream ss;
 			ss.imbue(std::locale::classic());
 			ss << std::uppercase << std::setbase(base);
+			if (min_digits)
+				ss << std::setfill('0') << std::setw(static_cast<int>(min_digits));
 			ss << static_cast<cast_type>(val);
 			const auto str = std::move(ss).str();
 			impl::print_to_stream(stream, str);
@ -194,31 +207,6 @@ TOML_ANON_NAMESPACE_START
 			default: TOML_UNREACHABLE;
 		}
 	}
-
-	template <typename T>
-	TOML_INTERNAL_LINKAGE
-	void print_integer_leftpad_zeros(std::ostream & stream, T val, size_t min_digits)
-	{
-#if TOML_INT_CHARCONV
-
-		char buf[charconv_buffer_length<T>];
-		const auto res = std::to_chars(buf, buf + sizeof(buf), val);
-		const auto len = static_cast<size_t>(res.ptr - buf);
-		for (size_t i = len; i < min_digits; i++)
-			stream.put('0');
-		impl::print_to_stream(stream, buf, static_cast<size_t>(res.ptr - buf));
-
-#else
-
-		std::ostringstream ss;
-		ss.imbue(std::locale::classic());
-		using cast_type = std::conditional_t<std::is_signed_v<T>, int64_t, uint64_t>;
-		ss << std::setfill('0') << std::setw(static_cast<int>(min_digits)) << static_cast<cast_type>(val);
-		const auto str = std::move(ss).str();
-		impl::print_to_stream(stream, str);
-
-#endif
-	}
 }
 TOML_ANON_NAMESPACE_END;

@ -250,51 +238,51 @@ TOML_IMPL_NAMESPACE_START
 	}

 	TOML_EXTERNAL_LINKAGE
-	void print_to_stream(std::ostream & stream, int8_t val, value_flags format)
+	void print_to_stream(std::ostream & stream, int8_t val, value_flags format, size_t min_digits)
 	{
-		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format);
+		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits);
 	}

 	TOML_EXTERNAL_LINKAGE
-	void print_to_stream(std::ostream & stream, int16_t val, value_flags format)
+	void print_to_stream(std::ostream & stream, int16_t val, value_flags format, size_t min_digits)
 	{
-		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format);
+		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits);
 	}

 	TOML_EXTERNAL_LINKAGE
-	void print_to_stream(std::ostream & stream, int32_t val, value_flags format)
+	void print_to_stream(std::ostream & stream, int32_t val, value_flags format, size_t min_digits)
 	{
-		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format);
+		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits);
 	}

 	TOML_EXTERNAL_LINKAGE
-	void print_to_stream(std::ostream & stream, int64_t val, value_flags format)
+	void print_to_stream(std::ostream & stream, int64_t val, value_flags format, size_t min_digits)
 	{
-		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format);
+		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits);
 	}

 	TOML_EXTERNAL_LINKAGE
-	void print_to_stream(std::ostream & stream, uint8_t val, value_flags format)
+	void print_to_stream(std::ostream & stream, uint8_t val, value_flags format, size_t min_digits)
 	{
-		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format);
+		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits);
 	}

 	TOML_EXTERNAL_LINKAGE
-	void print_to_stream(std::ostream & stream, uint16_t val, value_flags format)
+	void print_to_stream(std::ostream & stream, uint16_t val, value_flags format, size_t min_digits)
 	{
-		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format);
+		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits);
 	}

 	TOML_EXTERNAL_LINKAGE
-	void print_to_stream(std::ostream & stream, uint32_t val, value_flags format)
+	void print_to_stream(std::ostream & stream, uint32_t val, value_flags format, size_t min_digits)
 	{
-		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format);
+		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits);
 	}

 	TOML_EXTERNAL_LINKAGE
-	void print_to_stream(std::ostream & stream, uint64_t val, value_flags format)
+	void print_to_stream(std::ostream & stream, uint64_t val, value_flags format, size_t min_digits)
 	{
-		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format);
+		TOML_ANON_NAMESPACE::print_integer_to_stream(stream, val, format, min_digits);
 	}

 	TOML_EXTERNAL_LINKAGE
@ -318,21 +306,21 @@ TOML_IMPL_NAMESPACE_START
 	TOML_EXTERNAL_LINKAGE
 	void print_to_stream(std::ostream & stream, const toml::date& val)
 	{
-		print_integer_leftpad_zeros(stream, val.year, 4u);
+		print_to_stream(stream, val.year, {}, 4);
 		stream.put('-');
-		print_integer_leftpad_zeros(stream, val.month, 2u);
+		print_to_stream(stream, val.month, {}, 2);
 		stream.put('-');
-		print_integer_leftpad_zeros(stream, val.day, 2u);
+		print_to_stream(stream, val.day, {}, 2);
 	}

 	TOML_EXTERNAL_LINKAGE
 	void print_to_stream(std::ostream & stream, const toml::time& val)
 	{
-		print_integer_leftpad_zeros(stream, val.hour, 2u);
+		print_to_stream(stream, val.hour, {}, 2);
 		stream.put(':');
-		print_integer_leftpad_zeros(stream, val.minute, 2u);
+		print_to_stream(stream, val.minute, {}, 2);
 		stream.put(':');
-		print_integer_leftpad_zeros(stream, val.second, 2u);
+		print_to_stream(stream, val.second, {}, 2);
 		if (val.nanosecond && val.nanosecond <= 999999999u)
 		{
 			stream.put('.');
@ -343,7 +331,7 @@ TOML_IMPL_NAMESPACE_START
 				ns /= 10u;
 				digits--;
 			}
-			print_integer_leftpad_zeros(stream, ns, digits);
+			print_to_stream(stream, ns, {}, digits);
 		}
 	}

@ -367,13 +355,13 @@ TOML_IMPL_NAMESPACE_START
 		const auto hours = mins / 60;
 		if (hours)
 		{
-			print_integer_leftpad_zeros(stream, static_cast<unsigned int>(hours), 2u);
+			print_to_stream(stream, static_cast<unsigned int>(hours), {}, 2);
 			mins -= hours * 60;
 		}
 		else
 			print_to_stream(stream, "00"sv);
 		stream.put(':');
-		print_integer_leftpad_zeros(stream, static_cast<unsigned int>(mins), 2u);
+		print_to_stream(stream, static_cast<unsigned int>(mins), {}, 2);
 	}

 	TOML_EXTERNAL_LINKAGE
--- a/include/toml++/impl/toml_formatter.h
+++ b/include/toml++/impl/toml_formatter.h
@ -68,8 +68,8 @@ TOML_NAMESPACE_START
 		TOML_API
 		void print();

-		static constexpr impl::formatter_constants constants = { format_flags::none, // mandatory flags
-																 format_flags::none, // ignored flags
+		static constexpr impl::formatter_constants constants = { format_flags::none, // mandatory
+																 format_flags::none, // ignored
 																 "inf"sv,
 																 "-inf"sv,
 																 "nan"sv,
@ -83,6 +83,7 @@ TOML_NAMESPACE_START
 		static constexpr format_flags default_flags = constants.mandatory_flags				   //
 													| format_flags::allow_literal_strings	   //
 													| format_flags::allow_multi_line_strings   //
+													| format_flags::allow_unicode_strings	   //
 													| format_flags::allow_real_tabs_in_strings //
 													| format_flags::allow_binary_integers	   //
 													| format_flags::allow_octal_integers	   //
--- a/include/toml++/impl/toml_formatter.inl
+++ b/include/toml++/impl/toml_formatter.inl
@ -14,10 +14,10 @@

 #include "toml_formatter.h"
 #include "print_to_stream.h"
-#include "utf8.h"
 #include "value.h"
 #include "table.h"
 #include "array.h"
+#include "unicode.h"
 #include "header_start.h"
 TOML_DISABLE_ARITHMETIC_WARNINGS;

--- a/include/toml++/impl/unicode.h
+++ b/include/toml++/impl/unicode.h
@ -102,15 +102,21 @@ TOML_IMPL_NAMESPACE_START
 	}

 	TOML_CONST_GETTER
-	constexpr bool is_vertical_whitespace(char32_t c) noexcept
+	constexpr bool is_ascii_vertical_whitespace(char32_t c) noexcept
 	{
-		return (U'\n' <= c && c <= U'\r') || (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85';
+		return U'\n' <= c && c <= U'\r';
 	}

 	TOML_CONST_GETTER
-	constexpr bool is_vertical_whitespace_excl_cr(char32_t c) noexcept
+	constexpr bool is_non_ascii_vertical_whitespace(char32_t c) noexcept
 	{
-		return (U'\n' <= c && c <= U'\f') || (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85';
+		return (U'\u2028' <= c && c <= U'\u2029') || c == U'\x85';
+	}
+
+	TOML_CONST_GETTER
+	constexpr bool is_vertical_whitespace(char32_t c) noexcept
+	{
+		return is_ascii_vertical_whitespace(c) || is_non_ascii_vertical_whitespace(c);
 	}

 	TOML_CONST_GETTER
@ -122,6 +128,11 @@ TOML_IMPL_NAMESPACE_START
 	TOML_CONST_GETTER
 	constexpr bool is_ascii_bare_key_character(char32_t c) noexcept
 	{
+#if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys)
+		if (c == U'+')
+			return true;
+#endif
+
 		if (c < U'-' || c > U'z')
 			return false;

@ -861,8 +872,7 @@ TOML_IMPL_NAMESPACE_START
 	constexpr bool is_bare_key_character(char32_t c) noexcept
 	{
 		return is_ascii_bare_key_character(c)
-#if TOML_LANG_UNRELEASED // toml/issues/644 ('+' in bare keys) & toml/issues/687 (unicode bare keys)
-			|| c == U'+' //
+#if TOML_LANG_UNRELEASED // toml/issues/687 (unicode bare keys)
 			|| is_non_ascii_bare_key_character(c)
 #endif
 			;
@ -874,6 +884,12 @@ TOML_IMPL_NAMESPACE_START
 		return is_whitespace(c) || c == U']' || c == U'}' || c == U',' || c == U'#';
 	}

+	TOML_CONST_GETTER
+	constexpr bool is_control_character(char c) noexcept
+	{
+		return c <= '\u001F' || c == '\u007F';
+	}
+
 	TOML_CONST_GETTER
 	constexpr bool is_control_character(char32_t c) noexcept
 	{
@ -949,12 +965,22 @@ TOML_IMPL_NAMESPACE_START
 			state = state_table[state + uint_least32_t{ 256u } + type];
 		}

+		TOML_ALWAYS_INLINE
+		constexpr void operator()(char c) noexcept
+		{
+			operator()(static_cast<uint8_t>(c));
+		}
+
 		TOML_ALWAYS_INLINE
 		constexpr void reset() noexcept
 		{
 			state = {};
 		}
 	};
+
+	TOML_PURE_GETTER
+	TOML_ATTR(nonnull)
+	bool is_ascii(const char* str, size_t len) noexcept;
 }
 TOML_IMPL_NAMESPACE_END;

--- a/include/toml++/impl/unicode.inl
+++ b/include/toml++/impl/unicode.inl
@ -0,0 +1,59 @@
+//# This file is a part of toml++ and is subject to the the terms of the MIT license.
+//# Copyright (c) Mark Gillard <mark.gillard@outlook.com.au>
+//# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text.
+// SPDX-License-Identifier: MIT
+#pragma once
+
+//# {{
+#include "preprocessor.h"
+#if !TOML_IMPLEMENTATION
+#error This is an implementation-only header.
+#endif
+//# }}
+
+#include "unicode.h"
+#include "simd.h"
+#include "header_start.h"
+
+TOML_IMPL_NAMESPACE_START
+{
+	TOML_EXTERNAL_LINKAGE
+	bool is_ascii(const char* str, size_t len) noexcept
+	{
+		const char* const end = str + len;
+
+#if TOML_HAS_SSE2 && (128 % CHAR_BIT) == 0
+		{
+			constexpr size_t chars_per_vector = 128u / CHAR_BIT;
+
+			if (const size_t simdable = len - (len % chars_per_vector))
+			{
+				__m128i mask = _mm_setzero_si128();
+				for (const char* const e = str + simdable; str < e; str += chars_per_vector)
+				{
+					const __m128i current_bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(str));
+					mask						= _mm_or_si128(mask, current_bytes);
+				}
+				const __m128i has_error = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
+
+#if TOML_HAS_SSE4_1
+				if (!_mm_testz_si128(has_error, has_error))
+					return false;
+#else
+				if (_mm_movemask_epi8(_mm_cmpeq_epi8(has_error, _mm_setzero_si128())) != 0xFFFF)
+					return false;
+#endif
+			}
+		}
+#endif
+
+		for (; str < end; str++)
+			if (static_cast<unsigned char>(*str) > 127u)
+				return false;
+
+		return true;
+	}
+}
+TOML_IMPL_NAMESPACE_END;
+
+#include "header_end.h"
--- a/include/toml++/impl/yaml_formatter.h
+++ b/include/toml++/impl/yaml_formatter.h
@ -73,6 +73,7 @@ TOML_NAMESPACE_START
 		/// \brief	The default flags for a yaml_formatter.
 		static constexpr format_flags default_flags = constants.mandatory_flags			  //
 													| format_flags::allow_literal_strings //
+													| format_flags::allow_unicode_strings //
 													| format_flags::allow_octal_integers  //
 													| format_flags::allow_hexadecimal_integers;

--- a/include/toml++/toml.h
+++ b/include/toml++/toml.h
@ -39,7 +39,7 @@ TOML_DISABLE_SUGGEST_ATTR_WARNINGS;
 #include "impl/array.h"
 #include "impl/key.h"
 #include "impl/table.h"
-#include "impl/utf8.h"
+#include "impl/unicode.h"
 #include "impl/parse_error.h"
 #include "impl/parse_result.h"
 #include "impl/parser.h"
@ -57,6 +57,7 @@ TOML_DISABLE_SUGGEST_ATTR_WARNINGS;
 #include "impl/value.inl"
 #include "impl/array.inl"
 #include "impl/table.inl"
+#include "impl/unicode.inl"
 #include "impl/parser.inl"
 #include "impl/formatter.inl"
 #include "impl/toml_formatter.inl"
--- a/tests/formatters.cpp
+++ b/tests/formatters.cpp
@ -24,7 +24,7 @@ namespace
 		friend std::ostream& operator<<(std::ostream& os, const char32_printer& p)
 		{
 			if (p.value <= U'\x1F')
-				return os << '\'' << impl::low_character_escape_table[static_cast<size_t>(p.value)] << '\'';
+				return os << '\'' << impl::control_char_escapes[static_cast<size_t>(p.value)] << '\'';
 			else if (p.value == U'\x7F')
 				return os << "'\\u007F'"sv;
 			else if (p.value < 127u)
@ -52,6 +52,7 @@ namespace
 	{
 		string_difference diff{ { 1u, 1u } };
 		impl::utf8_decoder a, b;
+
 		for (size_t i = 0, e = std::min(str_a.length(), str_b.length()); i < e; i++, diff.index++)
 		{
 			a(static_cast<uint8_t>(str_a[i]));
@ -76,7 +77,7 @@ namespace
 				return diff;
 			}

-			if (impl::is_vertical_whitespace_excl_cr(a.codepoint))
+			if (a.codepoint == U'\n')
 			{
 				diff.position.line++;
 				diff.position.column = 1u;
--- a/tests/tests.h
+++ b/tests/tests.h
@ -198,9 +198,9 @@ inline bool parse_expected_value(std::string_view test_file,
 			if (!decoder.has_code_point())
 				continue;

-			if (impl::is_vertical_whitespace_excl_cr(decoder.codepoint))
+			if (impl::is_ascii_vertical_whitespace(decoder.codepoint))
 			{
-				if (decoder.codepoint != U'\r')
+				if (decoder.codepoint == U'\n')
 				{
 					pos.line++;
 					pos.column = source_index{ 1 };
--- a/tests/user_feedback.cpp
+++ b/tests/user_feedback.cpp
@ -192,4 +192,42 @@ b = []
 		)",
 							4);
 	}
+
+	SECTION("github/issues/125") // https://github.com/marzer/tomlplusplus/issues/125
+	{
+		parse_expected_value(FILE_LINE_ARGS, R"("\u0800")"sv, "\xE0\xA0\x80"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u7840")"sv, "\xE7\xA1\x80"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\uAA23")"sv, "\xEA\xA8\xA3"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\uA928")"sv, "\xEA\xA4\xA8"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u9CBF")"sv, "\xE9\xB2\xBF"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u2247")"sv, "\xE2\x89\x87"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u13D9")"sv, "\xE1\x8F\x99"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u69FC")"sv, "\xE6\xA7\xBC"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u8DE5")"sv, "\xE8\xB7\xA5"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u699C")"sv, "\xE6\xA6\x9C"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u8CD4")"sv, "\xE8\xB3\x94"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u4ED4")"sv, "\xE4\xBB\x94"sv);
+		parse_expected_value(FILE_LINE_ARGS, R"("\u2597")"sv, "\xE2\x96\x97"sv);
+	}
+
+	SECTION("github/issues/127") // https://github.com/marzer/tomlplusplus/issues/127
+	{
+		parse_expected_value(FILE_LINE_ARGS,
+							 "12:34:56.11122233345678"sv,
+							 toml::time{
+								 12,
+								 34,
+								 56,
+								 111222333u // should truncate the .45678 part
+							 });
+	}
+
+	SECTION("github/issues/128") // https://github.com/marzer/tomlplusplus/issues/128
+	{
+		parsing_should_fail(FILE_LINE_ARGS, "\f"sv);
+		parsing_should_fail(FILE_LINE_ARGS, "\v"sv);
+		parsing_should_succeed(FILE_LINE_ARGS, " "sv);
+		parsing_should_succeed(FILE_LINE_ARGS, "\t"sv);
+		parsing_should_succeed(FILE_LINE_ARGS, "\n"sv);
+	}
 }
--- a/toml++.vcxproj
+++ b/toml++.vcxproj
@ -72,7 +72,7 @@
    <ClInclude Include="include\toml++\impl\std_vector.h" />
    <ClInclude Include="include\toml++\impl\table.h" />
    <ClInclude Include="include\toml++\impl\table.inl" />
-    <ClInclude Include="include\toml++\impl\utf8.h" />
+    <ClInclude Include="include\toml++\impl\unicode.h" />
    <ClInclude Include="include\toml++\impl\value.h" />
    <ClInclude Include="include\toml++\impl\value.inl" />
    <ClInclude Include="include\toml++\impl\value_extern.inl" />
@ -95,6 +95,7 @@
    <None Include="CHANGELOG.md" />
    <None Include="CODE_OF_CONDUCT.md" />
    <None Include="CONTRIBUTING.md" />
+    <None Include="include\toml++\impl\unicode.inl" />
    <None Include="include\toml++\impl\node_view_extern.inl" />
    <None Include="include\toml++\impl\yaml_formatter.inl" />
    <None Include="LICENSE" />
--- a/toml++.vcxproj.filters
+++ b/toml++.vcxproj.filters
@ -61,7 +61,7 @@
    <ClInclude Include="include\toml++\impl\table.inl">
      <Filter>include\impl</Filter>
    </ClInclude>
-    <ClInclude Include="include\toml++\impl\utf8.h">
+    <ClInclude Include="include\toml++\impl\unicode.h">
      <Filter>include\impl</Filter>
    </ClInclude>
    <ClInclude Include="include\toml++\impl\value.h">
@ -214,6 +214,9 @@
    <None Include="tools\generate_single_header.bat">
      <Filter>tools</Filter>
    </None>
+    <None Include="include\toml++\impl\unicode.inl">
+      <Filter>include\impl</Filter>
+    </None>
  </ItemGroup>
  <ItemGroup>
    <Filter Include=".circleci">
--- a/toml-test/tt.hpp
+++ b/toml-test/tt.hpp
--- a/toml-test/tt_decoder.cpp
+++ b/toml-test/tt_decoder.cpp
@ -3,7 +3,7 @@
 //# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text.
 // SPDX-License-Identifier: MIT

-#include "tt.hpp"
+#include "tt.h"

 using nlohmann::json;
 using namespace std::string_view_literals;
@ -85,30 +85,29 @@ TOML_NAMESPACE_END;

 int main()
 {
-	json j;
 	try
 	{
 		const std::string str(std::istreambuf_iterator<char>{ std::cin }, std::istreambuf_iterator<char>{});

-		j = toml::parse(str, "stdin"sv);
+		json j = toml::parse(str, "stdin"sv);
+
+		std::cout << j << "\n";
 	}
 	catch (const toml::parse_error& err)
 	{
-		std::cerr << err << "\n";
+		std::cerr << "\n\n" << err << "\n";
 		return 1;
 	}
 	catch (const std::exception& exc)
 	{
-		std::cerr << exc.what() << "\n";
+		std::cerr << "\n\n" << exc.what() << "\n";
 		return 1;
 	}
 	catch (...)
 	{
-		std::cerr << "An unspecified error occurred.\n";
+		std::cerr << "\n\nAn unspecified error occurred.\n";
 		return 1;
 	}

-	std::cout << j << "\n";
-
 	return 0;
 }
--- a/toml-test/tt_decoder.vcxproj
+++ b/toml-test/tt_decoder.vcxproj
@ -53,7 +53,7 @@
    <None Include="README.md" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="tt.hpp" />
+    <ClInclude Include="tt.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 </Project>
--- a/toml-test/tt_encoder.cpp
+++ b/toml-test/tt_encoder.cpp
@ -3,7 +3,7 @@
 //# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text.
 // SPDX-License-Identifier: MIT

-#include "tt.hpp"
+#include "tt.h"

 using nlohmann::json;
 using namespace std::string_view_literals;
@ -191,12 +191,13 @@ TOML_NAMESPACE_END;

 int main()
 {
-	toml::table tbl;
 	try
 	{
 		const std::string str(std::istreambuf_iterator<char>{ std::cin }, std::istreambuf_iterator<char>{});

-		tbl = json::parse(str);
+		toml::table tbl = json::parse(str);
+
+		std::cout << tbl << "\n";
 	}
 	catch (const std::exception& exc)
 	{
@ -209,7 +210,5 @@ int main()
 		return 1;
 	}

-	std::cout << tbl << "\n";
-
 	return 0;
 }
--- a/toml-test/tt_encoder.vcxproj
+++ b/toml-test/tt_encoder.vcxproj
@ -53,7 +53,7 @@
    <None Include="README.md" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="tt.hpp" />
+    <ClInclude Include="tt.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 </Project>
--- a/toml.hpp
+++ b/toml.hpp