tomlplusplus/include/toml++/toml_utf8_streams.h

//# This file is a part of toml++ and is subject to the the terms of the MIT license.
//# Copyright (c) 2019-2020 Mark Gillard <mark.gillard@outlook.com.au>
//# See https://github.com/marzer/tomlplusplus/blob/master/LICENSE for the full license text.
// SPDX-License-Identifier: MIT

#pragma once
//# {{
#include "toml_preprocessor.h"
#if !TOML_PARSER
	#error This header cannot not be included when TOML_PARSER is disabled.
#endif
//# }}
#include "toml_utf8.h"
#include "toml_parse_error.h"

TOML_PUSH_WARNINGS
TOML_DISABLE_PADDING_WARNINGS
TOML_DISABLE_MISC_WARNINGS

TOML_IMPL_NAMESPACE_START
{
	template <typename T>
	class utf8_byte_stream;

	inline constexpr auto utf8_byte_order_mark = "\xEF\xBB\xBF"sv;

	template <typename Char>
	class TOML_API utf8_byte_stream<std::basic_string_view<Char>> final
	{
		static_assert(sizeof(Char) == 1_sz);

		private:
			std::basic_string_view<Char> source;
			size_t position = {};

		public:
			explicit constexpr utf8_byte_stream(std::basic_string_view<Char> sv) noexcept
				: source{ sv }
			{
				// trim trailing nulls
				size_t actual_len = source.length();
				for (size_t i = actual_len; i --> 0_sz;)
				{
					if (source[i] != Char{}) // not '\0'
					{
						actual_len = i + 1_sz;
						break;
					}
				}
				if (source.length() != actual_len) // not '\0'
					source = source.substr(0_sz, actual_len);

				// skip bom
				if (source.length() >= 3_sz && memcmp(utf8_byte_order_mark.data(), source.data(), 3_sz) == 0)
					position += 3_sz;
			}

			[[nodiscard]]
			TOML_ALWAYS_INLINE
			constexpr bool eof() const noexcept
			{
				return position >= source.length();
			}

			[[nodiscard]]
			TOML_ALWAYS_INLINE
			constexpr bool peek_eof() const noexcept
			{
				return eof();
			}

			[[nodiscard]]
			TOML_ALWAYS_INLINE
			constexpr bool error() const noexcept
			{
				return false;
			}

			[[nodiscard]]
			constexpr unsigned int operator() () noexcept
			{
				if (position >= source.length())
					return 0xFFFFFFFFu;
				return static_cast<unsigned int>(static_cast<uint8_t>(source[position++]));
			}
	};

	template <typename Char>
	class TOML_API utf8_byte_stream<std::basic_istream<Char>> final
	{
		static_assert(sizeof(Char) == 1_sz);

		private:
			std::basic_istream<Char>* source;

		public:
			explicit utf8_byte_stream(std::basic_istream<Char>& stream)
				: source{ &stream }
			{
				if (!source->good()) // eof, fail, bad
					return;

				const auto initial_pos = source->tellg();
				Char bom[3];
				source->read(bom, 3);
				if (source->bad() || (source->gcount() == 3 && memcmp(utf8_byte_order_mark.data(), bom, 3_sz) == 0))
					return;

				source->clear();
				source->seekg(initial_pos, std::ios::beg);
			}

			[[nodiscard]]
			TOML_ALWAYS_INLINE
			bool eof() const noexcept
			{
				return source->eof();
			}

			[[nodiscard]]
			TOML_ALWAYS_INLINE
			bool peek_eof() const
			{
				using stream_traits = typename std::remove_pointer_t<decltype(source)>::traits_type;
				return eof() || source->peek() == stream_traits::eof();
			}

			[[nodiscard]]
			TOML_ALWAYS_INLINE
			bool error() const noexcept
			{
				return !(*source);
			}

			[[nodiscard]]
			unsigned int operator() ()
			{
				auto val = source->get();
				if (val == std::basic_istream<Char>::traits_type::eof())
					return 0xFFFFFFFFu;
				return static_cast<unsigned int>(val);
			}
	};

	TOML_ABI_NAMESPACE_BOOL(TOML_LARGE_FILES, lf, sf)

	struct utf8_codepoint final
	{
		char32_t value;
		char bytes[4];
		source_position position;

		[[nodiscard]]
		std::string_view as_view() const noexcept
		{
			return bytes[3]
				? std::string_view{ bytes, 4_sz }
				: std::string_view{ bytes };
		}

		[[nodiscard]] TOML_ATTR(pure) constexpr operator char32_t& () noexcept { return value; }
		[[nodiscard]] TOML_ATTR(pure) constexpr operator const char32_t& () const noexcept { return value; }
		[[nodiscard]] TOML_ATTR(pure) constexpr const char32_t& operator* () const noexcept { return value; }
	};
	static_assert(std::is_trivial_v<utf8_codepoint>);
	static_assert(std::is_standard_layout_v<utf8_codepoint>);

	TOML_ABI_NAMESPACE_END // TOML_LARGE_FILES

	TOML_ABI_NAMESPACE_BOOL(TOML_EXCEPTIONS, ex, noex)

	#if TOML_EXCEPTIONS
		#define TOML_ERROR_CHECK	(void)0
		#define TOML_ERROR			throw parse_error
	#else
		#define TOML_ERROR_CHECK	if (err) return nullptr
		#define TOML_ERROR			err.emplace
	#endif

	struct TOML_INTERFACE utf8_reader_interface
	{
		[[nodiscard]]
		virtual const source_path_ptr& source_path() const noexcept = 0;

		[[nodiscard]]
		virtual const utf8_codepoint* read_next() = 0;

		[[nodiscard]]
		virtual bool peek_eof() const = 0;

		#if !TOML_EXCEPTIONS

		[[nodiscard]]
		virtual optional<parse_error>&& error() noexcept = 0;

		#endif

		virtual ~utf8_reader_interface() noexcept = default;
	};

	template <typename T>
	class TOML_EMPTY_BASES TOML_API utf8_reader final
		: public utf8_reader_interface
	{
		private:
			utf8_byte_stream<T> stream;
			utf8_decoder decoder;
			utf8_codepoint codepoints[2];
			size_t cp_idx = 1;
			uint8_t current_byte_count{};
			source_path_ptr source_path_;
			#if !TOML_EXCEPTIONS
			optional<parse_error> err;
			#endif

		public:

			template <typename U, typename String = std::string_view>
			explicit utf8_reader(U && source, String&& source_path = {})
				noexcept(std::is_nothrow_constructible_v<utf8_byte_stream<T>, U&&>)
				: stream{ std::forward<U>(source) }
			{
				std::memset(codepoints, 0, sizeof(codepoints));
				codepoints[0].position = { 1, 1 };
				codepoints[1].position = { 1, 1 };

				if (!source_path.empty())
					source_path_ = std::make_shared<const std::string>(std::forward<String>(source_path));
			}

			[[nodiscard]]
			const source_path_ptr& source_path() const noexcept override
			{
				return source_path_;
			}

			[[nodiscard]]
			const utf8_codepoint* read_next() override
			{
				TOML_ERROR_CHECK;

				auto& prev = codepoints[(cp_idx - 1_sz) % 2_sz];

				if (stream.eof())
					return nullptr;
				else if (stream.error())
					TOML_ERROR("An error occurred while reading from the underlying stream", prev.position, source_path_ );
				else if (decoder.error())
					TOML_ERROR( "Encountered invalid utf-8 sequence", prev.position, source_path_ );

				TOML_ERROR_CHECK;

				while (true)
				{
					uint8_t next_byte;
					{
						unsigned int next_byte_raw{ 0xFFFFFFFFu };
						if constexpr (noexcept(stream()) || !TOML_EXCEPTIONS)
						{
							next_byte_raw = stream();
						}
						#if TOML_EXCEPTIONS
						else
						{
							try
							{
								next_byte_raw = stream();
							}
							catch (const std::exception& exc)
							{
								throw parse_error{ exc.what(), prev.position, source_path_ };
							}
							catch (...)
							{
								throw parse_error{ "An unspecified error occurred", prev.position, source_path_ };
							}
						}
						#endif

						if (next_byte_raw >= 256u)
						{
							if (stream.eof())
							{
								if (decoder.needs_more_input())
									TOML_ERROR("Encountered EOF during incomplete utf-8 code point sequence",
										prev.position, source_path_);
								return nullptr;
							}
							else
								TOML_ERROR("An error occurred while reading from the underlying stream",
									prev.position, source_path_);
						}

						TOML_ERROR_CHECK;
						next_byte = static_cast<uint8_t>(next_byte_raw);
					}

					decoder(next_byte);
					if (decoder.error())
						TOML_ERROR( "Encountered invalid utf-8 sequence", prev.position, source_path_ );

					TOML_ERROR_CHECK;

					auto& current = codepoints[cp_idx % 2_sz];
					current.bytes[current_byte_count++] = static_cast<char>(next_byte);
					if (decoder.has_code_point())
					{
						//store codepoint
						current.value = decoder.codepoint;

						//reset prev (will be the next 'current')
						std::memset(prev.bytes, 0, sizeof(prev.bytes));
						current_byte_count = {};
						if (is_line_break<false>(current.value))
							prev.position = { static_cast<source_index>(current.position.line + 1), 1 };
						else
							prev.position = { current.position.line, static_cast<source_index>(current.position.column + 1) };
						cp_idx++;
						return &current;
					}
				}

				TOML_UNREACHABLE;
			}

			[[nodiscard]]
			bool peek_eof() const override
			{
				return stream.peek_eof();
			}

			#if !TOML_EXCEPTIONS

			[[nodiscard]]
			optional<parse_error>&& error() noexcept override
			{
				return std::move(err);
			}

			#endif
	};

	template <typename Char>
	utf8_reader(std::basic_string_view<Char>, std::string_view) -> utf8_reader<std::basic_string_view<Char>>;
	template <typename Char>
	utf8_reader(std::basic_string_view<Char>, std::string&&) -> utf8_reader<std::basic_string_view<Char>>;
	template <typename Char>
	utf8_reader(std::basic_istream<Char>&, std::string_view) -> utf8_reader<std::basic_istream<Char>>;
	template <typename Char>
	utf8_reader(std::basic_istream<Char>&, std::string&&) -> utf8_reader<std::basic_istream<Char>>;

	class TOML_EMPTY_BASES TOML_API utf8_buffered_reader final
		: public utf8_reader_interface
	{
		public:
			static constexpr size_t max_history_length = 72;

		private:
			static constexpr size_t history_buffer_size = max_history_length - 1; //'head' is stored in the reader
			utf8_reader_interface& reader;
			struct
			{

				utf8_codepoint buffer[history_buffer_size];
				size_t count, first;
			}
			history = {};
			const utf8_codepoint* head = {};
			size_t negative_offset = {};

		public:
			explicit utf8_buffered_reader(utf8_reader_interface& reader_) noexcept;
			const source_path_ptr& source_path() const noexcept override;
			const utf8_codepoint* read_next() override;
			const utf8_codepoint* step_back(size_t count) noexcept;
			bool peek_eof() const override;
			#if !TOML_EXCEPTIONS
			optional<parse_error>&& error() noexcept override;
			#endif
	};

	TOML_ABI_NAMESPACE_END // TOML_EXCEPTIONS
}
TOML_IMPL_NAMESPACE_END

TOML_POP_WARNINGS // TOML_DISABLE_PADDING_WARNINGS, TOML_DISABLE_MISC_WARNINGS