diff --git a/include/fmt/format.h b/include/fmt/format.h index 6e80001f..e146476e 100644 --- a/include/fmt/format.h +++ b/include/fmt/format.h @@ -602,6 +602,7 @@ FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end, */ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e) -> const char* { + constexpr const int prefix_masks[] = {0x00, 0x80, 0xe0, 0xf0, 0xf8}; constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; constexpr const int shiftc[] = {0, 18, 12, 6, 0}; @@ -628,6 +629,8 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e) *e |= uchar(s[3]) >> 6; *e ^= 0x2a; // top two bits of each tail byte correct? *e >>= shifte[len]; + *e |= ((uchar(s[0]) & prefix_masks[len]) != + uchar((prefix_masks[len] << 1) & 0xFF)); // first byte correct? return next; } @@ -643,8 +646,8 @@ FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) { auto error = 0; auto end = utf8_decode(buf_ptr, &cp, &error); bool result = f(error ? invalid_code_point : cp, - string_view(ptr, to_unsigned(end - buf_ptr))); - return result ? end : nullptr; + string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr))); + return result ? (error ? buf_ptr + 1 : end) : nullptr; }; auto p = s.data(); const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars. diff --git a/test/ranges-test.cc b/test/ranges-test.cc index ef960dce..80931aa4 100644 --- a/test/ranges-test.cc +++ b/test/ranges-test.cc @@ -380,8 +380,15 @@ TEST(ranges_test, escape_string) { EXPECT_EQ(fmt::format("{}", vec{"\xcd\xb8"}), "[\"\\u0378\"]"); // Unassigned Unicode code points. EXPECT_EQ(fmt::format("{}", vec{"\xf0\xaa\x9b\x9e"}), "[\"\\U0002a6de\"]"); + // Broken utf-8. EXPECT_EQ(fmt::format("{}", vec{"\xf4\x8f\xbf\xc0"}), "[\"\\xf4\\x8f\\xbf\\xc0\"]"); + EXPECT_EQ(fmt::format("{}", vec{"\xf0\x28"}), "[\"\\xf0(\"]"); + EXPECT_EQ(fmt::format("{}", vec{"\xe1\x28"}), "[\"\\xe1(\"]"); + EXPECT_EQ(fmt::format("{}", vec{std::string("\xf0\x28\0\0anything", 12)}), + "[\"\\xf0(\\x00\\x00anything\"]"); + + // Correct utf-8. EXPECT_EQ(fmt::format("{}", vec{"понедельник"}), "[\"понедельник\"]"); } }