From 711224230b573a3b2df57391319e80e606bccddc Mon Sep 17 00:00:00 2001 From: Damien Neil Date: Thu, 15 Oct 2020 14:48:42 -0700 Subject: [PATCH] internal/encoding/text: escape Unicode control characters in strings Escape not only ASCII control characters, but Unicode as well. Change-Id: I5f5791ae51fc5624599f66ce012ecef364e7ea97 Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/262682 Trust: Damien Neil Run-TryBot: Damien Neil Reviewed-by: Joe Tsai Reviewed-by: Joe Tsai --- internal/encoding/text/encode.go | 2 +- internal/encoding/text/encode_test.go | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/internal/encoding/text/encode.go b/internal/encoding/text/encode.go index 6320fe26..aa66bdd0 100644 --- a/internal/encoding/text/encode.go +++ b/internal/encoding/text/encode.go @@ -141,7 +141,7 @@ func appendString(out []byte, in string, outputASCII bool) []byte { out = strconv.AppendUint(out, uint64(r), 16) } in = in[n:] - case outputASCII && r >= utf8.RuneSelf: + case r >= utf8.RuneSelf && (outputASCII || r <= 0x009f): out = append(out, '\\') if r <= math.MaxUint16 { out = append(out, 'u') diff --git a/internal/encoding/text/encode_test.go b/internal/encoding/text/encode_test.go index 6a658839..af74690f 100644 --- a/internal/encoding/text/encode_test.go +++ b/internal/encoding/text/encode_test.go @@ -394,17 +394,19 @@ func TestEncodeStrings(t *testing.T) { // String that has as few escaped characters as possible. in: func() string { var b []byte - for i := 0; i < utf8.RuneSelf; i++ { + for i := rune(0); i <= 0x00a0; i++ { switch i { case 0, '\\', '\n', '\'': // these must be escaped, so ignore them default: - b = append(b, byte(i)) + var r [utf8.UTFMax]byte + n := utf8.EncodeRune(r[:], i) + b = append(b, r[:n]...) } } return string(b) }(), - wantOut: `"\x01\x02\x03\x04\x05\x06\x07\x08\t\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_` + "`abcdefghijklmnopqrstuvwxyz{|}~\\x7f\"", - wantOutASCII: `"\x01\x02\x03\x04\x05\x06\x07\x08\t\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_` + "`abcdefghijklmnopqrstuvwxyz{|}~\\x7f\"", + wantOut: `"\x01\x02\x03\x04\x05\x06\x07\x08\t\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_` + "`" + `abcdefghijklmnopqrstuvwxyz{|}~\x7f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f` + "\u00a0" + `"`, + wantOutASCII: `"\x01\x02\x03\x04\x05\x06\x07\x08\t\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_` + "`" + `abcdefghijklmnopqrstuvwxyz{|}~\x7f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0"`, }, { // Valid UTF-8 wire encoding of the RuneError rune.