Herbie Ong 1e09691415 internal/encoding/{json,text}: improve string parsing
Previous calls to indexNeedEscape with a type conversion from []byte
to string incurs allocation.

Make 2 different calls instead, one for string and one for bytes.

Type converting string to []byte does not incur extra allocation,
however, the benchmark results still show it to be slower by ~3% for
textpb and 6+% for jsonpb, hence decided to go with 2 separate calls
instead.

Results over current head:
name          old time/op    new time/op    delta
TextEncode-4    18.1ms ± 2%    18.3ms ± 2%     ~     (p=0.065 n=10+9)
TextDecode-4     233ms ± 3%     102ms ± 1%  -56.34%  (p=0.000 n=9+10)
JSONEncode-4    10.4ms ± 2%    10.5ms ± 0%   +0.56%  (p=0.019 n=9+9)
JSONDecode-4     870ms ± 2%     354ms ± 4%  -59.33%  (p=0.000 n=9+10)

name          old alloc/op   new alloc/op   delta
TextEncode-4    28.9MB ± 0%    28.9MB ± 0%   +0.00%  (p=0.000 n=10+9)
TextDecode-4    1.16GB ± 0%    0.03GB ± 0%  -97.44%  (p=0.000 n=9+10)
JSONEncode-4    3.94MB ± 0%    3.94MB ± 0%   +0.00%  (p=0.000 n=10+10)
JSONDecode-4    3.35GB ± 0%    0.01GB ± 0%  -99.83%  (p=0.000 n=10+10)

name          old allocs/op  new allocs/op  delta
TextEncode-4     73.5k ± 0%     73.5k ± 0%     ~     (all equal)
TextDecode-4      278k ± 0%      255k ± 0%   -8.26%  (p=0.000 n=9+10)
JSONEncode-4     63.8k ± 0%     63.8k ± 0%     ~     (all equal)
JSONDecode-4      247k ± 0%      210k ± 0%  -14.92%  (p=0.000 n=10+10)

Change-Id: Ibc64e9a7827ec1fffa213eb79f60497950203700
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/172239
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
2019-04-17 00:26:13 +00:00

158 lines
4.2 KiB
Go

// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package json
import (
"io"
"math/bits"
"strconv"
"unicode"
"unicode/utf16"
"unicode/utf8"
"github.com/golang/protobuf/v2/internal/errors"
)
func appendString(out []byte, in string) ([]byte, error) {
var nerr errors.NonFatal
out = append(out, '"')
i := indexNeedEscapeInString(in)
in, out = in[i:], append(out, in[:i]...)
for len(in) > 0 {
switch r, n := utf8.DecodeRuneInString(in); {
case r == utf8.RuneError && n == 1:
nerr.AppendInvalidUTF8("")
in, out = in[1:], append(out, in[0]) // preserve invalid byte
case r < ' ' || r == '"' || r == '\\':
out = append(out, '\\')
switch r {
case '"', '\\':
out = append(out, byte(r))
case '\b':
out = append(out, 'b')
case '\f':
out = append(out, 'f')
case '\n':
out = append(out, 'n')
case '\r':
out = append(out, 'r')
case '\t':
out = append(out, 't')
default:
out = append(out, 'u')
out = append(out, "0000"[1+(bits.Len32(uint32(r))-1)/4:]...)
out = strconv.AppendUint(out, uint64(r), 16)
}
in = in[n:]
default:
i := indexNeedEscapeInString(in[n:])
in, out = in[n+i:], append(out, in[:n+i]...)
}
}
out = append(out, '"')
return out, nerr.E
}
func (d *Decoder) parseString(in []byte) (string, int, error) {
var nerr errors.NonFatal
in0 := in
if len(in) == 0 {
return "", 0, io.ErrUnexpectedEOF
}
if in[0] != '"' {
return "", 0, d.newSyntaxError("invalid character %q at start of string", in[0])
}
in = in[1:]
i := indexNeedEscapeInBytes(in)
in, out := in[i:], in[:i:i] // set cap to prevent mutations
for len(in) > 0 {
switch r, n := utf8.DecodeRune(in); {
case r == utf8.RuneError && n == 1:
nerr.AppendInvalidUTF8("")
in, out = in[1:], append(out, in[0]) // preserve invalid byte
case r < ' ':
return "", 0, d.newSyntaxError("invalid character %q in string", r)
case r == '"':
in = in[1:]
n := len(in0) - len(in)
return string(out), n, nerr.E
case r == '\\':
if len(in) < 2 {
return "", 0, io.ErrUnexpectedEOF
}
switch r := in[1]; r {
case '"', '\\', '/':
in, out = in[2:], append(out, r)
case 'b':
in, out = in[2:], append(out, '\b')
case 'f':
in, out = in[2:], append(out, '\f')
case 'n':
in, out = in[2:], append(out, '\n')
case 'r':
in, out = in[2:], append(out, '\r')
case 't':
in, out = in[2:], append(out, '\t')
case 'u':
if len(in) < 6 {
return "", 0, io.ErrUnexpectedEOF
}
v, err := strconv.ParseUint(string(in[2:6]), 16, 16)
if err != nil {
return "", 0, d.newSyntaxError("invalid escape code %q in string", in[:6])
}
in = in[6:]
r := rune(v)
if utf16.IsSurrogate(r) {
if len(in) < 6 {
return "", 0, io.ErrUnexpectedEOF
}
v, err := strconv.ParseUint(string(in[2:6]), 16, 16)
r = utf16.DecodeRune(r, rune(v))
if in[0] != '\\' || in[1] != 'u' ||
r == unicode.ReplacementChar || err != nil {
return "", 0, d.newSyntaxError("invalid escape code %q in string", in[:6])
}
in = in[6:]
}
out = append(out, string(r)...)
default:
return "", 0, d.newSyntaxError("invalid escape code %q in string", in[:2])
}
default:
i := indexNeedEscapeInBytes(in[n:])
in, out = in[n+i:], append(out, in[:n+i]...)
}
}
return "", 0, io.ErrUnexpectedEOF
}
// indexNeedEscapeInString returns the index of the character that needs
// escaping. If no characters need escaping, this returns the input length.
func indexNeedEscapeInString(s string) int {
for i, r := range s {
if r < ' ' || r == '\\' || r == '"' || r == utf8.RuneError {
return i
}
}
return len(s)
}
// indexNeedEscapeInBytes returns the index of the character that needs
// escaping. If no characters need escaping, this returns the input length.
// TODO: Remove this duplicate function when https://golang.org/issue/31506 gets
// resolved.
func indexNeedEscapeInBytes(b []byte) int {
for i := 0; i < len(b); {
r, n := utf8.DecodeRune(b[i:])
if r < ' ' || r == '\\' || r == '"' || r == utf8.RuneError {
return i
}
i += n
}
return len(b)
}