mirror of
https://github.com/protocolbuffers/protobuf-go.git
synced 2025-02-24 00:39:55 +00:00
Improve performance by replacing use of regular expressions with direct parsing code. Compared to latest version: name old time/op new time/op delta Text/Unmarshal/google_message1_proto2-4 21.8µs ± 5% 14.0µs ± 9% -35.69% (p=0.000 n=10+9) Text/Unmarshal/google_message1_proto3-4 19.6µs ± 4% 13.8µs ±10% -29.47% (p=0.000 n=10+10) Text/Unmarshal/google_message2-4 13.4ms ± 4% 4.9ms ± 4% -63.44% (p=0.000 n=10+10) Text/Marshal/google_message1_proto2-4 13.8µs ± 2% 14.1µs ± 4% +2.42% (p=0.011 n=9+10) Text/Marshal/google_message1_proto3-4 11.6µs ± 2% 11.8µs ± 8% ~ (p=0.573 n=8+10) Text/Marshal/google_message2-4 8.01ms ±48% 5.97ms ± 5% -25.44% (p=0.000 n=10+10) name old alloc/op new alloc/op delta Text/Unmarshal/google_message1_proto2-4 13.0kB ± 0% 12.6kB ± 0% -3.40% (p=0.000 n=10+10) Text/Unmarshal/google_message1_proto3-4 13.0kB ± 0% 12.5kB ± 0% -3.50% (p=0.000 n=10+10) Text/Unmarshal/google_message2-4 5.67MB ± 0% 5.50MB ± 0% -3.13% (p=0.000 n=10+10) Text/Marshal/google_message1_proto2-4 12.0kB ± 0% 12.1kB ± 0% +0.02% (p=0.000 n=10+10) Text/Marshal/google_message1_proto3-4 11.7kB ± 0% 11.7kB ± 0% +0.01% (p=0.000 n=10+10) Text/Marshal/google_message2-4 5.68MB ± 0% 5.68MB ± 0% +0.01% (p=0.000 n=10+10) name old allocs/op new allocs/op delta Text/Unmarshal/google_message1_proto2-4 142 ± 0% 142 ± 0% ~ (all equal) Text/Unmarshal/google_message1_proto3-4 156 ± 0% 156 ± 0% ~ (all equal) Text/Unmarshal/google_message2-4 70.1k ± 0% 65.4k ± 0% -6.76% (p=0.000 n=10+10) Text/Marshal/google_message1_proto2-4 91.0 ± 0% 91.0 ± 0% ~ (all equal) Text/Marshal/google_message1_proto3-4 80.0 ± 0% 80.0 ± 0% ~ (all equal) Text/Marshal/google_message2-4 36.4k ± 0% 36.4k ± 0% ~ (all equal) Change-Id: Ia5d3c16e9e33961aae03bac0d53fcfc5b1943d2a Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/173360 Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
315 lines
7.6 KiB
Go
315 lines
7.6 KiB
Go
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package text
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"regexp"
|
|
"strconv"
|
|
"unicode/utf8"
|
|
|
|
"google.golang.org/protobuf/internal/errors"
|
|
"google.golang.org/protobuf/reflect/protoreflect"
|
|
)
|
|
|
|
type syntaxError struct{ error }
|
|
|
|
func newSyntaxError(f string, x ...interface{}) error {
|
|
return syntaxError{errors.New(f, x...)}
|
|
}
|
|
|
|
// Unmarshal parses b as the proto text format.
|
|
// It returns a Value, which is always of the Message type.
|
|
func Unmarshal(b []byte) (Value, error) {
|
|
p := decoder{in: b}
|
|
p.consume(0) // trim leading spaces or comments
|
|
v, err := p.unmarshalMessage(false)
|
|
if err != nil {
|
|
if e, ok := err.(syntaxError); ok {
|
|
b = b[:len(b)-len(p.in)] // consumed input
|
|
line := bytes.Count(b, []byte("\n")) + 1
|
|
if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
|
|
b = b[i+1:]
|
|
}
|
|
column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
|
|
err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
|
|
}
|
|
return Value{}, err
|
|
}
|
|
if len(p.in) > 0 {
|
|
return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
|
|
}
|
|
return v, nil
|
|
}
|
|
|
|
type decoder struct {
|
|
in []byte
|
|
}
|
|
|
|
func (p *decoder) unmarshalList() (Value, error) {
|
|
b := p.in
|
|
var elems []Value
|
|
if err := p.consumeChar('[', "at start of list"); err != nil {
|
|
return Value{}, err
|
|
}
|
|
if len(p.in) > 0 && p.in[0] != ']' {
|
|
for len(p.in) > 0 {
|
|
v, err := p.unmarshalValue()
|
|
if err != nil {
|
|
return Value{}, err
|
|
}
|
|
elems = append(elems, v)
|
|
if !p.tryConsumeChar(',') {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if err := p.consumeChar(']', "at end of list"); err != nil {
|
|
return Value{}, err
|
|
}
|
|
b = b[:len(b)-len(p.in)]
|
|
return rawValueOf(elems, b[:len(b):len(b)]), nil
|
|
}
|
|
|
|
func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) {
|
|
b := p.in
|
|
var items [][2]Value
|
|
delims := [2]byte{'{', '}'}
|
|
if len(p.in) > 0 && p.in[0] == '<' {
|
|
delims = [2]byte{'<', '>'}
|
|
}
|
|
if checkDelims {
|
|
if err := p.consumeChar(delims[0], "at start of message"); err != nil {
|
|
return Value{}, err
|
|
}
|
|
}
|
|
for len(p.in) > 0 {
|
|
if p.in[0] == '}' || p.in[0] == '>' {
|
|
break
|
|
}
|
|
k, err := p.unmarshalKey()
|
|
if err != nil {
|
|
return Value{}, err
|
|
}
|
|
if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' {
|
|
return Value{}, newSyntaxError("expected ':' after message key")
|
|
}
|
|
v, err := p.unmarshalValue()
|
|
if err != nil {
|
|
return Value{}, err
|
|
}
|
|
if p.tryConsumeChar(';') || p.tryConsumeChar(',') {
|
|
// always optional
|
|
}
|
|
items = append(items, [2]Value{k, v})
|
|
}
|
|
if checkDelims {
|
|
if err := p.consumeChar(delims[1], "at end of message"); err != nil {
|
|
return Value{}, err
|
|
}
|
|
}
|
|
b = b[:len(b)-len(p.in)]
|
|
return rawValueOf(items, b[:len(b):len(b)]), nil
|
|
}
|
|
|
|
// unmarshalKey parses the key, which may be a Name, String, or Uint.
|
|
func (p *decoder) unmarshalKey() (v Value, err error) {
|
|
if p.tryConsumeChar('[') {
|
|
if len(p.in) == 0 {
|
|
return Value{}, io.ErrUnexpectedEOF
|
|
}
|
|
if p.in[0] == '\'' || p.in[0] == '"' {
|
|
// Historically, Go's parser allowed a string for the Any type URL.
|
|
// This is specific to Go and contrary to the C++ implementation,
|
|
// which does not support strings for the Any type URL.
|
|
v, err = p.unmarshalString()
|
|
if err != nil {
|
|
return Value{}, err
|
|
}
|
|
} else {
|
|
v, err = p.unmarshalURL()
|
|
if err != nil {
|
|
return Value{}, err
|
|
}
|
|
}
|
|
if err := p.consumeChar(']', "at end of extension name"); err != nil {
|
|
return Value{}, err
|
|
}
|
|
return v, nil
|
|
}
|
|
v, err = p.unmarshalName()
|
|
if err == nil {
|
|
return v, nil
|
|
}
|
|
v, err = p.unmarshalNumberKey()
|
|
if err == nil {
|
|
return v, nil
|
|
}
|
|
return Value{}, err
|
|
}
|
|
|
|
// unmarshalURL parses an Any type URL string. The C++ parser does not handle
|
|
// many legal URL strings. This implementation is more liberal and allows for
|
|
// the pattern ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`).
|
|
func (p *decoder) unmarshalURL() (Value, error) {
|
|
s := p.in
|
|
var size int
|
|
for len(s) > 0 && (s[0] == '-' || s[0] == '_' ||
|
|
('0' <= s[0] && s[0] <= '9') ||
|
|
('a' <= s[0] && s[0] <= 'z') ||
|
|
('A' <= s[0] && s[0] <= 'Z')) {
|
|
s = s[1:]
|
|
size++
|
|
if len(s) > 0 && (s[0] == '/' || s[0] == '.') {
|
|
s = s[1:]
|
|
size++
|
|
}
|
|
}
|
|
|
|
// Last character cannot be '.' or '/'.
|
|
// Next byte should either be a delimiter or it is at the end.
|
|
if size == 0 || p.in[size-1] == '.' || p.in[size-1] == '/' ||
|
|
(len(s) > 0 && !isDelim(s[0])) {
|
|
return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
|
|
}
|
|
v := rawValueOf(string(p.in[:size]), p.in[:size:size])
|
|
p.consume(size)
|
|
return v, nil
|
|
}
|
|
|
|
// unmarshalNumberKey parses field number as key. Field numbers are non-negative
|
|
// integers.
|
|
func (p *decoder) unmarshalNumberKey() (Value, error) {
|
|
num, ok := parseNumber(p.in)
|
|
if !ok || num.neg || num.typ == numFloat {
|
|
return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
|
|
}
|
|
v, err := strconv.ParseUint(string(num.value), 0, 64)
|
|
if err != nil {
|
|
return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
|
|
}
|
|
p.consume(num.size)
|
|
return rawValueOf(v, num.value), nil
|
|
}
|
|
|
|
func (p *decoder) unmarshalValue() (Value, error) {
|
|
if len(p.in) == 0 {
|
|
return Value{}, io.ErrUnexpectedEOF
|
|
}
|
|
switch p.in[0] {
|
|
case '"', '\'':
|
|
return p.unmarshalStrings()
|
|
case '[':
|
|
return p.unmarshalList()
|
|
case '{', '<':
|
|
return p.unmarshalMessage(true)
|
|
default:
|
|
n, ok := consumeName(p.in)
|
|
if ok && literals[string(p.in[:n])] == nil {
|
|
v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
|
|
p.consume(n)
|
|
return v, nil
|
|
}
|
|
return p.unmarshalNumber()
|
|
}
|
|
}
|
|
|
|
// unmarshalName unmarshals an unquoted proto identifier.
|
|
// Regular expression that matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
|
|
//
|
|
// E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
|
|
func (p *decoder) unmarshalName() (Value, error) {
|
|
n, ok := consumeName(p.in)
|
|
if !ok {
|
|
return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
|
|
}
|
|
|
|
v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
|
|
p.consume(n)
|
|
return v, nil
|
|
}
|
|
|
|
func consumeName(input []byte) (int, bool) {
|
|
var n int
|
|
|
|
s := input
|
|
if len(s) == 0 {
|
|
return 0, false
|
|
}
|
|
|
|
switch {
|
|
case s[0] == '_',
|
|
'a' <= s[0] && s[0] <= 'z',
|
|
'A' <= s[0] && s[0] <= 'Z':
|
|
s = s[1:]
|
|
n++
|
|
default:
|
|
return 0, false
|
|
}
|
|
|
|
for len(s) > 0 && (s[0] == '_' ||
|
|
'a' <= s[0] && s[0] <= 'z' ||
|
|
'A' <= s[0] && s[0] <= 'Z' ||
|
|
'0' <= s[0] && s[0] <= '9') {
|
|
s = s[1:]
|
|
n++
|
|
}
|
|
|
|
if len(s) > 0 && !isDelim(s[0]) {
|
|
return 0, false
|
|
}
|
|
|
|
return n, true
|
|
}
|
|
|
|
func (p *decoder) consumeChar(c byte, msg string) error {
|
|
if p.tryConsumeChar(c) {
|
|
return nil
|
|
}
|
|
if len(p.in) == 0 {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
|
|
}
|
|
|
|
func (p *decoder) tryConsumeChar(c byte) bool {
|
|
if len(p.in) > 0 && p.in[0] == c {
|
|
p.consume(1)
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// consume consumes n bytes of input and any subsequent whitespace or comments.
|
|
func (p *decoder) consume(n int) {
|
|
p.in = p.in[n:]
|
|
for len(p.in) > 0 {
|
|
switch p.in[0] {
|
|
case ' ', '\n', '\r', '\t':
|
|
p.in = p.in[1:]
|
|
case '#':
|
|
if i := bytes.IndexByte(p.in, '\n'); i >= 0 {
|
|
p.in = p.in[i+len("\n"):]
|
|
} else {
|
|
p.in = nil
|
|
}
|
|
default:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Any sequence that looks like a non-delimiter (for error reporting).
|
|
var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
|
|
|
|
// isDelim returns true if given byte is a delimiter character.
|
|
func isDelim(c byte) bool {
|
|
return !(c == '-' || c == '+' || c == '.' || c == '_' ||
|
|
('a' <= c && c <= 'z') ||
|
|
('A' <= c && c <= 'Z') ||
|
|
('0' <= c && c <= '9'))
|
|
}
|