Damien Neil 8c86fc5e7d all: remove non-fatal UTF-8 validation errors (and non-fatal in general)
Immediately abort (un)marshal operations when encountering invalid UTF-8
data in proto3 strings. No other proto implementation supports non-UTF-8
data in proto3 strings (and many reject it in proto2 strings as well).
Producing invalid output is an interoperability threat (other
implementations won't be able to read it).

The case where existing string data is found to contain non-UTF8 data is
better handled by changing the field to the `bytes` type, which (aside
from UTF-8 validation) is wire-compatible with `string`.

Remove the errors.NonFatal type, since there are no remaining cases
where it is needed. "Non-fatal" errors which produce results and a
non-nil error are problematic because they compose poorly; the better
approach is to take an option like AllowPartial indicating which
conditions to check for.

Change-Id: I9d189ec6ffda7b5d96d094aa1b290af2e3f23736
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/183098
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
2019-06-20 20:55:13 +00:00

468 lines
12 KiB
Go

// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package json
import (
"bytes"
"fmt"
"io"
"regexp"
"strconv"
"unicode/utf8"
"google.golang.org/protobuf/internal/errors"
)
// call specifies which Decoder method was invoked.
type call uint8
const (
readCall call = iota
peekCall
)
// Decoder is a token-based JSON decoder.
type Decoder struct {
// lastCall is last method called, either readCall or peekCall.
// Initial value is readCall.
lastCall call
// value contains the last read value.
value Value
// err contains the last read error.
err error
// startStack is a stack containing StartObject and StartArray types. The
// top of stack represents the object or the array the current value is
// directly located in.
startStack []Type
// orig is used in reporting line and column.
orig []byte
// in contains the unconsumed input.
in []byte
}
// NewDecoder returns a Decoder to read the given []byte.
func NewDecoder(b []byte) *Decoder {
return &Decoder{orig: b, in: b}
}
// Peek looks ahead and returns the next JSON type without advancing a read.
func (d *Decoder) Peek() Type {
defer func() { d.lastCall = peekCall }()
if d.lastCall == readCall {
d.value, d.err = d.Read()
}
return d.value.typ
}
// Read returns the next JSON value. It will return an error if there is no
// valid value. For String types containing invalid UTF8 characters, a non-fatal
// error is returned and caller can call Read for the next value.
func (d *Decoder) Read() (Value, error) {
defer func() { d.lastCall = readCall }()
if d.lastCall == peekCall {
return d.value, d.err
}
value, err := d.parseNext()
if err != nil {
return Value{}, err
}
n := value.size
switch value.typ {
case EOF:
if len(d.startStack) != 0 ||
d.value.typ&Null|Bool|Number|String|EndObject|EndArray == 0 {
return Value{}, io.ErrUnexpectedEOF
}
case Null:
if !d.isValueNext() {
return Value{}, d.newSyntaxError("unexpected value null")
}
case Bool, Number:
if !d.isValueNext() {
return Value{}, d.newSyntaxError("unexpected value %v", value.Raw())
}
case String:
if d.isValueNext() {
break
}
// Check if this is for an object name.
if d.value.typ&(StartObject|comma) == 0 {
return Value{}, d.newSyntaxError("unexpected value %v", value.Raw())
}
d.in = d.in[n:]
d.consume(0)
if c := d.in[0]; c != ':' {
return Value{}, d.newSyntaxError(`unexpected character %v, missing ":" after object name`, string(c))
}
n = 1
value.typ = Name
case StartObject, StartArray:
if !d.isValueNext() {
return Value{}, d.newSyntaxError("unexpected character %v", value.Raw())
}
d.startStack = append(d.startStack, value.typ)
case EndObject:
if len(d.startStack) == 0 ||
d.value.typ == comma ||
d.startStack[len(d.startStack)-1] != StartObject {
return Value{}, d.newSyntaxError("unexpected character }")
}
d.startStack = d.startStack[:len(d.startStack)-1]
case EndArray:
if len(d.startStack) == 0 ||
d.value.typ == comma ||
d.startStack[len(d.startStack)-1] != StartArray {
return Value{}, d.newSyntaxError("unexpected character ]")
}
d.startStack = d.startStack[:len(d.startStack)-1]
case comma:
if len(d.startStack) == 0 ||
d.value.typ&(Null|Bool|Number|String|EndObject|EndArray) == 0 {
return Value{}, d.newSyntaxError("unexpected character ,")
}
}
// Update d.value only after validating value to be in the right sequence.
d.value = value
d.in = d.in[n:]
if d.value.typ == comma {
return d.Read()
}
return value, nil
}
// Any sequence that looks like a non-delimiter (for error reporting).
var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9]{1,32}|.)`)
// parseNext parses for the next JSON value. It returns a Value object for
// different types, except for Name. It does not handle whether the next value
// is in a valid sequence or not.
func (d *Decoder) parseNext() (value Value, err error) {
// Trim leading spaces.
d.consume(0)
in := d.in
if len(in) == 0 {
return d.newValue(EOF, nil, 0), nil
}
switch in[0] {
case 'n':
n := matchWithDelim("null", in)
if n == 0 {
return Value{}, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
}
return d.newValue(Null, in, n), nil
case 't':
n := matchWithDelim("true", in)
if n == 0 {
return Value{}, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
}
return d.newBoolValue(in, n, true), nil
case 'f':
n := matchWithDelim("false", in)
if n == 0 {
return Value{}, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
}
return d.newBoolValue(in, n, false), nil
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
n, ok := consumeNumber(in)
if !ok {
return Value{}, d.newSyntaxError("invalid number %s", errRegexp.Find(in))
}
return d.newValue(Number, in, n), nil
case '"':
s, n, err := d.parseString(in)
if err != nil {
return Value{}, err
}
return d.newStringValue(in, n, s), nil
case '{':
return d.newValue(StartObject, in, 1), nil
case '}':
return d.newValue(EndObject, in, 1), nil
case '[':
return d.newValue(StartArray, in, 1), nil
case ']':
return d.newValue(EndArray, in, 1), nil
case ',':
return d.newValue(comma, in, 1), nil
}
return Value{}, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
}
// position returns line and column number of index in given orig slice.
func position(orig []byte, idx int) (int, int) {
b := orig[:idx]
line := bytes.Count(b, []byte("\n")) + 1
if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
b = b[i+1:]
}
column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
return line, column
}
// newSyntaxError returns an error with line and column information useful for
// syntax errors.
func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
e := errors.New(f, x...)
line, column := position(d.orig, len(d.orig)-len(d.in))
return errors.New("syntax error (line %d:%d): %v", line, column, e)
}
// matchWithDelim matches s with the input b and verifies that the match
// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
// As a special case, EOF is considered a delimiter. It returns the length of s
// if there is a match, else 0.
func matchWithDelim(s string, b []byte) int {
if !bytes.HasPrefix(b, []byte(s)) {
return 0
}
n := len(s)
if n < len(b) && isNotDelim(b[n]) {
return 0
}
return n
}
// isNotDelim returns true if given byte is a not delimiter character.
func isNotDelim(c byte) bool {
return (c == '-' || c == '+' || c == '.' || c == '_' ||
('a' <= c && c <= 'z') ||
('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9'))
}
// consume consumes n bytes of input and any subsequent whitespace.
func (d *Decoder) consume(n int) {
d.in = d.in[n:]
for len(d.in) > 0 {
switch d.in[0] {
case ' ', '\n', '\r', '\t':
d.in = d.in[1:]
default:
return
}
}
}
// isValueNext returns true if next type should be a JSON value: Null,
// Number, String or Bool.
func (d *Decoder) isValueNext() bool {
if len(d.startStack) == 0 {
return d.value.typ == 0
}
start := d.startStack[len(d.startStack)-1]
switch start {
case StartObject:
return d.value.typ&Name != 0
case StartArray:
return d.value.typ&(StartArray|comma) != 0
}
panic(fmt.Sprintf(
"unreachable logic in Decoder.isValueNext, lastType: %v, startStack: %v",
d.value.typ, start))
}
// newValue constructs a Value for given Type.
func (d *Decoder) newValue(typ Type, input []byte, size int) Value {
return Value{
typ: typ,
input: d.orig,
start: len(d.orig) - len(input),
size: size,
}
}
// newBoolValue constructs a Value for a JSON boolean.
func (d *Decoder) newBoolValue(input []byte, size int, b bool) Value {
return Value{
typ: Bool,
input: d.orig,
start: len(d.orig) - len(input),
size: size,
boo: b,
}
}
// newStringValue constructs a Value for a JSON string.
func (d *Decoder) newStringValue(input []byte, size int, s string) Value {
return Value{
typ: String,
input: d.orig,
start: len(d.orig) - len(input),
size: size,
str: s,
}
}
// Clone returns a copy of the Decoder for use in reading ahead the next JSON
// object, array or other values without affecting current Decoder.
func (d *Decoder) Clone() *Decoder {
ret := *d
ret.startStack = append([]Type(nil), ret.startStack...)
return &ret
}
// Value provides a parsed JSON type and value.
//
// The original input slice is stored in this struct in order to compute for
// position as needed. The raw JSON value is derived from the original input
// slice given start and size.
//
// For JSON boolean and string, it holds the converted value in boo and str
// fields respectively. For JSON number, the raw JSON value holds a valid number
// which is converted only in Int or Float. Other JSON types do not require any
// additional data.
type Value struct {
typ Type
input []byte
start int
size int
boo bool
str string
}
func (v Value) newError(f string, x ...interface{}) error {
e := errors.New(f, x...)
line, col := v.Position()
return errors.New("error (line %d:%d): %v", line, col, e)
}
// Type returns the JSON type.
func (v Value) Type() Type {
return v.typ
}
// Position returns the line and column of the value.
func (v Value) Position() (int, int) {
return position(v.input, v.start)
}
// Bool returns the bool value if token is Bool, else it will return an error.
func (v Value) Bool() (bool, error) {
if v.typ != Bool {
return false, v.newError("%s is not a bool", v.Raw())
}
return v.boo, nil
}
// String returns the string value for a JSON string token or the read value in
// string if token is not a string.
func (v Value) String() string {
if v.typ != String {
return v.Raw()
}
return v.str
}
// Name returns the object name if token is Name, else it will return an error.
func (v Value) Name() (string, error) {
if v.typ != Name {
return "", v.newError("%s is not an object name", v.Raw())
}
return v.str, nil
}
// Raw returns the read value in string.
func (v Value) Raw() string {
return string(v.input[v.start : v.start+v.size])
}
// Float returns the floating-point number if token is Number, else it will
// return an error.
//
// The floating-point precision is specified by the bitSize parameter: 32 for
// float32 or 64 for float64. If bitSize=32, the result still has type float64,
// but it will be convertible to float32 without changing its value. It will
// return an error if the number exceeds the floating point limits for given
// bitSize.
func (v Value) Float(bitSize int) (float64, error) {
if v.typ != Number {
return 0, v.newError("%s is not a number", v.Raw())
}
f, err := strconv.ParseFloat(v.Raw(), bitSize)
if err != nil {
return 0, v.newError("%v", err)
}
return f, nil
}
// Int returns the signed integer number if token is Number, else it will
// return an error.
//
// The given bitSize specifies the integer type that the result must fit into.
// It returns an error if the number is not an integer value or if the result
// exceeds the limits for given bitSize.
func (v Value) Int(bitSize int) (int64, error) {
s, err := v.getIntStr()
if err != nil {
return 0, err
}
n, err := strconv.ParseInt(s, 10, bitSize)
if err != nil {
return 0, v.newError("%v", err)
}
return n, nil
}
// Uint returns the signed integer number if token is Number, else it will
// return an error.
//
// The given bitSize specifies the unsigned integer type that the result must
// fit into. It returns an error if the number is not an unsigned integer value
// or if the result exceeds the limits for given bitSize.
func (v Value) Uint(bitSize int) (uint64, error) {
s, err := v.getIntStr()
if err != nil {
return 0, err
}
n, err := strconv.ParseUint(s, 10, bitSize)
if err != nil {
return 0, v.newError("%v", err)
}
return n, nil
}
func (v Value) getIntStr() (string, error) {
if v.typ != Number {
return "", v.newError("%s is not a number", v.input)
}
parts, ok := parseNumber(v.input[v.start : v.start+v.size])
if !ok {
return "", v.newError("%s is not a number", v.input)
}
num, ok := normalizeToIntString(parts)
if !ok {
return "", v.newError("cannot convert %s to integer", v.input)
}
return num, nil
}