Indicate float conversion due to overflows (#31)

As flag to tape that indicates that an integer was converted to float due to int64/uint64 limits. Fixes #25
minio · Jan 25, 2021 · c66cb85 · c66cb85
1 parent 3d975b7
commit c66cb85
Show file tree

Hide file tree

Showing 10 changed files with 204 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -133,6 +133,28 @@ method to get an iterator.
 There are methods that allow you to retrieve all elements as a single type, 
 []int64, []uint64, float64 and strings.  
 
+## Number parsing
+
+Numbers in JSON are untyped and are returned by the following rules in order:
+
+* If there is any float point notation, like exponents, or a dot notation, it is always returned as float.
+* If number is a pure integer and it fits within an int64 it is returned as such.
+* If number is a pure positive integer and fits within a uint64 it is returned as such.
+* If the number is valid number it is returned as float64.
+
+If the number was converted from integer notation to a float due to not fitting inside int64/uint64
+the `FloatOverflowedInteger` flag is set, which can be retrieved using `(Iter).FloatFlags()` method.  
+
+JSON numbers follow JavaScript’s double-precision floating-point format.
+
+* Represented in base 10 with no superfluous leading zeros (e.g. 67, 1, 100).
+* Include digits between 0 and 9.
+* Can be a negative number (e.g. -10).
+* Can be a fraction (e.g. .5).
+* Can also have an exponent of 10, prefixed by e or E with a plus or minus sign to indicate positive or negative exponentiation.
+* Octal and hexadecimal formats are not supported.
+* Can not have a value of NaN (Not A Number) or Infinity.
+
 ## Parsing NDSJON stream
 
 Newline delimited json is sent as packets with each line being a root element.

diff --git a/parse_json_amd64.go b/parse_json_amd64.go
@@ -42,10 +42,10 @@ func (pj *internalParsedJson) initialize(size int) {
 		pj.Strings = make([]byte, 0, stringsSize)
 	}
 	pj.Strings = pj.Strings[:0]
-	if cap(pj.containing_scope_offset) < maxdepth {
-		pj.containing_scope_offset = make([]uint64, 0, maxdepth)
+	if cap(pj.containingScopeOffset) < maxdepth {
+		pj.containingScopeOffset = make([]uint64, 0, maxdepth)
 	}
-	pj.containing_scope_offset = pj.containing_scope_offset[:0]
+	pj.containingScopeOffset = pj.containingScopeOffset[:0]
 }
 
 func (pj *internalParsedJson) parseMessage(msg []byte) error {
@@ -75,8 +75,8 @@ func (pj *internalParsedJson) parseMessageInternal(msg []byte, ndjson bool) (err
 	// Make the capacity of the channel smaller than the number of slots.
 	// This way the sender will automatically block until the consumer
 	// has finished the slot it is working on.
-	pj.index_chan = make(chan indexChan, indexSlots-2)
-	pj.buffers_offset = ^uint64(0)
+	pj.indexChans = make(chan indexChan, indexSlots-2)
+	pj.buffersOffset = ^uint64(0)
 
 	var errStage1 error
 	go func() {
@@ -89,7 +89,7 @@ func (pj *internalParsedJson) parseMessageInternal(msg []byte, ndjson bool) (err
 		if !unifiedMachine(pj.Message, pj) {
 			err = errors.New("Bad parsing while executing stage 2")
 			// drain the channel until empty
-			for range pj.index_chan {
+			for range pj.indexChans {
 			}
 		}
 		wg.Done()

diff --git a/parse_json_amd64_test.go b/parse_json_amd64_test.go
@@ -96,7 +96,7 @@ func BenchmarkNdjsonStage1(b *testing.B) {
 
 	for i := 0; i < b.N; i++ {
 		// Create new channel (large enough so we won't block)
-		pj.index_chan = make(chan indexChan, 128*10240)
+		pj.indexChans = make(chan indexChan, 128*10240)
 		findStructuralIndices([]byte(ndjson), &pj)
 	}
 }
@@ -210,24 +210,30 @@ func TestParseNumber(t *testing.T) {
 		expectedD float64
 		expectedI int64
 		expectedU uint64
+		flags     FloatFlags
 	}{
-		{"1", TagInteger, 0.0, 1, 0},
-		{"-1", TagInteger, 0.0, -1, 0},
-		{"10000000000000000000", TagUint, 0.0, 0, 10000000000000000000},
-		{"10000000000000000001", TagUint, 0.0, 0, 10000000000000000001},
-		{"-10000000000000000000", TagFloat, -10000000000000000000, 0, 0},
-		{"1.0", TagFloat, 1.0, 0, 0},
-		{"1234567890", TagInteger, 0.0, 1234567890, 0},
-		{"9876.543210", TagFloat, 9876.543210, 0, 0},
-		{"0.123456789e-12", TagFloat, 1.23456789e-13, 0, 0},
-		{"1.234567890E+34", TagFloat, 1.234567890e+34, 0, 0},
-		{"23456789012E66", TagFloat, 23456789012e66, 0, 0},
-		{"-9876.543210", TagFloat, -9876.543210, 0, 0},
-		{"-65.619720000000029", TagFloat, -65.61972000000003, 0, 0},
+		{input: "1", wantTag: TagInteger, expectedI: 1},
+		{input: "-1", wantTag: TagInteger, expectedI: -1},
+		{input: "10000000000000000000", wantTag: TagUint, expectedU: 10000000000000000000},
+		{input: "10000000000000000001", wantTag: TagUint, expectedU: 10000000000000000001},
+		// math.MinInt64 - 1
+		{input: "-9223372036854775809", wantTag: TagFloat, expectedD: -9.223372036854776e+18, flags: FloatOverflowedInteger.Flags()},
+		{input: "-10000000000000000000", wantTag: TagFloat, expectedD: -10000000000000000000, flags: FloatOverflowedInteger.Flags()},
+		{input: "100000000000000000000", wantTag: TagFloat, expectedD: 100000000000000000000, flags: FloatOverflowedInteger.Flags()},
+		// math.MaxUint64 +1
+		{input: "18446744073709551616", wantTag: TagFloat, expectedD: 1.8446744073709552e+19, flags: FloatOverflowedInteger.Flags()},
+		{input: "1.0", wantTag: TagFloat, expectedD: 1.0},
+		{input: "1234567890", wantTag: TagInteger, expectedI: 1234567890},
+		{input: "9876.543210", wantTag: TagFloat, expectedD: 9876.543210},
+		{input: "0.123456789e-12", wantTag: TagFloat, expectedD: 1.23456789e-13},
+		{input: "1.234567890E+34", wantTag: TagFloat, expectedD: 1.234567890e+34},
+		{input: "23456789012E66", wantTag: TagFloat, expectedD: 23456789012e66},
+		{input: "-9876.543210", wantTag: TagFloat, expectedD: -9876.543210},
+		{input: "-65.619720000000029", wantTag: TagFloat, expectedD: -65.61972000000003},
 	}
 
 	for _, tc := range testCases {
-		tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, tc.input)))
+		tag, val, flags := parseNumber([]byte(fmt.Sprintf(`%s:`, tc.input)))
 		if tag != tc.wantTag {
 			t.Errorf("TestParseNumber: got: %v want: %v", tag, tc.wantTag)
 		}
@@ -246,6 +252,9 @@ func TestParseNumber(t *testing.T) {
 				t.Errorf("TestParseNumber: got: %d want: %d", val, tc.expectedU)
 			}
 		}
+		if flags != uint64(tc.flags) {
+			t.Errorf("TestParseNumber flags; got: %d want: %d", flags, tc.flags)
+		}
 	}
 }
 
@@ -295,7 +304,7 @@ func TestParseInt64(t *testing.T) {
 		test := &parseInt64Tests[i]
 		t.Run(test.in, func(t *testing.T) {
 
-			tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
+			tag, val, _ := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
 			if tag != test.tag {
 				// Ignore intentionally bad syntactical errors
 				t.Errorf("TestParseInt64: got: %v want: %v", tag, test.tag)
@@ -478,7 +487,7 @@ func TestParseFloat64(t *testing.T) {
 	for i := 0; i < len(atoftests); i++ {
 		test := &atoftests[i]
 		t.Run(test.in, func(t *testing.T) {
-			tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
+			tag, val, _ := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
 			switch tag {
 			case TagEnd:
 				if test.err == nil {

diff --git a/parse_number_amd64.go b/parse_number_amd64.go
@@ -21,6 +21,7 @@
 package simdjson
 
 import (
+	"errors"
 	"math"
 	"strconv"
 )
@@ -63,29 +64,29 @@ var isNumberRune = [256]uint8{
 // parseNumber will parse the number starting in the buffer.
 // Any non-number characters at the end will be ignored.
 // Returns TagEnd if no valid value found be found.
-func parseNumber(buf []byte) (tag Tag, val uint64) {
+func parseNumber(buf []byte) (tag Tag, val, flags uint64) {
 	pos := 0
 	found := uint8(0)
 	for i, v := range buf {
 		t := isNumberRune[v]
 		if t == 0 {
 			//fmt.Println("aborting on", string(v), "in", string(buf[:i]))
-			return TagEnd, 0
+			return TagEnd, 0, 0
 		}
 		if t == isEOVFlag {
 			break
 		}
 		if t&isMustHaveDigitNext > 0 {
 			// A period and minus must be followed by a digit
 			if len(buf) < i+2 || isNumberRune[buf[i+1]]&isDigitFlag == 0 {
-				return TagEnd, 0
+				return TagEnd, 0, 0
 			}
 		}
 		found |= t
 		pos = i + 1
 	}
 	if pos == 0 {
-		return TagEnd, 0
+		return TagEnd, 0, 0
 	}
 	const maxIntLen = 20
 
@@ -94,33 +95,42 @@ func parseNumber(buf []byte) (tag Tag, val uint64) {
 		if found&isMinusFlag == 0 {
 			if pos > 1 && buf[0] == '0' {
 				// Integers cannot have a leading zero.
-				return TagEnd, 0
+				return TagEnd, 0, 0
 			}
 		} else {
 			if pos > 2 && buf[1] == '0' {
 				// Integers cannot have a leading zero after minus.
-				return TagEnd, 0
+				return TagEnd, 0, 0
 			}
 		}
 		i64, err := strconv.ParseInt(string(buf[:pos]), 10, 64)
 		if err == nil {
-			return TagInteger, uint64(i64)
+			return TagInteger, uint64(i64), 0
 		}
+		if errors.Is(err, strconv.ErrRange) {
+			flags |= uint64(FloatOverflowedInteger)
+		}
+
 		if found&isMinusFlag == 0 {
 			u64, err := strconv.ParseUint(string(buf[:pos]), 10, 64)
 			if err == nil {
-				return TagUint, u64
+				return TagUint, u64, 0
+			}
+			if errors.Is(err, strconv.ErrRange) {
+				flags |= uint64(FloatOverflowedInteger)
 			}
 		}
+	} else if found&isFloatOnlyFlag == 0 {
+		flags |= uint64(FloatOverflowedInteger)
 	}
 
 	if pos > 1 && buf[0] == '0' && isNumberRune[buf[1]]&isFloatOnlyFlag == 0 {
 		// Float can only have have a leading 0 when followed by a period.
-		return TagEnd, 0
+		return TagEnd, 0, 0
 	}
 	f64, err := strconv.ParseFloat(string(buf[:pos]), 64)
 	if err == nil {
-		return TagFloat, math.Float64bits(f64)
+		return TagFloat, math.Float64bits(f64), flags
 	}
-	return TagEnd, 0
+	return TagEnd, 0, 0
 }
diff --git a/parse_number_test.go b/parse_number_test.go
@@ -31,7 +31,7 @@ func TestNumberIsValid(t *testing.T) {
 	// From: https://stackoverflow.com/a/13340826
 	var jsonNumberRegexp = regexp.MustCompile(`^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$`)
 	isValidNumber := func(s string) bool {
-		tag, _ := parseNumber([]byte(s))
+		tag, _, _ := parseNumber([]byte(s))
 		return tag != TagEnd
 	}
 	validTests := []string{

diff --git a/parsed_json.go b/parsed_json.go
@@ -42,6 +42,32 @@ const STRINGBUFMASK = 0x7fffffffffffff
 
 const maxdepth = 128
 
+// FloatFlags are flags recorded when converting floats.
+type FloatFlags uint64
+
+// FloatFlag is a flag recorded when parsing floats.
+type FloatFlag uint64
+
+const (
+	// FloatOverflowedInteger is set when number in JSON was in integer notation,
+	// but under/overflowed both int64 and uint64 and therefore was parsed as float.
+	FloatOverflowedInteger FloatFlag = 1 << iota
+)
+
+// Contains returns whether f contains the specified flag.
+func (f FloatFlags) Contains(flag FloatFlag) bool {
+	return FloatFlag(f)&flag == flag
+}
+
+// Flags converts the flag to FloatFlags and optionally merges more flags.
+func (f FloatFlag) Flags(more ...FloatFlag) FloatFlags {
+	// We operate on a copy, so we can modify f.
+	for _, v := range more {
+		f |= v
+	}
+	return FloatFlags(f)
+}
+
 type ParsedJson struct {
 	Message []byte
 	Tape    []uint64
@@ -63,13 +89,13 @@ type indexChan struct {
 
 type internalParsedJson struct {
 	ParsedJson
-	containing_scope_offset []uint64
-	isvalid                 bool
-	index_chan              chan indexChan
-	indexesChan             indexChan
-	buffers                 [indexSlots][indexSize]uint32
-	buffers_offset          uint64
-	ndjson                  uint64
+	containingScopeOffset []uint64
+	isvalid               bool
+	indexChans            chan indexChan
+	indexesChan           indexChan
+	buffers               [indexSlots][indexSize]uint32
+	buffersOffset         uint64
+	ndjson                uint64
 }
 
 // Iter returns a new Iter.
@@ -479,6 +505,34 @@ func (i *Iter) Float() (float64, error) {
 	}
 }
 
+// FloatFlags returns the float value of the next element.
+// This will include flags from parsing.
+// Integers are automatically converted to float.
+func (i *Iter) FloatFlags() (float64, FloatFlags, error) {
+	switch i.t {
+	case TagFloat:
+		if i.off >= len(i.tape.Tape) {
+			return 0, 0, errors.New("corrupt input: expected float, but no more values on tape")
+		}
+		v := math.Float64frombits(i.tape.Tape[i.off])
+		return v, 0, nil
+	case TagInteger:
+		if i.off >= len(i.tape.Tape) {
+			return 0, 0, errors.New("corrupt input: expected integer, but no more values on tape")
+		}
+		v := int64(i.tape.Tape[i.off])
+		return float64(v), 0, nil
+	case TagUint:
+		if i.off >= len(i.tape.Tape) {
+			return 0, 0, errors.New("corrupt input: expected integer, but no more values on tape")
+		}
+		v := i.tape.Tape[i.off]
+		return float64(v), FloatFlags(i.cur), nil
+	default:
+		return 0, 0, fmt.Errorf("unable to convert type %v to float", i.t)
+	}
+}
+
 // Int returns the integer value of the next element.
 // Integers and floats within range are automatically converted.
 func (i *Iter) Int() (int64, error) {
@@ -771,6 +825,10 @@ func (pj *ParsedJson) writeTapeTagVal(tag Tag, val uint64) {
 	pj.Tape = append(pj.Tape, uint64(tag)<<56, val)
 }
 
+func (pj *ParsedJson) writeTapeTagValFlags(tag Tag, val, flags uint64) {
+	pj.Tape = append(pj.Tape, uint64(tag)<<56|flags, val)
+}
+
 func (pj *ParsedJson) write_tape_s64(val int64) {
 	pj.writeTapeTagVal(TagInteger, uint64(val))
 }