Skip to content

Commit

Permalink
Indicate float conversion due to overflows (#31)
Browse files Browse the repository at this point in the history
As flag to tape that indicates that an integer was converted to float due to int64/uint64 limits.

Fixes #25
  • Loading branch information
klauspost authored Jan 25, 2021
1 parent 3d975b7 commit c66cb85
Show file tree
Hide file tree
Showing 10 changed files with 204 additions and 79 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,28 @@ method to get an iterator.
There are methods that allow you to retrieve all elements as a single type,
[]int64, []uint64, float64 and strings.

## Number parsing

Numbers in JSON are untyped and are returned by the following rules in order:

* If there is any float point notation, like exponents, or a dot notation, it is always returned as float.
* If number is a pure integer and it fits within an int64 it is returned as such.
* If number is a pure positive integer and fits within a uint64 it is returned as such.
* If the number is valid number it is returned as float64.

If the number was converted from integer notation to a float due to not fitting inside int64/uint64
the `FloatOverflowedInteger` flag is set, which can be retrieved using `(Iter).FloatFlags()` method.

JSON numbers follow JavaScript’s double-precision floating-point format.

* Represented in base 10 with no superfluous leading zeros (e.g. 67, 1, 100).
* Include digits between 0 and 9.
* Can be a negative number (e.g. -10).
* Can be a fraction (e.g. .5).
* Can also have an exponent of 10, prefixed by e or E with a plus or minus sign to indicate positive or negative exponentiation.
* Octal and hexadecimal formats are not supported.
* Can not have a value of NaN (Not A Number) or Infinity.

## Parsing NDSJON stream

Newline delimited json is sent as packets with each line being a root element.
Expand Down
12 changes: 6 additions & 6 deletions parse_json_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ func (pj *internalParsedJson) initialize(size int) {
pj.Strings = make([]byte, 0, stringsSize)
}
pj.Strings = pj.Strings[:0]
if cap(pj.containing_scope_offset) < maxdepth {
pj.containing_scope_offset = make([]uint64, 0, maxdepth)
if cap(pj.containingScopeOffset) < maxdepth {
pj.containingScopeOffset = make([]uint64, 0, maxdepth)
}
pj.containing_scope_offset = pj.containing_scope_offset[:0]
pj.containingScopeOffset = pj.containingScopeOffset[:0]
}

func (pj *internalParsedJson) parseMessage(msg []byte) error {
Expand Down Expand Up @@ -75,8 +75,8 @@ func (pj *internalParsedJson) parseMessageInternal(msg []byte, ndjson bool) (err
// Make the capacity of the channel smaller than the number of slots.
// This way the sender will automatically block until the consumer
// has finished the slot it is working on.
pj.index_chan = make(chan indexChan, indexSlots-2)
pj.buffers_offset = ^uint64(0)
pj.indexChans = make(chan indexChan, indexSlots-2)
pj.buffersOffset = ^uint64(0)

var errStage1 error
go func() {
Expand All @@ -89,7 +89,7 @@ func (pj *internalParsedJson) parseMessageInternal(msg []byte, ndjson bool) (err
if !unifiedMachine(pj.Message, pj) {
err = errors.New("Bad parsing while executing stage 2")
// drain the channel until empty
for range pj.index_chan {
for range pj.indexChans {
}
}
wg.Done()
Expand Down
43 changes: 26 additions & 17 deletions parse_json_amd64_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func BenchmarkNdjsonStage1(b *testing.B) {

for i := 0; i < b.N; i++ {
// Create new channel (large enough so we won't block)
pj.index_chan = make(chan indexChan, 128*10240)
pj.indexChans = make(chan indexChan, 128*10240)
findStructuralIndices([]byte(ndjson), &pj)
}
}
Expand Down Expand Up @@ -210,24 +210,30 @@ func TestParseNumber(t *testing.T) {
expectedD float64
expectedI int64
expectedU uint64
flags FloatFlags
}{
{"1", TagInteger, 0.0, 1, 0},
{"-1", TagInteger, 0.0, -1, 0},
{"10000000000000000000", TagUint, 0.0, 0, 10000000000000000000},
{"10000000000000000001", TagUint, 0.0, 0, 10000000000000000001},
{"-10000000000000000000", TagFloat, -10000000000000000000, 0, 0},
{"1.0", TagFloat, 1.0, 0, 0},
{"1234567890", TagInteger, 0.0, 1234567890, 0},
{"9876.543210", TagFloat, 9876.543210, 0, 0},
{"0.123456789e-12", TagFloat, 1.23456789e-13, 0, 0},
{"1.234567890E+34", TagFloat, 1.234567890e+34, 0, 0},
{"23456789012E66", TagFloat, 23456789012e66, 0, 0},
{"-9876.543210", TagFloat, -9876.543210, 0, 0},
{"-65.619720000000029", TagFloat, -65.61972000000003, 0, 0},
{input: "1", wantTag: TagInteger, expectedI: 1},
{input: "-1", wantTag: TagInteger, expectedI: -1},
{input: "10000000000000000000", wantTag: TagUint, expectedU: 10000000000000000000},
{input: "10000000000000000001", wantTag: TagUint, expectedU: 10000000000000000001},
// math.MinInt64 - 1
{input: "-9223372036854775809", wantTag: TagFloat, expectedD: -9.223372036854776e+18, flags: FloatOverflowedInteger.Flags()},
{input: "-10000000000000000000", wantTag: TagFloat, expectedD: -10000000000000000000, flags: FloatOverflowedInteger.Flags()},
{input: "100000000000000000000", wantTag: TagFloat, expectedD: 100000000000000000000, flags: FloatOverflowedInteger.Flags()},
// math.MaxUint64 +1
{input: "18446744073709551616", wantTag: TagFloat, expectedD: 1.8446744073709552e+19, flags: FloatOverflowedInteger.Flags()},
{input: "1.0", wantTag: TagFloat, expectedD: 1.0},
{input: "1234567890", wantTag: TagInteger, expectedI: 1234567890},
{input: "9876.543210", wantTag: TagFloat, expectedD: 9876.543210},
{input: "0.123456789e-12", wantTag: TagFloat, expectedD: 1.23456789e-13},
{input: "1.234567890E+34", wantTag: TagFloat, expectedD: 1.234567890e+34},
{input: "23456789012E66", wantTag: TagFloat, expectedD: 23456789012e66},
{input: "-9876.543210", wantTag: TagFloat, expectedD: -9876.543210},
{input: "-65.619720000000029", wantTag: TagFloat, expectedD: -65.61972000000003},
}

for _, tc := range testCases {
tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, tc.input)))
tag, val, flags := parseNumber([]byte(fmt.Sprintf(`%s:`, tc.input)))
if tag != tc.wantTag {
t.Errorf("TestParseNumber: got: %v want: %v", tag, tc.wantTag)
}
Expand All @@ -246,6 +252,9 @@ func TestParseNumber(t *testing.T) {
t.Errorf("TestParseNumber: got: %d want: %d", val, tc.expectedU)
}
}
if flags != uint64(tc.flags) {
t.Errorf("TestParseNumber flags; got: %d want: %d", flags, tc.flags)
}
}
}

Expand Down Expand Up @@ -295,7 +304,7 @@ func TestParseInt64(t *testing.T) {
test := &parseInt64Tests[i]
t.Run(test.in, func(t *testing.T) {

tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
tag, val, _ := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
if tag != test.tag {
// Ignore intentionally bad syntactical errors
t.Errorf("TestParseInt64: got: %v want: %v", tag, test.tag)
Expand Down Expand Up @@ -478,7 +487,7 @@ func TestParseFloat64(t *testing.T) {
for i := 0; i < len(atoftests); i++ {
test := &atoftests[i]
t.Run(test.in, func(t *testing.T) {
tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
tag, val, _ := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
switch tag {
case TagEnd:
if test.err == nil {
Expand Down
32 changes: 21 additions & 11 deletions parse_number_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
package simdjson

import (
"errors"
"math"
"strconv"
)
Expand Down Expand Up @@ -63,29 +64,29 @@ var isNumberRune = [256]uint8{
// parseNumber will parse the number starting in the buffer.
// Any non-number characters at the end will be ignored.
// Returns TagEnd if no valid value found be found.
func parseNumber(buf []byte) (tag Tag, val uint64) {
func parseNumber(buf []byte) (tag Tag, val, flags uint64) {
pos := 0
found := uint8(0)
for i, v := range buf {
t := isNumberRune[v]
if t == 0 {
//fmt.Println("aborting on", string(v), "in", string(buf[:i]))
return TagEnd, 0
return TagEnd, 0, 0
}
if t == isEOVFlag {
break
}
if t&isMustHaveDigitNext > 0 {
// A period and minus must be followed by a digit
if len(buf) < i+2 || isNumberRune[buf[i+1]]&isDigitFlag == 0 {
return TagEnd, 0
return TagEnd, 0, 0
}
}
found |= t
pos = i + 1
}
if pos == 0 {
return TagEnd, 0
return TagEnd, 0, 0
}
const maxIntLen = 20

Expand All @@ -94,33 +95,42 @@ func parseNumber(buf []byte) (tag Tag, val uint64) {
if found&isMinusFlag == 0 {
if pos > 1 && buf[0] == '0' {
// Integers cannot have a leading zero.
return TagEnd, 0
return TagEnd, 0, 0
}
} else {
if pos > 2 && buf[1] == '0' {
// Integers cannot have a leading zero after minus.
return TagEnd, 0
return TagEnd, 0, 0
}
}
i64, err := strconv.ParseInt(string(buf[:pos]), 10, 64)
if err == nil {
return TagInteger, uint64(i64)
return TagInteger, uint64(i64), 0
}
if errors.Is(err, strconv.ErrRange) {
flags |= uint64(FloatOverflowedInteger)
}

if found&isMinusFlag == 0 {
u64, err := strconv.ParseUint(string(buf[:pos]), 10, 64)
if err == nil {
return TagUint, u64
return TagUint, u64, 0
}
if errors.Is(err, strconv.ErrRange) {
flags |= uint64(FloatOverflowedInteger)
}
}
} else if found&isFloatOnlyFlag == 0 {
flags |= uint64(FloatOverflowedInteger)
}

if pos > 1 && buf[0] == '0' && isNumberRune[buf[1]]&isFloatOnlyFlag == 0 {
// Float can only have have a leading 0 when followed by a period.
return TagEnd, 0
return TagEnd, 0, 0
}
f64, err := strconv.ParseFloat(string(buf[:pos]), 64)
if err == nil {
return TagFloat, math.Float64bits(f64)
return TagFloat, math.Float64bits(f64), flags
}
return TagEnd, 0
return TagEnd, 0, 0
}
2 changes: 1 addition & 1 deletion parse_number_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func TestNumberIsValid(t *testing.T) {
// From: https://stackoverflow.com/a/13340826
var jsonNumberRegexp = regexp.MustCompile(`^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$`)
isValidNumber := func(s string) bool {
tag, _ := parseNumber([]byte(s))
tag, _, _ := parseNumber([]byte(s))
return tag != TagEnd
}
validTests := []string{
Expand Down
72 changes: 65 additions & 7 deletions parsed_json.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,32 @@ const STRINGBUFMASK = 0x7fffffffffffff

const maxdepth = 128

// FloatFlags are flags recorded when converting floats.
type FloatFlags uint64

// FloatFlag is a flag recorded when parsing floats.
type FloatFlag uint64

const (
// FloatOverflowedInteger is set when number in JSON was in integer notation,
// but under/overflowed both int64 and uint64 and therefore was parsed as float.
FloatOverflowedInteger FloatFlag = 1 << iota
)

// Contains returns whether f contains the specified flag.
func (f FloatFlags) Contains(flag FloatFlag) bool {
return FloatFlag(f)&flag == flag
}

// Flags converts the flag to FloatFlags and optionally merges more flags.
func (f FloatFlag) Flags(more ...FloatFlag) FloatFlags {
// We operate on a copy, so we can modify f.
for _, v := range more {
f |= v
}
return FloatFlags(f)
}

type ParsedJson struct {
Message []byte
Tape []uint64
Expand All @@ -63,13 +89,13 @@ type indexChan struct {

type internalParsedJson struct {
ParsedJson
containing_scope_offset []uint64
isvalid bool
index_chan chan indexChan
indexesChan indexChan
buffers [indexSlots][indexSize]uint32
buffers_offset uint64
ndjson uint64
containingScopeOffset []uint64
isvalid bool
indexChans chan indexChan
indexesChan indexChan
buffers [indexSlots][indexSize]uint32
buffersOffset uint64
ndjson uint64
}

// Iter returns a new Iter.
Expand Down Expand Up @@ -479,6 +505,34 @@ func (i *Iter) Float() (float64, error) {
}
}

// FloatFlags returns the float value of the next element.
// This will include flags from parsing.
// Integers are automatically converted to float.
func (i *Iter) FloatFlags() (float64, FloatFlags, error) {
switch i.t {
case TagFloat:
if i.off >= len(i.tape.Tape) {
return 0, 0, errors.New("corrupt input: expected float, but no more values on tape")
}
v := math.Float64frombits(i.tape.Tape[i.off])
return v, 0, nil
case TagInteger:
if i.off >= len(i.tape.Tape) {
return 0, 0, errors.New("corrupt input: expected integer, but no more values on tape")
}
v := int64(i.tape.Tape[i.off])
return float64(v), 0, nil
case TagUint:
if i.off >= len(i.tape.Tape) {
return 0, 0, errors.New("corrupt input: expected integer, but no more values on tape")
}
v := i.tape.Tape[i.off]
return float64(v), FloatFlags(i.cur), nil
default:
return 0, 0, fmt.Errorf("unable to convert type %v to float", i.t)
}
}

// Int returns the integer value of the next element.
// Integers and floats within range are automatically converted.
func (i *Iter) Int() (int64, error) {
Expand Down Expand Up @@ -771,6 +825,10 @@ func (pj *ParsedJson) writeTapeTagVal(tag Tag, val uint64) {
pj.Tape = append(pj.Tape, uint64(tag)<<56, val)
}

func (pj *ParsedJson) writeTapeTagValFlags(tag Tag, val, flags uint64) {
pj.Tape = append(pj.Tape, uint64(tag)<<56|flags, val)
}

func (pj *ParsedJson) write_tape_s64(val int64) {
pj.writeTapeTagVal(TagInteger, uint64(val))
}
Expand Down
Loading

0 comments on commit c66cb85

Please sign in to comment.