google · seizethedave · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go
@@ -358,7 +358,21 @@ func (l *lexer) resetTokenStart() {
  l.tokenStartLoc = l.location()
 }
 
+// tokenKindPostprocessors defines a transformation of the lexed token string
+// before it is stored in the tokens list. It is optional for each token kind.
+var tokenKindPostprocessors = map[tokenKind]func(string) string{
+ tokenNumber: func(s string) string {
+ // Get rid of underscore digit separators.
+ return strings.ReplaceAll(s, "_", "")
+ },
+}
+
 func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) {
+ // Run the postprocessor if the token kind has one defined.
+ if pp, ok := tokenKindPostprocessors[kind]; ok {
+ data = pp(data)
+ }
+
  l.tokens = append(l.tokens, token{
  kind: kind,
  fodder: l.fodder,
@@ -451,7 +465,7 @@ func (l *lexer) lexUntilNewline() (string, int, int) {
 // that the next rune to be served by the lexer will be a leading digit.
 func (l *lexer) lexNumber() error {
  // This function should be understood with reference to the linked image:
- // http://www.json.org/number.gif
+ // https://www.json.org/img/number.png
 
  // Note, we deviate from the json.org documentation as follows:
  // There is no reason to lex negative numbers as atomic tokens, it is better to parse them
@@ -465,9 +479,11 @@ func (l *lexer) lexNumber() error {
  numAfterOneToNine
  numAfterDot
  numAfterDigit
+ numAfterUnderscore
  numAfterE
  numAfterExpSign
  numAfterExpDigit
+ numAfterExpUnderscore
  )
 
  state := numBegin
@@ -492,6 +508,9 @@ outerLoop:
  state = numAfterDot
  case 'e', 'E':
  state = numAfterE
+ case '_':
+ state = numAfterUnderscore
+
  default:
  break outerLoop
  }
@@ -503,6 +522,8 @@ outerLoop:
  state = numAfterE
  case r >= '0' && r <= '9':
  state = numAfterOneToNine
+ case r == '_':
+ state = numAfterUnderscore
  default:
  break outerLoop
  }
@@ -521,9 +542,22 @@ outerLoop:
  state = numAfterE
  case r >= '0' && r <= '9':
  state = numAfterDigit
+ case r == '_':
+ state = numAfterUnderscore
  default:
  break outerLoop
  }
+
+ case numAfterUnderscore:
+ // The only valid transition out of _ is to a digit.
+ switch {
+ case r >= '0' && r <= '9':
+ state = numAfterOneToNine
+ default:
+ return l.makeStaticErrorPoint(
+ fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)),
+ l.location())
+ }
  case numAfterE:
  switch {
  case r == '+' || r == '-':
@@ -545,12 +579,27 @@ outerLoop:
  }
 
  case numAfterExpDigit:
- if r >= '0' && r <= '9' {
+ switch {
+ case r >= '0' && r <= '9':
  state = numAfterExpDigit
- } else {
+ case r == '_':
+ state = numAfterExpUnderscore
+ default:
  break outerLoop
  }
+
+ case numAfterExpUnderscore:
+ // The only valid transition out of _ is to a digit.
+ switch {
+ case r >= '0' && r <= '9':
+ state = numAfterExpDigit
+ default:
+ return l.makeStaticErrorPoint(
+ fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)),
+ l.location())
+ }
  }
+
  l.next()
  }
 
@@ -965,7 +1014,6 @@ func Lex(diagnosticFilename ast.DiagnosticFileName, importedFilename, input stri
  fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)),
  l.location())
  }
-
  }
  }
 

diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go
@@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
- http://www.apache.org/licenses/LICENSE-2.0
+ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -16,6 +16,7 @@ limitations under the License.
 package parser
 
 import (
+ "fmt"
  "testing"
 
  "github.com/google/go-jsonnet/ast"
@@ -314,6 +315,39 @@ func TestNumber1epExc(t *testing.T) {
  SingleTest(t, "1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{})
 }
 
+func TestNumberSeparators(t *testing.T) {
+ for _, c := range []struct {
+ input string
+ err string
+ tokens Tokens
+ }{
+ {"123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}},
+ {"1_750_000", "", Tokens{{kind: tokenNumber, data: "1750000"}}},
+ {"1_2_3", "", Tokens{{kind: tokenNumber, data: "123"}}},
+ {"3.141_592", "", Tokens{{kind: tokenNumber, data: "3.141592"}}},
+ {"01_100", "", Tokens{{kind: tokenNumber, data: "0"}, {kind: tokenNumber, data: "1100"}}},
+ {"1_200.0", "", Tokens{{kind: tokenNumber, data: "1200.0"}}},
+ {"0e1_01", "", Tokens{{kind: tokenNumber, data: "0e101"}}},
+ {"10_10e3", "", Tokens{{kind: tokenNumber, data: "1010e3"}}},
+ {"2_3e1_2", "", Tokens{{kind: tokenNumber, data: "23e12"}}},
+ {"1.1_2e100", "", Tokens{{kind: tokenNumber, data: "1.12e100"}}},
+ {"1.1e-10_1", "", Tokens{{kind: tokenNumber, data: "1.1e-101"}}},
+ {"9.109_383_56e-31", "", Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}},
+ {"123456_!", "snippet:1:8 Couldn't lex number, junk after '_': '!'", Tokens{}},
+ {"123__456", "snippet:1:5 Couldn't lex number, junk after '_': '_'", Tokens{}},
+ {"1_200_.0", "snippet:1:7 Couldn't lex number, junk after '_': '.'", Tokens{}},
+ {"1_200._0", "snippet:1:7 Couldn't lex number, junk after decimal point: '_'", Tokens{}},
+ {"1_200_e2", "snippet:1:7 Couldn't lex number, junk after '_': 'e'", Tokens{}},
+ {"1_200e_2", "snippet:1:7 Couldn't lex number, junk after 'E': '_'", Tokens{}},
+ {"200e-_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}},
+ {"200e+_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}},
+ } {
+ t.Run(fmt.Sprintf("number %s", c.input), func(t *testing.T) {
+ SingleTest(t, c.input, c.err, c.tokens)
+ })
+ }
+}
+
 func TestDoublestring1(t *testing.T) {
  SingleTest(t, "\"hi\"", "", Tokens{
  {kind: tokenStringDouble, data: "hi"},
@@ -491,6 +525,12 @@ func TestIdentifiers(t *testing.T) {
  })
 }
 
+func TestIdentifierUnderscore(t *testing.T) {
+ SingleTest(t, "_123", "", Tokens{
+ {kind: tokenIdentifier, data: "_123"},
+ })
+}
+
 func TestCppComment(t *testing.T) {
  SingleTest(t, "// hi", "", Tokens{
  {kind: tokenEndOfFile, fodder: ast.Fodder{{Kind: ast.FodderParagraph, Comment: []string{"// hi"}}}},