Merge pull request #6 from jg-rp/update-cts

Update CTS and fix
jg-rp · Aug 5, 2024 · 3fc3918 · 3fc3918
2 parents af3c18a + 967e1f9
commit 3fc3918
Show file tree

Hide file tree

Showing 9 changed files with 165 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Python JSONPath RFC 9535 Change Log
 
+## Version 0.1.3 (unreleased)
+
+**Fixes**
+
+- Fixed decoding of escape sequences in quoted name selectors and string literals. We now raise a `JSONPathSyntaxError` for invalid code points.
+- Fixed parsing of number literals with an exponent. We now allow 'e' to be upper case.
+- Fixed handling of trailing commas in bracketed segments. We now raise a `JSONPathSyntaxError` in such cases.
+- Fixed handling of invalid number literals. We now raise a syntax error for invalid leading zeros and extra negative signs.
+
 ## Version 0.1.2
 
 **Fixes**

diff --git a/jsonpath_rfc9535/__about__.py b/jsonpath_rfc9535/__about__.py
@@ -1 +1 @@
-__version__ = "0.1.2"
+__version__ = "0.1.3"
diff --git a/jsonpath_rfc9535/lex.py b/jsonpath_rfc9535/lex.py
@@ -18,8 +18,8 @@
 RE_PROPERTY = re.compile(r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*")
 RE_INDEX = re.compile(r"-?[0-9]+")
 RE_INT = re.compile(r"-?[0-9]+")
-RE_EXPONENT = re.compile(r"e[+-]?[0-9]+")
-RE_NEGATIVE_EXPONENT = re.compile(r"e-[0-9]+")
+RE_EXPONENT = re.compile(r"[eE][+-]?[0-9]+")
+RE_NEGATIVE_EXPONENT = re.compile(r"[eE]-[0-9]+")
 RE_FUNCTION_NAME = re.compile(r"[a-z][a-z_0-9]*")
 RE_AND = re.compile(r"&&")
 RE_OR = re.compile(r"\|\|")

diff --git a/jsonpath_rfc9535/parse.py b/jsonpath_rfc9535/parse.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import json
 from typing import TYPE_CHECKING
 from typing import Callable
 from typing import Dict
@@ -312,6 +311,7 @@ def parse_bracketed_selection(self, stream: TokenStream) -> List[JSONPathSelecto
             if stream.peek.type_ != TokenType.RBRACKET:
                 stream.expect_peek(TokenType.COMMA)
                 stream.next_token()
+                stream.expect_peek_not(TokenType.RBRACKET, "unexpected trailing comma")
 
             stream.next_token()
 
@@ -362,11 +362,29 @@ def parse_string_literal(self, stream: TokenStream) -> Expression:
         )
 
     def parse_integer_literal(self, stream: TokenStream) -> Expression:
+        value = stream.current.value
+        if value.startswith("0") and len(value) > 1:
+            raise JSONPathSyntaxError("invalid integer literal", token=stream.current)
+
         # Convert to float first to handle scientific notation.
-        return IntegerLiteral(stream.current, value=int(float(stream.current.value)))
+        try:
+            return IntegerLiteral(stream.current, value=int(float(value)))
+        except ValueError as err:
+            raise JSONPathSyntaxError(
+                "invalid integer literal", token=stream.current
+            ) from err
 
     def parse_float_literal(self, stream: TokenStream) -> Expression:
-        return FloatLiteral(stream.current, value=float(stream.current.value))
+        value = stream.current.value
+        if value.startswith("0") and len(value.split(".")[0]) > 1:
+            raise JSONPathSyntaxError("invalid float literal", token=stream.current)
+
+        try:
+            return FloatLiteral(stream.current, value=float(stream.current.value))
+        except ValueError as err:
+            raise JSONPathSyntaxError(
+                "invalid float literal", token=stream.current
+            ) from err
 
     def parse_prefix_expression(self, stream: TokenStream) -> Expression:
         tok = stream.next_token()
@@ -514,12 +532,127 @@ def _decode_string_literal(self, token: Token) -> str:
             value = token.value.replace('"', '\\"').replace("\\'", "'")
         else:
             value = token.value
-        try:
-            rv = json.loads(f'"{value}"')
-            assert isinstance(rv, str)
-            return rv
-        except json.JSONDecodeError as err:
-            raise JSONPathSyntaxError(str(err).split(":")[1], token=token) from None
+
+        return self._unescape_string(value, token)
+
+    def _unescape_string(self, value: str, token: Token) -> str:
+        unescaped: List[str] = []
+        index = 0
+
+        while index < len(value):
+            ch = value[index]
+            if ch == "\\":
+                index += 1
+                _ch, index = self._decode_escape_sequence(value, index, token)
+                unescaped.append(_ch)
+            else:
+                self._string_from_codepoint(ord(ch), token)
+                unescaped.append(ch)
+            index += 1
+        return "".join(unescaped)
+
+    def _decode_escape_sequence(  # noqa: PLR0911
+        self, value: str, index: int, token: Token
+    ) -> Tuple[str, int]:
+        ch = value[index]
+        if ch == '"':
+            return '"', index
+        if ch == "\\":
+            return "\\", index
+        if ch == "/":
+            return "/", index
+        if ch == "b":
+            return "\x08", index
+        if ch == "f":
+            return "\x0c", index
+        if ch == "n":
+            return "\n", index
+        if ch == "r":
+            return "\r", index
+        if ch == "t":
+            return "\t", index
+        if ch == "u":
+            codepoint, index = self._decode_hex_char(value, index, token)
+            return self._string_from_codepoint(codepoint, token), index
+
+        raise JSONPathSyntaxError(
+            f"unknown escape sequence at index {token.index + index - 1}",
+            token=token,
+        )
+
+    def _decode_hex_char(self, value: str, index: int, token: Token) -> Tuple[int, int]:
+        length = len(value)
+
+        if index + 4 >= length:
+            raise JSONPathSyntaxError(
+                f"incomplete escape sequence at index {token.index + index - 1}",
+                token=token,
+            )
+
+        index += 1  # move past 'u'
+        codepoint = self._parse_hex_digits(value[index : index + 4], token)
+
+        if self._is_low_surrogate(codepoint):
+            raise JSONPathSyntaxError(
+                f"unexpected low surrogate at index {token.index + index - 1}",
+                token=token,
+            )
+
+        if self._is_high_surrogate(codepoint):
+            # expect a surrogate pair
+            if not (
+                index + 9 < length
+                and value[index + 4] == "\\"
+                and value[index + 5] == "u"
+            ):
+                raise JSONPathSyntaxError(
+                    f"incomplete escape sequence at index {token.index + index - 2}",
+                    token=token,
+                )
+
+            low_surrogate = self._parse_hex_digits(value[index + 6 : index + 10], token)
+
+            if not self._is_low_surrogate(low_surrogate):
+                raise JSONPathSyntaxError(
+                    f"unexpected codepoint at index {token.index + index + 4}",
+                    token=token,
+                )
+
+            codepoint = 0x10000 + (
+                ((codepoint & 0x03FF) << 10) | (low_surrogate & 0x03FF)
+            )
+
+            return (codepoint, index + 9)
+
+        return (codepoint, index + 3)
+
+    def _parse_hex_digits(self, digits: str, token: Token) -> int:
+        codepoint = 0
+        for digit in digits.encode():
+            codepoint <<= 4
+            if digit >= 48 and digit <= 57:
+                codepoint |= digit - 48
+            elif digit >= 65 and digit <= 70:
+                codepoint |= digit - 65 + 10
+            elif digit >= 97 and digit <= 102:
+                codepoint |= digit - 97 + 10
+            else:
+                raise JSONPathSyntaxError(
+                    "invalid \\uXXXX escape sequence",
+                    token=token,
+                )
+        return codepoint
+
+    def _string_from_codepoint(self, codepoint: int, token: Token) -> str:
+        if codepoint <= 0x1F:
+            raise JSONPathSyntaxError("invalid character", token=token)
+        return chr(codepoint)
+
+    def _is_high_surrogate(self, codepoint: int) -> bool:
+        return codepoint >= 0xD800 and codepoint <= 0xDBFF
+
+    def _is_low_surrogate(self, codepoint: int) -> bool:
+        return codepoint >= 0xDC00 and codepoint <= 0xDFFF
 
     def _raise_for_non_comparable_function(
         self, expr: Expression, token: Token

diff --git a/jsonpath_rfc9535/tokens.py b/jsonpath_rfc9535/tokens.py
@@ -193,3 +193,8 @@ def expect_peek(self, *typ: TokenType) -> None:
                 f"expected {_typ}, found {self.peek.type_.name!r}",
                 token=self.peek,
             )
+
+    def expect_peek_not(self, typ: TokenType, message: str) -> None:
+        """Raise an exception if the next token type is not one of _type_."""
+        if self.peek.type_ == typ:
+            raise JSONPathSyntaxError(message, token=self.peek)
diff --git a/pyproject.toml b/pyproject.toml
@@ -179,6 +179,7 @@ convention = "google"
 "scripts/__init__.py" = ["D104"]
 "tests/*" = ["D100", "D101", "D104", "D103"]
 "jsonpath_rfc9535/lex.py" = ["E741"]
+"jsonpath_rfc9535/parse.py" = ["PLR2004"]
 "jsonpath_rfc9535/utils/nondeterministic_descent.py" = [
   "D103",
   "D101",

diff --git a/tests/cts b/tests/cts
diff --git a/tests/test_compliance.py b/tests/test_compliance.py
@@ -7,6 +7,7 @@
 import json
 import operator
 from dataclasses import dataclass
+from dataclasses import field
 from typing import Any
 from typing import Dict
 from typing import List
@@ -26,6 +27,7 @@ class Case:
     result: Any = None
     results: Optional[List[Any]] = None
     invalid_selector: Optional[bool] = None
+    tags: List[str] = field(default_factory=list)
 
 
 SKIP: Dict[str, str] = {}

diff --git a/tests/test_cts_nondeterminism.py b/tests/test_cts_nondeterminism.py
@@ -7,6 +7,7 @@
 import json
 import operator
 from dataclasses import dataclass
+from dataclasses import field
 from typing import Any
 from typing import List
 from typing import Optional
@@ -26,6 +27,7 @@ class Case:
     result: Any = None
     results: Optional[List[Any]] = None
     invalid_selector: Optional[bool] = None
+    tags: List[str] = field(default_factory=list)
 
 
 def cases() -> List[Case]:
+39 −0		CONTRIBUTING.md
+73 −10		build.js
+2,448 −273		cts.json
+10 −0		cts.schema.json
+15 −2		tests/basic.json
+231 −36		tests/filter.json
+22 −11		tests/functions/count.json
+28 −14		tests/functions/length.json
+48 −24		tests/functions/match.json
+48 −24		tests/functions/search.json
+10 −5		tests/functions/value.json
+78 −11		tests/index_selector.json
+264 −83		tests/name_selector.json
+264 −39		tests/slice_selector.json
+33 −17		tests/whitespace/filter.json
+56 −28		tests/whitespace/functions.json
+144 −72		tests/whitespace/operators.json
+72 −36		tests/whitespace/selectors.json
+32 −16		tests/whitespace/slice.json