From b8409543eb4b3451a09e311f52fd58180bf81969 Mon Sep 17 00:00:00 2001
From: James Prior <jamesgr.prior@gmail.com>
Date: Sun, 27 Oct 2024 07:34:21 +0000
Subject: [PATCH 1/3] Improve filter selector class and parse function names

---
 jsonpath_rfc9535/parse.py     | 23 +++++++++++------------
 jsonpath_rfc9535/query.py     |  4 ++--
 jsonpath_rfc9535/selectors.py |  4 ++--
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/jsonpath_rfc9535/parse.py b/jsonpath_rfc9535/parse.py
index e722bc1..15b3da7 100644
--- a/jsonpath_rfc9535/parse.py
+++ b/jsonpath_rfc9535/parse.py
@@ -34,7 +34,7 @@
 from .segments import JSONPathChildSegment
 from .segments import JSONPathRecursiveDescentSegment
 from .segments import JSONPathSegment
-from .selectors import Filter
+from .selectors import FilterSelector
 from .selectors import IndexSelector
 from .selectors import JSONPathSelector
 from .selectors import NameSelector
@@ -113,9 +113,6 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
             TokenType.TRUE: self.parse_boolean,
         }
 
-        # TODO: can a function argument be a grouped expression?
-        # TODO: can a function argument contain a !?
-
         self.function_argument_map: Dict[
             TokenType, Callable[[TokenStream], Expression]
         ] = {
@@ -291,7 +288,7 @@ def parse_bracketed_selection(self, stream: TokenStream) -> List[JSONPathSelecto
                     )
                 )
             elif stream.current.type_ == TokenType.FILTER:
-                selectors.append(self.parse_filter(stream))
+                selectors.append(self.parse_filter_selector(stream))
             elif stream.current.type_ == TokenType.EOF:
                 raise JSONPathSyntaxError(
                     "unexpected end of query", token=stream.current
@@ -320,9 +317,9 @@ def parse_bracketed_selection(self, stream: TokenStream) -> List[JSONPathSelecto
 
         return selectors
 
-    def parse_filter(self, stream: TokenStream) -> Filter:
+    def parse_filter_selector(self, stream: TokenStream) -> FilterSelector:
         tok = stream.next_token()
-        expr = self.parse_filter_selector(stream)
+        expr = self.parse_filter_expression(stream)
 
         if isinstance(expr, FunctionExtension):
             func = self.env.function_extensions.get(expr.name)
@@ -342,7 +339,7 @@ def parse_filter(self, stream: TokenStream) -> Filter:
                 token=expr.token,
             )
 
-        return Filter(
+        return FilterSelector(
             env=self.env,
             token=tok,
             expression=FilterExpression(token=expr.token, expression=expr),
@@ -392,7 +389,9 @@ def parse_prefix_expression(self, stream: TokenStream) -> Expression:
         return PrefixExpression(
             tok,
             operator="!",
-            right=self.parse_filter_selector(stream, precedence=self.PRECEDENCE_PREFIX),
+            right=self.parse_filter_expression(
+                stream, precedence=self.PRECEDENCE_PREFIX
+            ),
         )
 
     def parse_infix_expression(
@@ -400,7 +399,7 @@ def parse_infix_expression(
     ) -> Expression:
         tok = stream.next_token()
         precedence = self.PRECEDENCES.get(tok.type_, self.PRECEDENCE_LOWEST)
-        right = self.parse_filter_selector(stream, precedence)
+        right = self.parse_filter_expression(stream, precedence)
         operator = self.BINARY_OPERATORS[tok.type_]
 
         if operator in self.COMPARISON_OPERATORS:
@@ -425,7 +424,7 @@ def parse_infix_expression(
 
     def parse_grouped_expression(self, stream: TokenStream) -> Expression:
         stream.next_token()
-        expr = self.parse_filter_selector(stream)
+        expr = self.parse_filter_expression(stream)
         stream.next_token()
 
         while stream.current.type_ != TokenType.RPAREN:
@@ -497,7 +496,7 @@ def parse_function_extension(self, stream: TokenStream) -> Expression:
             ),
         )
 
-    def parse_filter_selector(
+    def parse_filter_expression(
         self, stream: TokenStream, precedence: int = PRECEDENCE_LOWEST
     ) -> Expression:
         try:
diff --git a/jsonpath_rfc9535/query.py b/jsonpath_rfc9535/query.py
index 35c6ea2..38ac4ce 100644
--- a/jsonpath_rfc9535/query.py
+++ b/jsonpath_rfc9535/query.py
@@ -1,4 +1,4 @@
-"""A compiled JSONPath ready to be applied to a JSON-like value."""
+"""A compiled JSONPath expression ready to be applied to JSON-like data."""
 
 from __future__ import annotations
 
@@ -20,7 +20,7 @@
 
 
 class JSONPathQuery:
-    """A compiled JSONPath expression ready to be applied to a JSON-like value.
+    """A compiled JSONPath expression ready to be applied to JSON-like data.
 
     Arguments:
         env: The `JSONPathEnvironment` this query is bound to.
diff --git a/jsonpath_rfc9535/selectors.py b/jsonpath_rfc9535/selectors.py
index 184ef50..c365402 100644
--- a/jsonpath_rfc9535/selectors.py
+++ b/jsonpath_rfc9535/selectors.py
@@ -213,7 +213,7 @@ def resolve(self, node: JSONPathNode) -> Iterable[JSONPathNode]:
                 yield node.new_child(element, i)
 
 
-class Filter(JSONPathSelector):
+class FilterSelector(JSONPathSelector):
     """Filter array/list items or dict/object values with a filter expression."""
 
     __slots__ = ("expression",)
@@ -233,7 +233,7 @@ def __str__(self) -> str:
 
     def __eq__(self, __value: object) -> bool:
         return (
-            isinstance(__value, Filter)
+            isinstance(__value, FilterSelector)
             and self.expression == __value.expression
             and self.token == __value.token
         )

From d5bf0934b62c54535b55145abecca9df0d60a2c4 Mon Sep 17 00:00:00 2001
From: James Prior <jamesgr.prior@gmail.com>
Date: Sun, 27 Oct 2024 07:55:12 +0000
Subject: [PATCH 2/3] Lexer todos

---
 jsonpath_rfc9535/lex.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/jsonpath_rfc9535/lex.py b/jsonpath_rfc9535/lex.py
index ba081be..8166abb 100644
--- a/jsonpath_rfc9535/lex.py
+++ b/jsonpath_rfc9535/lex.py
@@ -77,6 +77,7 @@ def emit(self, t: TokenType) -> None:
 
     def next(self) -> str:
         """Return the next character, or the empty string if no more characters."""
+        # TODO: benchmark ty/except approach
         if self.pos >= len(self.query):
             return ""
 
@@ -100,18 +101,18 @@ def backup(self) -> None:
 
     def peek(self) -> str:
         """Return the next character without advancing the pointer."""
+        # TODO: benchmark try/except without self.next()
         c = self.next()
         if c:
             self.backup()
         return c
 
-    def accept(self, pattern: Pattern[str]) -> bool:
-        """Increment the pointer if the current character matches _pattern_."""
-        c = self.next()
-        if pattern.match(c):
+    def accept(self, s: str) -> bool:
+        """Increment the pointer if the current position starts with _s_."""
+        # TODO: benchmark using accept instead of accept_match for known words
+        if self.query.startswith(s, self.pos):
+            self.pos += len(s)
             return True
-        if c:
-            self.backup()
         return False
 
     def accept_match(self, pattern: Pattern[str]) -> bool:
@@ -140,6 +141,8 @@ def ignore_whitespace(self) -> bool:
 
     def error(self, msg: str) -> None:
         """Emit an error token."""
+        # TODO: move msg out of Token.value. We'll need the value too when implementing
+        # better error messages.
         self.tokens.append(Token(TokenType.ERROR, msg, self.pos, self.query))
 
 
@@ -147,6 +150,7 @@ def error(self, msg: str) -> None:
 
 
 def lex_root(l: Lexer) -> Optional[StateFn]:  # noqa: D103
+    # TODO: benchmark peek/next instead of next/backup
     c = l.next()
 
     if c != "$":
@@ -392,6 +396,7 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]:  # noqa: D103, PLR0915, PL
         l.backup()
 
         # numbers
+        # TODO: try accept_match(RE_FLOAT), including negative exponent
         if l.accept_match(RE_INT):
             if l.peek() == ".":
                 # A float
@@ -474,6 +479,7 @@ def _lex_string(l: Lexer) -> Optional[StateFn]:
                 l.next()
                 continue
 
+            # TODO: replace use of `head` with peek
             if c == "\\" and not RE_ESCAPE.match(head):
                 l.error("invalid escape")
                 return None

From 5614806dd6cf28029ecd894250aafd317d958ded Mon Sep 17 00:00:00 2001
From: James Prior <jamesgr.prior@gmail.com>
Date: Sun, 27 Oct 2024 13:44:50 +0000
Subject: [PATCH 3/3] Tidy lexer and fix error tokens

---
 jsonpath_rfc9535/lex.py    | 131 +++++++++++++------------------------
 jsonpath_rfc9535/tokens.py |   6 +-
 pyproject.toml             |   3 +-
 tests/test_lex.py          |  19 ++++--
 4 files changed, 66 insertions(+), 93 deletions(-)

diff --git a/jsonpath_rfc9535/lex.py b/jsonpath_rfc9535/lex.py
index 8166abb..04e508a 100644
--- a/jsonpath_rfc9535/lex.py
+++ b/jsonpath_rfc9535/lex.py
@@ -17,16 +17,11 @@
 RE_WHITESPACE = re.compile(r"[ \n\r\t]+")
 RE_PROPERTY = re.compile(r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*")
 RE_INDEX = re.compile(r"-?[0-9]+")
-RE_INT = re.compile(r"-?[0-9]+")
-RE_EXPONENT = re.compile(r"[eE][+-]?[0-9]+")
-RE_NEGATIVE_EXPONENT = re.compile(r"[eE]-[0-9]+")
+RE_INT = re.compile(r"-?[0-9]+(?:[eE]\+?[0-9]+)?")
+# RE_FLOAT includes numbers with a negative exponent and no decimal point.
+RE_FLOAT = re.compile(r"(:?-?[0-9]+\.[0-9]+(?:[eE][+-]?[0-9]+)?)|(-?[0-9]+[eE]-[0-9]+)")
 RE_FUNCTION_NAME = re.compile(r"[a-z][a-z_0-9]*")
-RE_AND = re.compile(r"&&")
-RE_OR = re.compile(r"\|\|")
-RE_TRUE = re.compile(r"true")
-RE_FALSE = re.compile(r"false")
-RE_NULL = re.compile(r"null")
-RE_ESCAPE = re.compile(r"\\[bfnrtu/]")
+ESCAPES = frozenset(["b", "f", "n", "r", "t", "u", "/", "\\"])
 
 
 class Lexer:
@@ -77,14 +72,13 @@ def emit(self, t: TokenType) -> None:
 
     def next(self) -> str:
         """Return the next character, or the empty string if no more characters."""
-        # TODO: benchmark ty/except approach
-        if self.pos >= len(self.query):
+        try:
+            c = self.query[self.pos]
+            self.pos += 1
+            return c
+        except IndexError:
             return ""
 
-        c = self.query[self.pos]
-        self.pos += 1
-        return c
-
     def ignore(self) -> None:
         """Ignore characters up to the pointer."""
         self.start = self.pos
@@ -101,15 +95,13 @@ def backup(self) -> None:
 
     def peek(self) -> str:
         """Return the next character without advancing the pointer."""
-        # TODO: benchmark try/except without self.next()
-        c = self.next()
-        if c:
-            self.backup()
-        return c
+        try:
+            return self.query[self.pos]
+        except IndexError:
+            return ""
 
     def accept(self, s: str) -> bool:
         """Increment the pointer if the current position starts with _s_."""
-        # TODO: benchmark using accept instead of accept_match for known words
         if self.query.startswith(s, self.pos):
             self.pos += len(s)
             return True
@@ -141,20 +133,25 @@ def ignore_whitespace(self) -> bool:
 
     def error(self, msg: str) -> None:
         """Emit an error token."""
-        # TODO: move msg out of Token.value. We'll need the value too when implementing
         # better error messages.
-        self.tokens.append(Token(TokenType.ERROR, msg, self.pos, self.query))
+        self.tokens.append(
+            Token(
+                TokenType.ERROR,
+                self.query[self.start : self.pos],
+                self.start,
+                self.query,
+                msg,
+            )
+        )
 
 
 StateFn = Callable[[Lexer], Optional["StateFn"]]
 
 
 def lex_root(l: Lexer) -> Optional[StateFn]:  # noqa: D103
-    # TODO: benchmark peek/next instead of next/backup
     c = l.next()
 
     if c != "$":
-        l.backup()
         l.error(f"expected '$', found {c!r}")
         return None
 
@@ -184,9 +181,8 @@ def lex_segment(l: Lexer) -> Optional[StateFn]:  # noqa: D103, PLR0911
         l.emit(TokenType.LBRACKET)
         return lex_inside_bracketed_segment
 
-    # default
-    l.backup()
     if l.filter_depth:
+        l.backup()
         return lex_inside_filter
 
     l.error(f"expected '.', '..' or a bracketed selection, found {c!r}")
@@ -208,13 +204,13 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]:  # noqa: D103
         l.emit(TokenType.LBRACKET)
         return lex_inside_bracketed_segment
 
-    # default
     l.backup()
 
     if l.accept_match(RE_PROPERTY):
         l.emit(TokenType.PROPERTY)
         return lex_segment
 
+    l.next()
     l.error(f"unexpected descendant selection token {c!r}")
     return None
 
@@ -222,7 +218,7 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]:  # noqa: D103
 def lex_shorthand_selector(l: Lexer) -> Optional[StateFn]:  # noqa: D103
     l.ignore()  # ignore dot
 
-    if l.ignore_whitespace():
+    if l.accept_match(RE_WHITESPACE):
         l.error("unexpected whitespace after dot")
         return None
 
@@ -322,11 +318,9 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]:  # noqa: D103, PLR0915, PL
             return lex_inside_bracketed_segment
 
         if c == "'":
-            # String literal
             return lex_single_quoted_string_inside_filter_expression
 
         if c == '"':
-            # String literal
             return lex_double_quoted_string_inside_filter_expression
 
         if c == "(":
@@ -392,62 +386,31 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]:  # noqa: D103, PLR0915, PL
                 l.emit(TokenType.GT)
             continue
 
-        # default
         l.backup()
 
-        # numbers
-        # TODO: try accept_match(RE_FLOAT), including negative exponent
-        if l.accept_match(RE_INT):
-            if l.peek() == ".":
-                # A float
-                l.next()
-                if not l.accept_match(RE_INT):
-                    l.error("a fractional digit is required after a decimal point")
-                    return None
-
-                l.accept_match(RE_EXPONENT)
-                l.emit(TokenType.FLOAT)
-                continue
-
-            # An int, or float if exponent is negative
-            if l.accept_match(RE_NEGATIVE_EXPONENT):
-                l.emit(TokenType.FLOAT)
-            else:
-                l.accept_match(RE_EXPONENT)
-                l.emit(TokenType.INT)
-            continue
-
-        if l.accept_match(RE_AND):
+        if l.accept("&&"):
             l.emit(TokenType.AND)
-            continue
-
-        if l.accept_match(RE_OR):
+        elif l.accept("||"):
             l.emit(TokenType.OR)
-            continue
-
-        if l.accept_match(RE_TRUE):
+        elif l.accept("true"):
             l.emit(TokenType.TRUE)
-            continue
-
-        if l.accept_match(RE_FALSE):
+        elif l.accept("false"):
             l.emit(TokenType.FALSE)
-            continue
-
-        if l.accept_match(RE_NULL):
+        elif l.accept("null"):
             l.emit(TokenType.NULL)
-            continue
-
-        # functions
-        if l.accept_match(RE_FUNCTION_NAME) and l.peek() == "(":
+        elif l.accept_match(RE_FLOAT):
+            l.emit(TokenType.FLOAT)
+        elif l.accept_match(RE_INT):
+            l.emit(TokenType.INT)
+        elif l.accept_match(RE_FUNCTION_NAME) and l.peek() == "(":
             # Keep track of parentheses for this function call.
             l.paren_stack.append(1)
             l.emit(TokenType.FUNCTION)
             l.next()
             l.ignore()  # ignore LPAREN
-            continue
-
-        l.error(f"unexpected filter selector token {c!r}")
-        return None
+        else:
+            l.error(f"unexpected filter selector token {c!r}")
+            return None
 
 
 def lex_string_factory(quote: str, state: StateFn) -> StateFn:
@@ -472,17 +435,15 @@ def _lex_string(l: Lexer) -> Optional[StateFn]:
             return state
 
         while True:
-            head = l.query[l.pos : l.pos + 2]
             c = l.next()
 
-            if head in ("\\\\", f"\\{quote}"):
-                l.next()
-                continue
-
-            # TODO: replace use of `head` with peek
-            if c == "\\" and not RE_ESCAPE.match(head):
-                l.error("invalid escape")
-                return None
+            if c == "\\":
+                peeked = l.peek()
+                if peeked in ESCAPES or peeked == quote:
+                    l.next()
+                else:
+                    l.error("invalid escape")
+                    return None
 
             if not c:
                 l.error(f"unclosed string starting at index {l.start}")
@@ -528,6 +489,6 @@ def tokenize(query: str) -> List[Token]:
     lexer.run()
 
     if tokens and tokens[-1].type_ == TokenType.ERROR:
-        raise JSONPathSyntaxError(tokens[-1].value, token=tokens[-1])
+        raise JSONPathSyntaxError(tokens[-1].message, token=tokens[-1])
 
     return tokens
diff --git a/jsonpath_rfc9535/tokens.py b/jsonpath_rfc9535/tokens.py
index 93d6674..47c7b20 100644
--- a/jsonpath_rfc9535/tokens.py
+++ b/jsonpath_rfc9535/tokens.py
@@ -67,7 +67,7 @@ class Token:
             token derives.
     """
 
-    __slots__ = ("type_", "value", "index", "query")
+    __slots__ = ("type_", "value", "index", "query", "message")
 
     def __init__(
         self,
@@ -75,16 +75,18 @@ def __init__(
         value: str,
         index: int,
         query: str,
+        message: str | None = None,
     ) -> None:
         self.type_ = type_
         self.value = value
         self.index = index
         self.query = query
+        self.message = message
 
     def __repr__(self) -> str:  # pragma: no cover
         return (
             f"Token(type={self.type_.name!r}, value={self.value!r}, "
-            f"index={self.index}, query={self.query!r})"
+            f"index={self.index}, query={self.query!r}, message={self.message!r})"
         )
 
     def __eq__(self, other: object) -> bool:
diff --git a/pyproject.toml b/pyproject.toml
index d6eb406..8bfbbac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,9 +64,10 @@ no-cov = "cov --no-cov {args}"
 test = "pytest {args}"
 lint = "ruff check ."
 typing = "mypy"
+benchmark = "python scripts/benchmark.py"
 
 [[tool.hatch.envs.all.matrix]]
-python = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+python = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.10"]
 
 [tool.coverage.run]
 branch = true
diff --git a/tests/test_lex.py b/tests/test_lex.py
index 9c81c80..8ea3a23 100644
--- a/tests/test_lex.py
+++ b/tests/test_lex.py
@@ -57,7 +57,7 @@ class Case:
         description="missing root selector",
         query="foo.bar",
         want=[
-            Token(TokenType.ERROR, "expected '$', found 'f'", 0, "foo.bar"),
+            Token(TokenType.ERROR, "f", 0, "foo.bar", "expected '$', found 'f'"),
         ],
     ),
     Case(
@@ -67,9 +67,10 @@ class Case:
             Token(TokenType.ROOT, "$", 0, "$foo"),
             Token(
                 TokenType.ERROR,
-                "expected '.', '..' or a bracketed selection, found 'f'",
+                "f",
                 1,
                 "$foo",
+                "expected '.', '..' or a bracketed selection, found 'f'",
             ),
         ],
     ),
@@ -88,7 +89,13 @@ class Case:
         query="$. foo.bar",
         want=[
             Token(TokenType.ROOT, "$", 0, "$. foo.bar"),
-            Token(TokenType.ERROR, "unexpected whitespace after dot", 3, "$. foo.bar"),
+            Token(
+                TokenType.ERROR,
+                " ",
+                2,
+                "$. foo.bar",
+                "unexpected whitespace after dot",
+            ),
         ],
     ),
     Case(
@@ -129,9 +136,10 @@ class Case:
             Token(TokenType.DOUBLE_DOT, "..", 1, "$...foo"),
             Token(
                 TokenType.ERROR,
-                "unexpected descendant selection token '.'",
+                ".",
                 3,
                 "$...foo",
+                "unexpected descendant selection token '.'",
             ),
         ],
     ),
@@ -143,9 +151,10 @@ class Case:
             Token(TokenType.DOUBLE_DOT, "..", 1, "$....foo"),
             Token(
                 TokenType.ERROR,
-                "unexpected descendant selection token '.'",
+                ".",
                 3,
                 "$....foo",
+                "unexpected descendant selection token '.'",
             ),
         ],
     ),