From b8409543eb4b3451a09e311f52fd58180bf81969 Mon Sep 17 00:00:00 2001 From: James Prior Date: Sun, 27 Oct 2024 07:34:21 +0000 Subject: [PATCH 1/3] Improve filter selector class and parse function names --- jsonpath_rfc9535/parse.py | 23 +++++++++++------------ jsonpath_rfc9535/query.py | 4 ++-- jsonpath_rfc9535/selectors.py | 4 ++-- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/jsonpath_rfc9535/parse.py b/jsonpath_rfc9535/parse.py index e722bc1..15b3da7 100644 --- a/jsonpath_rfc9535/parse.py +++ b/jsonpath_rfc9535/parse.py @@ -34,7 +34,7 @@ from .segments import JSONPathChildSegment from .segments import JSONPathRecursiveDescentSegment from .segments import JSONPathSegment -from .selectors import Filter +from .selectors import FilterSelector from .selectors import IndexSelector from .selectors import JSONPathSelector from .selectors import NameSelector @@ -113,9 +113,6 @@ def __init__(self, *, env: JSONPathEnvironment) -> None: TokenType.TRUE: self.parse_boolean, } - # TODO: can a function argument be a grouped expression? - # TODO: can a function argument contain a !? - self.function_argument_map: Dict[ TokenType, Callable[[TokenStream], Expression] ] = { @@ -291,7 +288,7 @@ def parse_bracketed_selection(self, stream: TokenStream) -> List[JSONPathSelecto ) ) elif stream.current.type_ == TokenType.FILTER: - selectors.append(self.parse_filter(stream)) + selectors.append(self.parse_filter_selector(stream)) elif stream.current.type_ == TokenType.EOF: raise JSONPathSyntaxError( "unexpected end of query", token=stream.current @@ -320,9 +317,9 @@ def parse_bracketed_selection(self, stream: TokenStream) -> List[JSONPathSelecto return selectors - def parse_filter(self, stream: TokenStream) -> Filter: + def parse_filter_selector(self, stream: TokenStream) -> FilterSelector: tok = stream.next_token() - expr = self.parse_filter_selector(stream) + expr = self.parse_filter_expression(stream) if isinstance(expr, FunctionExtension): func = self.env.function_extensions.get(expr.name) @@ -342,7 +339,7 @@ def parse_filter(self, stream: TokenStream) -> Filter: token=expr.token, ) - return Filter( + return FilterSelector( env=self.env, token=tok, expression=FilterExpression(token=expr.token, expression=expr), @@ -392,7 +389,9 @@ def parse_prefix_expression(self, stream: TokenStream) -> Expression: return PrefixExpression( tok, operator="!", - right=self.parse_filter_selector(stream, precedence=self.PRECEDENCE_PREFIX), + right=self.parse_filter_expression( + stream, precedence=self.PRECEDENCE_PREFIX + ), ) def parse_infix_expression( @@ -400,7 +399,7 @@ def parse_infix_expression( ) -> Expression: tok = stream.next_token() precedence = self.PRECEDENCES.get(tok.type_, self.PRECEDENCE_LOWEST) - right = self.parse_filter_selector(stream, precedence) + right = self.parse_filter_expression(stream, precedence) operator = self.BINARY_OPERATORS[tok.type_] if operator in self.COMPARISON_OPERATORS: @@ -425,7 +424,7 @@ def parse_infix_expression( def parse_grouped_expression(self, stream: TokenStream) -> Expression: stream.next_token() - expr = self.parse_filter_selector(stream) + expr = self.parse_filter_expression(stream) stream.next_token() while stream.current.type_ != TokenType.RPAREN: @@ -497,7 +496,7 @@ def parse_function_extension(self, stream: TokenStream) -> Expression: ), ) - def parse_filter_selector( + def parse_filter_expression( self, stream: TokenStream, precedence: int = PRECEDENCE_LOWEST ) -> Expression: try: diff --git a/jsonpath_rfc9535/query.py b/jsonpath_rfc9535/query.py index 35c6ea2..38ac4ce 100644 --- a/jsonpath_rfc9535/query.py +++ b/jsonpath_rfc9535/query.py @@ -1,4 +1,4 @@ -"""A compiled JSONPath ready to be applied to a JSON-like value.""" +"""A compiled JSONPath expression ready to be applied to JSON-like data.""" from __future__ import annotations @@ -20,7 +20,7 @@ class JSONPathQuery: - """A compiled JSONPath expression ready to be applied to a JSON-like value. + """A compiled JSONPath expression ready to be applied to JSON-like data. Arguments: env: The `JSONPathEnvironment` this query is bound to. diff --git a/jsonpath_rfc9535/selectors.py b/jsonpath_rfc9535/selectors.py index 184ef50..c365402 100644 --- a/jsonpath_rfc9535/selectors.py +++ b/jsonpath_rfc9535/selectors.py @@ -213,7 +213,7 @@ def resolve(self, node: JSONPathNode) -> Iterable[JSONPathNode]: yield node.new_child(element, i) -class Filter(JSONPathSelector): +class FilterSelector(JSONPathSelector): """Filter array/list items or dict/object values with a filter expression.""" __slots__ = ("expression",) @@ -233,7 +233,7 @@ def __str__(self) -> str: def __eq__(self, __value: object) -> bool: return ( - isinstance(__value, Filter) + isinstance(__value, FilterSelector) and self.expression == __value.expression and self.token == __value.token ) From d5bf0934b62c54535b55145abecca9df0d60a2c4 Mon Sep 17 00:00:00 2001 From: James Prior Date: Sun, 27 Oct 2024 07:55:12 +0000 Subject: [PATCH 2/3] Lexer todos --- jsonpath_rfc9535/lex.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/jsonpath_rfc9535/lex.py b/jsonpath_rfc9535/lex.py index ba081be..8166abb 100644 --- a/jsonpath_rfc9535/lex.py +++ b/jsonpath_rfc9535/lex.py @@ -77,6 +77,7 @@ def emit(self, t: TokenType) -> None: def next(self) -> str: """Return the next character, or the empty string if no more characters.""" + # TODO: benchmark ty/except approach if self.pos >= len(self.query): return "" @@ -100,18 +101,18 @@ def backup(self) -> None: def peek(self) -> str: """Return the next character without advancing the pointer.""" + # TODO: benchmark try/except without self.next() c = self.next() if c: self.backup() return c - def accept(self, pattern: Pattern[str]) -> bool: - """Increment the pointer if the current character matches _pattern_.""" - c = self.next() - if pattern.match(c): + def accept(self, s: str) -> bool: + """Increment the pointer if the current position starts with _s_.""" + # TODO: benchmark using accept instead of accept_match for known words + if self.query.startswith(s, self.pos): + self.pos += len(s) return True - if c: - self.backup() return False def accept_match(self, pattern: Pattern[str]) -> bool: @@ -140,6 +141,8 @@ def ignore_whitespace(self) -> bool: def error(self, msg: str) -> None: """Emit an error token.""" + # TODO: move msg out of Token.value. We'll need the value too when implementing + # better error messages. self.tokens.append(Token(TokenType.ERROR, msg, self.pos, self.query)) @@ -147,6 +150,7 @@ def error(self, msg: str) -> None: def lex_root(l: Lexer) -> Optional[StateFn]: # noqa: D103 + # TODO: benchmark peek/next instead of next/backup c = l.next() if c != "$": @@ -392,6 +396,7 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL l.backup() # numbers + # TODO: try accept_match(RE_FLOAT), including negative exponent if l.accept_match(RE_INT): if l.peek() == ".": # A float @@ -474,6 +479,7 @@ def _lex_string(l: Lexer) -> Optional[StateFn]: l.next() continue + # TODO: replace use of `head` with peek if c == "\\" and not RE_ESCAPE.match(head): l.error("invalid escape") return None From 5614806dd6cf28029ecd894250aafd317d958ded Mon Sep 17 00:00:00 2001 From: James Prior Date: Sun, 27 Oct 2024 13:44:50 +0000 Subject: [PATCH 3/3] Tidy lexer and fix error tokens --- jsonpath_rfc9535/lex.py | 131 +++++++++++++------------------------ jsonpath_rfc9535/tokens.py | 6 +- pyproject.toml | 3 +- tests/test_lex.py | 19 ++++-- 4 files changed, 66 insertions(+), 93 deletions(-) diff --git a/jsonpath_rfc9535/lex.py b/jsonpath_rfc9535/lex.py index 8166abb..04e508a 100644 --- a/jsonpath_rfc9535/lex.py +++ b/jsonpath_rfc9535/lex.py @@ -17,16 +17,11 @@ RE_WHITESPACE = re.compile(r"[ \n\r\t]+") RE_PROPERTY = re.compile(r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*") RE_INDEX = re.compile(r"-?[0-9]+") -RE_INT = re.compile(r"-?[0-9]+") -RE_EXPONENT = re.compile(r"[eE][+-]?[0-9]+") -RE_NEGATIVE_EXPONENT = re.compile(r"[eE]-[0-9]+") +RE_INT = re.compile(r"-?[0-9]+(?:[eE]\+?[0-9]+)?") +# RE_FLOAT includes numbers with a negative exponent and no decimal point. +RE_FLOAT = re.compile(r"(:?-?[0-9]+\.[0-9]+(?:[eE][+-]?[0-9]+)?)|(-?[0-9]+[eE]-[0-9]+)") RE_FUNCTION_NAME = re.compile(r"[a-z][a-z_0-9]*") -RE_AND = re.compile(r"&&") -RE_OR = re.compile(r"\|\|") -RE_TRUE = re.compile(r"true") -RE_FALSE = re.compile(r"false") -RE_NULL = re.compile(r"null") -RE_ESCAPE = re.compile(r"\\[bfnrtu/]") +ESCAPES = frozenset(["b", "f", "n", "r", "t", "u", "/", "\\"]) class Lexer: @@ -77,14 +72,13 @@ def emit(self, t: TokenType) -> None: def next(self) -> str: """Return the next character, or the empty string if no more characters.""" - # TODO: benchmark ty/except approach - if self.pos >= len(self.query): + try: + c = self.query[self.pos] + self.pos += 1 + return c + except IndexError: return "" - c = self.query[self.pos] - self.pos += 1 - return c - def ignore(self) -> None: """Ignore characters up to the pointer.""" self.start = self.pos @@ -101,15 +95,13 @@ def backup(self) -> None: def peek(self) -> str: """Return the next character without advancing the pointer.""" - # TODO: benchmark try/except without self.next() - c = self.next() - if c: - self.backup() - return c + try: + return self.query[self.pos] + except IndexError: + return "" def accept(self, s: str) -> bool: """Increment the pointer if the current position starts with _s_.""" - # TODO: benchmark using accept instead of accept_match for known words if self.query.startswith(s, self.pos): self.pos += len(s) return True @@ -141,20 +133,25 @@ def ignore_whitespace(self) -> bool: def error(self, msg: str) -> None: """Emit an error token.""" - # TODO: move msg out of Token.value. We'll need the value too when implementing # better error messages. - self.tokens.append(Token(TokenType.ERROR, msg, self.pos, self.query)) + self.tokens.append( + Token( + TokenType.ERROR, + self.query[self.start : self.pos], + self.start, + self.query, + msg, + ) + ) StateFn = Callable[[Lexer], Optional["StateFn"]] def lex_root(l: Lexer) -> Optional[StateFn]: # noqa: D103 - # TODO: benchmark peek/next instead of next/backup c = l.next() if c != "$": - l.backup() l.error(f"expected '$', found {c!r}") return None @@ -184,9 +181,8 @@ def lex_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0911 l.emit(TokenType.LBRACKET) return lex_inside_bracketed_segment - # default - l.backup() if l.filter_depth: + l.backup() return lex_inside_filter l.error(f"expected '.', '..' or a bracketed selection, found {c!r}") @@ -208,13 +204,13 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103 l.emit(TokenType.LBRACKET) return lex_inside_bracketed_segment - # default l.backup() if l.accept_match(RE_PROPERTY): l.emit(TokenType.PROPERTY) return lex_segment + l.next() l.error(f"unexpected descendant selection token {c!r}") return None @@ -222,7 +218,7 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103 def lex_shorthand_selector(l: Lexer) -> Optional[StateFn]: # noqa: D103 l.ignore() # ignore dot - if l.ignore_whitespace(): + if l.accept_match(RE_WHITESPACE): l.error("unexpected whitespace after dot") return None @@ -322,11 +318,9 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL return lex_inside_bracketed_segment if c == "'": - # String literal return lex_single_quoted_string_inside_filter_expression if c == '"': - # String literal return lex_double_quoted_string_inside_filter_expression if c == "(": @@ -392,62 +386,31 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL l.emit(TokenType.GT) continue - # default l.backup() - # numbers - # TODO: try accept_match(RE_FLOAT), including negative exponent - if l.accept_match(RE_INT): - if l.peek() == ".": - # A float - l.next() - if not l.accept_match(RE_INT): - l.error("a fractional digit is required after a decimal point") - return None - - l.accept_match(RE_EXPONENT) - l.emit(TokenType.FLOAT) - continue - - # An int, or float if exponent is negative - if l.accept_match(RE_NEGATIVE_EXPONENT): - l.emit(TokenType.FLOAT) - else: - l.accept_match(RE_EXPONENT) - l.emit(TokenType.INT) - continue - - if l.accept_match(RE_AND): + if l.accept("&&"): l.emit(TokenType.AND) - continue - - if l.accept_match(RE_OR): + elif l.accept("||"): l.emit(TokenType.OR) - continue - - if l.accept_match(RE_TRUE): + elif l.accept("true"): l.emit(TokenType.TRUE) - continue - - if l.accept_match(RE_FALSE): + elif l.accept("false"): l.emit(TokenType.FALSE) - continue - - if l.accept_match(RE_NULL): + elif l.accept("null"): l.emit(TokenType.NULL) - continue - - # functions - if l.accept_match(RE_FUNCTION_NAME) and l.peek() == "(": + elif l.accept_match(RE_FLOAT): + l.emit(TokenType.FLOAT) + elif l.accept_match(RE_INT): + l.emit(TokenType.INT) + elif l.accept_match(RE_FUNCTION_NAME) and l.peek() == "(": # Keep track of parentheses for this function call. l.paren_stack.append(1) l.emit(TokenType.FUNCTION) l.next() l.ignore() # ignore LPAREN - continue - - l.error(f"unexpected filter selector token {c!r}") - return None + else: + l.error(f"unexpected filter selector token {c!r}") + return None def lex_string_factory(quote: str, state: StateFn) -> StateFn: @@ -472,17 +435,15 @@ def _lex_string(l: Lexer) -> Optional[StateFn]: return state while True: - head = l.query[l.pos : l.pos + 2] c = l.next() - if head in ("\\\\", f"\\{quote}"): - l.next() - continue - - # TODO: replace use of `head` with peek - if c == "\\" and not RE_ESCAPE.match(head): - l.error("invalid escape") - return None + if c == "\\": + peeked = l.peek() + if peeked in ESCAPES or peeked == quote: + l.next() + else: + l.error("invalid escape") + return None if not c: l.error(f"unclosed string starting at index {l.start}") @@ -528,6 +489,6 @@ def tokenize(query: str) -> List[Token]: lexer.run() if tokens and tokens[-1].type_ == TokenType.ERROR: - raise JSONPathSyntaxError(tokens[-1].value, token=tokens[-1]) + raise JSONPathSyntaxError(tokens[-1].message, token=tokens[-1]) return tokens diff --git a/jsonpath_rfc9535/tokens.py b/jsonpath_rfc9535/tokens.py index 93d6674..47c7b20 100644 --- a/jsonpath_rfc9535/tokens.py +++ b/jsonpath_rfc9535/tokens.py @@ -67,7 +67,7 @@ class Token: token derives. """ - __slots__ = ("type_", "value", "index", "query") + __slots__ = ("type_", "value", "index", "query", "message") def __init__( self, @@ -75,16 +75,18 @@ def __init__( value: str, index: int, query: str, + message: str | None = None, ) -> None: self.type_ = type_ self.value = value self.index = index self.query = query + self.message = message def __repr__(self) -> str: # pragma: no cover return ( f"Token(type={self.type_.name!r}, value={self.value!r}, " - f"index={self.index}, query={self.query!r})" + f"index={self.index}, query={self.query!r}, message={self.message!r})" ) def __eq__(self, other: object) -> bool: diff --git a/pyproject.toml b/pyproject.toml index d6eb406..8bfbbac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,9 +64,10 @@ no-cov = "cov --no-cov {args}" test = "pytest {args}" lint = "ruff check ." typing = "mypy" +benchmark = "python scripts/benchmark.py" [[tool.hatch.envs.all.matrix]] -python = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] +python = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.10"] [tool.coverage.run] branch = true diff --git a/tests/test_lex.py b/tests/test_lex.py index 9c81c80..8ea3a23 100644 --- a/tests/test_lex.py +++ b/tests/test_lex.py @@ -57,7 +57,7 @@ class Case: description="missing root selector", query="foo.bar", want=[ - Token(TokenType.ERROR, "expected '$', found 'f'", 0, "foo.bar"), + Token(TokenType.ERROR, "f", 0, "foo.bar", "expected '$', found 'f'"), ], ), Case( @@ -67,9 +67,10 @@ class Case: Token(TokenType.ROOT, "$", 0, "$foo"), Token( TokenType.ERROR, - "expected '.', '..' or a bracketed selection, found 'f'", + "f", 1, "$foo", + "expected '.', '..' or a bracketed selection, found 'f'", ), ], ), @@ -88,7 +89,13 @@ class Case: query="$. foo.bar", want=[ Token(TokenType.ROOT, "$", 0, "$. foo.bar"), - Token(TokenType.ERROR, "unexpected whitespace after dot", 3, "$. foo.bar"), + Token( + TokenType.ERROR, + " ", + 2, + "$. foo.bar", + "unexpected whitespace after dot", + ), ], ), Case( @@ -129,9 +136,10 @@ class Case: Token(TokenType.DOUBLE_DOT, "..", 1, "$...foo"), Token( TokenType.ERROR, - "unexpected descendant selection token '.'", + ".", 3, "$...foo", + "unexpected descendant selection token '.'", ), ], ), @@ -143,9 +151,10 @@ class Case: Token(TokenType.DOUBLE_DOT, "..", 1, "$....foo"), Token( TokenType.ERROR, - "unexpected descendant selection token '.'", + ".", 3, "$....foo", + "unexpected descendant selection token '.'", ), ], ),