Skip to content

Commit

Permalink
Fix invalid escapes in string literals
Browse files Browse the repository at this point in the history
  • Loading branch information
jg-rp committed Sep 28, 2023
1 parent fd696ae commit 822b6b0
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 26 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Python JSONPath Change Log

## Version 0.10.0 (unreleased)

**Breaking Changes**

- The JSONPath lexer now yields distinct tokens for single and double quoted string literals. This is so the parser can do a better job of detecting invalid escape sequences.

**Fixes**

- We no longer silently ignore invalid escape sequences in JSONPath string literals. For example, `$['\"']` used to be OK, it now raises a `JSONPathSyntaxError`.

## Version 0.9.0

**Breaking Changes**
Expand Down
1 change: 0 additions & 1 deletion docs/syntax.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ And this is a list of areas where we deviate from the [IETF JSONPath draft](http
- The root token (default `$`) is optional.
- Paths starting with a dot (`.`) are OK. `.thing` is the same as `$.thing`, as is `thing`, `$[thing]` and `$["thing"]`.
- The built-in `match()` and `search()` filter functions use Python's standard library `re` module, which, at least, doesn't support Unicode properties. We might add an implementation of `match()` and `search()` using the third party [regex](https://pypi.org/project/regex/) package in the future.
- We silently ignore unnecessary escaping when parsing some quoted selectors. The standard treats this as an "invalid selector".

And this is a list of features that are uncommon or unique to Python JSONPath.

Expand Down
5 changes: 2 additions & 3 deletions jsonpath/lex.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@
from .token import TOKEN_SLICE_START
from .token import TOKEN_SLICE_STEP
from .token import TOKEN_SLICE_STOP
from .token import TOKEN_STRING
from .token import TOKEN_TRUE
from .token import TOKEN_UNDEFINED
from .token import TOKEN_UNION
Expand Down Expand Up @@ -256,13 +255,13 @@ def tokenize(self, path: str) -> Iterator[Token]: # noqa PLR0912
)
elif kind == TOKEN_DOUBLE_QUOTE_STRING:
yield _token(
kind=TOKEN_STRING,
kind=TOKEN_DOUBLE_QUOTE_STRING,
value=match.group("G_DQUOTE"),
index=match.start("G_DQUOTE"),
)
elif kind == TOKEN_SINGLE_QUOTE_STRING:
yield _token(
kind=TOKEN_STRING,
kind=TOKEN_SINGLE_QUOTE_STRING,
value=match.group("G_SQUOTE"),
index=match.start("G_SQUOTE"),
)
Expand Down
36 changes: 24 additions & 12 deletions jsonpath/parse.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""The default JSONPath parser."""
from __future__ import annotations

import codecs
import json
import re
from typing import TYPE_CHECKING
from typing import Callable
Expand Down Expand Up @@ -45,6 +45,7 @@
from .token import TOKEN_COMMA
from .token import TOKEN_CONTAINS
from .token import TOKEN_DDOT
from .token import TOKEN_DOUBLE_QUOTE_STRING
from .token import TOKEN_EOF
from .token import TOKEN_EQ
from .token import TOKEN_FALSE
Expand Down Expand Up @@ -81,10 +82,10 @@
from .token import TOKEN_ROOT
from .token import TOKEN_RPAREN
from .token import TOKEN_SELF
from .token import TOKEN_SINGLE_QUOTE_STRING
from .token import TOKEN_SLICE_START
from .token import TOKEN_SLICE_STEP
from .token import TOKEN_SLICE_STOP
from .token import TOKEN_STRING
from .token import TOKEN_TRUE
from .token import TOKEN_UNDEFINED
from .token import TOKEN_UNION
Expand Down Expand Up @@ -212,7 +213,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
TOKEN_ROOT: self.parse_root_path,
TOKEN_SELF: self.parse_self_path,
TOKEN_FILTER_CONTEXT: self.parse_filter_context_path,
TOKEN_STRING: self.parse_string_literal,
TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
TOKEN_TRUE: self.parse_boolean,
TOKEN_UNDEFINED: self.parse_undefined,
TOKEN_FUNCTION: self.parse_function_extension,
Expand All @@ -225,7 +227,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
TOKEN_NIL: self.parse_nil,
TOKEN_NONE: self.parse_nil,
TOKEN_NULL: self.parse_nil,
TOKEN_STRING: self.parse_string_literal,
TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
TOKEN_TRUE: self.parse_boolean,
}

Expand All @@ -239,7 +242,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
TOKEN_NIL: self.parse_nil,
TOKEN_NONE: self.parse_nil,
TOKEN_NULL: self.parse_nil,
TOKEN_STRING: self.parse_string_literal,
TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
TOKEN_TRUE: self.parse_boolean,
TOKEN_ROOT: self.parse_root_path,
TOKEN_SELF: self.parse_self_path,
Expand Down Expand Up @@ -384,21 +388,29 @@ def parse_selector_list(self, stream: TokenStream) -> ListSelector: # noqa: PLR
token=stream.current,
)
)
elif stream.current.kind == TOKEN_STRING:
elif stream.current.kind in (
TOKEN_DOUBLE_QUOTE_STRING,
TOKEN_SINGLE_QUOTE_STRING,
):
if self.RE_INVALID_NAME_SELECTOR.search(stream.current.value):
raise JSONPathSyntaxError(
f"invalid name selector {stream.current.value!r}",
token=stream.current,
)

if self.env.unicode_escape:
name = (
codecs.decode(
stream.current.value.replace("\\/", "/"), "unicode-escape"
if stream.current.kind == TOKEN_SINGLE_QUOTE_STRING:
value = stream.current.value.replace('"', '\\"').replace(
"\\'", "'"
)
.encode("utf-16", "surrogatepass")
.decode("utf-16")
)
else:
value = stream.current.value
try:
name = json.loads(f'"{value}"')
except json.JSONDecodeError as err:
raise JSONPathSyntaxError(
str(err).split(":")[1], token=stream.current
) from None
else:
name = stream.current.value

Expand Down
4 changes: 0 additions & 4 deletions tests/test_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ class Case:
"functions, match, filter, match function, unicode char class negated, uppercase": "\\P not supported", # noqa: E501
"functions, search, filter, search function, unicode char class, uppercase": "\\p not supported", # noqa: E501
"functions, search, filter, search function, unicode char class negated, uppercase": "\\P not supported", # noqa: E501
"name selector, double quotes, invalid escaped single quote": "ignore",
"name selector, double quotes, incomplete escape": "ignore",
"name selector, single quotes, invalid escaped double quote": "ignore",
"name selector, single quotes, incomplete escape": "ignore",
"filter, non-singular query in comparison, slice": "TODO",
"filter, non-singular query in comparison, all children": "TODO",
"filter, non-singular query in comparison, descendants": "TODO",
Expand Down
28 changes: 22 additions & 6 deletions tests/test_lex.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from jsonpath.token import TOKEN_BARE_PROPERTY
from jsonpath.token import TOKEN_COMMA
from jsonpath.token import TOKEN_DDOT
from jsonpath.token import TOKEN_DOUBLE_QUOTE_STRING
from jsonpath.token import TOKEN_EQ
from jsonpath.token import TOKEN_FALSE
from jsonpath.token import TOKEN_FILTER_END
Expand All @@ -35,10 +36,10 @@
from jsonpath.token import TOKEN_ROOT
from jsonpath.token import TOKEN_RPAREN
from jsonpath.token import TOKEN_SELF
from jsonpath.token import TOKEN_SINGLE_QUOTE_STRING
from jsonpath.token import TOKEN_SLICE_START
from jsonpath.token import TOKEN_SLICE_STEP
from jsonpath.token import TOKEN_SLICE_STOP
from jsonpath.token import TOKEN_STRING
from jsonpath.token import TOKEN_TRUE
from jsonpath.token import TOKEN_UNION
from jsonpath.token import TOKEN_WILD
Expand Down Expand Up @@ -84,7 +85,9 @@ class Case:
want=[
Token(kind=TOKEN_ROOT, value="$", index=0, path='$["some"]'),
Token(kind=TOKEN_LIST_START, value="[", index=1, path='$["some"]'),
Token(kind=TOKEN_STRING, value="some", index=3, path='$["some"]'),
Token(
kind=TOKEN_DOUBLE_QUOTE_STRING, value="some", index=3, path='$["some"]'
),
Token(kind=TOKEN_RBRACKET, value="]", index=8, path='$["some"]'),
],
),
Expand All @@ -94,7 +97,9 @@ class Case:
want=[
Token(kind=TOKEN_ROOT, value="$", index=0, path="$['some']"),
Token(kind=TOKEN_LIST_START, value="[", index=1, path="$['some']"),
Token(kind=TOKEN_STRING, value="some", index=3, path="$['some']"),
Token(
kind=TOKEN_SINGLE_QUOTE_STRING, value="some", index=3, path="$['some']"
),
Token(kind=TOKEN_RBRACKET, value="]", index=8, path="$['some']"),
],
),
Expand Down Expand Up @@ -754,7 +759,10 @@ class Case:
kind=TOKEN_COMMA, value=",", index=16, path="[?(@.thing in [1, '1'])]"
),
Token(
kind=TOKEN_STRING, value="1", index=19, path="[?(@.thing in [1, '1'])]"
kind=TOKEN_SINGLE_QUOTE_STRING,
value="1",
index=19,
path="[?(@.thing in [1, '1'])]",
),
Token(
kind=TOKEN_RBRACKET,
Expand Down Expand Up @@ -1010,10 +1018,18 @@ class Case:
want=[
Token(kind=TOKEN_ROOT, value="$", index=0, path="$['some', 'thing']"),
Token(kind=TOKEN_LIST_START, value="[", index=1, path="$['some', 'thing']"),
Token(kind=TOKEN_STRING, value="some", index=3, path="$['some', 'thing']"),
Token(
kind=TOKEN_SINGLE_QUOTE_STRING,
value="some",
index=3,
path="$['some', 'thing']",
),
Token(kind=TOKEN_COMMA, value=",", index=8, path="$['some', 'thing']"),
Token(
kind=TOKEN_STRING, value="thing", index=11, path="$['some', 'thing']"
kind=TOKEN_SINGLE_QUOTE_STRING,
value="thing",
index=11,
path="$['some', 'thing']",
),
Token(kind=TOKEN_RBRACKET, value="]", index=17, path="$['some', 'thing']"),
],
Expand Down

0 comments on commit 822b6b0

Please sign in to comment.