Fix invalid escapes in string literals

jg-rp · Sep 28, 2023 · 822b6b0 · 822b6b0
1 parent fd696ae
commit 822b6b0
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 26 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Python JSONPath Change Log
 
+## Version 0.10.0 (unreleased)
+
+**Breaking Changes**
+
+- The JSONPath lexer now yields distinct tokens for single and double quoted string literals. This is so the parser can do a better job of detecting invalid escape sequences.
+
+**Fixes**
+
+- We no longer silently ignore invalid escape sequences in JSONPath string literals. For example, `$['\"']` used to be OK, it now raises a `JSONPathSyntaxError`.
+
 ## Version 0.9.0
 
 **Breaking Changes**

diff --git a/docs/syntax.md b/docs/syntax.md
@@ -193,7 +193,6 @@ And this is a list of areas where we deviate from the [IETF JSONPath draft](http
 - The root token (default `$`) is optional.
 - Paths starting with a dot (`.`) are OK. `.thing` is the same as `$.thing`, as is `thing`, `$[thing]` and `$["thing"]`.
 - The built-in `match()` and `search()` filter functions use Python's standard library `re` module, which, at least, doesn't support Unicode properties. We might add an implementation of `match()` and `search()` using the third party [regex](https://pypi.org/project/regex/) package in the future.
-- We silently ignore unnecessary escaping when parsing some quoted selectors. The standard treats this as an "invalid selector".
 
 And this is a list of features that are uncommon or unique to Python JSONPath.
 

diff --git a/jsonpath/lex.py b/jsonpath/lex.py
@@ -60,7 +60,6 @@
 from .token import TOKEN_SLICE_START
 from .token import TOKEN_SLICE_STEP
 from .token import TOKEN_SLICE_STOP
-from .token import TOKEN_STRING
 from .token import TOKEN_TRUE
 from .token import TOKEN_UNDEFINED
 from .token import TOKEN_UNION
@@ -256,13 +255,13 @@ def tokenize(self, path: str) -> Iterator[Token]: # noqa PLR0912
  )
  elif kind == TOKEN_DOUBLE_QUOTE_STRING:
  yield _token(
- kind=TOKEN_STRING,
+ kind=TOKEN_DOUBLE_QUOTE_STRING,
  value=match.group("G_DQUOTE"),
  index=match.start("G_DQUOTE"),
  )
  elif kind == TOKEN_SINGLE_QUOTE_STRING:
  yield _token(
- kind=TOKEN_STRING,
+ kind=TOKEN_SINGLE_QUOTE_STRING,
  value=match.group("G_SQUOTE"),
  index=match.start("G_SQUOTE"),
  )

diff --git a/jsonpath/parse.py b/jsonpath/parse.py
@@ -1,7 +1,7 @@
 """The default JSONPath parser."""
 from __future__ import annotations
 
-import codecs
+import json
 import re
 from typing import TYPE_CHECKING
 from typing import Callable
@@ -45,6 +45,7 @@
 from .token import TOKEN_COMMA
 from .token import TOKEN_CONTAINS
 from .token import TOKEN_DDOT
+from .token import TOKEN_DOUBLE_QUOTE_STRING
 from .token import TOKEN_EOF
 from .token import TOKEN_EQ
 from .token import TOKEN_FALSE
@@ -81,10 +82,10 @@
 from .token import TOKEN_ROOT
 from .token import TOKEN_RPAREN
 from .token import TOKEN_SELF
+from .token import TOKEN_SINGLE_QUOTE_STRING
 from .token import TOKEN_SLICE_START
 from .token import TOKEN_SLICE_STEP
 from .token import TOKEN_SLICE_STOP
-from .token import TOKEN_STRING
 from .token import TOKEN_TRUE
 from .token import TOKEN_UNDEFINED
 from .token import TOKEN_UNION
@@ -212,7 +213,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
  TOKEN_ROOT: self.parse_root_path,
  TOKEN_SELF: self.parse_self_path,
  TOKEN_FILTER_CONTEXT: self.parse_filter_context_path,
- TOKEN_STRING: self.parse_string_literal,
+ TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
+ TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
  TOKEN_TRUE: self.parse_boolean,
  TOKEN_UNDEFINED: self.parse_undefined,
  TOKEN_FUNCTION: self.parse_function_extension,
@@ -225,7 +227,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
  TOKEN_NIL: self.parse_nil,
  TOKEN_NONE: self.parse_nil,
  TOKEN_NULL: self.parse_nil,
- TOKEN_STRING: self.parse_string_literal,
+ TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
+ TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
  TOKEN_TRUE: self.parse_boolean,
  }
 
@@ -239,7 +242,8 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
  TOKEN_NIL: self.parse_nil,
  TOKEN_NONE: self.parse_nil,
  TOKEN_NULL: self.parse_nil,
- TOKEN_STRING: self.parse_string_literal,
+ TOKEN_SINGLE_QUOTE_STRING: self.parse_string_literal,
+ TOKEN_DOUBLE_QUOTE_STRING: self.parse_string_literal,
  TOKEN_TRUE: self.parse_boolean,
  TOKEN_ROOT: self.parse_root_path,
  TOKEN_SELF: self.parse_self_path,
@@ -384,21 +388,29 @@ def parse_selector_list(self, stream: TokenStream) -> ListSelector: # noqa: PLR
  token=stream.current,
  )
  )
- elif stream.current.kind == TOKEN_STRING:
+ elif stream.current.kind in (
+ TOKEN_DOUBLE_QUOTE_STRING,
+ TOKEN_SINGLE_QUOTE_STRING,
+ ):
  if self.RE_INVALID_NAME_SELECTOR.search(stream.current.value):
  raise JSONPathSyntaxError(
  f"invalid name selector {stream.current.value!r}",
  token=stream.current,
  )
 
  if self.env.unicode_escape:
- name = (
- codecs.decode(
- stream.current.value.replace("\\/", "/"), "unicode-escape"
+ if stream.current.kind == TOKEN_SINGLE_QUOTE_STRING:
+ value = stream.current.value.replace('"', '\\"').replace(
+ "\\'", "'"
  )
- .encode("utf-16", "surrogatepass")
- .decode("utf-16")
- )
+ else:
+ value = stream.current.value
+ try:
+ name = json.loads(f'"{value}"')
+ except json.JSONDecodeError as err:
+ raise JSONPathSyntaxError(
+ str(err).split(":")[1], token=stream.current
+ ) from None
  else:
  name = stream.current.value
 

diff --git a/tests/test_compliance.py b/tests/test_compliance.py
@@ -37,10 +37,6 @@ class Case:
  "functions, match, filter, match function, unicode char class negated, uppercase": "\\P not supported", # noqa: E501
  "functions, search, filter, search function, unicode char class, uppercase": "\\p not supported", # noqa: E501
  "functions, search, filter, search function, unicode char class negated, uppercase": "\\P not supported", # noqa: E501
- "name selector, double quotes, invalid escaped single quote": "ignore",
- "name selector, double quotes, incomplete escape": "ignore",
- "name selector, single quotes, invalid escaped double quote": "ignore",
- "name selector, single quotes, incomplete escape": "ignore",
  "filter, non-singular query in comparison, slice": "TODO",
  "filter, non-singular query in comparison, all children": "TODO",
  "filter, non-singular query in comparison, descendants": "TODO",

diff --git a/tests/test_lex.py b/tests/test_lex.py
@@ -10,6 +10,7 @@
 from jsonpath.token import TOKEN_BARE_PROPERTY
 from jsonpath.token import TOKEN_COMMA
 from jsonpath.token import TOKEN_DDOT
+from jsonpath.token import TOKEN_DOUBLE_QUOTE_STRING
 from jsonpath.token import TOKEN_EQ
 from jsonpath.token import TOKEN_FALSE
 from jsonpath.token import TOKEN_FILTER_END
@@ -35,10 +36,10 @@
 from jsonpath.token import TOKEN_ROOT
 from jsonpath.token import TOKEN_RPAREN
 from jsonpath.token import TOKEN_SELF
+from jsonpath.token import TOKEN_SINGLE_QUOTE_STRING
 from jsonpath.token import TOKEN_SLICE_START
 from jsonpath.token import TOKEN_SLICE_STEP
 from jsonpath.token import TOKEN_SLICE_STOP
-from jsonpath.token import TOKEN_STRING
 from jsonpath.token import TOKEN_TRUE
 from jsonpath.token import TOKEN_UNION
 from jsonpath.token import TOKEN_WILD
@@ -84,7 +85,9 @@ class Case:
  want=[
  Token(kind=TOKEN_ROOT, value="$", index=0, path='$["some"]'),
  Token(kind=TOKEN_LIST_START, value="[", index=1, path='$["some"]'),
- Token(kind=TOKEN_STRING, value="some", index=3, path='$["some"]'),
+ Token(
+ kind=TOKEN_DOUBLE_QUOTE_STRING, value="some", index=3, path='$["some"]'
+ ),
  Token(kind=TOKEN_RBRACKET, value="]", index=8, path='$["some"]'),
  ],
  ),
@@ -94,7 +97,9 @@ class Case:
  want=[
  Token(kind=TOKEN_ROOT, value="$", index=0, path="$['some']"),
  Token(kind=TOKEN_LIST_START, value="[", index=1, path="$['some']"),
- Token(kind=TOKEN_STRING, value="some", index=3, path="$['some']"),
+ Token(
+ kind=TOKEN_SINGLE_QUOTE_STRING, value="some", index=3, path="$['some']"
+ ),
  Token(kind=TOKEN_RBRACKET, value="]", index=8, path="$['some']"),
  ],
  ),
@@ -754,7 +759,10 @@ class Case:
  kind=TOKEN_COMMA, value=",", index=16, path="[?(@.thing in [1, '1'])]"
  ),
  Token(
- kind=TOKEN_STRING, value="1", index=19, path="[?(@.thing in [1, '1'])]"
+ kind=TOKEN_SINGLE_QUOTE_STRING,
+ value="1",
+ index=19,
+ path="[?(@.thing in [1, '1'])]",
  ),
  Token(
  kind=TOKEN_RBRACKET,
@@ -1010,10 +1018,18 @@ class Case:
  want=[
  Token(kind=TOKEN_ROOT, value="$", index=0, path="$['some', 'thing']"),
  Token(kind=TOKEN_LIST_START, value="[", index=1, path="$['some', 'thing']"),
- Token(kind=TOKEN_STRING, value="some", index=3, path="$['some', 'thing']"),
+ Token(
+ kind=TOKEN_SINGLE_QUOTE_STRING,
+ value="some",
+ index=3,
+ path="$['some', 'thing']",
+ ),
  Token(kind=TOKEN_COMMA, value=",", index=8, path="$['some', 'thing']"),
  Token(
- kind=TOKEN_STRING, value="thing", index=11, path="$['some', 'thing']"
+ kind=TOKEN_SINGLE_QUOTE_STRING,
+ value="thing",
+ index=11,
+ path="$['some', 'thing']",
  ),
  Token(kind=TOKEN_RBRACKET, value="]", index=17, path="$['some', 'thing']"),
  ],