Skip to content

Commit

Permalink
Merge branch 'main' into ref-dedup
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Aug 20, 2024
2 parents 8fbce0b + 1044cf4 commit b32e828
Show file tree
Hide file tree
Showing 9 changed files with 293 additions and 48 deletions.
7 changes: 7 additions & 0 deletions .github/actions/build_pandas/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ runs:
fi
shell: bash -el {0}

- name: Uninstall nomkl
run: |
if conda list nomkl | grep nomkl 1>/dev/null; then
conda remove nomkl -y
fi
shell: bash -el {0}

- name: Build Pandas
run: |
if [[ ${{ inputs.editable }} == "true" ]]; then
Expand Down
4 changes: 3 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ repos:
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
exclude: ^pandas/tests/frame/test_query_eval.py
- id: ruff
# TODO: remove autofixe-only rules when they are checked by ruff
name: ruff-selected-autofixes
Expand All @@ -31,7 +32,7 @@ repos:
exclude: ^pandas/tests
args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
- id: ruff-format
exclude: ^scripts
exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
- repo: https://github.com/jendrikseipp/vulture
rev: 'v2.11'
hooks:
Expand Down Expand Up @@ -85,6 +86,7 @@ repos:
types: [text] # overwrite types: [rst]
types_or: [python, rst]
- id: rst-inline-touching-normal
exclude: ^pandas/tests/frame/test_query_eval.py
types: [text] # overwrite types: [rst]
types_or: [python, rst]
- repo: https://github.com/sphinx-contrib/sphinx-lint
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,7 @@ Other
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`)
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
- Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
Expand Down
133 changes: 115 additions & 18 deletions pandas/core/computation/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

from enum import Enum
from io import StringIO
from keyword import iskeyword
import token
Expand Down Expand Up @@ -32,13 +33,21 @@ def create_valid_python_identifier(name: str) -> str:
------
SyntaxError
If the returned name is not a Python valid identifier, raise an exception.
This can happen if there is a hashtag in the name, as the tokenizer will
than terminate and not find the backtick.
But also for characters that fall out of the range of (U+0001..U+007F).
"""
if name.isidentifier() and not iskeyword(name):
return name

# Escape characters that fall outside the ASCII range (U+0001..U+007F).
# GH 49633
gen = (
(c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace")))
for c in name
)
name = "".join(
c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_")
for c, c_escaped in gen
)

# Create a dict with the special characters and their replacement string.
# EXACT_TOKEN_TYPES contains these special characters
# token.tok_name contains a readable description of the replacement string.
Expand All @@ -54,11 +63,10 @@ def create_valid_python_identifier(name: str) -> str:
"$": "_DOLLARSIGN_",
"€": "_EUROSIGN_",
"°": "_DEGREESIGN_",
# Including quotes works, but there are exceptions.
"'": "_SINGLEQUOTE_",
'"': "_DOUBLEQUOTE_",
# Currently not possible. Terminates parser and won't find backtick.
# "#": "_HASH_",
"#": "_HASH_",
"`": "_BACKTICK_",
}
)

Expand Down Expand Up @@ -127,6 +135,9 @@ def clean_column_name(name: Hashable) -> Hashable:
which is not caught and propagates to the user level.
"""
try:
# Escape backticks
name = name.replace("`", "``") if isinstance(name, str) else name

tokenized = tokenize_string(f"`{name}`")
tokval = next(tokenized)[1]
return create_valid_python_identifier(tokval)
Expand Down Expand Up @@ -168,6 +179,91 @@ def tokenize_backtick_quoted_string(
return BACKTICK_QUOTED_STRING, source[string_start:string_end]


class ParseState(Enum):
DEFAULT = 0
IN_BACKTICK = 1
IN_SINGLE_QUOTE = 2
IN_DOUBLE_QUOTE = 3


def _split_by_backtick(s: str) -> list[tuple[bool, str]]:
"""
Splits a str into substrings along backtick characters (`).
Disregards backticks inside quotes.
Parameters
----------
s : str
The Python source code string.
Returns
-------
substrings: list[tuple[bool, str]]
List of tuples, where each tuple has two elements:
The first is a boolean indicating if the substring is backtick-quoted.
The second is the actual substring.
"""
substrings = []
substr: list[str] = [] # Will join into a string before adding to `substrings`
i = 0
parse_state = ParseState.DEFAULT
while i < len(s):
char = s[i]

match char:
case "`":
# start of a backtick-quoted string
if parse_state == ParseState.DEFAULT:
if substr:
substrings.append((False, "".join(substr)))

substr = [char]
i += 1
parse_state = ParseState.IN_BACKTICK
continue

elif parse_state == ParseState.IN_BACKTICK:
# escaped backtick inside a backtick-quoted string
next_char = s[i + 1] if (i != len(s) - 1) else None
if next_char == "`":
substr.append(char)
substr.append(next_char)
i += 2
continue

# end of the backtick-quoted string
else:
substr.append(char)
substrings.append((True, "".join(substr)))

substr = []
i += 1
parse_state = ParseState.DEFAULT
continue
case "'":
# start of a single-quoted string
if parse_state == ParseState.DEFAULT:
parse_state = ParseState.IN_SINGLE_QUOTE
# end of a single-quoted string
elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"):
parse_state = ParseState.DEFAULT
case '"':
# start of a double-quoted string
if parse_state == ParseState.DEFAULT:
parse_state = ParseState.IN_DOUBLE_QUOTE
# end of a double-quoted string
elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"):
parse_state = ParseState.DEFAULT
substr.append(char)
i += 1

if substr:
substrings.append((False, "".join(substr)))

return substrings


def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
"""
Tokenize a Python source code string.
Expand All @@ -182,18 +278,19 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
tok_generator : Iterator[Tuple[int, str]]
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
"""
# GH 59285
# Escape characters, including backticks
source = "".join(
(
create_valid_python_identifier(substring[1:-1])
if is_backtick_quoted
else substring
)
for is_backtick_quoted, substring in _split_by_backtick(source)
)

line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)

# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted string
for toknum, tokval, start, _, _ in token_generator:
if tokval == "`":
try:
yield tokenize_backtick_quoted_string(
token_generator, source, string_start=start[1] + 1
)
except Exception as err:
raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
else:
yield toknum, tokval
for toknum, tokval, _, _, _ in token_generator:
yield toknum, tokval
14 changes: 3 additions & 11 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4556,17 +4556,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
quoted string are replaced by strings that are allowed as a Python identifier.
These characters include all operators in Python, the space character, the
question mark, the exclamation mark, the dollar sign, and the euro sign.
For other characters that fall outside the ASCII range (U+0001..U+007F)
and those that are not further specified in PEP 3131,
the query parser will raise an error.
This excludes whitespace different than the space character,
but also the hashtag (as it is used for comments) and the backtick
itself (backtick can also not be escaped).
In a special case, quotes that make a pair around a backtick can
confuse the parser.
For example, ```it's` > `that's``` will raise an error,
as it forms a quoted string (``'s > `that'``) with a backtick inside.
A backtick can be escaped by double backticks.
See also the `Python documentation about lexical analysis
<https://docs.python.org/3/reference/lexical_analysis.html>`__
Expand Down Expand Up @@ -4620,6 +4611,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
raise ValueError(msg)
kwargs["level"] = kwargs.pop("level", 0) + 1
kwargs["target"] = None

res = self.eval(expr, **kwargs)

try:
Expand Down
Loading

0 comments on commit b32e828

Please sign in to comment.