Merge branch 'main' into ref-dedup-converters

jbrockmendel · Aug 20, 2024 · 412edce · 412edce
2 parents bd4a83b + 1044cf4
commit 412edce
Show file tree

Hide file tree

Showing 9 changed files with 293 additions and 48 deletions.
diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
@@ -22,6 +22,13 @@ runs:
         fi
       shell: bash -el {0}
 
+    - name: Uninstall nomkl
+      run: |
+        if conda list nomkl | grep nomkl 1>/dev/null; then
+          conda remove nomkl -y
+        fi
+      shell: bash -el {0}
+
     - name: Build Pandas
       run: |
         if [[ ${{ inputs.editable }} == "true" ]]; then

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -23,6 +23,7 @@ repos:
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
+        exclude: ^pandas/tests/frame/test_query_eval.py
     -   id: ruff
         # TODO: remove autofixe-only rules when they are checked by ruff
         name: ruff-selected-autofixes
@@ -31,7 +32,7 @@ repos:
         exclude: ^pandas/tests
         args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
     -   id: ruff-format
-        exclude: ^scripts
+        exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
 -   repo: https://github.com/jendrikseipp/vulture
     rev: 'v2.11'
     hooks:
@@ -85,6 +86,7 @@ repos:
         types: [text]  # overwrite types: [rst]
         types_or: [python, rst]
       - id: rst-inline-touching-normal
+        exclude: ^pandas/tests/frame/test_query_eval.py
         types: [text]  # overwrite types: [rst]
         types_or: [python, rst]
 -   repo: https://github.com/sphinx-contrib/sphinx-lint

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -685,6 +685,7 @@ Other
 - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
+- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`)
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
 - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)

diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+from enum import Enum
 from io import StringIO
 from keyword import iskeyword
 import token
@@ -32,13 +33,21 @@ def create_valid_python_identifier(name: str) -> str:
     ------
     SyntaxError
         If the returned name is not a Python valid identifier, raise an exception.
-        This can happen if there is a hashtag in the name, as the tokenizer will
-        than terminate and not find the backtick.
-        But also for characters that fall out of the range of (U+0001..U+007F).
     """
     if name.isidentifier() and not iskeyword(name):
         return name
 
+    # Escape characters that fall outside the ASCII range (U+0001..U+007F).
+    # GH 49633
+    gen = (
+        (c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace")))
+        for c in name
+    )
+    name = "".join(
+        c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_")
+        for c, c_escaped in gen
+    )
+
     # Create a dict with the special characters and their replacement string.
     # EXACT_TOKEN_TYPES contains these special characters
     # token.tok_name contains a readable description of the replacement string.
@@ -54,11 +63,10 @@ def create_valid_python_identifier(name: str) -> str:
             "$": "_DOLLARSIGN_",
             "€": "_EUROSIGN_",
             "°": "_DEGREESIGN_",
-            # Including quotes works, but there are exceptions.
             "'": "_SINGLEQUOTE_",
             '"': "_DOUBLEQUOTE_",
-            # Currently not possible. Terminates parser and won't find backtick.
-            # "#": "_HASH_",
+            "#": "_HASH_",
+            "`": "_BACKTICK_",
         }
     )
 
@@ -127,6 +135,9 @@ def clean_column_name(name: Hashable) -> Hashable:
         which is not caught and propagates to the user level.
     """
     try:
+        # Escape backticks
+        name = name.replace("`", "``") if isinstance(name, str) else name
+
         tokenized = tokenize_string(f"`{name}`")
         tokval = next(tokenized)[1]
         return create_valid_python_identifier(tokval)
@@ -168,6 +179,91 @@ def tokenize_backtick_quoted_string(
     return BACKTICK_QUOTED_STRING, source[string_start:string_end]
 
 
+class ParseState(Enum):
+    DEFAULT = 0
+    IN_BACKTICK = 1
+    IN_SINGLE_QUOTE = 2
+    IN_DOUBLE_QUOTE = 3
+
+
+def _split_by_backtick(s: str) -> list[tuple[bool, str]]:
+    """
+    Splits a str into substrings along backtick characters (`).
+
+    Disregards backticks inside quotes.
+
+    Parameters
+    ----------
+    s : str
+        The Python source code string.
+
+    Returns
+    -------
+    substrings: list[tuple[bool, str]]
+        List of tuples, where each tuple has two elements:
+        The first is a boolean indicating if the substring is backtick-quoted.
+        The second is the actual substring.
+    """
+    substrings = []
+    substr: list[str] = []  # Will join into a string before adding to `substrings`
+    i = 0
+    parse_state = ParseState.DEFAULT
+    while i < len(s):
+        char = s[i]
+
+        match char:
+            case "`":
+                # start of a backtick-quoted string
+                if parse_state == ParseState.DEFAULT:
+                    if substr:
+                        substrings.append((False, "".join(substr)))
+
+                    substr = [char]
+                    i += 1
+                    parse_state = ParseState.IN_BACKTICK
+                    continue
+
+                elif parse_state == ParseState.IN_BACKTICK:
+                    # escaped backtick inside a backtick-quoted string
+                    next_char = s[i + 1] if (i != len(s) - 1) else None
+                    if next_char == "`":
+                        substr.append(char)
+                        substr.append(next_char)
+                        i += 2
+                        continue
+
+                    # end of the backtick-quoted string
+                    else:
+                        substr.append(char)
+                        substrings.append((True, "".join(substr)))
+
+                        substr = []
+                        i += 1
+                        parse_state = ParseState.DEFAULT
+                        continue
+            case "'":
+                # start of a single-quoted string
+                if parse_state == ParseState.DEFAULT:
+                    parse_state = ParseState.IN_SINGLE_QUOTE
+                # end of a single-quoted string
+                elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"):
+                    parse_state = ParseState.DEFAULT
+            case '"':
+                # start of a double-quoted string
+                if parse_state == ParseState.DEFAULT:
+                    parse_state = ParseState.IN_DOUBLE_QUOTE
+                # end of a double-quoted string
+                elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"):
+                    parse_state = ParseState.DEFAULT
+        substr.append(char)
+        i += 1
+
+    if substr:
+        substrings.append((False, "".join(substr)))
+
+    return substrings
+
+
 def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
     """
     Tokenize a Python source code string.
@@ -182,18 +278,19 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
     tok_generator : Iterator[Tuple[int, str]]
         An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
     """
+    # GH 59285
+    # Escape characters, including backticks
+    source = "".join(
+        (
+            create_valid_python_identifier(substring[1:-1])
+            if is_backtick_quoted
+            else substring
+        )
+        for is_backtick_quoted, substring in _split_by_backtick(source)
+    )
+
     line_reader = StringIO(source).readline
     token_generator = tokenize.generate_tokens(line_reader)
 
-    # Loop over all tokens till a backtick (`) is found.
-    # Then, take all tokens till the next backtick to form a backtick quoted string
-    for toknum, tokval, start, _, _ in token_generator:
-        if tokval == "`":
-            try:
-                yield tokenize_backtick_quoted_string(
-                    token_generator, source, string_start=start[1] + 1
-                )
-            except Exception as err:
-                raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
-        else:
-            yield toknum, tokval
+    for toknum, tokval, _, _, _ in token_generator:
+        yield toknum, tokval
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4556,17 +4556,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
         quoted string are replaced by strings that are allowed as a Python identifier.
         These characters include all operators in Python, the space character, the
         question mark, the exclamation mark, the dollar sign, and the euro sign.
-        For other characters that fall outside the ASCII range (U+0001..U+007F)
-        and those that are not further specified in PEP 3131,
-        the query parser will raise an error.
-        This excludes whitespace different than the space character,
-        but also the hashtag (as it is used for comments) and the backtick
-        itself (backtick can also not be escaped).
-
-        In a special case, quotes that make a pair around a backtick can
-        confuse the parser.
-        For example, ```it's` > `that's``` will raise an error,
-        as it forms a quoted string (``'s > `that'``) with a backtick inside.
+
+        A backtick can be escaped by double backticks.
 
         See also the `Python documentation about lexical analysis
         <https://docs.python.org/3/reference/lexical_analysis.html>`__
@@ -4620,6 +4611,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
             raise ValueError(msg)
         kwargs["level"] = kwargs.pop("level", 0) + 1
         kwargs["target"] = None
+
         res = self.eval(expr, **kwargs)
 
         try: