Support forcing a selector type into a subselector

Gallaecio · Jun 14, 2024 · e7c5e97 · e7c5e97
1 parent 4e140f9
commit e7c5e97
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 48 deletions.
diff --git a/parsel/selector.py b/parsel/selector.py
@@ -1,6 +1,7 @@
 """XPath and JMESPath selectors based on the lxml and jmespath Python
 packages."""
 
+import builtins
 import json
 import typing
 import warnings
@@ -141,7 +142,7 @@ def __getitem__(
     def __getstate__(self) -> None:
         raise TypeError("can't pickle SelectorList objects")
 
-    def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
+    def jmespath(self, query: str, *, type: Optional[str] = None, **kwargs: Any) -> "SelectorList[_SelectorType]":
         """
         Call the ``.jmespath()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
@@ -153,12 +154,14 @@ def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
 
             selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
         """
-        return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
+        return self.__class__(flatten([x.jmespath(query, type=type, **kwargs) for x in self]))
 
     def xpath(
         self,
         xpath: str,
         namespaces: Optional[Mapping[str, str]] = None,
+        *,
+        type: Optional[str] = None,
         **kwargs: Any,
     ) -> "SelectorList[_SelectorType]":
         """
@@ -178,17 +181,17 @@ def xpath(
             selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
         return self.__class__(
-            flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])
+            flatten([x.xpath(xpath, namespaces=namespaces, type=type, **kwargs) for x in self])
         )
 
-    def css(self, query: str) -> "SelectorList[_SelectorType]":
+    def css(self, query: str, type: Optional[str] = None,) -> "SelectorList[_SelectorType]":
         """
         Call the ``.css()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
 
         ``query`` is the same argument as the one in :meth:`Selector.css`
         """
-        return self.__class__(flatten([x.css(query) for x in self]))
+        return self.__class__(flatten([x.css(query, type=type) for x in self]))
 
     def re(
         self, regex: Union[str, Pattern[str]], replace_entities: bool = True
@@ -423,7 +426,6 @@ class Selector:
         "_huge_tree",
         "root",
         "_text",
-        "_text_lazy_html_root",
         "body",
         "__weakref__",
     ]
@@ -508,7 +510,6 @@ def __init__(
         self._expr = _expr
         self._huge_tree = huge_tree
         self._text = text
-        self._text_lazy_html_root: Optional[etree._Element] = None
 
     def __getstate__(self) -> Any:
         raise TypeError("can't pickle Selector objects")
@@ -534,6 +535,7 @@ def _get_root(
     def jmespath(
         self: _SelectorType,
         query: str,
+        type: Optional[str] = None,
         **kwargs: Any,
     ) -> SelectorList[_SelectorType]:
         """
@@ -567,9 +569,9 @@ def jmespath(
 
         def make_selector(x: Any) -> _SelectorType:  # closure function
             if isinstance(x, str):
-                return self.__class__(text=x, _expr=query, type="text")
+                return self.__class__(text=x, _expr=query, type=type or "text")
             else:
-                return self.__class__(root=x, _expr=query)
+                return self.__class__(root=x, _expr=query, type=type)
 
         result = [make_selector(x) for x in result]
         return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
@@ -578,6 +580,7 @@ def xpath(
         self: _SelectorType,
         query: str,
         namespaces: Optional[Mapping[str, str]] = None,
+        type: Optional[str] = None,
         **kwargs: Any,
     ) -> SelectorList[_SelectorType]:
         """
@@ -608,12 +611,7 @@ def xpath(
                 )
         else:
             try:
-                if self._text_lazy_html_root is None:
-                    self._text_lazy_html_root = self._get_root(
-                        self.root or "", type="html"
-                    )
-                if self._text_lazy_html_root is not None:
-                    xpathev = self._text_lazy_html_root.xpath
+                xpathev = self._get_root(self._text or "", type="html").xpath
             except AttributeError:
                 return typing.cast(
                     SelectorList[_SelectorType], self.selectorlist_cls([])
@@ -632,21 +630,21 @@ def xpath(
         except etree.XPathError as exc:
             raise ValueError(f"XPath error: {exc} in {query}")
 
-        if type(result) is not list:
+        if builtins.type(result) is not list:
             result = [result]
 
         result = [
             self.__class__(
                 root=x,
                 _expr=query,
                 namespaces=self.namespaces,
-                type=_xml_or_html(self.type),
+                type=type or _xml_or_html(self.type),
             )
             for x in result
         ]
         return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
 
-    def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
+    def css(self: _SelectorType, query: str, type: Optional[str] = None) -> SelectorList[_SelectorType]:
         """
         Apply the given CSS selector and return a :class:`SelectorList` instance.
 
@@ -659,7 +657,7 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
         """
         if self.type not in ("html", "xml", "text"):
             raise ValueError(f"Cannot use css on a Selector of type {self.type!r}")
-        return self.xpath(self._css2xpath(query))
+        return self.xpath(self._css2xpath(query), type=type)
 
     def _css2xpath(self, query: str) -> str:
         type = _xml_or_html(self.type)
@@ -729,30 +727,25 @@ def get(self) -> Any:
         For HTML and XML, the result is always a string, and percent-encoded
         content is unquoted.
         """
-        if self.type in ("json", "text"):
-            if self.type == "text" and self._text_lazy_html_root is not None:
-                return etree.tostring(
-                    self._text_lazy_html_root, encoding="unicode", with_tail=False
-                )
+        if self.type in ("text", "json"):
             return self.root
-        else:
-            try:
-                return typing.cast(
-                    str,
-                    etree.tostring(
-                        self.root,
-                        method=_ctgroup[self.type]["_tostring_method"],
-                        encoding="unicode",
-                        with_tail=False,
-                    ),
-                )
-            except (AttributeError, TypeError):
-                if self.root is True:
-                    return "1"
-                elif self.root is False:
-                    return "0"
-                else:
-                    return str(self.root)
+        try:
+            return typing.cast(
+                str,
+                etree.tostring(
+                    self.root,
+                    method=_ctgroup[self.type]["_tostring_method"],
+                    encoding="unicode",
+                    with_tail=False,
+                ),
+            )
+        except (AttributeError, TypeError):
+            if self.root is True:
+                return "1"
+            elif self.root is False:
+                return "0"
+            else:
+                return str(self.root)
 
     extract = get
 

diff --git a/tests/test_selector.py b/tests/test_selector.py
@@ -1012,13 +1012,13 @@ def test_remove_selector_from_html_in_text(self) -> None:
             "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
         )
         expect_result = "<html><body><p>hello world</p></body></html>"
-        sel = self.sscls(text=html, type="text")
-        self.assertEqual(sel.type, "text")
+        sel = self.sscls(text=html, type="html")
+        self.assertEqual(sel.type, "html")
         li_sel_list = sel.css("style")
         li_sel_list.drop()
         self.assertEqual(sel.get(), expect_result)
         # The type of the parent selector should not change
-        self.assertEqual(sel.type, "text")
+        self.assertEqual(sel.type, "html")
 
     def test_remove_selector_from_html_in_json(self) -> None:
         json_str = """{
@@ -1028,13 +1028,13 @@ def test_remove_selector_from_html_in_json(self) -> None:
         """
         expect_result = "<html><body><p>hello world</p></body></html>"
         sel = self.sscls(text=json_str)
-        html_sel = sel.jmespath("body")[0]
-        self.assertEqual(html_sel.type, "text")
+        html_sel = sel.jmespath("body", type="html")[0]
+        self.assertEqual(html_sel.type, "html")
         li_sel_list = html_sel.css("style")
         li_sel_list.drop()
         self.assertEqual(html_sel.get(), expect_result)
         # The type of the parent selector should not change
-        self.assertEqual(html_sel.type, "text")
+        self.assertEqual(html_sel.type, "html")
 
     def test_remove_pseudo_element_selector_list(self) -> None:
         sel = self.sscls(