Skip to content

Commit

Permalink
Support forcing a selector type into a subselector
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Jun 14, 2024
1 parent 4e140f9 commit e7c5e97
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 48 deletions.
77 changes: 35 additions & 42 deletions parsel/selector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""XPath and JMESPath selectors based on the lxml and jmespath Python
packages."""

import builtins
import json
import typing
import warnings
Expand Down Expand Up @@ -141,7 +142,7 @@ def __getitem__(
def __getstate__(self) -> None:
raise TypeError("can't pickle SelectorList objects")

def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
def jmespath(self, query: str, *, type: Optional[str] = None, **kwargs: Any) -> "SelectorList[_SelectorType]":
"""
Call the ``.jmespath()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
Expand All @@ -153,12 +154,14 @@ def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
"""
return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
return self.__class__(flatten([x.jmespath(query, type=type, **kwargs) for x in self]))

def xpath(
self,
xpath: str,
namespaces: Optional[Mapping[str, str]] = None,
*,
type: Optional[str] = None,
**kwargs: Any,
) -> "SelectorList[_SelectorType]":
"""
Expand All @@ -178,17 +181,17 @@ def xpath(
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
return self.__class__(
flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])
flatten([x.xpath(xpath, namespaces=namespaces, type=type, **kwargs) for x in self])
)

def css(self, query: str) -> "SelectorList[_SelectorType]":
def css(self, query: str, type: Optional[str] = None,) -> "SelectorList[_SelectorType]":
"""
Call the ``.css()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
``query`` is the same argument as the one in :meth:`Selector.css`
"""
return self.__class__(flatten([x.css(query) for x in self]))
return self.__class__(flatten([x.css(query, type=type) for x in self]))

def re(
self, regex: Union[str, Pattern[str]], replace_entities: bool = True
Expand Down Expand Up @@ -423,7 +426,6 @@ class Selector:
"_huge_tree",
"root",
"_text",
"_text_lazy_html_root",
"body",
"__weakref__",
]
Expand Down Expand Up @@ -508,7 +510,6 @@ def __init__(
self._expr = _expr
self._huge_tree = huge_tree
self._text = text
self._text_lazy_html_root: Optional[etree._Element] = None

def __getstate__(self) -> Any:
raise TypeError("can't pickle Selector objects")
Expand All @@ -534,6 +535,7 @@ def _get_root(
def jmespath(
self: _SelectorType,
query: str,
type: Optional[str] = None,
**kwargs: Any,
) -> SelectorList[_SelectorType]:
"""
Expand Down Expand Up @@ -567,9 +569,9 @@ def jmespath(

def make_selector(x: Any) -> _SelectorType: # closure function
if isinstance(x, str):
return self.__class__(text=x, _expr=query, type="text")
return self.__class__(text=x, _expr=query, type=type or "text")
else:
return self.__class__(root=x, _expr=query)
return self.__class__(root=x, _expr=query, type=type)

result = [make_selector(x) for x in result]
return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
Expand All @@ -578,6 +580,7 @@ def xpath(
self: _SelectorType,
query: str,
namespaces: Optional[Mapping[str, str]] = None,
type: Optional[str] = None,
**kwargs: Any,
) -> SelectorList[_SelectorType]:
"""
Expand Down Expand Up @@ -608,12 +611,7 @@ def xpath(
)
else:
try:
if self._text_lazy_html_root is None:
self._text_lazy_html_root = self._get_root(
self.root or "", type="html"
)
if self._text_lazy_html_root is not None:
xpathev = self._text_lazy_html_root.xpath
xpathev = self._get_root(self._text or "", type="html").xpath
except AttributeError:
return typing.cast(
SelectorList[_SelectorType], self.selectorlist_cls([])
Expand All @@ -632,21 +630,21 @@ def xpath(
except etree.XPathError as exc:
raise ValueError(f"XPath error: {exc} in {query}")

if type(result) is not list:
if builtins.type(result) is not list:
result = [result]

result = [
self.__class__(
root=x,
_expr=query,
namespaces=self.namespaces,
type=_xml_or_html(self.type),
type=type or _xml_or_html(self.type),
)
for x in result
]
return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))

def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
def css(self: _SelectorType, query: str, type: Optional[str] = None) -> SelectorList[_SelectorType]:
"""
Apply the given CSS selector and return a :class:`SelectorList` instance.
Expand All @@ -659,7 +657,7 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
"""
if self.type not in ("html", "xml", "text"):
raise ValueError(f"Cannot use css on a Selector of type {self.type!r}")
return self.xpath(self._css2xpath(query))
return self.xpath(self._css2xpath(query), type=type)

def _css2xpath(self, query: str) -> str:
type = _xml_or_html(self.type)
Expand Down Expand Up @@ -729,30 +727,25 @@ def get(self) -> Any:
For HTML and XML, the result is always a string, and percent-encoded
content is unquoted.
"""
if self.type in ("json", "text"):
if self.type == "text" and self._text_lazy_html_root is not None:
return etree.tostring(
self._text_lazy_html_root, encoding="unicode", with_tail=False
)
if self.type in ("text", "json"):
return self.root
else:
try:
return typing.cast(
str,
etree.tostring(
self.root,
method=_ctgroup[self.type]["_tostring_method"],
encoding="unicode",
with_tail=False,
),
)
except (AttributeError, TypeError):
if self.root is True:
return "1"
elif self.root is False:
return "0"
else:
return str(self.root)
try:
return typing.cast(
str,
etree.tostring(
self.root,
method=_ctgroup[self.type]["_tostring_method"],
encoding="unicode",
with_tail=False,
),
)
except (AttributeError, TypeError):
if self.root is True:
return "1"
elif self.root is False:
return "0"
else:
return str(self.root)

extract = get

Expand Down
12 changes: 6 additions & 6 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,13 +1012,13 @@ def test_remove_selector_from_html_in_text(self) -> None:
"<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
)
expect_result = "<html><body><p>hello world</p></body></html>"
sel = self.sscls(text=html, type="text")
self.assertEqual(sel.type, "text")
sel = self.sscls(text=html, type="html")
self.assertEqual(sel.type, "html")
li_sel_list = sel.css("style")
li_sel_list.drop()
self.assertEqual(sel.get(), expect_result)
# The type of the parent selector should not change
self.assertEqual(sel.type, "text")
self.assertEqual(sel.type, "html")

def test_remove_selector_from_html_in_json(self) -> None:
json_str = """{
Expand All @@ -1028,13 +1028,13 @@ def test_remove_selector_from_html_in_json(self) -> None:
"""
expect_result = "<html><body><p>hello world</p></body></html>"
sel = self.sscls(text=json_str)
html_sel = sel.jmespath("body")[0]
self.assertEqual(html_sel.type, "text")
html_sel = sel.jmespath("body", type="html")[0]
self.assertEqual(html_sel.type, "html")
li_sel_list = html_sel.css("style")
li_sel_list.drop()
self.assertEqual(html_sel.get(), expect_result)
# The type of the parent selector should not change
self.assertEqual(html_sel.type, "text")
self.assertEqual(html_sel.type, "html")

def test_remove_pseudo_element_selector_list(self) -> None:
sel = self.sscls(
Expand Down

0 comments on commit e7c5e97

Please sign in to comment.