Skip to content

Commit

Permalink
Merge pull request #626 from flairNLP/deprecate-value-by-key-path
Browse files Browse the repository at this point in the history
Deprecate `get_value_by_key_path` and replace with `xpath_search`
  • Loading branch information
MaxDall authored Oct 13, 2024
2 parents 51404cb + da7fe93 commit a14fbb3
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 35 deletions.
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,9 @@ target-version = ['py38']

[tool.isort]
profile = "black"

[tool.pytest.ini_options]
filterwarnings = [
"error"
]

35 changes: 30 additions & 5 deletions src/fundus/parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Iterable,
Iterator,
List,
Literal,
Optional,
Sequence,
Tuple,
Expand All @@ -22,7 +23,7 @@
import xmltodict
from dict2xml import dict2xml
from lxml.etree import XPath, tostring
from typing_extensions import Self, TypeAlias
from typing_extensions import Self, TypeAlias, deprecated

from fundus.utils.serialization import replace_keys_in_nested_dict

Expand Down Expand Up @@ -81,6 +82,7 @@ def add_ld(self, ld: Dict[str, Any], name: Optional[str] = None) -> None:
self.__dict__[self.__UNKNOWN_TYPE__] = []
self.__dict__[self.__UNKNOWN_TYPE__].append(ld)

@deprecated("Use xpath_search() instead")
def get_value_by_key_path(self, key_path: List[str], default: Any = None) -> Optional[Any]:
"""
Works like get() except this one assumes a path is given as list of keys (str).
Expand Down Expand Up @@ -113,7 +115,15 @@ def to_unicode_characters(text: str) -> str:
self.__xml = lxml.etree.fromstring(xml)
return self.__xml

def xpath_search(self, query: XPath) -> List[Any]:
@overload
def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) -> List[Any]:
...

@overload
def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]:
...

def xpath_search(self, query: Union[XPath, str], scalar: bool = False):
"""Search through LD using XPath expressions
Internally, the content of the LinkedDataMapping is converted to XML and then
Expand Down Expand Up @@ -142,12 +152,17 @@ def xpath_search(self, query: XPath) -> List[Any]:
>> [value1]
Args:
query: A XPath expression
query: A XPath expression either as string or XPath object.
scalar: If True, return an optional "scalar" value and raise a ValueError if there are more
than one result to return; if False, return a list of results. Defaults to False.
Returns:
An ordered list of search results
An ordered list of search results or an optional "scalar" result
"""

if isinstance(query, str):
query = XPath(query)

pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.values())))

def node2string(n: lxml.etree._Element) -> str:
Expand All @@ -174,7 +189,17 @@ def to_original_characters(text: str) -> str:
xml = f"<result{i}>" + node2string(node) + f"</result{i}>"
results.update(replace_keys_in_nested_dict(xmltodict.parse(xml), to_original_characters))

return list(results.values())
values = list(results.values())

if scalar:
if not values:
return None
elif len(values) == 1:
return values.pop()
else:
raise ValueError(f"Got multiple values when expecting a single scalar value")
else:
return values

def bf_search(self, key: str, depth: Optional[int] = None, default: Optional[_T] = None) -> Union[Any, _T]:
"""
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/de/freiepresse.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def publishing_date(self) -> Optional[datetime.datetime]:

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))

@attribute
def title(self) -> Optional[str]:
Expand Down
3 changes: 1 addition & 2 deletions src/fundus/publishers/de/krautreporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ def authors(self) -> List[str]:

@attribute
def publishing_date(self) -> Optional[datetime]:
key_path = ["NewsArticle", "datePublished"]
date_string = self.precomputed.ld.get_value_by_key_path(key_path)
date_string = self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)
return utility.generic_date_parsing(date_string)

@attribute
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/no/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class NO(metaclass=PublisherGroup):
sources=[
Sitemap(
"https://www.vg.no/sitemap.xml",
sitemap_filter=inverse(regex_filter("vg\.no\/sitemaps/\d{4}\-\d{2}-articles.xml")),
sitemap_filter=inverse(regex_filter(r"vg\.no\/sitemaps/\d{4}\-\d{2}-articles.xml")),
reverse=True,
),
NewsMap("https://www.vg.no/sitemap/files/articles-48hrs.xml"),
Expand Down
3 changes: 1 addition & 2 deletions src/fundus/publishers/shared/euronews.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ def body(self) -> ArticleBody:

@attribute
def authors(self) -> List[str]:
key_path = ["NewsArticle", "author", "name"]
author_string = self.precomputed.ld.get_value_by_key_path(key_path)
author_string = self.precomputed.ld.xpath_search("NewsArticle/author/name")
return utility.generic_author_parsing(author_string)

@attribute
Expand Down
6 changes: 3 additions & 3 deletions src/fundus/publishers/us/ap_news.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,17 @@ def authors(self) -> List[str]:
author_string = re.sub(r"^By ", "", author_string)
except IndexError:
# Fallback to the generic author parsing from the linked data.
return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))

return generic_author_parsing(author_string)

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)

@attribute
def topics(self) -> List[str]:
Expand Down
7 changes: 3 additions & 4 deletions src/fundus/publishers/us/cnbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,15 @@ def body(self) -> ArticleBody:

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))

@attribute
def title(self) -> Optional[str]:
title: Optional[str] = self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
return title
return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)

@attribute
def topics(self) -> List[str]:
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/us/occupy_democrats.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def title(self) -> Optional[str]:

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.ld.get_value_by_key_path(["Article", "keywords"]))
return generic_topic_parsing(self.precomputed.ld.xpath_search("Article/keywords", scalar=True))

@attribute(validate=False)
def description(self) -> Optional[str]:
Expand Down
4 changes: 2 additions & 2 deletions src/fundus/publishers/us/reuters.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ def authors(self) -> List[str]:

@attribute
def publishing_date(self) -> Optional[datetime]:
return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)

@attribute
def topics(self) -> List[str]:
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/us/the_gateway_pundit.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def body(self) -> ArticleBody:

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["Article", "author"]))
return generic_author_parsing(self.precomputed.ld.xpath_search("Article/author"))

@attribute
def publishing_date(self) -> Optional[datetime]:
Expand Down
11 changes: 4 additions & 7 deletions src/fundus/publishers/us/the_intercept.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,25 +38,22 @@ def body(self) -> ArticleBody:

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))

@attribute
def publishing_date(self) -> Optional[datetime]:
return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)

@attribute
def topics(self) -> List[str]:
# The Intercept specifies the article's topics, including other metadata,
# inside the "keywords" linked data indicated by a "Subject: " prefix.
# Example keywords: ["Day: Saturday", ..., "Subject: World", ...]
keywords: Optional[List[str]] = self.precomputed.ld.get_value_by_key_path(["NewsArticle", "keywords"])
if keywords is None:
return []

keywords: List[str] = self.precomputed.ld.xpath_search("NewsArticle/keywords")
return [keyword[9:] for keyword in keywords if keyword.startswith("Subject: ")]

class V1_1(V1):
Expand Down
12 changes: 6 additions & 6 deletions src/fundus/publishers/us/the_new_yorker.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,23 @@ def description(self) -> Optional[str]:

@attribute(validate=False)
def alternative_description(self) -> Optional[str]:
return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "description"])
return self.precomputed.ld.xpath_search("NewsArticle/description", scalar=True)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))

@attribute
def publishing_date(self) -> Optional[datetime]:
return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)

@attribute(validate=False)
def alternative_title(self) -> Optional[str]:
return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "alternativeHeadline"])
return self.precomputed.ld.xpath_search("NewsArticle/alternativeHeadline", scalar=True)

@attribute
def topics(self) -> List[str]:
Expand All @@ -61,4 +61,4 @@ def topics(self) -> List[str]:

@attribute(validate=False)
def section(self) -> Optional[str]:
return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "articleSection"])
return self.precomputed.ld.xpath_search("NewsArticle/articleSection", scalar=True)

0 comments on commit a14fbb3

Please sign in to comment.