Skip to content

Commit

Permalink
Merge pull request #583 from flairNLP/add-canadian-news-sources
Browse files Browse the repository at this point in the history
Add cbc as first canadian publisher
  • Loading branch information
addie9800 authored Sep 3, 2024
2 parents 15865e0 + ebb05e3 commit 131ae12
Show file tree
Hide file tree
Showing 9 changed files with 230 additions and 19 deletions.
32 changes: 32 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,38 @@
</table>


## CA-Publishers

<table class="publishers ca">
<thead>
<tr>
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Missing&#160;Attributes</th>
<th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<code>CBCNews</code>
</td>
<td>
<div>CBC News</div>
</td>
<td>
<a href="https://www.cbc.ca/">
<span>www.cbc.ca</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
</tbody>
</table>


## CH-Publishers

<table class="publishers ch">
Expand Down
4 changes: 2 additions & 2 deletions src/fundus/parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def __init__(self, lds: Iterable[Dict[str, Any]] = ()):
def serialize(self) -> Dict[str, Any]:
return {attribute: value for attribute, value in self.__dict__.items() if "__" not in attribute}

def add_ld(self, ld: Dict[str, Any]) -> None:
if ld_type := ld.get("@type"):
def add_ld(self, ld: Dict[str, Any], name: Optional[str] = None) -> None:
if ld_type := ld.get("@type", name):
if isinstance(ld_type, list):
if len(ld_type) == 1:
ld_type = ld_type[0]
Expand Down
38 changes: 21 additions & 17 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from datetime import datetime
from functools import total_ordering
from typing import (
Any,
Callable,
ClassVar,
Dict,
Expand Down Expand Up @@ -155,6 +156,25 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
_json_pattern = re.compile(r"(?P<json>{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))")


def extract_json_from_dom(root: lxml.html.HtmlElement, selector: XPath) -> Iterable[Dict[str, Any]]:
def sanitize(text: str) -> Optional[str]:
# capture only content enclosed as follows: {...} or [{...}]
match = re.search(_json_pattern, text)
if match is not None and (sanitized := match.group("json")):
return sanitized
return None

json_nodes = selector(root)
jsons = []
for node in json_nodes:
json_content = sanitize(node.text_content()) or ""
try:
jsons.append(json.loads(json_content))
except json.JSONDecodeError as error:
logger.debug(f"Encountered {error!r} during JSON parsing")
return more_itertools.collapse(jsons, base_type=dict)


def get_ld_content(root: lxml.html.HtmlElement) -> LinkedDataMapping:
"""Parse JSON-LD from HTML.
Expand All @@ -168,23 +188,7 @@ def get_ld_content(root: lxml.html.HtmlElement) -> LinkedDataMapping:
The JSON-LD data as a LinkedDataMapping
"""

def sanitize(text: str) -> Optional[str]:
# capture only content enclosed as follows: {...} or [{...}]
match = re.search(_json_pattern, text)
if match is not None and (sanitized := match.group("json")):
return sanitized
return None

ld_nodes = _ld_node_selector(root)
lds = []
for node in ld_nodes:
json_content = sanitize(node.text_content()) or ""
try:
lds.append(json.loads(json_content))
except json.JSONDecodeError as error:
logger.debug(f"Encountered {error!r} during LD parsing")
collapsed_lds = more_itertools.collapse(lds, base_type=dict)
return LinkedDataMapping(collapsed_lds)
return LinkedDataMapping(extract_json_from_dom(root, _ld_node_selector))


_meta_node_selector = CSSSelector("head > meta, body > meta")
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from fundus.publishers.at import AT
from fundus.publishers.au import AU
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.ca import CA
from fundus.publishers.ch import CH
from fundus.publishers.cn import CN
from fundus.publishers.de import DE
Expand Down Expand Up @@ -61,3 +62,4 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
tr = TR
my = MY
no = NO
ca = CA
18 changes: 18 additions & 0 deletions src/fundus/publishers/ca/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.ca.cbc_news import CBCNewsParser
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap

# noinspection PyPep8Naming


class CA(metaclass=PublisherGroup):
CBCNews = Publisher(
name="CBC News",
domain="https://www.cbc.ca/",
parser=CBCNewsParser,
sources=[
RSSFeed("https://www.cbc.ca/webfeed/rss/rss-topstories"),
RSSFeed("https://www.cbc.ca/webfeed/rss/rss-world"),
RSSFeed("https://www.cbc.ca/webfeed/rss/rss-canada"),
],
)
66 changes: 66 additions & 0 deletions src/fundus/publishers/ca/cbc_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import datetime
import re
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.base_parser import function
from fundus.parser.utility import (
extract_article_body_with_selector,
extract_json_from_dom,
generic_author_parsing,
generic_date_parsing,
)


class CBCNewsParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("h2.deck")
_subheadline_selector = CSSSelector("div.story > h2")
_paragraph_selector = CSSSelector("div.story > p")

_cbc_ld_selector: XPath = XPath("//script[@id='initialStateDom']")

@function(priority=1)
def _parse_initial_state_dom(self):
state_dom_json = extract_json_from_dom(self.precomputed.doc, self._cbc_ld_selector)
for ld in state_dom_json:
self.precomputed.ld.add_ld(ld, "initialStateDom")

@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def topics(self) -> List[str]:
if not (topic_dict := self.precomputed.ld.bf_search("keywords")):
return []

# add locations
topic_list = [topic for location in topic_dict.get("tags") if (topic := location.get("name")) is not None]

# add subjects
for subject in topic_dict.get("concepts"):
if (path := subject.get("path")) is not None:
topic_list.append(re.sub(r".*/", "", path))

return topic_list
83 changes: 83 additions & 0 deletions tests/resources/parser/test_data/ca/CBCNews.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"V1": {
"authors": [
"Yasmine Hassan"
],
"body": {
"summary": [
"The appointment came days after Ismail Haniyeh was assassinated in Tehran"
],
"sections": [
{
"headline": [],
"paragraphs": [
"Hours after Yahya Sinwar was named the new leader of Hamas's political bureau on Tuesday, many in Gaza wondered how the appointment would affect the war and ceasefire negotiations with Israel.",
"The announcement, posted on Hamas's Telegram channel soon after former leader Ismail Haniyeh was killed in Iran, was seen as a defiant move from the group. Israel has characterized Sinwar as the \"mastermind\" behind the Oct. 7 attacks on southern Israel, which Israeli figures say killed 1,200 and took over 250 hostages into Gaza.",
"Sinwar, 61, has led Hamas in the Gaza Strip since 2017. But his background leans more in military rather than in politics, and his methods are seen as more extreme than his predecessor's.",
"That has created questions over how Sinwar will manage negotiations, and how Israel will negotiate with the man who they say orchestrated the attacks — and whom they've vowed to kill.",
"Many Palestinians interviewed in Gaza expressed similar concern over the promotion, although some welcomed the move. The news comes during a time of tense negotiation to end a war that has devastated the region and killed more than 39,000, according to Palestinian tallies, over the past 10 months."
]
},
{
"headline": [
"Palestinians react"
],
"paragraphs": [
"Jamil Al Saadouni, 58, told CBC freelance videographer Mohamed El Saife in Khan Younis that Sinwar's appointment was \"an internal decision.\"",
"He lamented the fact that Palestinian civilians, who are directly impacted by the war in Gaza, were not consulted on the best replacement for Haniyeh.",
"\"This has nothing to do with other factions or the Palestinian people.\"",
"Abu Hassan Amer, 44, agreed.",
"\"Choosing a military leadership during this period can harm the negotiations,\" he told El Saife. \"Because as they say, the non-political gun creates roadblocks.\"",
"Sinwar is seen as a \"hard-liner\" even within Hamas, said Matthew Levitt, senior fellow at the Washington Institute for Near East Policy, which was founded in 1985 with support from the American Israel Public Affairs Committee, a pro-Israel lobbying organization.",
"Sinwar served over 20 years in Israeli jails in connection with the killings of two Israeli soldiers and four fellow Palestinians, and was released early in 2011 as part of a prisoner swap. He has been known to hunt down people suspected of collaborating with Israel.",
"Levitt said that because of his time in jail, Sinwar \"understands Israelis.\"",
"\"He learned Hebrew, he spoke with his jailers, and that really showed on Oct. 7, when he understood the trauma that the kidnapping and killing of a large number of people would do for the Israelis,\" he said.",
"By comparison, Haniyeh, who ruled in exile from Qatar, often took a more moderate and pragmatic stance.",
"\"The killing of Haniyeh already brought negotiations back to the drawing board,\" Lina Khatib, an expert on the conflict at U.K. think-tank Chatham House, told the AP in an interview. \"The next chess move by Hamas makes negotiations even trickier.\"",
"Haniyeh was killed by an airstrike in Tehran, where he was attending the inauguration of Iran's new president. While Hamas and Iran have blamed Israel for the strike, Israel has not claimed responsibility for it."
]
},
{
"headline": [
"A military man in politics"
],
"paragraphs": [
"Some in Gaza welcomed the news of Sinwar's promotion, saying they needed someone to defend them.",
"\"Choosing him from the stance of Palestine is a good choice,\" Abu Anas Al Saud told El Saife. \"We need someone to defend the land that was stolen.\"",
"But Al Saud is aware of the effect Sinwar may have on ceasefire talks.",
"\"He's the most wanted man to Israel,\" he said. \"It will not advance negotiations at all.\"",
"Sinwar only made rare appearances before the war. He hasn't been seen in public since Oct. 7, and is thought to be hiding deep in tunnels beneath the Gaza Strip. Mediators say it takes several days to exchange messages with him, raising questions on how he will now manage Hamas as its international face.",
"Sinwar \"is someone who grew up within the brigade and the militant terrorist wing of Hamas,\" said Levitt.",
"However, while Sinwar's promotion might seem like a direct \"challenge to Israel,\" a deal was still possible, Sadeq Abu Amer told the AP. He noted that Sinwar \"might take a step that will surprise everyone.\" Abu Amer is the head of the Palestinian Dialogue Group in Turkey, which says on its site that it aims to \"protect the historical rights of the Palestinian people.\"",
"And while the assassination of Haniyeh makes a difference \"in the immediate,\" Levitt said, in the long term, both sides are still looking for a deal.",
"\"The same factors that were driving Hamas towards the deal and separately driving the Israeli prime minister to a deal are still there.\""
]
},
{
"headline": [
"'There is only one place for Yahya Sinwar'"
],
"paragraphs": [
"On Tuesday, Israel's chief military spokesperson, Rear Admiral Daniel Hagari, said Sinwar's appointment would not stop Israel from pursuing him.",
"\"There is only one place for Yahya Sinwar, and it is beside Mohammed Deif and the rest of the Oct. 7 terrorists,\" he told the Saudi state-owned Al-Arabiya television. \"That is the only place we're preparing and intending for him.\"",
"Amer, in Gaza, stressed the importance of diplomacy before military strength, particularly as negotiations continue between both sides.",
"\"There are rules to resistance, rules to war and rules to peace,\" said Amer. \"[And] we need peace in this current moment.\""
]
}
]
},
"publishing_date": "2024-08-08 08:00:00+00:00",
"title": "What's next for Gaza, after Yahya Sinwar's appointment as Hamas political head?",
"topics": [
"Israel-Hamas war",
"Iran",
"Israel",
"Tehran",
"Hamas",
"Yahya Sinwar",
"Assassinations",
"Kidnapping",
"War and unrest"
]
}
}
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/resources/parser/test_data/ca/meta.info
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"CBCNews_2024_08_08.html.gz": {
"url": "https://www.cbc.ca/news/world/gaza-israel-ceasefire-negotiations-sinwar-1.7287711?cmp=rss",
"crawl_date": "2024-08-08 23:53:17.604667"
}
}

0 comments on commit 131ae12

Please sign in to comment.