Skip to content

Commit

Permalink
Merge branch 'add-canadian-news-sources' into add-national-post
Browse files Browse the repository at this point in the history
  • Loading branch information
addie9800 authored Aug 28, 2024
2 parents 8d56c2b + 1bfaacc commit 215ce4c
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 15 deletions.
41 changes: 27 additions & 14 deletions src/fundus/publishers/ca/cbc_news.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,21 @@
import re
from typing import List, Optional

import lxml
import more_itertools
from lxml.cssselect import CSSSelector
from lxml.etree import XPath
from lxml.html import document_fromstring

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.base_parser import Precomputed, logger
from fundus.parser.data import LinkedDataMapping
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
get_meta_content,
)


Expand All @@ -21,8 +26,22 @@ class V1(BaseParser):
_summary_selector = CSSSelector("h2.deck")
_subheadline_selector = CSSSelector("div.story > h2")
_paragraph_selector = CSSSelector("div.story > p")
_cbc_ld_selector: XPath = XPath("//script[@type='application/ld+json' or @id='initialStateDom']")

_author_ld_selector = XPath("//script[@id='initialStateDom']")
def _base_setup(self, html: str) -> None:
doc = lxml.html.document_fromstring(html)
ld_nodes = self._cbc_ld_selector(doc)
lds = []
for node in ld_nodes:
try:
json_object = json.loads(re.sub(r"(window\.__INITIAL_STATE__ = |;$)", "", node.text_content()))
if not json_object.get("@type"):
json_object["@type"] = "FurtherDetails"
lds.append(json_object)
except json.JSONDecodeError as error:
logger.debug(f"Encountered {error!r} during LD parsing")
collapsed_lds = more_itertools.collapse(lds, base_type=dict)
self.precomputed = Precomputed(html, doc, get_meta_content(doc), LinkedDataMapping(collapsed_lds))

@attribute
def body(self) -> ArticleBody:
Expand All @@ -35,21 +54,11 @@ def body(self) -> ArticleBody:

@attribute
def authors(self) -> List[str]:
doc = document_fromstring(self.precomputed.html)
ld_nodes = self._author_ld_selector(doc)
try:
author_ld = json.loads(re.sub(r"(window\.__INITIAL_STATE__ = |;$)", "", ld_nodes[0].text_content()))
except json.JSONDecodeError:
return []
if not (details := author_ld.get("detail")):
return []
if not (content := details.get("content")):
return []
return generic_author_parsing(content.get("authorList"))
return generic_author_parsing(self.precomputed.ld.bf_search("authorList"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("ReportageNewsArticle")[0].get("datePublished"))
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
Expand All @@ -59,4 +68,8 @@ def title(self) -> Optional[str]:

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.ld.bf_search("ReportageNewsArticle")[0].get("articleSection"))
topic_dict = self.precomputed.ld.bf_search("keywords")
topic_list = [v.get("name") for v in topic_dict.get("tags")]
for v in topic_dict.get("concepts"):
topic_list.append(re.sub(r".*/", "", v.get("path")))
return topic_list
10 changes: 9 additions & 1 deletion tests/resources/parser/test_data/ca/CBCNews.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,15 @@
"publishing_date": "2024-08-08 08:00:00+00:00",
"title": "Palestinians say his appointment could ruin ceasefire talks",
"topics": [
"World"
"Israel-Hamas war",
"Iran",
"Israel",
"Tehran",
"Hamas",
"Yahya Sinwar",
"Assassinations",
"Kidnapping",
"War and unrest"
]
}
}

0 comments on commit 215ce4c

Please sign in to comment.