diff --git a/src/fundus/publishers/ca/cbc_news.py b/src/fundus/publishers/ca/cbc_news.py index cb15f489..2ae47ab0 100644 --- a/src/fundus/publishers/ca/cbc_news.py +++ b/src/fundus/publishers/ca/cbc_news.py @@ -3,16 +3,21 @@ import re from typing import List, Optional +import lxml +import more_itertools from lxml.cssselect import CSSSelector from lxml.etree import XPath from lxml.html import document_fromstring from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.base_parser import Precomputed, logger +from fundus.parser.data import LinkedDataMapping from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + get_meta_content, ) @@ -21,8 +26,22 @@ class V1(BaseParser): _summary_selector = CSSSelector("h2.deck") _subheadline_selector = CSSSelector("div.story > h2") _paragraph_selector = CSSSelector("div.story > p") + _cbc_ld_selector: XPath = XPath("//script[@type='application/ld+json' or @id='initialStateDom']") - _author_ld_selector = XPath("//script[@id='initialStateDom']") + def _base_setup(self, html: str) -> None: + doc = lxml.html.document_fromstring(html) + ld_nodes = self._cbc_ld_selector(doc) + lds = [] + for node in ld_nodes: + try: + json_object = json.loads(re.sub(r"(window\.__INITIAL_STATE__ = |;$)", "", node.text_content())) + if not json_object.get("@type"): + json_object["@type"] = "FurtherDetails" + lds.append(json_object) + except json.JSONDecodeError as error: + logger.debug(f"Encountered {error!r} during LD parsing") + collapsed_lds = more_itertools.collapse(lds, base_type=dict) + self.precomputed = Precomputed(html, doc, get_meta_content(doc), LinkedDataMapping(collapsed_lds)) @attribute def body(self) -> ArticleBody: @@ -35,21 +54,11 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - doc = document_fromstring(self.precomputed.html) - ld_nodes = self._author_ld_selector(doc) - try: - author_ld = json.loads(re.sub(r"(window\.__INITIAL_STATE__ = |;$)", "", ld_nodes[0].text_content())) - except json.JSONDecodeError: - return [] - if not (details := author_ld.get("detail")): - return [] - if not (content := details.get("content")): - return [] - return generic_author_parsing(content.get("authorList")) + return generic_author_parsing(self.precomputed.ld.bf_search("authorList")) @attribute def publishing_date(self) -> Optional[datetime.datetime]: - return generic_date_parsing(self.precomputed.ld.bf_search("ReportageNewsArticle")[0].get("datePublished")) + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) @attribute def title(self) -> Optional[str]: @@ -59,4 +68,8 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: - return generic_topic_parsing(self.precomputed.ld.bf_search("ReportageNewsArticle")[0].get("articleSection")) + topic_dict = self.precomputed.ld.bf_search("keywords") + topic_list = [v.get("name") for v in topic_dict.get("tags")] + for v in topic_dict.get("concepts"): + topic_list.append(re.sub(r".*/", "", v.get("path"))) + return topic_list diff --git a/tests/resources/parser/test_data/ca/CBCNews.json b/tests/resources/parser/test_data/ca/CBCNews.json index b6632f46..b8a86b95 100644 --- a/tests/resources/parser/test_data/ca/CBCNews.json +++ b/tests/resources/parser/test_data/ca/CBCNews.json @@ -69,7 +69,15 @@ "publishing_date": "2024-08-08 08:00:00+00:00", "title": "Palestinians say his appointment could ruin ceasefire talks", "topics": [ - "World" + "Israel-Hamas war", + "Iran", + "Israel", + "Tehran", + "Hamas", + "Yahya Sinwar", + "Assassinations", + "Kidnapping", + "War and unrest" ] } }