Merge branch 'add-canadian-news-sources' into add-national-post

flairNLP · Aug 28, 2024 · 215ce4c · 215ce4c
2 parents 8d56c2b + 1bfaacc
commit 215ce4c
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 15 deletions.
diff --git a/src/fundus/publishers/ca/cbc_news.py b/src/fundus/publishers/ca/cbc_news.py
@@ -3,16 +3,21 @@
 import re
 from typing import List, Optional
 
+import lxml
+import more_itertools
 from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
 from lxml.html import document_fromstring
 
 from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
+from fundus.parser.base_parser import Precomputed, logger
+from fundus.parser.data import LinkedDataMapping
 from fundus.parser.utility import (
  extract_article_body_with_selector,
  generic_author_parsing,
  generic_date_parsing,
  generic_topic_parsing,
+ get_meta_content,
 )
 
 
@@ -21,8 +26,22 @@ class V1(BaseParser):
  _summary_selector = CSSSelector("h2.deck")
  _subheadline_selector = CSSSelector("div.story > h2")
  _paragraph_selector = CSSSelector("div.story > p")
+ _cbc_ld_selector: XPath = XPath("//script[@type='application/ld+json' or @id='initialStateDom']")
 
- _author_ld_selector = XPath("//script[@id='initialStateDom']")
+ def _base_setup(self, html: str) -> None:
+ doc = lxml.html.document_fromstring(html)
+ ld_nodes = self._cbc_ld_selector(doc)
+ lds = []
+ for node in ld_nodes:
+ try:
+ json_object = json.loads(re.sub(r"(window\.__INITIAL_STATE__ = |;$)", "", node.text_content()))
+ if not json_object.get("@type"):
+ json_object["@type"] = "FurtherDetails"
+ lds.append(json_object)
+ except json.JSONDecodeError as error:
+ logger.debug(f"Encountered {error!r} during LD parsing")
+ collapsed_lds = more_itertools.collapse(lds, base_type=dict)
+ self.precomputed = Precomputed(html, doc, get_meta_content(doc), LinkedDataMapping(collapsed_lds))
 
  @attribute
  def body(self) -> ArticleBody:
@@ -35,21 +54,11 @@ def body(self) -> ArticleBody:
 
  @attribute
  def authors(self) -> List[str]:
- doc = document_fromstring(self.precomputed.html)
- ld_nodes = self._author_ld_selector(doc)
- try:
- author_ld = json.loads(re.sub(r"(window\.__INITIAL_STATE__ = |;$)", "", ld_nodes[0].text_content()))
- except json.JSONDecodeError:
- return []
- if not (details := author_ld.get("detail")):
- return []
- if not (content := details.get("content")):
- return []
- return generic_author_parsing(content.get("authorList"))
+ return generic_author_parsing(self.precomputed.ld.bf_search("authorList"))
 
  @attribute
  def publishing_date(self) -> Optional[datetime.datetime]:
- return generic_date_parsing(self.precomputed.ld.bf_search("ReportageNewsArticle")[0].get("datePublished"))
+ return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
 
  @attribute
  def title(self) -> Optional[str]:
@@ -59,4 +68,8 @@ def title(self) -> Optional[str]:
 
  @attribute
  def topics(self) -> List[str]:
- return generic_topic_parsing(self.precomputed.ld.bf_search("ReportageNewsArticle")[0].get("articleSection"))
+ topic_dict = self.precomputed.ld.bf_search("keywords")
+ topic_list = [v.get("name") for v in topic_dict.get("tags")]
+ for v in topic_dict.get("concepts"):
+ topic_list.append(re.sub(r".*/", "", v.get("path")))
+ return topic_list
diff --git a/tests/resources/parser/test_data/ca/CBCNews.json b/tests/resources/parser/test_data/ca/CBCNews.json
@@ -69,7 +69,15 @@
  "publishing_date": "2024-08-08 08:00:00+00:00",
  "title": "Palestinians say his appointment could ruin ceasefire talks",
  "topics": [
- "World"
+ "Israel-Hamas war",
+ "Iran",
+ "Israel",
+ "Tehran",
+ "Hamas",
+ "Yahya Sinwar",
+ "Assassinations",
+ "Kidnapping",
+ "War and unrest"
  ]
  }
 }