Merge pull request #588 from dkm1006/krautreporter

Added Krautreporter
flairNLP · Sep 3, 2024 · 15865e0 · 15865e0
2 parents 1571d6d + e4d79e6
commit 15865e0
Show file tree

Hide file tree

Showing 8 changed files with 329 additions and 15 deletions.
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -533,6 +533,21 @@
   </td>
   <td>&#160;</td>
  </tr>
+ <tr>
+  <td>
+  <code>Krautreporter</code>
+  </td>
+  <td>
+  <div>Krautreporter</div>
+  </td>
+  <td>
+  <a href="https://krautreporter.de/">
+  <span>krautreporter.de</span>
+  </a>
+  </td>
+  <td>&#160;</td>
+  <td>&#160;</td>
+ </tr>
  <tr>
   <td>
   <code>MitteldeutscheZeitung</code>

diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py
@@ -1,7 +1,6 @@
 import functools
 import inspect
 import itertools
-import json
 import re
 from abc import ABC
 from copy import copy
@@ -23,12 +22,10 @@
 )
 
 import lxml.html
-import more_itertools
-from lxml.etree import XPath
 
 from fundus.logging import create_logger
 from fundus.parser.data import LinkedDataMapping
-from fundus.parser.utility import get_meta_content
+from fundus.parser.utility import get_ld_content, get_meta_content
 
 RegisteredFunctionT_co = TypeVar("RegisteredFunctionT_co", covariant=True, bound="RegisteredFunction")
 
@@ -159,7 +156,6 @@ class Precomputed:
 class BaseParser(ABC):
  VALID_UNTIL: date = date.today()
  precomputed: Precomputed
- _ld_selector: XPath = XPath("//script[@type='application/ld+json']")
 
  def __init__(self):
  predicate: Callable[[object], bool] = lambda x: isinstance(x, RegisteredFunction)
@@ -192,15 +188,7 @@ def cache(self) -> Optional[Dict[str, Any]]:
 
  def _base_setup(self, html: str) -> None:
  doc = lxml.html.document_fromstring(html)
- ld_nodes = self._ld_selector(doc)
- lds = []
- for node in ld_nodes:
- try:
- lds.append(json.loads(node.text_content()))
- except json.JSONDecodeError as error:
- logger.debug(f"Encountered {error!r} during LD parsing")
- collapsed_lds = more_itertools.collapse(lds, base_type=dict)
- self.precomputed = Precomputed(html, doc, get_meta_content(doc), LinkedDataMapping(collapsed_lds))
+ self.precomputed = Precomputed(html, doc, get_meta_content(doc), get_ld_content(doc))
 
  def parse(self, html: str, error_handling: Literal["suppress", "catch", "raise"] = "raise") -> Dict[str, Any]:
  # wipe existing precomputed

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
@@ -1,4 +1,5 @@
 import itertools
+import json
 import re
 from collections import defaultdict
 from copy import copy
@@ -24,7 +25,15 @@
 from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
 
-from fundus.parser.data import ArticleBody, ArticleSection, TextSequence
+from fundus.logging import create_logger
+from fundus.parser.data import (
+ ArticleBody,
+ ArticleSection,
+ LinkedDataMapping,
+ TextSequence,
+)
+
+logger = create_logger(__name__)
 
 
 def normalize_whitespace(text: str) -> str:
@@ -142,6 +151,42 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
  return ArticleBody(summary=summary, sections=sections)
 
 
+_ld_node_selector = XPath("//script[@type='application/ld+json']")
+_json_pattern = re.compile(r"(?P<json>{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))")
+
+
+def get_ld_content(root: lxml.html.HtmlElement) -> LinkedDataMapping:
+ """Parse JSON-LD from HTML.
+
+ This function parses a script tags of type ld+json.
+ In case the JSON is wrapped in a CDATA tag it is first stripped.
+
+ Args:
+ root: The HTML document given as a lxml.html.HtmlElement.
+
+ Returns:
+ The JSON-LD data as a LinkedDataMapping
+ """
+
+ def sanitize(text: str) -> Optional[str]:
+ # capture only content enclosed as follows: {...} or [{...}]
+ match = re.search(_json_pattern, text)
+ if match is not None and (sanitized := match.group("json")):
+ return sanitized
+ return None
+
+ ld_nodes = _ld_node_selector(root)
+ lds = []
+ for node in ld_nodes:
+ json_content = sanitize(node.text_content()) or ""
+ try:
+ lds.append(json.loads(json_content))
+ except json.JSONDecodeError as error:
+ logger.debug(f"Encountered {error!r} during LD parsing")
+ collapsed_lds = more_itertools.collapse(lds, base_type=dict)
+ return LinkedDataMapping(collapsed_lds)
+
+
 _meta_node_selector = CSSSelector("head > meta, body > meta")
 
 

diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py
@@ -27,6 +27,7 @@
 from .hessenschau import HessenschauParser
 from .junge_welt import JungeWeltParser
 from .kicker import KickerParser
+from .krautreporter import KrautreporterParser
 from .mdr import MDRParser
 from .merkur import MerkurParser
 from .morgenpost_berlin import BerlinerMorgenpostParser
@@ -463,6 +464,21 @@ class DE(metaclass=PublisherGroup):
  url_filter=regex_filter("/slideshow|/video"),
  )
 
+ Krautreporter = Publisher(
+ name="Krautreporter",
+ domain="https://krautreporter.de/",
+ parser=KrautreporterParser,
+ sources=[
+ # NOTE: robots.txt mentions that it reserves the right of use for text & data mining (§ 44 b UrhG),
+ # but this is not in machine readable format, which is required by law for it to be effective.
+ # NOTE: Unfortunately, both sitemap.xml and news.xml are identical.
+ Sitemap("https://krautreporter.de/sitemap.xml", reverse=True),
+ # NewsMap("https://krautreporter.de/news.xml"),
+ RSSFeed("https://krautreporter.de/feeds.rss"),
+ ],
+ url_filter=regex_filter(r"/(pages|archiv|serien|thema|zusammenhaenge)/"),
+ )
+
  FrankfurterRundschau = Publisher(
  name="Frankfurter Rundschau",
  domain="https://www.fr.de",

diff --git a/src/fundus/publishers/de/krautreporter.py b/src/fundus/publishers/de/krautreporter.py
@@ -0,0 +1,52 @@
+from datetime import datetime
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, utility
+
+
+class KrautreporterParser(ParserProxy):
+ class V1(BaseParser):
+ _bloat_pattern: str = (
+ "^Redaktion:|^Dieser Artikel ist eine Übersetzung|^Übersetzung:|^Recherche:|^Schlussredaktion:"
+ )
+
+ _summary_selector = CSSSelector("p[data-test='article-teaser']")
+ _subheadline_selector = CSSSelector("div.article-markdown > h2")
+ _paragraph_selector = XPath(
+ f"//div[contains(@class, 'article-markdown')] /p[not(re:test(string(), '{_bloat_pattern}'))]",
+ namespaces={"re": "http://exslt.org/regular-expressions"},
+ )
+
+ _topic_selector = XPath("string(//div[contains(@class, 'article-headers') and contains(@class, 'topic')])")
+
+ @attribute
+ def title(self) -> Optional[str]:
+ return self.precomputed.meta.get("og:title")
+
+ @attribute
+ def body(self) -> ArticleBody:
+ article_body = utility.extract_article_body_with_selector(
+ self.precomputed.doc,
+ summary_selector=self._summary_selector,
+ subheadline_selector=self._subheadline_selector,
+ paragraph_selector=self._paragraph_selector,
+ )
+ return article_body
+
+ @attribute
+ def authors(self) -> List[str]:
+ author_string = self.precomputed.meta.get("author")
+ return utility.generic_author_parsing(author_string)
+
+ @attribute
+ def publishing_date(self) -> Optional[datetime]:
+ key_path = ["NewsArticle", "datePublished"]
+ date_string = self.precomputed.ld.get_value_by_key_path(key_path)
+ return utility.generic_date_parsing(date_string)
+
+ @attribute
+ def topics(self) -> List[str]:
+ return utility.generic_topic_parsing(self._topic_selector(self.precomputed.doc))