Skip to content

Commit

Permalink
Merge pull request #588 from dkm1006/krautreporter
Browse files Browse the repository at this point in the history
Added Krautreporter
  • Loading branch information
MaxDall authored Sep 3, 2024
2 parents 1571d6d + e4d79e6 commit 15865e0
Show file tree
Hide file tree
Showing 8 changed files with 329 additions and 15 deletions.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,21 @@
</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>Krautreporter</code>
</td>
<td>
<div>Krautreporter</div>
</td>
<td>
<a href="https://krautreporter.de/">
<span>krautreporter.de</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>MitteldeutscheZeitung</code>
Expand Down
16 changes: 2 additions & 14 deletions src/fundus/parser/base_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import functools
import inspect
import itertools
import json
import re
from abc import ABC
from copy import copy
Expand All @@ -23,12 +22,10 @@
)

import lxml.html
import more_itertools
from lxml.etree import XPath

from fundus.logging import create_logger
from fundus.parser.data import LinkedDataMapping
from fundus.parser.utility import get_meta_content
from fundus.parser.utility import get_ld_content, get_meta_content

RegisteredFunctionT_co = TypeVar("RegisteredFunctionT_co", covariant=True, bound="RegisteredFunction")

Expand Down Expand Up @@ -159,7 +156,6 @@ class Precomputed:
class BaseParser(ABC):
VALID_UNTIL: date = date.today()
precomputed: Precomputed
_ld_selector: XPath = XPath("//script[@type='application/ld+json']")

def __init__(self):
predicate: Callable[[object], bool] = lambda x: isinstance(x, RegisteredFunction)
Expand Down Expand Up @@ -192,15 +188,7 @@ def cache(self) -> Optional[Dict[str, Any]]:

def _base_setup(self, html: str) -> None:
doc = lxml.html.document_fromstring(html)
ld_nodes = self._ld_selector(doc)
lds = []
for node in ld_nodes:
try:
lds.append(json.loads(node.text_content()))
except json.JSONDecodeError as error:
logger.debug(f"Encountered {error!r} during LD parsing")
collapsed_lds = more_itertools.collapse(lds, base_type=dict)
self.precomputed = Precomputed(html, doc, get_meta_content(doc), LinkedDataMapping(collapsed_lds))
self.precomputed = Precomputed(html, doc, get_meta_content(doc), get_ld_content(doc))

def parse(self, html: str, error_handling: Literal["suppress", "catch", "raise"] = "raise") -> Dict[str, Any]:
# wipe existing precomputed
Expand Down
47 changes: 46 additions & 1 deletion src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import json
import re
from collections import defaultdict
from copy import copy
Expand All @@ -24,7 +25,15 @@
from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser.data import ArticleBody, ArticleSection, TextSequence
from fundus.logging import create_logger
from fundus.parser.data import (
ArticleBody,
ArticleSection,
LinkedDataMapping,
TextSequence,
)

logger = create_logger(__name__)


def normalize_whitespace(text: str) -> str:
Expand Down Expand Up @@ -142,6 +151,42 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
return ArticleBody(summary=summary, sections=sections)


_ld_node_selector = XPath("//script[@type='application/ld+json']")
_json_pattern = re.compile(r"(?P<json>{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))")


def get_ld_content(root: lxml.html.HtmlElement) -> LinkedDataMapping:
"""Parse JSON-LD from HTML.
This function parses a script tags of type ld+json.
In case the JSON is wrapped in a CDATA tag it is first stripped.
Args:
root: The HTML document given as a lxml.html.HtmlElement.
Returns:
The JSON-LD data as a LinkedDataMapping
"""

def sanitize(text: str) -> Optional[str]:
# capture only content enclosed as follows: {...} or [{...}]
match = re.search(_json_pattern, text)
if match is not None and (sanitized := match.group("json")):
return sanitized
return None

ld_nodes = _ld_node_selector(root)
lds = []
for node in ld_nodes:
json_content = sanitize(node.text_content()) or ""
try:
lds.append(json.loads(json_content))
except json.JSONDecodeError as error:
logger.debug(f"Encountered {error!r} during LD parsing")
collapsed_lds = more_itertools.collapse(lds, base_type=dict)
return LinkedDataMapping(collapsed_lds)


_meta_node_selector = CSSSelector("head > meta, body > meta")


Expand Down
16 changes: 16 additions & 0 deletions src/fundus/publishers/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .hessenschau import HessenschauParser
from .junge_welt import JungeWeltParser
from .kicker import KickerParser
from .krautreporter import KrautreporterParser
from .mdr import MDRParser
from .merkur import MerkurParser
from .morgenpost_berlin import BerlinerMorgenpostParser
Expand Down Expand Up @@ -463,6 +464,21 @@ class DE(metaclass=PublisherGroup):
url_filter=regex_filter("/slideshow|/video"),
)

Krautreporter = Publisher(
name="Krautreporter",
domain="https://krautreporter.de/",
parser=KrautreporterParser,
sources=[
# NOTE: robots.txt mentions that it reserves the right of use for text & data mining (§ 44 b UrhG),
# but this is not in machine readable format, which is required by law for it to be effective.
# NOTE: Unfortunately, both sitemap.xml and news.xml are identical.
Sitemap("https://krautreporter.de/sitemap.xml", reverse=True),
# NewsMap("https://krautreporter.de/news.xml"),
RSSFeed("https://krautreporter.de/feeds.rss"),
],
url_filter=regex_filter(r"/(pages|archiv|serien|thema|zusammenhaenge)/"),
)

FrankfurterRundschau = Publisher(
name="Frankfurter Rundschau",
domain="https://www.fr.de",
Expand Down
52 changes: 52 additions & 0 deletions src/fundus/publishers/de/krautreporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from datetime import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, utility


class KrautreporterParser(ParserProxy):
class V1(BaseParser):
_bloat_pattern: str = (
"^Redaktion:|^Dieser Artikel ist eine Übersetzung|^Übersetzung:|^Recherche:|^Schlussredaktion:"
)

_summary_selector = CSSSelector("p[data-test='article-teaser']")
_subheadline_selector = CSSSelector("div.article-markdown > h2")
_paragraph_selector = XPath(
f"//div[contains(@class, 'article-markdown')] /p[not(re:test(string(), '{_bloat_pattern}'))]",
namespaces={"re": "http://exslt.org/regular-expressions"},
)

_topic_selector = XPath("string(//div[contains(@class, 'article-headers') and contains(@class, 'topic')])")

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def body(self) -> ArticleBody:
article_body = utility.extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)
return article_body

@attribute
def authors(self) -> List[str]:
author_string = self.precomputed.meta.get("author")
return utility.generic_author_parsing(author_string)

@attribute
def publishing_date(self) -> Optional[datetime]:
key_path = ["NewsArticle", "datePublished"]
date_string = self.precomputed.ld.get_value_by_key_path(key_path)
return utility.generic_date_parsing(date_string)

@attribute
def topics(self) -> List[str]:
return utility.generic_topic_parsing(self._topic_selector(self.precomputed.doc))
Loading

0 comments on commit 15865e0

Please sign in to comment.