From a9f59ca2b5438800e1785001b75e393649ac4a85 Mon Sep 17 00:00:00 2001 From: Paul Schilling Date: Fri, 1 Mar 2024 21:25:27 +0100 Subject: [PATCH] Replace BeautifulSoup with PyQuery --- .github/workflows/django.yml | 3 +- Dockerfile | 3 - articles/migrations/0001_initial.py | 30 ++- ...options_alter_article_headline_and_more.py | 62 +++++ articles/models.py | 66 +++-- articles/tasks.py | 43 ++- articles/tests/conftest.py | 12 +- articles/tests/integration/__init__.py | 0 articles/tests/integration/test_tasks.py | 57 ++++ articles/tests/unit/__init__.py | 0 articles/tests/{ => unit}/test_models.py | 12 +- articles/tests/{ => unit}/test_views.py | 0 fixtures/languages.json | 9 - fixtures/pub_types.json | 9 - fixtures/sources.json | 178 ++++--------- fixtures/{sources2.json => sources_test.json} | 249 ++++++------------ nous_aggregator/settings/base.py | 4 +- requirements/base.in | 1 - requirements/base.txt | 4 - requirements/ci.in | 5 +- requirements/ci.txt | 16 +- requirements/dev.in | 6 +- requirements/dev.txt | 38 ++- scraper/__init__.py | 2 +- scraper/parser.py | 129 ++++----- scraper/spiders.py | 20 +- scraper/tests/__init__.py | 5 + scraper/tests/conftest.py | 26 +- scraper/tests/integration/test_spider.py | 22 +- scraper/tests/mocks.py | 9 +- scraper/tests/unit/test_parsers.py | 65 +++-- scraper/tests/unit/test_spider.py | 3 +- 32 files changed, 585 insertions(+), 503 deletions(-) create mode 100644 articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py create mode 100644 articles/tests/integration/__init__.py create mode 100644 articles/tests/integration/test_tasks.py create mode 100644 articles/tests/unit/__init__.py rename articles/tests/{ => unit}/test_models.py (72%) rename articles/tests/{ => unit}/test_views.py (100%) delete mode 100644 fixtures/languages.json delete mode 100644 fixtures/pub_types.json rename fixtures/{sources2.json => sources_test.json} (50%) diff --git a/.github/workflows/django.yml b/.github/workflows/django.yml index ba38169..e79660d 100644 --- a/.github/workflows/django.yml +++ b/.github/workflows/django.yml @@ -57,7 +57,8 @@ jobs: SECRET_KEY: dummy DJANGO_ENV: BASE SECURE_SSL_REDIRECT: False - run: pytest articles/tests/ + run: pytest + # # Migrations # diff --git a/Dockerfile b/Dockerfile index bdb3b87..7f4ee03 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,9 +15,6 @@ RUN pip install pip -U COPY /requirements/* /app/requirements/ RUN pip install -r /app/requirements/dev.txt -# pyppeteer deps (https://stackoverflow.com/a/71935536) -# RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt - # # Final diff --git a/articles/migrations/0001_initial.py b/articles/migrations/0001_initial.py index 3a6252d..a589334 100644 --- a/articles/migrations/0001_initial.py +++ b/articles/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.1 on 2024-02-26 20:25 +# Generated by Django 5.0.1 on 2024-03-01 20:14 import django.db.models.deletion import django.db.models.functions.text @@ -63,7 +63,7 @@ class Migration(migrations.Migration): ( "paths", models.JSONField( - help_text="A list of resource paths where the scraper will look for articles" + help_text="List of resource paths where the scraper will look for articles" ), ), ( @@ -82,22 +82,34 @@ class Migration(migrations.Migration): ), ), ( - "headline_selectors", + "headline_search_params_find", models.JSONField( - help_text="Information about the structure of the target page needed to extract the headline of articles published by this source" + help_text="Selectors for extracting the headline of articles" ), ), ( - "summary_selectors", + "headline_search_params_remove", models.JSONField( - blank=True, - help_text="Information about the structure of the target page needed to extract the summary of articles published by this source", - null=True, + help_text="Selectors for HTML elements that need to be removed from the headline" + ), + ), + ( + "summary_search_params_find", + models.JSONField( + default=[], + help_text="Selectors for extracting the summary of articles", + ), + ), + ( + "summary_search_params_remove", + models.JSONField( + default=[], + help_text="Selectors for HTML elements that need to be removed from the summary", ), ), ], options={ - "ordering": [django.db.models.functions.text.Lower("title")], + "ordering": [django.db.models.functions.text.Lower("name")], }, ), migrations.CreateModel( diff --git a/articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py b/articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py new file mode 100644 index 0000000..78824cc --- /dev/null +++ b/articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py @@ -0,0 +1,62 @@ +# Generated by Django 5.0.1 on 2024-03-03 10:29 + +import django.db.models.functions.text +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("articles", "0001_initial"), + ] + + operations = [ + migrations.AlterModelOptions( + name="source", + options={"ordering": [django.db.models.functions.text.Lower("title")]}, + ), + migrations.AlterField( + model_name="article", + name="headline", + field=models.CharField( + help_text="The headline of the article", max_length=512 + ), + ), + migrations.AlterField( + model_name="article", + name="slug", + field=models.SlugField( + help_text="The slug of the article for SEO-friendly urls", + max_length=1024, + ), + ), + migrations.AlterField( + model_name="article", + name="url", + field=models.URLField( + help_text="The link to the article", max_length=512, unique=True + ), + ), + migrations.AlterField( + model_name="source", + name="summary_search_params_find", + field=models.JSONField( + default=str, + help_text="Selectors for extracting the summary of articles", + ), + ), + migrations.AlterField( + model_name="source", + name="summary_search_params_remove", + field=models.JSONField( + default=list, + help_text="Selectors for HTML elements that need to be removed from the summary", + ), + ), + migrations.AlterField( + model_name="source", + name="url", + field=models.URLField( + help_text="The url of the source", max_length=512, unique=True + ), + ), + ] diff --git a/articles/models.py b/articles/models.py index 9c4c00a..3fb56da 100644 --- a/articles/models.py +++ b/articles/models.py @@ -1,5 +1,4 @@ import regex -from django.core.validators import URLValidator from django.db import models from django.db.models.functions import Lower from django.utils.translation import gettext_lazy as _ @@ -14,14 +13,13 @@ class Article(models.Model): Fields: headline (models.TextField): headline of the article slug (models.SlugField): slug of the article (generated from headline) + summary (models.TextField): short paragraph summarizing the article created_at (models.DateTimeField): date the article was added to the database. Mostly corresponds to actual publication date, though this can vary. Actual dates are not used because their format varies a lot, hence they are difficult to parse. language (models.CharField): the language of the article url (models.URLField): link to the article - body (models.TextField): either the actual body of the article, - or a short desriptive paragraph Relations: source (ForeignKey): the source of the article @@ -31,13 +29,17 @@ class Article(models.Model): """ headline = models.CharField( - max_length=200, + max_length=512, help_text=_("The headline of the article"), ) slug = models.SlugField( - max_length=255, + max_length=1024, help_text=_("The slug of the article for SEO-friendly urls"), ) + summary = models.TextField( + blank=True, + help_text=_("A summary of the article"), + ) created_at = models.DateTimeField() language = models.CharField( max_length=4, @@ -46,14 +48,10 @@ class Article(models.Model): help_text=_("The language of the article"), ) url = models.URLField( - max_length=255, + max_length=512, unique=True, help_text=_("The link to the article"), ) - summary = models.TextField( - blank=True, - help_text=_("A summary of the article"), - ) source = models.ForeignKey( to="Source", on_delete=models.CASCADE, @@ -65,7 +63,7 @@ class Meta: ordering = ("-created_at",) indexes = [models.Index(fields=["headline", "url"])] - def __str__(self): + def __str__(self) -> str: return f"{self.source}: {self.headline}" @@ -74,7 +72,7 @@ class Source(models.Model): Metadata about the source of articles Fields: - title (models.CharField): name of the source + title (models.CharField): name/title of the source slug (models.SlugField): slug of the source publication_type (models.CharField): the type of publication of the source (newspaper, journal, blog...) @@ -122,15 +120,15 @@ class Source(models.Model): ) url = models.URLField( unique=True, - max_length=255, + max_length=512, help_text=_("The url of the source"), ) # - # info related to scraping + # data related to scraping # paths = models.JSONField( help_text=_( - "A list of resource paths where the scraper will look for articles" + "List of resource paths where the scraper will look for articles" ), ) regex = models.CharField( @@ -147,18 +145,26 @@ class Source(models.Model): "of JavaScript" ), ) - headline_selectors = models.JSONField( + headline_search_params_find = models.JSONField( help_text=_( - "Information about the structure of the target page needed to extract " - "the headline of articles published by this source" + "Selectors for extracting the headline of articles" ), ) - summary_selectors = models.JSONField( - null=True, - blank=True, + headline_search_params_remove = models.JSONField( + help_text=_( + "Selectors for HTML elements that need to be removed from the headline" + ), + ) + summary_search_params_find = models.JSONField( + default=str, + help_text=_( + "Selectors for extracting the summary of articles" + ), + ) + summary_search_params_remove = models.JSONField( + default=list, help_text=_( - "Information about the structure of the target page needed to extract " - "the summary of articles published by this source" + "Selectors for HTML elements that need to be removed from the summary" ), ) @@ -167,7 +173,7 @@ class Meta: Lower("title"), ] - def __str__(self): + def __str__(self) -> str: return f"{self.title}" def to_dict(self): @@ -177,7 +183,15 @@ def to_dict(self): "language": self.language, "javascript_required": self.javascript_required, "filter": regex.compile(self.regex), - "headline_selectors": self.headline_selectors, - "summary_selectors": self.summary_selectors, + "search_params": { + "headline": { + "find": self.headline_search_params_find, + "remove": self.headline_search_params_remove, + }, + "summary": { + "find": self.summary_search_params_find, + "remove": self.summary_search_params_remove, + }, + }, } return sitemap diff --git a/articles/tasks.py b/articles/tasks.py index 49bcbd2..f37f874 100644 --- a/articles/tasks.py +++ b/articles/tasks.py @@ -1,6 +1,8 @@ import json +import logging from celery import group, shared_task +from django.db.utils import DatabaseError from django.utils import timezone import scraper @@ -8,6 +10,8 @@ from .models import Article, Source +logger = logging.getLogger("__name__") + @shared_task def get_articles_for_source(source_title: str) -> None: @@ -21,17 +25,34 @@ def get_articles_for_source(source_title: str) -> None: spider.run() articles = [json.loads(article) for article in spider.articles] - Article.objects.bulk_create([ - Article( - headline=article_data["headline"], - slug=article_data["slug"], - source=Source.objects.get(url=article_data["source_link"]), - summary=article_data["summary"], - language=article_data["language"], - url=article_data["url"], - created_at=timezone.now(), - ) for article_data in articles - ], ignore_conflicts=True) + # try bulk create, revert to individual db saves in case of error + try: + Article.objects.bulk_create([ + Article( + headline=article_data["headline"], + slug=article_data["slug"], + source=Source.objects.get(url=article_data["source_link"]), + summary=article_data["summary"], + language=article_data["language"], + url=article_data["url"], + created_at=timezone.now(), + ) for article_data in articles + ], ignore_conflicts=True) + except DatabaseError as exc: + logger.error("Bulk create failed", exc_info=exc) + for article_data in articles: + try: + Article.objects.create( + headline=article_data["headline"], + slug=article_data["slug"], + source=Source.objects.get(url=article_data["source_link"]), + summary=article_data["summary"], + language=article_data["language"], + url=article_data["url"], + created_at=timezone.now(), + ) + except DatabaseError as exc: + logger.error("DB save failed for %s", article_data["url"], exc_info=exc) @shared_task diff --git a/articles/tests/conftest.py b/articles/tests/conftest.py index c80344c..f03b77b 100644 --- a/articles/tests/conftest.py +++ b/articles/tests/conftest.py @@ -20,8 +20,10 @@ def source_values(): "paths": ["world/"], "javascript_required": False, "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}", - "headline_selectors": {"tag": "h1", "attrs": {}}, - "summary_selectors": {"tag": "h2", "attrs": {}}, + "headline_search_params_find": "h1", + "headline_search_params_remove": [], + "summary_search_params_find": "", + "summary_search_params_remove": [] } @@ -41,8 +43,10 @@ def source_values_2(): "paths": ["world/"], "javascript_required": False, "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}", - "headline_selectors": {"tag": "h1", "attrs": {}}, - "summary_selectors": {"tag": "h2", "attrs": {}}, + "headline_search_params_find": "h1", + "headline_search_params_remove": [], + "summary_search_params_find": "", + "summary_search_params_remove": [] } diff --git a/articles/tests/integration/__init__.py b/articles/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/articles/tests/integration/test_tasks.py b/articles/tests/integration/test_tasks.py new file mode 100644 index 0000000..8ab1c1a --- /dev/null +++ b/articles/tests/integration/test_tasks.py @@ -0,0 +1,57 @@ +import pytest +from django.conf import settings +from django.core.management import call_command + +from articles.models import Article +from articles.tasks import get_articles_for_source +from scraper.tests import MockResponse, contents_aj, expected_aj + + +@pytest.fixture +def django_db_setup(django_db_blocker): + settings.DATABASES["default"] = { + "ENGINE": "django.db.backends.postgresql", + "HOST": "db.example.com", + "NAME": "test_db", + "ATOMIC_REQUESTS": True, + } + + with django_db_blocker.unblock(): + call_command("loaddata", "fixtures/sources_test.json") + + +@pytest.mark.usefixtures('celery_session_app') +@pytest.mark.usefixtures('celery_session_worker') +@pytest.mark.django_db +def test_get_articles_for_source(django_db_setup, contents_aj, expected_aj, mocker): + # + # setup + # + SOURCE_TITLE = "Al Jazeera" + + def return_value(*args, **kwargs): + for key, value in contents_aj.items(): + if args[0] == value["link"]: + return MockResponse(text=value["content"]) + + mocker.patch("aiohttp.ClientSession.get", side_effect=return_value) + + # + # asserts + # + promise = get_articles_for_source.delay(SOURCE_TITLE) + promise.get() + + articles = Article.objects.all() + + assert len(articles) == 12 + + for expected_data in expected_aj.values(): + article = next( + (article for article in articles if article.headline == expected_data["headline"]) + ) + assert article.slug == expected_data["slug"] + assert article.summary == expected_data["summary"] + assert article.language == expected_data["language"] + assert article.url == expected_data["url"] + assert article.source.title == SOURCE_TITLE diff --git a/articles/tests/unit/__init__.py b/articles/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/articles/tests/test_models.py b/articles/tests/unit/test_models.py similarity index 72% rename from articles/tests/test_models.py rename to articles/tests/unit/test_models.py index 40563cf..3566282 100644 --- a/articles/tests/test_models.py +++ b/articles/tests/unit/test_models.py @@ -1,10 +1,10 @@ import regex # type: ignore -from ..models import Article, Source +from articles.models import Article, Source # -# Test Source +# Source # def test_create_source(source_values) -> None: source = Source(**source_values) @@ -18,8 +18,6 @@ def test_source_to_dict(source_values) -> None: sitemap = source.to_dict() for attr_name in [ - "summary_selectors", - "headline_selectors", "javascript_required", "language", "paths", @@ -28,6 +26,10 @@ def test_source_to_dict(source_values) -> None: assert source.url == sitemap["base_url"] assert regex.compile(source.regex) == sitemap["filter"] + assert sitemap["search_params"]["headline"]["find"] == source.headline_search_params_find + assert sitemap["search_params"]["headline"]["remove"] == source.headline_search_params_remove + assert sitemap["search_params"]["summary"]["find"] == source.summary_search_params_find + assert sitemap["search_params"]["summary"]["remove"] == source.summary_search_params_remove def test_source_str_representation(source_values) -> None: @@ -37,7 +39,7 @@ def test_source_str_representation(source_values) -> None: # -# Test Article +# Article # def test_create_article(article_values_m) -> None: article = Article(**article_values_m) diff --git a/articles/tests/test_views.py b/articles/tests/unit/test_views.py similarity index 100% rename from articles/tests/test_views.py rename to articles/tests/unit/test_views.py diff --git a/fixtures/languages.json b/fixtures/languages.json deleted file mode 100644 index 14d1ce8..0000000 --- a/fixtures/languages.json +++ /dev/null @@ -1,9 +0,0 @@ -[ -{ - "model": "articles.language", - "pk": 1, - "fields": { - "name": "en" - } -} -] diff --git a/fixtures/pub_types.json b/fixtures/pub_types.json deleted file mode 100644 index 57b8cf5..0000000 --- a/fixtures/pub_types.json +++ /dev/null @@ -1,9 +0,0 @@ -[ -{ - "model": "articles.publicationtype", - "pk": 1, - "fields": { - "name": "newspaper/journal" - } -} -] diff --git a/fixtures/sources.json b/fixtures/sources.json index f7f22f9..004c096 100644 --- a/fixtures/sources.json +++ b/fixtures/sources.json @@ -12,17 +12,11 @@ "news/" ], "javascript_required": false, - "regex": "/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics)", - "headline_selectors": { - "tag": "h1", - "attrs": {} - }, - "summary_selectors": { - "tag": "p", - "attrs": { - "class": "article__subhead" - } - } + "regex": "(? .story-two"], + "summary_search_params_remove": ["aside", "div"] } }, { @@ -92,14 +72,10 @@ ], "javascript_required": false, "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)", - "headline_selectors": { - "tag": "h1", - "attrs": {} - }, - "summary_selectors": { - "tag": "p", - "attrs": {} - } + "headline_search_params_find": ["h1"], + "headline_search_params_remove": [], + "summary_search_params_find": [".entry-content"], + "summary_search_params_remove": ["div"] } }, { @@ -118,18 +94,10 @@ ], "javascript_required": false, "regex": "[0-9]{4}/[0-9]{2}/", - "headline_selectors": { - "tag": "h1", - "attrs": { - "class": "title" - } - }, - "summary_selectors": { - "tag": "div", - "attrs": { - "class": "tagline" - } - } + "headline_search_params_find": [".title > span"], + "headline_search_params_remove": [], + "summary_search_params_find": [".details .tagline"], + "summary_search_params_remove": [] } }, { @@ -146,18 +114,10 @@ ], "javascript_required": false, "regex": "(? .story-two"], + "summary_search_params_remove": ["aside", "div"] } }, { "model": "articles.source", "pk": 4, "fields": { - "name": "Consortium News", - "slug": "", + "title": "Consortium News", + "url": "https://consortiumnews.com/", "publication_type": "newspaper/journal", "language": "en", - "link": "https://consortiumnews.com/", "paths": [ "" ], - "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)", "javascript_required": false, - "headline_selectors": { - "tag": "h1", - "attrs": {} - }, - "summary_selectors": { - "tag": "p", - "attrs": {} - } + "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)", + "headline_search_params_find": ["h1"], + "headline_search_params_remove": [], + "summary_search_params_find": [".entry-content"], + "summary_search_params_remove": ["div"] } }, { "model": "articles.source", "pk": 5, "fields": { - "name": "Current Affairs", - "slug": "", + "title": "Current Affairs", + "url": "https://www.currentaffairs.org/", "publication_type": "newspaper/journal", "language": "en", - "link": "https://www.currentaffairs.org/", "paths": [ "category/politics/", "category/economics/", "category/interviews/", "category/history/" ], - "regex": "[0-9]{4}/[0-9]{2}/", "javascript_required": false, - "headline_selectors": { - "tag": "h1", - "attrs": { - "class": "title" - } - }, - "summary_selectors": { - "tag": "div", - "attrs": { - "class": "tagline" - } - } + "regex": "[0-9]{4}/[0-9]{2}/", + "headline_search_params_find": [".title > span"], + "headline_search_params_remove": [], + "summary_search_params_find": [".details .tagline"], + "summary_search_params_remove": [] } }, { "model": "articles.source", "pk": 6, "fields": { - "name": "New York Times", - "slug": "", + "title": "New York Times", + "url": "https://www.nytimes.com/", "publication_type": "newspaper/journal", "language": "en", - "link": "https://www.nytimes.com/", "paths": [ "section/world/", "section/business/" ], - "regex": "(? Generator[str, None, No yield urllib.parse.urljoin(sitemap["base_url"], link) -def find_headline(soup: BeautifulSoup, sitemap: dict, url: str) -> Optional[str]: - """Use `sitemap` to extract headline from article""" +def find_headline(doc: PyQuery, sitemap: dict, url: str) -> Optional[str]: + """ Use `doc` + `sitemap` to extract headline from article at `url` """ + + search_params = sitemap["search_params"]["headline"] + + if not search_params.get("find", ""): + logger.warning("No search params for headline of %s", url) + return None + + for param in search_params["find"]: + if headline_doc := doc.find(param): + break + + if not headline_doc: + return None + + for item in search_params.get("remove", []): + headline_doc(f"{item}").remove() try: - headline = soup.find( - sitemap["headline_selectors"]["tag"], - attrs=sitemap["headline_selectors"]["attrs"], - ) - except KeyError as e: - logger.error("KeyError (%s) for headline of %s", e, url) - raise KeyError from e # Abort job after logging error - if headline is None: + headline_text = headline_doc.text().strip() + except AttributeError: return None - headline_text = headline.get_text().strip() + return headline_text -def find_summary(soup: BeautifulSoup, sitemap: dict, url: str) -> Optional[str]: - """Use `parser` & `sitemap` to extract summary from article""" +def find_summary(doc: PyQuery, sitemap: dict, url: str) -> Optional[str]: + """ Use `doc` + `sitemap` to extract summary from article at `url` """ - if not sitemap["summary_selectors"]: + search_params = sitemap["search_params"]["summary"] + + if not search_params.get("find", ""): + logger.warning("No search params for summary of %s", url) return None + for param in search_params["find"]: + if summary_doc := doc.find(param): + break + + if not summary_doc: + return None + + for item in search_params.get("remove", []): + summary_doc(f"{item}").remove() + try: - summary = soup.find( - sitemap["summary_selectors"]["tag"], - attrs=sitemap["summary_selectors"]["attrs"], - ) - except KeyError as e: - logger.error("KeyError (%s) for summary of %s", e, url) - summary = None # Continue job, set `summary` to avoid UnboundLocalError - if summary is None: - logger.warning("Missing summary for %s", url) + summary_text = summary_doc.text().strip() + except AttributeError: return None - summary_text = summary.get_text().strip() - return summary_text + return summary_text[:1000] -def find_language(soup: BeautifulSoup, url: str) -> Optional[str]: - """Detect the language of the page at `url`.""" - if (body := soup.body) is None or (text := body.get_text()) is None: - return None +def find_language(summary: str, headline: str, doc: PyQuery, url: str) -> Optional[str]: + """ Detect the language of the page at `url` """ - language = langdetect.detect(text) - if language is None: - logger.warning("Missing language for %s", url) - return language + for item in (summary, headline, doc.text()): + if item: + try: + language = langdetect.detect(item) + except LangDetectException as exc: + logger.warning( + "Failed to detect language for %s", url, exc_info=exc + ) + else: + return language + return None def parse(html: str, sitemap: dict, url: str) -> Optional[str]: - """ - Parse the `html` at `url` with lxml, use html.parser as a fallback, - return data as JSON. If html.parser fails to return a headline, return `None` - (every article must have a headline); if the language does not match the one - specified in `sitemap`, return `None`. - """ - - # try first parser - parser = "lxml" - soup = BeautifulSoup(html, parser) - headline = find_headline(soup, sitemap, url) - # try second parser - if headline is None: - parser = "html.parser" - soup = BeautifulSoup(html, parser) - headline = find_headline(soup, sitemap, url) + doc = PyQuery(html) + + headline = find_headline(doc, sitemap=sitemap, url=url) if headline is None: logger.warning("No headline for %s", url) return None - language = find_language(soup, url) - if not language == sitemap["language"]: - logger.warning( - "Language of article (%s) + source (%s) do not match: %s", - language, - sitemap["language"], - url, - ) - return None - summary = find_summary(soup, sitemap, url) + + summary = find_summary(doc, sitemap=sitemap, url=url) + if summary is None: + logger.warning("Missing summary for %s", url) + summary = "No description" + + language = find_language(summary, headline, doc, url) + if language is None: + language = sitemap["language"] + article = { "headline": headline, "slug": slugify(headline), - "summary": summary if summary else "No description", + "summary": summary, "language": language, "url": url, "source_link": sitemap["base_url"], diff --git a/scraper/spiders.py b/scraper/spiders.py index dd36ed2..ccb766f 100644 --- a/scraper/spiders.py +++ b/scraper/spiders.py @@ -2,7 +2,7 @@ import logging import random -import aiohttp # pyre-ignore +import aiohttp from aiohttp import ClientSession from aiohttp.web_exceptions import HTTPError @@ -33,7 +33,7 @@ def __init__(self, starting_urls: list[str], sitemap: dict) -> None: self.links: set[str] = set() self.articles: set[str] = set() - async def connect(self, session: ClientSession, url: str) -> str | None: # pyre-ignore + async def connect(self, session: ClientSession, url: str) -> str | None: headers = random.choice(self.headers) try: @@ -44,7 +44,7 @@ async def connect(self, session: ClientSession, url: str) -> str | None: # pyre return None return html - async def get_links(self, session: ClientSession, url: str) -> list[str] | None: + async def get_links(self, session: ClientSession, url: str) -> None: html = await self.connect(session=session, url=url) if not html: return None @@ -52,7 +52,9 @@ async def get_links(self, session: ClientSession, url: str) -> list[str] | None: for link in parser.generate_filtered_links(html=html, sitemap=self.sitemap): self.links.add(link) - async def scrape(self, session: ClientSession, link: str) -> str | None: + return None + + async def scrape(self, session: ClientSession, link: str) -> None: html = await self.connect(session=session, url=link) if not html: return None @@ -63,6 +65,8 @@ async def scrape(self, session: ClientSession, link: str) -> str | None: self.articles.add(article) + return None + async def collect_links(self, session: ClientSession, starting_urls: list[str]) -> None: coros = (self.get_links(session, url) for url in starting_urls) await asyncio.gather(*coros) @@ -78,6 +82,10 @@ async def main(self) -> None: await self.collect_links(session, self.starting_urls) await self.collect_metadata(session, self.links) - def run(self): - loop = asyncio.get_event_loop() + def run(self) -> None: + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) loop.run_until_complete(self.main()) diff --git a/scraper/tests/__init__.py b/scraper/tests/__init__.py index e69de29..d20bc93 100644 --- a/scraper/tests/__init__.py +++ b/scraper/tests/__init__.py @@ -0,0 +1,5 @@ +from .conftest import expected_aj, sitemap_aj +from .integration.test_spider import contents_aj +from .mocks import MockResponse + +__all__ = ["contents_aj", "expected_aj", "MockResponse", "sitemap_aj"] diff --git a/scraper/tests/conftest.py b/scraper/tests/conftest.py index eb13753..39d68bf 100644 --- a/scraper/tests/conftest.py +++ b/scraper/tests/conftest.py @@ -14,8 +14,16 @@ def sitemap_aj(): 'filter': regex.Regex( '(? Dict[str, Dict[str, str]]: "asian_cup": { 'headline': 'Asian Cup final brings FIFA World Cup frenzy back to Qatar’s Souq Waqif', 'slug': 'asian-cup-final-brings-fifa-world-cup-frenzy-back-to-qatars-souq-waqif', - 'summary': "Excitement for the Asian Cup football final is reaching fever pitch " - "as al-Annabi take on an-Nashama at Lusail Stadium.", + "summary": ( + "Excitement for the Asian Cup football final is reaching fever pitch as al-Annabi take on an-Nashama at Lusail Stadium. Save articles to read later and create your own reading list. Doha, Qatar – On Friday nights, Souq Waqif – Qatar’s old-style all-purpose market that also serves as the country’s central tourist attraction – brings together people from all walks of life, dozens of different nationalities and varying interests for a unique mix of colour and noise. But when the country plays host to a football tournament – be it the world’s biggest sporting event such as the FIFA World Cup or a regional championship – the excitement reaches a fever pitch. On the eve of the final of the ongoing AFC Asian Cup 2023, the famous marketplace in the heart of Doha was the marching ground of football fans of both teams vying for the continental crown in Saturday’s final at Lusail Stadium. Passionate supporters of an-Nashama – the gentlemen, as Jordan’s football team is lovingly known – gathered in" + ), 'language': 'en', - "url": "https://www.aljazeera.com/news/2024/2/10/football-fever-hits-" - "jordan-ahead-of-historic-asian-cup-final", + "url": "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif", 'source_link': 'https://www.aljazeera.com/' }, "indonesia": { "headline": "Big election rallies in Indonesia on final day of campaign", "slug": "big-election-rallies-in-indonesia-on-final-day-of-campaign", "summary": ( - "Hundreds of thousands of supporters of " - "presidential contenders pack rallies in capital Jakarta and other cities." + "Tens of thousands of supporters of Indonesia’s presidential candidates have poured onto its streets as they hold final campaigns before heading to the polls in the world’s biggest single-day election. The contenders to lead the world’s third-largest democracy are popular former governors, Ganjar Pranowo and Anies Baswedan, and ex-special forces commander Prabowo Subianto, who has soared in opinion polls with the tacit backing of the president, and with the incumbent’s son as his running mate. The elections on Wednesday will elect a new president and vice president, in addition to parliamentary and local representatives. High-schooler Alfiatnan, 18, said she would vote for Subianto because this was his third attempt at the presidency. “I think there’s no harm [in] giving opportunity to someone who is trying. His optimistic spirit influenced me to choose him.” Also in the running is Baswedan, the former governor of Jakarta who is running as an independent candidate. The 54-year-old was e" ), "language": "en", "url": "https://www.aljazeera.com/news/2024/2/10/big-election-rallies-in-" @@ -53,8 +60,7 @@ def expected_aj() -> Dict[str, Dict[str, str]]: "headline": "How Taiwan’s elections challenge the power of China’s Communist Party", "slug": "how-taiwans-elections-challenge-the-power-of-chinas-communist-party", "summary": ( - "Elections in Taiwan highlight dissatisfaction in China with a political " - "system that Beijing says works best for Chinese people." + "Elections in Taiwan highlight dissatisfaction in China with a political system that Beijing says works best for Chinese people. Save articles to read later and create your own reading list. If free and fair national elections are considered the hallmark of a democratic state, Taiwan has much to boast about. In January, the self-ruled island held its eighth presidential election concurrently with a parliamentary vote. Just 160km (100 miles) away on the other side of the narrow Taiwan Strait, the Communist Party of China (CPC) has ruled China since 1949, and though the party often claims that it governs a democratic state, there is no electoral process comparable with Taiwan’s. China’s President Xi Jinping has referred to “whole-process people’s democracy” to describe the Chinese political system where the “people are the masters” but the party-state apparatus runs the people’s affairs on their behalf. Ken Cai*, a 35-year-old entrepreneur from Shanghai, does not subscribe to Xi’s definit" ), "language": "en", "url": "https://www.aljazeera.com/news/2024/2/10/how-taiwans-elections-" diff --git a/scraper/tests/integration/test_spider.py b/scraper/tests/integration/test_spider.py index 3e3ca05..1036e06 100644 --- a/scraper/tests/integration/test_spider.py +++ b/scraper/tests/integration/test_spider.py @@ -29,25 +29,27 @@ def source(): paths=["news/"], regex="(? Dict[str, Dict[str, Any]]: +def contents_aj(): contents = { "_start": { "link": "https://www.aljazeera.com/news/", "content": read_file(directory=FILES_DIR, file_name="_start.html"), }, "asian_cup": { - "link": "https://www.aljazeera.com/news/2024/2/10/football-fever-hits-" - "jordan-ahead-of-historic-asian-cup-final", + "link": "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif", "content": read_file(directory=FILES_DIR, file_name="asian_cup.html"), }, "football": { - "link": "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif", + "link": "https://www.aljazeera.com/news/2024/2/10/football-fever-hits-" + "jordan-ahead-of-historic-asian-cup-final", "content": read_file(directory=FILES_DIR, file_name="football.html"), }, "footprints": { @@ -108,14 +110,14 @@ def contents() -> Dict[str, Dict[str, Any]]: # tests # @pytest.mark.django_db -def test_run_spider(source, contents, expected_aj, mocker) -> None: +def test_run_spider(source, contents_aj, expected_aj, mocker) -> None: # # setup # def return_value(*args, **kwargs): - for k, v in contents.items(): - if args[0] == v["link"]: - return MockResponse(text=v["content"]) + for key, value in contents_aj.items(): + if args[0] == value["link"]: + return MockResponse(text=value["content"]) mocker.patch("aiohttp.ClientSession.get", side_effect=return_value) diff --git a/scraper/tests/mocks.py b/scraper/tests/mocks.py index 4ad6cd6..bc48fbf 100644 --- a/scraper/tests/mocks.py +++ b/scraper/tests/mocks.py @@ -1,6 +1,9 @@ +from typing import Optional + + class MockResponse: - def __init__(self, status_code: int | None = None, text: str = ""): - self.status_code = status_code or 200 + def __init__(self, status_code: Optional[int] = 200, text: str = ""): + self.status_code = status_code self._text = text async def text(self): @@ -9,5 +12,5 @@ async def text(self): async def __aenter__(self): return self - async def __aexit__(self, exc_type, exc, tb): + async def __aexit__(self, exc_type, exc_value, exc_traceback): pass diff --git a/scraper/tests/unit/test_parsers.py b/scraper/tests/unit/test_parsers.py index da7e008..a9805df 100644 --- a/scraper/tests/unit/test_parsers.py +++ b/scraper/tests/unit/test_parsers.py @@ -1,7 +1,8 @@ import json from pathlib import Path -from bs4 import BeautifulSoup +import pytest +from pyquery import PyQuery from scraper.parser import find_headline, find_language, find_summary, parse @@ -9,41 +10,61 @@ FILES_DIR: Path = Path(__file__).parent.parent.resolve() / "files" / "articles" / "aj" -PAGE = "indonesia" -URL = "https://www.aljazeera.com/news/2024/2/10/big-election-rallies-in-indonesia-on-final-day-of-campaign" +URL_ASIAN_CUP = "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif" +URL_INDONESIA = "https://www.aljazeera.com/news/2024/2/10/big-election-rallies-in-indonesia-on-final-day-of-campaign" +URL_TAIWAN = "https://www.aljazeera.com/news/2024/2/10/how-taiwans-elections-challenge-the-power-of-chinas-communist-party" -def test_find_headline(sitemap_aj, expected_aj) -> None: - html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html") - soup = BeautifulSoup(html, "lxml") - headline_text = find_headline(soup, sitemap_aj, url="dummy") +@pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"]) +def test_find_headline(sitemap_aj, expected_aj, page) -> None: + html = read_file(directory=FILES_DIR, file_name=f"{page}.html") + doc = PyQuery(html) - assert headline_text == expected_aj[f"{PAGE}"]["headline"] + headline_text = find_headline(doc, sitemap_aj, url="dummy") + assert headline_text == expected_aj[f"{page}"]["headline"] -def test_find_summary(sitemap_aj, expected_aj) -> None: - html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html") - soup = BeautifulSoup(html, "lxml") - summary = find_summary(soup, sitemap_aj, url="https://www.example.com") +@pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"]) +def test_find_summary(sitemap_aj, expected_aj, page) -> None: + html = read_file(directory=FILES_DIR, file_name=f"{page}.html") + doc = PyQuery(html) - assert summary == expected_aj[f"{PAGE}"]["summary"] + summary = find_summary(doc, sitemap_aj, url="https://www.example.com") + assert summary + assert summary == expected_aj[f"{page}"]["summary"] -def test_find_language(sitemap_aj, expected_aj) -> None: - html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html") - soup = BeautifulSoup(html, "lxml") - lang = find_language(soup, url="dummy") +@pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"]) +def test_find_language(sitemap_aj, expected_aj, page) -> None: + html = read_file(directory=FILES_DIR, file_name=f"{page}.html") + doc = PyQuery(html) - assert lang == expected_aj[f"{PAGE}"]["language"] + headline = find_headline(doc, sitemap_aj, url="dummy") + summary = find_summary(doc, sitemap_aj, url="https://www.example.com") + assert headline + assert summary -def test_parse(sitemap_aj, expected_aj) -> None: - html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html") + lang = find_language(summary, headline, doc, url="dummy") + + assert lang == expected_aj[f"{page}"]["language"] + + +@pytest.mark.parametrize( + "page, url", + [("asian_cup", URL_ASIAN_CUP), ("indonesia", URL_INDONESIA), ("taiwan", URL_TAIWAN)] +) +def test_parse(sitemap_aj, expected_aj, page, url) -> None: + html = read_file(directory=FILES_DIR, file_name=f"{page}.html") + + json_data = parse(html, sitemap_aj, url=url) + + assert json_data - json_data = parse(html, sitemap_aj, url=URL) data = json.loads(json_data) - assert data == expected_aj[f"{PAGE}"] + assert data + assert data == expected_aj[f"{page}"] diff --git a/scraper/tests/unit/test_spider.py b/scraper/tests/unit/test_spider.py index d407c91..d56df05 100644 --- a/scraper/tests/unit/test_spider.py +++ b/scraper/tests/unit/test_spider.py @@ -71,7 +71,8 @@ def return_value(*args, **kwargs): # # asserts # - await spider.collect_metadata(aiohttp.ClientSession(), spider.links) + async with aiohttp.ClientSession() as session: + await spider.collect_metadata(session, spider.links) articles = [json.loads(article) for article in spider.articles]