diff --git a/.github/workflows/django.yml b/.github/workflows/django.yml index 76afdd7..e122cb7 100644 --- a/.github/workflows/django.yml +++ b/.github/workflows/django.yml @@ -58,10 +58,7 @@ jobs: SECRET_KEY: dummy DJANGO_ENV: BASE SECURE_SSL_REDIRECT: False - run: | - pytest src/articles/tests/unit/ - pytest src/articles/tests/integration/ - pytest src/scraper/tests/ + run: pytest # # Migrations diff --git a/config/scraper.py b/config/scraper.py deleted file mode 100644 index 59a2395..0000000 --- a/config/scraper.py +++ /dev/null @@ -1,22 +0,0 @@ -# TODO: get titles from Source (check lang + presence of SiteMap) -tasks = { - "magazines": { - "en": { - "schedule": 1, - "titles": [ - #"Al Jazeera", - # "Associated Press", - # "Christian Science Monitor", - # "Consortium News", - # "Current Affairs", - "New York Times", - # "NPR", - # "Reuters", - # "The Atlantic", - # "The Intercept", - # "UPI", - # "Wall Street Journal", - ] - }, - }, -} diff --git a/config/settings/__init__.py b/config/settings/__init__.py index a06216a..640c2cd 100644 --- a/config/settings/__init__.py +++ b/config/settings/__init__.py @@ -1,8 +1,5 @@ """ -Settings are loaded depending on the value of the DJANGO_ENV environment variable, - -On the production server, DJANGO_ENV should be left undefined -(hence the production settings are loaded by default). +Settings are loaded depending on the DJANGO_ENV environment variable, """ from decouple import config diff --git a/config/settings/base.py b/config/settings/base.py index 4505c5b..01b597c 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -5,7 +5,7 @@ from decouple import Csv, config -from .. import scraper +from .. import tasks BASE_DIR = Path(__file__).resolve().parent.parent.parent @@ -189,11 +189,21 @@ CELERY_BROKER_URL = config("CELERY_BROKER_URL", "redis://localhost:6379") CELERY_RESULT_BACKEND = config("CELERY_RESULT_BACKEND", "redis://localhost:6379") CELERY_BEAT_SCHEDULE = { - "get_articles_en": { + "scrape_articles_en": { "task": "articles.tasks.get_articles", - "schedule": scraper.tasks["magazines"]["en"]["schedule"], + "schedule": tasks.scrape["articles"]["en"]["schedule"], "kwargs": { "language": "en", + "titles": tasks.scrape["articles"]["en"]["titles"], + } + }, + "get_articles_from_feed_en": { + "task": "articles.tasks.get_articles", + "schedule": tasks.feed["articles"]["en"]["schedule"], + "kwargs": { + "language": "en", + "titles": tasks.feed["articles"]["en"]["titles"], + "time_delta": tasks.feed["articles"]["en"]["schedule"], } } } diff --git a/config/tasks.py b/config/tasks.py new file mode 100644 index 0000000..80b6b65 --- /dev/null +++ b/config/tasks.py @@ -0,0 +1,30 @@ +scrape = { + "articles": { + "en": { + "schedule": 3, # minutes + "titles": [ + "Al Jazeera", + "Associated Press", + "Consortium News", + "Current Affairs", + "NPR", + "Reuters", + "The Atlantic", + "UPI", + ] + }, + }, +} +feed = { + "articles": { + "en": { + "schedule": 3, # minutes + "titles": [ + "Christian Science Monitor", + "New York Times", + "The Guardian", + "The Intercept", + ] + }, + }, +} diff --git a/fixtures/feeds.json b/fixtures/feeds.json new file mode 100644 index 0000000..a9d493d --- /dev/null +++ b/fixtures/feeds.json @@ -0,0 +1,34 @@ +[ +{ + "model": "articles.feed", + "pk": 1, + "fields": { + "source": 3, + "url": "https://rss.csmonitor.com/feeds/world" + } +}, +{ + "model": "articles.feed", + "pk": 2, + "fields": { + "source": 6, + "url": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml" + } +}, +{ + "model": "articles.feed", + "pk": 3, + "fields": { + "source": 10, + "url": "https://theintercept.com/feed/?lang=en" + } +}, +{ + "model": "articles.feed", + "pk": 4, + "fields": { + "source": 13, + "url": "https://www.theguardian.com/world/rss" + } +} +] diff --git a/fixtures/sources_test.json b/fixtures/legacy_sources.json similarity index 96% rename from fixtures/sources_test.json rename to fixtures/legacy_sources.json index df442b9..004c096 100644 --- a/fixtures/sources_test.json +++ b/fixtures/legacy_sources.json @@ -1,241 +1,241 @@ -[ -{ - "model": "articles.source", - "pk": 1, - "fields": { - "title": "Al Jazeera", - "slug": "al-jazeera", - "url": "https://www.aljazeera.com/", - "publication_type": "newspaper/journal", - "language": "en", - "paths": [ - "news/" - ], - "javascript_required": false, - "regex": "(? .story-two"], - "summary_search_params_remove": ["aside", "div"] - } -}, -{ - "model": "articles.source", - "pk": 4, - "fields": { - "title": "Consortium News", - "url": "https://consortiumnews.com/", - "publication_type": "newspaper/journal", - "language": "en", - "paths": [ - "" - ], - "javascript_required": false, - "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)", - "headline_search_params_find": ["h1"], - "headline_search_params_remove": [], - "summary_search_params_find": [".entry-content"], - "summary_search_params_remove": ["div"] - } -}, -{ - "model": "articles.source", - "pk": 5, - "fields": { - "title": "Current Affairs", - "url": "https://www.currentaffairs.org/", - "publication_type": "newspaper/journal", - "language": "en", - "paths": [ - "category/politics/", - "category/economics/", - "category/interviews/", - "category/history/" - ], - "javascript_required": false, - "regex": "[0-9]{4}/[0-9]{2}/", - "headline_search_params_find": [".title > span"], - "headline_search_params_remove": [], - "summary_search_params_find": [".details .tagline"], - "summary_search_params_remove": [] - } -}, -{ - "model": "articles.source", - "pk": 6, - "fields": { - "title": "New York Times", - "url": "https://www.nytimes.com/", - "publication_type": "newspaper/journal", - "language": "en", - "paths": [ - "section/world/", - "section/business/" - ], - "javascript_required": false, - "regex": "(? .story-two"], + "summary_search_params_remove": ["aside", "div"] + } +}, +{ + "model": "articles.source", + "pk": 4, + "fields": { + "title": "Consortium News", + "url": "https://consortiumnews.com/", + "publication_type": "newspaper/journal", + "language": "en", + "paths": [ + "" + ], + "javascript_required": false, + "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)", + "headline_search_params_find": ["h1"], + "headline_search_params_remove": [], + "summary_search_params_find": [".entry-content"], + "summary_search_params_remove": ["div"] + } +}, +{ + "model": "articles.source", + "pk": 5, + "fields": { + "title": "Current Affairs", + "url": "https://www.currentaffairs.org/", + "publication_type": "newspaper/journal", + "language": "en", + "paths": [ + "category/politics/", + "category/economics/", + "category/interviews/", + "category/history/" + ], + "javascript_required": false, + "regex": "[0-9]{4}/[0-9]{2}/", + "headline_search_params_find": [".title > span"], + "headline_search_params_remove": [], + "summary_search_params_find": [".details .tagline"], + "summary_search_params_remove": [] + } +}, +{ + "model": "articles.source", + "pk": 6, + "fields": { + "title": "New York Times", + "url": "https://www.nytimes.com/", + "publication_type": "newspaper/journal", + "language": "en", + "paths": [ + "section/world/", + "section/business/" + ], + "javascript_required": false, + "regex": "(? .story-two" + ], + "description_search_params_remove": [ + "aside", + "div" + ] + } +}, +{ + "model": "articles.sitemap", + "pk": 4, + "fields": { + "source": 4, + "paths": [ + "" + ], + "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)", + "javascript_required": false, + "title_search_params_find": [ + "h1" + ], + "title_search_params_remove": null, + "description_search_params_find": [ + ".entry-content" + ], + "description_search_params_remove": [ + "div" + ] + } +}, +{ + "model": "articles.sitemap", + "pk": 5, + "fields": { + "source": 5, + "paths": [ + "category/politics/", + "category/economics/", + "category/interviews/", + "category/history/" + ], + "regex": "[0-9]{4}/[0-9]{2}/", + "javascript_required": false, + "title_search_params_find": [ + ".title > span" + ], + "title_search_params_remove": null, + "description_search_params_find": [ + ".details .tagline" + ], + "description_search_params_remove": null + } +}, +{ + "model": "articles.sitemap", + "pk": 6, + "fields": { + "source": 6, + "paths": [ + "section/world/", + "section/business/" + ], + "regex": "(? .story-two"], - "summary_search_params_remove": ["aside", "div"] - } -}, -{ - "model": "articles.source", - "pk": 4, - "fields": { - "title": "Consortium News", - "url": "https://consortiumnews.com/", - "publication_type": "newspaper/journal", - "language": "en", - "paths": [ - "" - ], - "javascript_required": false, - "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)", - "headline_search_params_find": ["h1"], - "headline_search_params_remove": [], - "summary_search_params_find": [".entry-content"], - "summary_search_params_remove": ["div"] - } -}, -{ - "model": "articles.source", - "pk": 5, - "fields": { - "title": "Current Affairs", - "url": "https://www.currentaffairs.org/", - "publication_type": "newspaper/journal", - "language": "en", - "paths": [ - "category/politics/", - "category/economics/", - "category/interviews/", - "category/history/" - ], - "javascript_required": false, - "regex": "[0-9]{4}/[0-9]{2}/", - "headline_search_params_find": [".title > span"], - "headline_search_params_remove": [], - "summary_search_params_find": [".details .tagline"], - "summary_search_params_remove": [] - } -}, -{ - "model": "articles.source", - "pk": 6, - "fields": { - "title": "New York Times", - "url": "https://www.nytimes.com/", - "publication_type": "newspaper/journal", - "language": "en", - "paths": [ - "section/world/", - "section/business/" - ], - "javascript_required": false, - "regex": "(? str: - return f"{self.source}: {self.headline}" + return f"{self.source}: {self.title}" class Source(models.Model): @@ -78,16 +78,6 @@ class Source(models.Model): source (newspaper, journal, blog...) language (models.CharField): the language of the source url (models.URLField): the base url of the source - paths (models.JSONField): a list of paths, each of which is appended to - the base url to tell the scraper where to look for links - ('https://example.com/path1/') - regex (models.CharField): a regular expression for filtering links - javascript_required (models.BooleanField): True if JavaScript must be rendered - before data can be extracted from the webpage, False otherwise - headline_selectors (models.JSONField): information about the CSS selectors - needed to extract the headline of an article - summary_selectors (models.JSONField): information about the CSS selectors - needed to extract the summary of an article Relations: sitemap (models.OneToOneField): information about the HTML/CSS structure @@ -115,7 +105,7 @@ class Source(models.Model): help_text=_("The type of publication of the source"), ) language = models.CharField( - max_length=4, + max_length=16, choices=Language.choices, blank=True, help_text=_("The language of the article"), @@ -125,47 +115,6 @@ class Source(models.Model): max_length=512, help_text=_("The url of the source"), ) - paths = models.JSONField( - help_text=_( - "List of resource paths where the scraper will look for articles" - ), - ) - regex = models.CharField( - max_length=255, - blank=True, - help_text=( - "Regular expression for filtering hyper-links found at the resource paths" - ), - ) - javascript_required = models.BooleanField( - default=False, - help_text=_( - "Whether the parsing of articles by this source requires rendering " - "of JavaScript" - ), - ) - headline_search_params_find = models.JSONField( - help_text=_( - "Selectors for extracting the headline of articles" - ), - ) - headline_search_params_remove = models.JSONField( - help_text=_( - "Selectors for HTML elements that need to be removed from the headline" - ), - ) - summary_search_params_find = models.JSONField( - default=str, - help_text=_( - "Selectors for extracting the summary of articles" - ), - ) - summary_search_params_remove = models.JSONField( - default=list, - help_text=_( - "Selectors for HTML elements that need to be removed from the summary" - ), - ) class Meta: ordering = [ @@ -177,6 +126,8 @@ def __str__(self) -> str: class Sitemap(models.Model): + """Information for scraping source""" + source = models.OneToOneField( to=Source, on_delete=models.CASCADE, @@ -200,24 +151,28 @@ class Sitemap(models.Model): "of JavaScript" ), ) - headline_search_params_find = models.JSONField( + title_search_params_find = models.JSONField( + default=str, help_text=_( "Selectors for extracting the headline of articles" ), ) - headline_search_params_remove = models.JSONField( + title_search_params_remove = models.JSONField( + null=True, + blank=True, help_text=_( "Selectors for HTML elements that need to be removed from the headline" ), ) - summary_search_params_find = models.JSONField( + description_search_params_find = models.JSONField( default=str, help_text=_( "Selectors for extracting the summary of articles" ), ) - summary_search_params_remove = models.JSONField( - default=list, + description_search_params_remove = models.JSONField( + null=True, + blank=True, help_text=_( "Selectors for HTML elements that need to be removed from the summary" ), @@ -231,42 +186,26 @@ def to_dict(self): "javascript_required": self.javascript_required, "filter": regex.compile(self.regex), "search_params": { - "headline": { - "find": self.headline_search_params_find, - "remove": self.headline_search_params_remove, + "title": { + "find": self.title_search_params_find, + "remove": self.title_search_params_remove, }, - "summary": { - "find": self.summary_search_params_find, - "remove": self.summary_search_params_remove, + "description": { + "find": self.description_search_params_find, + "remove": self.description_search_params_remove, }, }, } -class RSSFeed(models.Model): +class Feed(models.Model): source = models.ForeignKey( to=Source, on_delete=models.CASCADE, - related_name="rss_feeds", - help_text=_("Map with information for retrieving RSS feed"), + related_name="feeds", ) url = models.URLField( unique=True, max_length=512, help_text=_("The url of the RSS feed"), ) - use_guid = models.BooleanField( - default=False, - help_text=_("Use guid field for retrieving link (default: 'link')") - ) - time_format = models.CharField( - max_length=64, - help_text=_("Format string for converting str to datetime") - ) - - def to_dict(self): - return { - "url": self.url, - "use_guid": self.use_guid, - "time_format": self.time_format, - } diff --git a/src/articles/tasks.py b/src/articles/tasks.py index 22b068f..a5faf9b 100644 --- a/src/articles/tasks.py +++ b/src/articles/tasks.py @@ -1,65 +1,98 @@ import json import logging +from typing import Optional from celery import group, shared_task from django.db.utils import DatabaseError from django.utils import timezone +import rss import scraper -from config.scraper import tasks as scraper_tasks from .models import Article, Source logger = logging.getLogger("__name__") -@shared_task -def get_articles_for_source(source_title: str) -> None: - source: Source = Source.objects.get(title=source_title) - sitemap = source.to_dict() - starting_urls = [ - sitemap["base_url"] + path for path in sitemap["paths"] - ] - - spider = scraper.Spider(starting_urls, sitemap) - spider.run() - articles = [json.loads(article) for article in spider.articles] - - # try bulk create, revert to individual db saves in case of error +def create_articles(article_data: list[dict], source: Source) -> None: + """ + Try bulk create, revert to individual DB saves in case of error + """ try: Article.objects.bulk_create([ Article( - headline=article_data["headline"], - slug=article_data["slug"], - source=Source.objects.get(url=article_data["source_link"]), - summary=article_data["summary"], - language=article_data["language"], - url=article_data["url"], - created_at=timezone.now(), - ) for article_data in articles + title=article["title"], + slug=article["slug"], + source=source, + description=article["description"], + language=article["language"], + url=article["url"], + created_at=article.get("pubdate") or timezone.now(), + ) for article in article_data ], ignore_conflicts=True) except DatabaseError as exc: logger.error("Bulk create failed", exc_info=exc) - for article_data in articles: + for article in article_data: try: Article.objects.create( - headline=article_data["headline"], - slug=article_data["slug"], - source=Source.objects.get(url=article_data["source_link"]), - summary=article_data["summary"], - language=article_data["language"], - url=article_data["url"], - created_at=timezone.now(), + title=article["title"], + slug=article["slug"], + source=source, + description=article["description"], + language=article["language"], + url=article["url"], + created_at=article.get("pubdate") or timezone.now(), ) except DatabaseError as exc: - logger.error("DB save failed for %s", article_data["url"], exc_info=exc) + logger.error("DB save failed for %s", article["url"], exc_info=exc) + + +@shared_task +def get_feed_articles_for_source(source_title: str, time_delta: int): + source: Source = Source.objects.get(title=source_title) + reader = rss.Reader(feeds=source.feeds, time_delta=time_delta) + + reader.get_feed() + article_data = [item for item in reader.articles] + + create_articles(article_data, source) + + +@shared_task +def scrape_articles_from_source(source_title: str): + source: Source = Source.objects.get(title=source_title) + sitemap = source.sitemap.to_dict() + starting_urls = [ + sitemap["base_url"] + path for path in sitemap["paths"] + ] + + spider = scraper.Spider(starting_urls, sitemap) + spider.run() + article_data = [json.loads(article) for article in spider.articles] + + create_articles(article_data, source) @shared_task -def get_articles(language: str): - task_group = group( - get_articles_for_source.s(source_title=title) for title in scraper_tasks["magazines"][language]["titles"] - ) +def get_articles(language: str, titles: list, time_delta: Optional[int] = None): + """Retrieve articles from RSS feed or by scraping, depending on `time_delta` + + Args: + language: the language of the articles + titles: the titles of the article sources + time_delta: max age (unit agnostic) of the target articles; if specified, articles + are retrieved from feed, otherwise scraped + """ + if time_delta: + task_group = group( + get_feed_articles_for_source.s(source_title=title, time_delta=time_delta) + for title in titles + ) + else: + task_group = group( + scrape_articles_from_source.s(source_title=title) for title in titles + ) + promise = task_group.apply_async() if promise.ready(): return promise.get() diff --git a/src/articles/templates/articles/index.html b/src/articles/templates/articles/index.html index 6ec749d..63a5ccd 100644 --- a/src/articles/templates/articles/index.html +++ b/src/articles/templates/articles/index.html @@ -15,10 +15,10 @@ {% for article in source.articles.all|slice:":10" %}
  • - {{ article.headline}} - + {{ article.title}} +

  • diff --git a/src/articles/templates/articles/search_results.html b/src/articles/templates/articles/search_results.html index 2c15433..e7a8a22 100644 --- a/src/articles/templates/articles/search_results.html +++ b/src/articles/templates/articles/search_results.html @@ -20,8 +20,8 @@

    Search Results for: "{{ query }}"

  • {{ article.headline}} + title="{{ article.title }} {{ article.description|slice:":200" }}... {{ article.created_at|naturaltime }}" + href="{{ article.url }}">{{ article.title}}
  • diff --git a/src/articles/tests/conftest.py b/src/articles/tests/conftest.py index f03b77b..d2130ce 100644 --- a/src/articles/tests/conftest.py +++ b/src/articles/tests/conftest.py @@ -6,9 +6,30 @@ from django.utils import timezone from ..constants import Language, PublicationType -from ..models import Article, Source +from ..models import Article, Sitemap, Source +# Sitemaps +@pytest.fixture +def sitemap_values(source_values): + return { + "source": Source(**source_values), + "paths": ["world/"], + "javascript_required": False, + "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}", + "title_search_params_find": "h1", + "title_search_params_remove": None, + "description_search_params_find": "", + "description_search_params_remove": None, + } + + +@pytest.fixture +def sitemap_instance(sitemap_values): + return Sitemap.objects.create(**sitemap_values) + + +# Sources @pytest.fixture def source_values(): return { @@ -17,13 +38,6 @@ def source_values(): "url": "https://www.hocusbogus.com/", "publication_type": PublicationType.newspaper, "language": Language.en, - "paths": ["world/"], - "javascript_required": False, - "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}", - "headline_search_params_find": "h1", - "headline_search_params_remove": [], - "summary_search_params_find": "", - "summary_search_params_remove": [] } @@ -40,13 +54,6 @@ def source_values_2(): "url": "https://www.nonsensical.org/", "publication_type": PublicationType.newspaper, "language": Language.en, - "paths": ["world/"], - "javascript_required": False, - "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}", - "headline_search_params_find": "h1", - "headline_search_params_remove": [], - "summary_search_params_find": "", - "summary_search_params_remove": [] } @@ -58,9 +65,9 @@ def source_instance_2(source_values_2): @pytest.fixture def article_values(source_instance) -> Dict[str, Union[datetime, str]]: return { - "headline": "A cow jumps over the moon", + "title": "A cow jumps over the moon", "slug": "a-cow-jumps-over-the-moon", - "summary": "Lorem dolor sit amet...", + "description": "Lorem dolor sit amet...", "url": "https://www.hocusbogus.com/2022/05/08/foobar", "source": source_instance, "created_at": timezone.localtime(), @@ -70,9 +77,9 @@ def article_values(source_instance) -> Dict[str, Union[datetime, str]]: @pytest.fixture def article_values_m(source_values) -> Dict[str, Union[Source, datetime, str]]: return { - "headline": "A cow jumps over the moon", + "title": "A cow jumps over the moon", "slug": "a-cow-jumps-over-the-moon", - "summary": "Lorem dolor sit amet...", + "description": "Lorem dolor sit amet...", "url": "https://www.hocusbogus.com/2022/05/08/foobar", "source": Source(**source_values), "created_at": timezone.localtime(), @@ -87,9 +94,9 @@ def article_instance(article_values): @pytest.fixture def article_values_2(source_instance) -> Dict[str, Union[datetime, str]]: return { - "headline": "The moon is made of cheese", + "title": "The moon is made of cheese", "slug": "the-moon-is-made-of-cheese", - "summary": "Consectetur adipiscing elit, sed do eiusmod tempor incididunt...", + "description": "Consectetur adipiscing elit, sed do eiusmod tempor incididunt...", "url": "https://www.nonsensical.org/2022/05/08/baz", "source": source_instance, "created_at": timezone.localtime(), diff --git a/src/articles/tests/integration/__init__.py b/src/articles/tests/integration/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/articles/tests/integration/test_tasks.py b/src/articles/tests/integration/test_tasks.py deleted file mode 100644 index b07be56..0000000 --- a/src/articles/tests/integration/test_tasks.py +++ /dev/null @@ -1,53 +0,0 @@ -import pytest -from django.conf import settings -from django.core.management import call_command - -from articles.models import Article -from articles.tasks import get_articles_for_source -from scraper.tests import AsyncMockResponse, contents_aj, expected_aj - -SOURCE_TITLE = "Al Jazeera" - - -@pytest.fixture(scope="session") -def django_db_setup(django_db_setup, django_db_blocker): - settings.DATABASES["default"] = { - "ENGINE": "django.db.backends.postgresql", - "HOST": "db.example.com", - "NAME": "test_db", - "ATOMIC_REQUESTS": True, - } - - with django_db_blocker.unblock(): - call_command("loaddata", "fixtures/sources_test.json") - - -@pytest.mark.usefixtures("celery_session_app") -@pytest.mark.usefixtures("celery_session_worker") -@pytest.mark.django_db -def test_get_articles_for_source(contents_aj, expected_aj, mocker): - # setup - def return_value(*args, **kwargs): - for key, value in contents_aj.items(): - if args[0] == value["link"]: - return AsyncMockResponse(text=value["content"]) - - mocker.patch("aiohttp.ClientSession.get", side_effect=return_value) - - # asserts - promise = get_articles_for_source.delay(SOURCE_TITLE) - promise.get() - - articles = Article.objects.all() - - assert len(articles) == 12 - - for expected_data in expected_aj.values(): - article = next( - (article for article in articles if article.headline == expected_data["headline"]) - ) - assert article.slug == expected_data["slug"] - assert article.summary == expected_data["summary"] - assert article.language == expected_data["language"] - assert article.url == expected_data["url"] - assert article.source.title == SOURCE_TITLE diff --git a/src/articles/tests/unit/test_models.py b/src/articles/tests/unit/test_models.py index 3566282..5dea716 100644 --- a/src/articles/tests/unit/test_models.py +++ b/src/articles/tests/unit/test_models.py @@ -1,6 +1,6 @@ import regex # type: ignore -from articles.models import Article, Source +from articles.models import Article, Sitemap, Source # @@ -13,29 +13,39 @@ def test_create_source(source_values) -> None: assert getattr(source, attr_name) == source_values.get(attr_name) -def test_source_to_dict(source_values) -> None: +def test_source_str_representation(source_values) -> None: source = Source(**source_values) - sitemap = source.to_dict() + + assert str(source) == "Fake News" + + +# +# Sitemap +# +def test_create_sitemap(sitemap_values) -> None: + sitemap = Sitemap(**sitemap_values) + + for attr_name in sitemap_values: + assert getattr(sitemap, attr_name) == sitemap_values.get(attr_name) + + +def test_sitemap_to_dict(sitemap_values) -> None: + sitemap = Sitemap(**sitemap_values) + sitemap_dict = sitemap.to_dict() for attr_name in [ "javascript_required", - "language", "paths", ]: - assert getattr(source, attr_name) == sitemap.get(attr_name) - - assert source.url == sitemap["base_url"] - assert regex.compile(source.regex) == sitemap["filter"] - assert sitemap["search_params"]["headline"]["find"] == source.headline_search_params_find - assert sitemap["search_params"]["headline"]["remove"] == source.headline_search_params_remove - assert sitemap["search_params"]["summary"]["find"] == source.summary_search_params_find - assert sitemap["search_params"]["summary"]["remove"] == source.summary_search_params_remove - + assert getattr(sitemap, attr_name) == sitemap_dict.get(attr_name) -def test_source_str_representation(source_values) -> None: - source = Source(**source_values) - - assert str(source) == "Fake News" + assert regex.compile(sitemap.regex) == sitemap_dict["filter"] + assert sitemap_dict["search_params"]["title"]["find"] == sitemap.title_search_params_find + assert sitemap_dict["search_params"]["title"]["remove"] == sitemap.title_search_params_remove + assert sitemap_dict["search_params"]["description"]["find"] == sitemap.description_search_params_find + assert ( + sitemap_dict["search_params"]["description"]["remove"] == sitemap.description_search_params_remove + ) # @@ -51,4 +61,4 @@ def test_create_article(article_values_m) -> None: def test_article_representation(article_values_m) -> None: article = Article(**article_values_m) - assert str(article) == (f"{article.source}: {article.headline}") + assert str(article) == (f"{article.source}: {article.title}") diff --git a/src/articles/tests/unit/test_views.py b/src/articles/tests/unit/test_views.py index 2d0e60f..7d9aaa9 100644 --- a/src/articles/tests/unit/test_views.py +++ b/src/articles/tests/unit/test_views.py @@ -27,15 +27,15 @@ def test_index_view(client, source_instance, article_instance) -> None: assert source_link_href == source_instance.url # assert that details of article are present in response content - article_headline = doc.find(".article-headline").text() + article_title = doc.find(".article-headline").text() article_link = doc.find(".article-link") article_link_href = article_link.attr("href") article_link_title = article_link.attr("title") - assert article_headline == article_instance.headline + assert article_title == article_instance.title assert article_link.is_("a") assert article_link_href == article_instance.url - assert article_instance.headline in article_link_title + assert article_instance.title in article_link_title # @@ -50,7 +50,7 @@ def test_search_results_view( article_instance, article_instance_2, ) -> None: - query_params = {"q": article_values["headline"][:5]} + query_params = {"q": article_values["title"][:5]} response = client.get(reverse("search"), query_params) html = response.content.decode("utf-8") doc = pq(html) @@ -67,25 +67,25 @@ def test_search_results_view( assert source_link_href == source_instance.url # assert that details of article matching query are present in response content - article_headline = doc.find(".article-headline").text() + article_title = doc.find(".article-headline").text() article_link = doc.find(".article-link") article_link_href = article_link.attr("href") article_link_title = article_link.attr("title") - assert article_headline == article_instance.headline + assert article_title == article_instance.title assert article_link.is_("a") assert article_link_href == article_instance.url - assert article_instance.headline in article_link_title - assert article_instance.summary in article_link_title + assert article_instance.title in article_link_title + assert article_instance.title in article_link_title # assert that details of non-matching source are not found assert source_instance_2.title not in html assert source_instance_2.url not in html # assert that details of non-matching article are not found - assert article_instance_2.headline not in html + assert article_instance_2.title not in html assert article_instance_2.url not in html - assert article_instance_2.summary not in html + assert article_instance_2.title not in html @pytest.mark.django_db @@ -107,9 +107,9 @@ def test_search_result_not_found( assert source_instance.url not in html # assert that details of non-matching article are not found - assert article_instance.headline not in html + assert article_instance.title not in html assert article_instance.url not in html - assert article_instance.summary not in html + assert article_instance.title not in html @pytest.mark.django_db @@ -119,7 +119,7 @@ def test_search_result_substring( article_instance, article_values, ) -> None: - query_params = {"q": article_values["headline"][2:7]} + query_params = {"q": article_values["title"][2:7]} response = client.get(reverse("search"), query_params) html = response.content.decode("utf-8") @@ -130,6 +130,6 @@ def test_search_result_substring( assert source_instance.url not in html # assert that details of non-matching article are not found - assert article_instance.headline not in html + assert article_instance.title not in html assert article_instance.url not in html - assert article_instance.summary not in html + assert article_instance.title not in html diff --git a/src/articles/views.py b/src/articles/views.py index 899a7b1..41b900f 100644 --- a/src/articles/views.py +++ b/src/articles/views.py @@ -18,7 +18,7 @@ def index(request: Optional[HttpRequest]) -> HttpResponse: class SearchResultsView(ListView): model = Article - fields = ["headline", "url", "body"] + fields = ["title", "url", "body"] template_name = "articles/search_results.html" def get_context_data(self, **kwargs) -> Dict[str, Any]: @@ -32,7 +32,7 @@ def get_context_data(self, **kwargs) -> Dict[str, Any]: context.update( { "sources": Source.objects.only("title", "url", "publication_type") - .filter(articles__headline__iregex=regex) + .filter(articles__title__iregex=regex) .distinct(), "query": query, }, @@ -42,4 +42,4 @@ def get_context_data(self, **kwargs) -> Dict[str, Any]: def get_queryset(self): query = self.request.GET.get("q") regex = r"(? list[hashabledict]: + """Parse the content of an RSS feed and return article metadata + + Args: + content: XML string with RSS feed content + time_delta: number representing the max age (unit agnostic) of + articles + + Returns: + Collection of hashable dicts with article metadata + """ response_xml = ElementTree.fromstring(content) - articles = response_xml.findall("./channel/item") + channel = response_xml.find("./channel") + articles = channel.findall("item") + + data = [] + + for article in articles: + # 1. title + title = article.find("title").text - for elem in articles: - # skip old entries - pubDate = elem.find("pubDate").text - # TODO: from RSSMap - format_str = "%a, %d %b %Y %H:%M:%S %z" - pubdate = datetime.strptime(pubDate, format_str) + # 2. url + url = article.find("link").text + + # 3. date (skip old entries) + pubdate_string = article.find("pubDate").text + pubdate = dateutil.parser.parse(pubdate_string, tzinfos=tzinfos) now_utc = datetime.now(timezone.utc) + delta = timedelta(hours=time_delta) - if now_utc - pubdate > DELTA: + if now_utc - pubdate > delta: continue - title = elem.find("title").text - # TODO: from RSSMap - url = elem.find("guid").text - description = elem.find("description").text - - # TODO: investigate, perhaps get from RSSMap - ns = {"dc": "http://purl.org/dc/elements/1.1/"} - creators = elem.find(".//dc:creator", ns).text - - # TODO: return dict (or other DS) - print(title) - print(url) - print(description) - print(creators) - print("\n") + # 4. description + description = article.find("description").text + + # 5. creators + ns = {"dc": "http://purl.org/dc/articleents/1.1/"} + if creators := article.find(".//dc:creator", ns): + creators = creators.text + + # 6. language + language = channel.find("language").text + + article_dict = hashabledict( + title=title, + slug=slugify(title), + url=url, + pubdate=pubdate, + description=description, + creators=creators or "", + language=language, + ) + + data.append(article_dict) + + return data diff --git a/src/rss/reader.py b/src/rss/reader.py index c8aeb1b..17c0679 100644 --- a/src/rss/reader.py +++ b/src/rss/reader.py @@ -1,4 +1,60 @@ -# TODO -# create Reader for making requests + storing article data -# one reader per source, multiple paths (topics) -# get RSSMap for source, pass to parser +import logging +import random + +import requests +from requests.exceptions import RequestException + +from articles.models import Feed +from scraper import headers +from utils.data_structures import hashabledict + +from . import parser + +logger = logging.getLogger(__name__) + + +class Reader: + """ + Class Attributes: + headers (list): a collection of HTTP headers + + Instance Attributes: + feeds (list): a list of RSS feeds + time_delta (int): the maximum age (unit agnostic) of articles to be + collected + articles (set): a collection of dicts representing article + metadata + """ + + headers = headers + + def __init__(self, feeds: list[Feed], time_delta: int): + self.feeds = feeds + self.time_delta = time_delta + self.articles: set[dict] = set() + + def connect(self, session: requests.Session, url: str) -> str | None: + headers = random.choice(self.headers) # nosec + + try: + with session.get(url, headers=headers) as response: + content = response.content + except RequestException as exc: + logger.error("Could not fetch %s", url, exc_info=exc) + return None + return content + + def get_articles(self, session: requests.Session, feed: Feed) -> list[hashabledict]: + content = self.connect(session, feed.url) + if not content: + return None + + articles = parser.parse(content, self.time_delta) + return articles + + def get_feed(self): + with requests.Session() as session: + for feed in self.feeds.all(): + articles = self.get_articles(session, feed) + for article in articles: + self.articles.add(article) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index 0ba02b6..73fe853 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -1,3 +1,4 @@ +from .headers import headers from .spiders import Spider -__all__ = ["Spider"] +__all__ = ["Spider", "headers"] diff --git a/src/scraper/parser.py b/src/scraper/parser.py index 5b627f3..c413d02 100644 --- a/src/scraper/parser.py +++ b/src/scraper/parser.py @@ -25,65 +25,67 @@ def generate_filtered_links(html: str, sitemap: dict) -> Generator[str, None, No yield urllib.parse.urljoin(sitemap["base_url"], link) -def find_headline(doc: PyQuery, sitemap: dict, url: str) -> Optional[str]: +def find_title(doc: PyQuery, sitemap: dict, url: str) -> Optional[str]: """ Use `doc` + `sitemap` to extract headline from article at `url` """ - search_params = sitemap["search_params"]["headline"] + search_params = sitemap["search_params"]["title"] if not search_params.get("find", ""): - logger.warning("No search params for headline of %s", url) + logger.warning("No search params for title of %s", url) return None - # TODO: error handling; except cssselect.SelectorSyntaxError; log source, article, params + # TODO: error handling; except cssselect.SelectorSyntaxError for param in search_params["find"]: - if headline_doc := doc.find(param): + if title_doc := doc.find(param): break - if not headline_doc: + if not title_doc: return None - for item in search_params.get("remove", []): - headline_doc(f"{item}").remove() + if remove_params := search_params.get("remove", []): + for item in remove_params: + title_doc(f"{item}").remove() try: - headline_text = headline_doc.text().strip() + title_text = title_doc.text().strip() except AttributeError: return None - return headline_text + return title_text -def find_summary(doc: PyQuery, sitemap: dict, url: str) -> Optional[str]: +def find_description(doc: PyQuery, sitemap: dict, url: str) -> Optional[str]: """ Use `doc` + `sitemap` to extract summary from article at `url` """ - search_params = sitemap["search_params"]["summary"] + search_params = sitemap["search_params"]["description"] if not search_params.get("find", ""): logger.warning("No search params for summary of %s", url) return None for param in search_params["find"]: - if summary_doc := doc.find(param): + if description_doc := doc.find(param): break - if not summary_doc: + if not description_doc: return None - for item in search_params.get("remove", []): - summary_doc(f"{item}").remove() + if remove_params := search_params.get("remove", []): + for item in remove_params: + description_doc(f"{item}").remove() try: - summary_text = summary_doc.text().strip() + description_text = description_doc.text().strip() except AttributeError: return None - return summary_text[:1000] + return description_text[:1000] -def find_language(summary: str, headline: str, doc: PyQuery, url: str) -> Optional[str]: +def find_language(description: str, title: str, doc: PyQuery, url: str) -> Optional[str]: """ Detect the language of the page at `url` """ - for item in (summary, headline, doc.text()): + for item in (description, title, doc.text()): if item: try: language = langdetect.detect(item) @@ -99,24 +101,24 @@ def find_language(summary: str, headline: str, doc: PyQuery, url: str) -> Option def parse(html: str, sitemap: dict, url: str) -> Optional[str]: doc = PyQuery(html) - headline = find_headline(doc, sitemap=sitemap, url=url) - if headline is None: - logger.warning("No headline for %s", url) + title = find_title(doc, sitemap=sitemap, url=url) + if title is None: + logger.warning("No title for %s", url) return None - summary = find_summary(doc, sitemap=sitemap, url=url) - if summary is None: + description = find_description(doc, sitemap=sitemap, url=url) + if description is None: logger.warning("Missing summary for %s", url) - summary = "No description" + description = "No description" - language = find_language(summary, headline, doc, url) + language = find_language(description, title, doc, url) if language is None: language = sitemap["language"] article = { - "headline": headline, - "slug": slugify(headline), - "summary": summary, + "title": title, + "slug": slugify(title), + "description": description, "language": language, "url": url, "source_link": sitemap["base_url"], diff --git a/src/scraper/spiders.py b/src/scraper/spiders.py index 46a664b..a678889 100644 --- a/src/scraper/spiders.py +++ b/src/scraper/spiders.py @@ -6,7 +6,7 @@ from aiohttp import ClientSession from aiohttp.web_exceptions import HTTPError -from . import headers, parser +from scraper import headers, parser logger: logging.Logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ class Spider: metadata """ - headers = headers.headers + headers = headers def __init__(self, starting_urls: list[str], sitemap: dict) -> None: self.sitemap: dict = sitemap diff --git a/src/scraper/tests/conftest.py b/src/scraper/tests/conftest.py index 39d68bf..80e1cc9 100644 --- a/src/scraper/tests/conftest.py +++ b/src/scraper/tests/conftest.py @@ -7,19 +7,19 @@ @pytest.fixture def sitemap_aj(): return { - 'base_url': 'https://www.aljazeera.com/', - 'paths': ['news/'], - 'language': 'en', - 'javascript_required': False, - 'filter': regex.Regex( + "base_url": 'https://www.aljazeera.com/', + "paths": ['news/'], + "language": 'en', + "javascript_required": False, + "filter": regex.Regex( '(? List[str]: def expected_aj() -> Dict[str, Dict[str, str]]: expected = { "asian_cup": { - 'headline': 'Asian Cup final brings FIFA World Cup frenzy back to Qatar’s Souq Waqif', - 'slug': 'asian-cup-final-brings-fifa-world-cup-frenzy-back-to-qatars-souq-waqif', - "summary": ( + "title": 'Asian Cup final brings FIFA World Cup frenzy back to Qatar’s Souq Waqif', + "slug": 'asian-cup-final-brings-fifa-world-cup-frenzy-back-to-qatars-souq-waqif', + "description": ( "Excitement for the Asian Cup football final is reaching fever pitch as al-Annabi take on an-Nashama at Lusail Stadium. Save articles to read later and create your own reading list. Doha, Qatar – On Friday nights, Souq Waqif – Qatar’s old-style all-purpose market that also serves as the country’s central tourist attraction – brings together people from all walks of life, dozens of different nationalities and varying interests for a unique mix of colour and noise. But when the country plays host to a football tournament – be it the world’s biggest sporting event such as the FIFA World Cup or a regional championship – the excitement reaches a fever pitch. On the eve of the final of the ongoing AFC Asian Cup 2023, the famous marketplace in the heart of Doha was the marching ground of football fans of both teams vying for the continental crown in Saturday’s final at Lusail Stadium. Passionate supporters of an-Nashama – the gentlemen, as Jordan’s football team is lovingly known – gathered in" ), - 'language': 'en', + "language": 'en', "url": "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif", - 'source_link': 'https://www.aljazeera.com/' + "source_link": 'https://www.aljazeera.com/' }, "indonesia": { - "headline": "Big election rallies in Indonesia on final day of campaign", + "title": "Big election rallies in Indonesia on final day of campaign", "slug": "big-election-rallies-in-indonesia-on-final-day-of-campaign", - "summary": ( + "description": ( "Tens of thousands of supporters of Indonesia’s presidential candidates have poured onto its streets as they hold final campaigns before heading to the polls in the world’s biggest single-day election. The contenders to lead the world’s third-largest democracy are popular former governors, Ganjar Pranowo and Anies Baswedan, and ex-special forces commander Prabowo Subianto, who has soared in opinion polls with the tacit backing of the president, and with the incumbent’s son as his running mate. The elections on Wednesday will elect a new president and vice president, in addition to parliamentary and local representatives. High-schooler Alfiatnan, 18, said she would vote for Subianto because this was his third attempt at the presidency. “I think there’s no harm [in] giving opportunity to someone who is trying. His optimistic spirit influenced me to choose him.” Also in the running is Baswedan, the former governor of Jakarta who is running as an independent candidate. The 54-year-old was e" ), "language": "en", @@ -57,9 +57,9 @@ def expected_aj() -> Dict[str, Dict[str, str]]: "source_link": "https://www.aljazeera.com/", }, "taiwan": { - "headline": "How Taiwan’s elections challenge the power of China’s Communist Party", + "title": "How Taiwan’s elections challenge the power of China’s Communist Party", "slug": "how-taiwans-elections-challenge-the-power-of-chinas-communist-party", - "summary": ( + "description": ( "Elections in Taiwan highlight dissatisfaction in China with a political system that Beijing says works best for Chinese people. Save articles to read later and create your own reading list. If free and fair national elections are considered the hallmark of a democratic state, Taiwan has much to boast about. In January, the self-ruled island held its eighth presidential election concurrently with a parliamentary vote. Just 160km (100 miles) away on the other side of the narrow Taiwan Strait, the Communist Party of China (CPC) has ruled China since 1949, and though the party often claims that it governs a democratic state, there is no electoral process comparable with Taiwan’s. China’s President Xi Jinping has referred to “whole-process people’s democracy” to describe the Chinese political system where the “people are the masters” but the party-state apparatus runs the people’s affairs on their behalf. Ken Cai*, a 35-year-old entrepreneur from Shanghai, does not subscribe to Xi’s definit" ), "language": "en", diff --git a/src/scraper/tests/integration/test_spider.py b/src/scraper/tests/integration/test_spider.py index a48bfc6..b88ef5e 100644 --- a/src/scraper/tests/integration/test_spider.py +++ b/src/scraper/tests/integration/test_spider.py @@ -5,7 +5,7 @@ from django.utils.text import slugify from articles.constants import Language, PublicationType -from articles.models import Source +from articles.models import Sitemap, Source from scraper.spiders import Spider from ..mocks import AsyncMockResponse @@ -19,19 +19,29 @@ # @pytest.fixture def source(): - return Source( + return Source.objects.create( title="Al Jazeera", slug=slugify("Al Jazeera"), publication_type=PublicationType.newspaper, language=Language.en, url="https://www.aljazeera.com/", + ) + + +@pytest.fixture +def sitemap(source): + return Sitemap( + source=source, paths=["news/"], - regex="(? None: +def test_run_spider(source, sitemap, contents_aj, expected_aj, mocker) -> None: # setup def return_value(*args, **kwargs): for key, value in contents_aj.items(): @@ -119,7 +129,7 @@ def return_value(*args, **kwargs): mocker.patch("aiohttp.ClientSession.get", side_effect=return_value) # asserts - sitemap = source.to_dict() + sitemap = source.sitemap.to_dict() starting_urls = [ sitemap["base_url"] + path for path in sitemap["paths"] ] @@ -133,10 +143,10 @@ def return_value(*args, **kwargs): for expected_data in expected_aj.values(): article = next( - (article for article in articles if article["headline"] == expected_data["headline"]) + (article for article in articles if article["title"] == expected_data["title"]) ) assert article["slug"] == expected_data["slug"] - assert article["summary"] == expected_data["summary"] + assert article["description"] == expected_data["description"] assert article["language"] == expected_data["language"] assert article["url"] == expected_data["url"] assert article["source_link"] == expected_data["source_link"] diff --git a/src/scraper/tests/unit/test_parsers.py b/src/scraper/tests/unit/test_parsers.py index a9805df..81d6230 100644 --- a/src/scraper/tests/unit/test_parsers.py +++ b/src/scraper/tests/unit/test_parsers.py @@ -4,7 +4,7 @@ import pytest from pyquery import PyQuery -from scraper.parser import find_headline, find_language, find_summary, parse +from scraper.parser import find_description, find_language, find_title, parse from ..utils import read_file @@ -17,24 +17,24 @@ @pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"]) -def test_find_headline(sitemap_aj, expected_aj, page) -> None: +def test_find_title(sitemap_aj, expected_aj, page) -> None: html = read_file(directory=FILES_DIR, file_name=f"{page}.html") doc = PyQuery(html) - headline_text = find_headline(doc, sitemap_aj, url="dummy") + title_text = find_title(doc, sitemap_aj, url="dummy") - assert headline_text == expected_aj[f"{page}"]["headline"] + assert title_text == expected_aj[f"{page}"]["title"] @pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"]) -def test_find_summary(sitemap_aj, expected_aj, page) -> None: +def test_find_description(sitemap_aj, expected_aj, page) -> None: html = read_file(directory=FILES_DIR, file_name=f"{page}.html") doc = PyQuery(html) - summary = find_summary(doc, sitemap_aj, url="https://www.example.com") + description = find_description(doc, sitemap_aj, url="https://www.example.com") - assert summary - assert summary == expected_aj[f"{page}"]["summary"] + assert description + assert description == expected_aj[f"{page}"]["description"] @pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"]) @@ -42,13 +42,13 @@ def test_find_language(sitemap_aj, expected_aj, page) -> None: html = read_file(directory=FILES_DIR, file_name=f"{page}.html") doc = PyQuery(html) - headline = find_headline(doc, sitemap_aj, url="dummy") - summary = find_summary(doc, sitemap_aj, url="https://www.example.com") + title = find_title(doc, sitemap_aj, url="dummy") + description = find_description(doc, sitemap_aj, url="https://www.example.com") - assert headline - assert summary + assert title + assert description - lang = find_language(summary, headline, doc, url="dummy") + lang = find_language(description, title, doc, url="dummy") assert lang == expected_aj[f"{page}"]["language"] diff --git a/src/scraper/tests/unit/test_spider.py b/src/scraper/tests/unit/test_spider.py index 4afd7e8..4c8b659 100644 --- a/src/scraper/tests/unit/test_spider.py +++ b/src/scraper/tests/unit/test_spider.py @@ -78,16 +78,16 @@ def return_value(*args, **kwargs): (article for article in articles if article["url"] == "https://indonesia.com") ) - assert article_indonesia["headline"] == expected_aj["indonesia"]["headline"] + assert article_indonesia["title"] == expected_aj["indonesia"]["title"] assert article_indonesia["slug"] == expected_aj["indonesia"]["slug"] - assert article_indonesia["summary"] == expected_aj["indonesia"]["summary"] + assert article_indonesia["description"] == expected_aj["indonesia"]["description"] assert article_indonesia["language"] == expected_aj["indonesia"]["language"] article_taiwan = next( (article for article in articles if article["url"] == "https://taiwan.com") ) - assert article_taiwan["headline"] == expected_aj["taiwan"]["headline"] + assert article_taiwan["title"] == expected_aj["taiwan"]["title"] assert article_taiwan["slug"] == expected_aj["taiwan"]["slug"] - assert article_taiwan["summary"] == expected_aj["taiwan"]["summary"] + assert article_taiwan["description"] == expected_aj["taiwan"]["description"] assert article_indonesia["language"] == expected_aj["taiwan"]["language"] diff --git a/src/utils/data_structures.py b/src/utils/data_structures.py new file mode 100644 index 0000000..1d932ce --- /dev/null +++ b/src/utils/data_structures.py @@ -0,0 +1,9 @@ +class hashabledict(dict): + def __key(self): + return tuple((k, self[k]) for k in sorted(self)) + + def __hash__(self): + return hash(self.__key()) + + def __eq__(self, other): + return self.__key() == other.__key()