diff --git a/Dockerfile b/Dockerfile
index ada29f9..bdb3b87 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,7 +16,7 @@ COPY /requirements/* /app/requirements/
RUN pip install -r /app/requirements/dev.txt
# pyppeteer deps (https://stackoverflow.com/a/71935536)
-RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt
+# RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt
#
diff --git a/articles/migrations/0001_initial.py b/articles/migrations/0001_initial.py
index 4395ec6..3a6252d 100644
--- a/articles/migrations/0001_initial.py
+++ b/articles/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 5.0.1 on 2024-02-18 12:41
+# Generated by Django 5.0.1 on 2024-02-26 20:25
import django.db.models.deletion
import django.db.models.functions.text
@@ -24,7 +24,7 @@ class Migration(migrations.Migration):
),
),
(
- "name",
+ "title",
models.CharField(
help_text="The name of the source", max_length=128, unique=True
),
@@ -97,7 +97,7 @@ class Migration(migrations.Migration):
),
],
options={
- "ordering": [django.db.models.functions.text.Lower("name")],
+ "ordering": [django.db.models.functions.text.Lower("title")],
},
),
migrations.CreateModel(
diff --git a/articles/models.py b/articles/models.py
index 0c37f29..9c4c00a 100644
--- a/articles/models.py
+++ b/articles/models.py
@@ -74,7 +74,7 @@ class Source(models.Model):
Metadata about the source of articles
Fields:
- name (models.CharField): name of the source
+ title (models.CharField): name of the source
slug (models.SlugField): slug of the source
publication_type (models.CharField): the type of publication of the
source (newspaper, journal, blog...)
@@ -97,7 +97,7 @@ class Source(models.Model):
scraper
"""
- name = models.CharField(
+ title = models.CharField(
max_length=128,
unique=True,
blank=False,
@@ -164,11 +164,11 @@ class Source(models.Model):
class Meta:
ordering = [
- Lower("name"),
+ Lower("title"),
]
def __str__(self):
- return f"{self.name}"
+ return f"{self.title}"
def to_dict(self):
sitemap = {
diff --git a/articles/tasks.py b/articles/tasks.py
index ee5c151..49bcbd2 100644
--- a/articles/tasks.py
+++ b/articles/tasks.py
@@ -10,8 +10,8 @@
@shared_task
-def get_articles_for_source(source_title: str):
- source: Source = Source.objects.get(name=source_title)
+def get_articles_for_source(source_title: str) -> None:
+ source: Source = Source.objects.get(title=source_title)
sitemap = source.to_dict()
starting_urls = [
sitemap["base_url"] + path for path in sitemap["paths"]
@@ -19,18 +19,18 @@ def get_articles_for_source(source_title: str):
spider = scraper.Spider(starting_urls, sitemap)
spider.run()
- data = [json.loads(article) for article in spider.articles]
+ articles = [json.loads(article) for article in spider.articles]
Article.objects.bulk_create([
Article(
headline=article_data["headline"],
slug=article_data["slug"],
- source=Source.objects.get(link=article_data["source_link"]),
+ source=Source.objects.get(url=article_data["source_link"]),
summary=article_data["summary"],
language=article_data["language"],
url=article_data["url"],
created_at=timezone.now(),
- ) for article_data in data
+ ) for article_data in articles
], ignore_conflicts=True)
diff --git a/articles/templates/articles/index.html b/articles/templates/articles/index.html
index ae695cf..6ec749d 100644
--- a/articles/templates/articles/index.html
+++ b/articles/templates/articles/index.html
@@ -9,7 +9,7 @@
{% for source in sources %}
-
+
{% for article in source.articles.all|slice:":10" %}
diff --git a/articles/templates/articles/search_results.html b/articles/templates/articles/search_results.html
index 89d8c89..2c15433 100644
--- a/articles/templates/articles/search_results.html
+++ b/articles/templates/articles/search_results.html
@@ -12,11 +12,11 @@ Search Results for: "{{ query }}"
{% for source in sources %}
-
+
{% for article in article_list %}
- {% if article.source.name == source.name %}
+ {% if article.source.title == source.title %}
-
Dict[str, Union[datetime, str]]:
return {
"headline": "A cow jumps over the moon",
"slug": "a-cow-jumps-over-the-moon",
@@ -64,7 +64,7 @@ def article_values(source_instance):
@pytest.fixture
-def article_values_m(source_values):
+def article_values_m(source_values) -> Dict[str, Union[Source, datetime, str]]:
return {
"headline": "A cow jumps over the moon",
"slug": "a-cow-jumps-over-the-moon",
@@ -81,7 +81,7 @@ def article_instance(article_values):
@pytest.fixture
-def article_values_2(source_instance):
+def article_values_2(source_instance) -> Dict[str, Union[datetime, str]]:
return {
"headline": "The moon is made of cheese",
"slug": "the-moon-is-made-of-cheese",
@@ -98,5 +98,5 @@ def article_instance_2(article_values_2):
@pytest.fixture
-def client():
+def client() -> Client:
return Client()
diff --git a/articles/tests/test_models.py b/articles/tests/test_models.py
index 4453b39..40563cf 100644
--- a/articles/tests/test_models.py
+++ b/articles/tests/test_models.py
@@ -6,14 +6,14 @@
#
# Test Source
#
-def test_create_source(source_values):
+def test_create_source(source_values) -> None:
source = Source(**source_values)
for attr_name in source_values:
assert getattr(source, attr_name) == source_values.get(attr_name)
-def test_source_to_dict(source_values):
+def test_source_to_dict(source_values) -> None:
source = Source(**source_values)
sitemap = source.to_dict()
@@ -30,7 +30,7 @@ def test_source_to_dict(source_values):
assert regex.compile(source.regex) == sitemap["filter"]
-def test_source_str_representation(source_values):
+def test_source_str_representation(source_values) -> None:
source = Source(**source_values)
assert str(source) == "Fake News"
@@ -39,14 +39,14 @@ def test_source_str_representation(source_values):
#
# Test Article
#
-def test_create_article(article_values_m):
+def test_create_article(article_values_m) -> None:
article = Article(**article_values_m)
for attr_name in article_values_m:
assert getattr(article, attr_name) == article_values_m.get(attr_name)
-def test_article_representation(article_values_m):
+def test_article_representation(article_values_m) -> None:
article = Article(**article_values_m)
assert str(article) == (f"{article.source}: {article.headline}")
diff --git a/articles/tests/test_views.py b/articles/tests/test_views.py
index d662691..af14260 100644
--- a/articles/tests/test_views.py
+++ b/articles/tests/test_views.py
@@ -9,7 +9,7 @@
# Test IndexView
#
@pytest.mark.django_db
-def test_index_view(client, source_instance, article_instance):
+def test_index_view(client, source_instance, article_instance) -> None:
response = client.get(reverse("index"))
assert response.status_code == 200
@@ -18,12 +18,12 @@ def test_index_view(client, source_instance, article_instance):
doc = pq(html)
# assert that details of source are present in response content
- source_name = doc.find(".source-name").text()
+ source_title = doc.find(".source-title").text()
source_link = doc.find(".source-link")
source_link_href = source_link.attr("href")
- assert source_name == source_instance.name
+ assert source_title == source_instance.title
assert source_link.is_("a")
assert source_link_href == source_instance.url
@@ -50,7 +50,7 @@ def test_search_results_view(
article_values,
article_instance,
article_instance_2,
-):
+) -> None:
query_params = {"q": article_values["headline"][:5]}
response = client.get(reverse("search"), query_params)
html = response.content.decode("utf-8")
@@ -59,11 +59,11 @@ def test_search_results_view(
assert response.status_code == 200
# assert that details of source matching query are present in response content
- source_name = doc.find(".source-name").text()
+ source_title = doc.find(".source-title").text()
source_link = doc.find(".source-link")
source_link_href = source_link.attr("href")
- assert source_name == source_instance.name
+ assert source_title == source_instance.title
assert source_link.is_("a")
assert source_link_href == source_instance.url
@@ -80,7 +80,7 @@ def test_search_results_view(
assert article_instance.summary in article_link_title
# assert that details of non-matching source are not found
- assert source_instance_2.name not in html
+ assert source_instance_2.title not in html
assert source_instance_2.url not in html
# assert that details of non-matching article are not found
@@ -96,7 +96,7 @@ def test_search_result_not_found(
source_instance_2,
article_instance,
article_instance_2,
-):
+) -> None:
query_params = {"q": "test"}
response = client.get(reverse("search"), query_params)
html = response.content.decode("utf-8")
@@ -104,7 +104,7 @@ def test_search_result_not_found(
assert response.status_code == 200
# assert that details of non-matching source are not found
- assert source_instance.name not in html
+ assert source_instance.title not in html
assert source_instance.url not in html
# assert that details of non-matching article are not found
@@ -119,7 +119,7 @@ def test_search_result_substring(
source_instance,
article_instance,
article_values,
-):
+) -> None:
query_params = {"q": article_values["headline"][2:7]}
response = client.get(reverse("search"), query_params)
html = response.content.decode("utf-8")
@@ -127,7 +127,7 @@ def test_search_result_substring(
assert response.status_code == 200
# assert that details of non-matching source are not found
- assert source_instance.name not in html
+ assert source_instance.title not in html
assert source_instance.url not in html
# assert that details of non-matching article are not found
diff --git a/articles/urls.py b/articles/urls.py
index 04fb78a..5a7c07c 100644
--- a/articles/urls.py
+++ b/articles/urls.py
@@ -1,8 +1,11 @@
+from typing import List
+
from django.urls import path
+from django.urls.resolvers import URLPattern
from . import views
-urlpatterns = [
+urlpatterns: List[URLPattern] = [
path("", views.index, name="index"),
path("search", views.SearchResultsView.as_view(), name="search"),
]
diff --git a/articles/views.py b/articles/views.py
index 06a361f..899a7b1 100644
--- a/articles/views.py
+++ b/articles/views.py
@@ -1,14 +1,17 @@
import re
+from typing import Any, Dict, Optional
+from django.http.request import HttpRequest
+from django.http.response import HttpResponse
from django.shortcuts import render
from django.views.generic import ListView
from .models import Article, Source
-def index(request):
+def index(request: Optional[HttpRequest]) -> HttpResponse:
context = {
- "sources": Source.objects.only("name", "url", "publication_type"),
+ "sources": Source.objects.only("title", "url", "publication_type"),
}
return render(request, "articles/index.html", context)
@@ -18,7 +21,7 @@ class SearchResultsView(ListView):
fields = ["headline", "url", "body"]
template_name = "articles/search_results.html"
- def get_context_data(self, **kwargs):
+ def get_context_data(self, **kwargs) -> Dict[str, Any]:
"""Pre-filter sources on the basis of article headlines and queries"""
context = super().get_context_data(**kwargs)
@@ -28,7 +31,7 @@ def get_context_data(self, **kwargs):
regex = r"(? None:
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'nous_aggregator.settings')
try:
diff --git a/nous_aggregator/asgi.py b/nous_aggregator/asgi.py
index b22f185..751c449 100644
--- a/nous_aggregator/asgi.py
+++ b/nous_aggregator/asgi.py
@@ -10,7 +10,8 @@
import os
from django.core.asgi import get_asgi_application
+from django.core.handlers.asgi import ASGIHandler
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'nous_aggregator.settings')
-application = get_asgi_application()
+application: ASGIHandler = get_asgi_application()
diff --git a/nous_aggregator/settings/base.py b/nous_aggregator/settings/base.py
index 1c4b325..82d4e5b 100644
--- a/nous_aggregator/settings/base.py
+++ b/nous_aggregator/settings/base.py
@@ -1,5 +1,6 @@
import os
from pathlib import Path
+from typing import Any, Dict, List, Union
from decouple import Csv, config
@@ -7,7 +8,7 @@
# Build paths inside the project like this: BASE_DIR / 'subdir'.
# (modified because settings files are nested one level deeper)
-BASE_DIR = Path(__file__).resolve().parent.parent.parent
+BASE_DIR: Path = Path(__file__).resolve().parent.parent.parent
SECRET_KEY = config("SECRET_KEY", default="")
@@ -25,7 +26,7 @@
SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https')
-DATABASES = {
+DATABASES: Dict[str, Dict[str, Any]] = {
"default": {
"ENGINE": config("DATABASE_ENGINE", default="django.db.backends.postgresql"),
"NAME": config("DATABASE_NAME", default="postgres"),
@@ -88,8 +89,8 @@
# Logging
-LOG_DIR = BASE_DIR / "logs"
-LOGGING = {
+LOG_DIR: Path = BASE_DIR / "logs"
+LOGGING: Dict[str, Union[Dict[str, Dict[str, str]], Dict[str, Dict[str, Union[List[str], bool, str]]], Dict[str, Dict[str, Union[int, str]]], int]] = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
@@ -170,7 +171,7 @@
# https://docs.djangoproject.com/en/4.0/howto/static-files/
STATIC_URL = "static/"
-STATIC_ROOT = os.path.join(BASE_DIR, 'static')
+STATIC_ROOT: str = os.path.join(BASE_DIR, 'static')
# Default primary key field type
@@ -178,13 +179,13 @@
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
-# Timeouts (connection timeout, read timeout) in seconds for requests
-REQUESTS_TIMEOUT = (30, 60)
+# Timeout in seconds for HTTP requests
+REQUESTS_TIMEOUT = 30
# Celery
CELERY_BROKER_URL = config("CELERY_BROKER_URL", "redis://localhost:6379")
CELERY_RESULT_BACKEND = config("CELERY_RESULT_BACKEND", "redis://localhost:6379")
-CELERY_BEAT_SCHEDULE = {
+CELERY_BEAT_SCHEDULE: Dict[str, Dict[str, Union[Dict[str, str], List[str], int, str]]] = {
"get_articles_en": {
"task": "articles.tasks.get_articles",
"schedule": scraper_tasks.magazines["en"]["schedule"],
diff --git a/nous_aggregator/settings/local.py b/nous_aggregator/settings/local.py
index c3a5c96..fe00d3c 100644
--- a/nous_aggregator/settings/local.py
+++ b/nous_aggregator/settings/local.py
@@ -1,7 +1,11 @@
import socket
+from typing import List
from .base import *
+hostname: str
+ips: List[str]
+
SECRET_KEY = "hush-hush"
DEBUG = True
diff --git a/nous_aggregator/urls.py b/nous_aggregator/urls.py
index b24dd1a..2523e9d 100644
--- a/nous_aggregator/urls.py
+++ b/nous_aggregator/urls.py
@@ -13,11 +13,14 @@
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
+from typing import List
+
from django.conf import settings
from django.contrib import admin
-from django.urls import path, include
+from django.urls import include, path
+from django.urls.resolvers import URLResolver
-urlpatterns = [
+urlpatterns: List[URLResolver] = [
path("", include("articles.urls")),
path("articles", include("articles.urls")),
path("", include("articles.urls")),
diff --git a/nous_aggregator/wsgi.py b/nous_aggregator/wsgi.py
index c944b75..3c4105a 100644
--- a/nous_aggregator/wsgi.py
+++ b/nous_aggregator/wsgi.py
@@ -9,8 +9,9 @@
import os
+from django.core.handlers.wsgi import WSGIHandler
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'nous_aggregator.settings')
-application = get_wsgi_application()
+application: WSGIHandler = get_wsgi_application()
diff --git a/requirements/base.in b/requirements/base.in
index 36c6922..054be83 100644
--- a/requirements/base.in
+++ b/requirements/base.in
@@ -1,14 +1,13 @@
# Python
+aiohttp
beautifulsoup4
langdetect
-regex
-requests
-requests-html
-python-decouple
psycopg2
+python-decouple
pyppeteer
+pyquery
+regex
sentry-sdk
-websockets
# Django
django
diff --git a/requirements/base.txt b/requirements/base.txt
index 1042fdf..ff36e21 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -4,29 +4,28 @@
#
# ./bin/compile_dependencies.sh
#
+aiohttp==3.9.3
+ # via -r requirements/base.in
+aiosignal==1.3.1
+ # via aiohttp
amqp==5.2.0
# via kombu
appdirs==1.4.4
# via pyppeteer
asgiref==3.7.2
# via django
+attrs==23.2.0
+ # via aiohttp
beautifulsoup4==4.12.2
- # via
- # -r requirements/base.in
- # bs4
+ # via -r requirements/base.in
billiard==4.2.0
# via celery
-bs4==0.0.1
- # via requests-html
celery==5.3.6
# via -r requirements/base.in
certifi==2023.7.22
# via
# pyppeteer
- # requests
# sentry-sdk
-charset-normalizer==3.2.0
- # via requests
click==8.1.7
# via
# celery
@@ -50,12 +49,14 @@ django==5.0.1
# django-on-heroku
django-on-heroku==1.1.2
# via -r requirements/base.in
-fake-useragent==1.2.1
- # via requests-html
+frozenlist==1.4.1
+ # via
+ # aiohttp
+ # aiosignal
gunicorn==21.2.0
# via -r requirements/base.in
idna==3.4
- # via requests
+ # via yarl
importlib-metadata==6.8.0
# via pyppeteer
kombu==5.3.5
@@ -64,10 +65,12 @@ langdetect==1.0.9
# via -r requirements/base.in
lxml==4.9.3
# via pyquery
+multidict==6.0.5
+ # via
+ # aiohttp
+ # yarl
packaging==23.2
# via gunicorn
-parse==1.19.1
- # via requests-html
prompt-toolkit==3.0.43
# via click-repl
psycopg2==2.9.9
@@ -77,11 +80,9 @@ psycopg2-binary==2.9.7
pyee==8.2.2
# via pyppeteer
pyppeteer==1.0.2
- # via
- # -r requirements/base.in
- # requests-html
+ # via -r requirements/base.in
pyquery==2.0.0
- # via requests-html
+ # via -r requirements/base.in
python-dateutil==2.8.2
# via celery
python-decouple==3.8
@@ -90,12 +91,6 @@ redis==5.0.1
# via -r requirements/base.in
regex==2023.8.8
# via -r requirements/base.in
-requests==2.31.0
- # via
- # -r requirements/base.in
- # requests-html
-requests-html==0.10.0
- # via -r requirements/base.in
sentry-sdk==1.39.2
# via -r requirements/base.in
six==1.16.0
@@ -115,22 +110,19 @@ tzdata==2023.4
urllib3==1.26.18
# via
# pyppeteer
- # requests
# sentry-sdk
vine==5.1.0
# via
# amqp
# celery
# kombu
-w3lib==2.1.2
- # via requests-html
wcwidth==0.2.13
# via prompt-toolkit
websockets==10.4
- # via
- # -r requirements/base.in
- # pyppeteer
+ # via pyppeteer
whitenoise==6.5.0
# via django-on-heroku
+yarl==1.9.4
+ # via aiohttp
zipp==3.16.2
# via importlib-metadata
diff --git a/requirements/ci.in b/requirements/ci.in
index 61ae914..076ba5b 100644
--- a/requirements/ci.in
+++ b/requirements/ci.in
@@ -6,4 +6,5 @@ bandit
# Testing
pyquery
pytest
+pytest-mock
pytest-django
diff --git a/requirements/ci.txt b/requirements/ci.txt
index eafbd45..4af1f45 100644
--- a/requirements/ci.txt
+++ b/requirements/ci.txt
@@ -4,6 +4,15 @@
#
# ./bin/compile_dependencies.sh
#
+aiohttp==3.9.3
+ # via
+ # -c requirements/base.txt
+ # -r requirements/base.txt
+aiosignal==1.3.1
+ # via
+ # -c requirements/base.txt
+ # -r requirements/base.txt
+ # aiohttp
amqp==5.2.0
# via
# -c requirements/base.txt
@@ -19,23 +28,22 @@ asgiref==3.7.2
# -c requirements/base.txt
# -r requirements/base.txt
# django
+attrs==23.2.0
+ # via
+ # -c requirements/base.txt
+ # -r requirements/base.txt
+ # aiohttp
bandit==1.7.5
# via -r requirements/ci.in
beautifulsoup4==4.12.2
# via
# -c requirements/base.txt
# -r requirements/base.txt
- # bs4
billiard==4.2.0
# via
# -c requirements/base.txt
# -r requirements/base.txt
# celery
-bs4==0.0.1
- # via
- # -c requirements/base.txt
- # -r requirements/base.txt
- # requests-html
celery==5.3.6
# via
# -c requirements/base.txt
@@ -45,13 +53,7 @@ certifi==2023.7.22
# -c requirements/base.txt
# -r requirements/base.txt
# pyppeteer
- # requests
# sentry-sdk
-charset-normalizer==3.2.0
- # via
- # -c requirements/base.txt
- # -r requirements/base.txt
- # requests
click==8.1.7
# via
# -c requirements/base.txt
@@ -95,11 +97,12 @@ django-on-heroku==1.1.2
# via
# -c requirements/base.txt
# -r requirements/base.txt
-fake-useragent==1.2.1
+frozenlist==1.4.1
# via
# -c requirements/base.txt
# -r requirements/base.txt
- # requests-html
+ # aiohttp
+ # aiosignal
gitdb==4.0.10
# via gitpython
gitpython==3.1.41
@@ -112,7 +115,7 @@ idna==3.4
# via
# -c requirements/base.txt
# -r requirements/base.txt
- # requests
+ # yarl
importlib-metadata==6.8.0
# via
# -c requirements/base.txt
@@ -138,17 +141,18 @@ markdown-it-py==3.0.0
# via rich
mdurl==0.1.2
# via markdown-it-py
-packaging==23.2
+multidict==6.0.5
# via
# -c requirements/base.txt
# -r requirements/base.txt
- # gunicorn
- # pytest
-parse==1.19.1
+ # aiohttp
+ # yarl
+packaging==23.2
# via
# -c requirements/base.txt
# -r requirements/base.txt
- # requests-html
+ # gunicorn
+ # pytest
pbr==5.11.1
# via stevedore
pluggy==1.3.0
@@ -178,19 +182,20 @@ pyppeteer==1.0.2
# via
# -c requirements/base.txt
# -r requirements/base.txt
- # requests-html
pyquery==2.0.0
# via
# -c requirements/base.txt
# -r requirements/base.txt
# -r requirements/ci.in
- # requests-html
pytest==7.4.2
# via
# -r requirements/ci.in
# pytest-django
+ # pytest-mock
pytest-django==4.5.2
# via -r requirements/ci.in
+pytest-mock==3.12.0
+ # via -r requirements/ci.in
python-dateutil==2.8.2
# via
# -c requirements/base.txt
@@ -210,15 +215,6 @@ regex==2023.8.8
# via
# -c requirements/base.txt
# -r requirements/base.txt
-requests==2.31.0
- # via
- # -c requirements/base.txt
- # -r requirements/base.txt
- # requests-html
-requests-html==0.10.0
- # via
- # -c requirements/base.txt
- # -r requirements/base.txt
rich==13.5.3
# via bandit
sentry-sdk==1.39.2
@@ -265,7 +261,6 @@ urllib3==1.26.18
# -c requirements/base.txt
# -r requirements/base.txt
# pyppeteer
- # requests
# sentry-sdk
vine==5.1.0
# via
@@ -274,11 +269,6 @@ vine==5.1.0
# amqp
# celery
# kombu
-w3lib==2.1.2
- # via
- # -c requirements/base.txt
- # -r requirements/base.txt
- # requests-html
wcwidth==0.2.13
# via
# -c requirements/base.txt
@@ -294,6 +284,11 @@ whitenoise==6.5.0
# -c requirements/base.txt
# -r requirements/base.txt
# django-on-heroku
+yarl==1.9.4
+ # via
+ # -c requirements/base.txt
+ # -r requirements/base.txt
+ # aiohttp
zipp==3.16.2
# via
# -c requirements/base.txt
diff --git a/requirements/dev.txt b/requirements/dev.txt
index 6f08f0f..5bb0ecd 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -4,6 +4,15 @@
#
# ./bin/compile_dependencies.sh
#
+aiohttp==3.9.3
+ # via
+ # -c requirements/ci.txt
+ # -r requirements/ci.txt
+aiosignal==1.3.1
+ # via
+ # -c requirements/ci.txt
+ # -r requirements/ci.txt
+ # aiohttp
amqp==5.2.0
# via
# -c requirements/ci.txt
@@ -20,7 +29,11 @@ asgiref==3.7.2
# -r requirements/ci.txt
# django
attrs==23.2.0
- # via pytype
+ # via
+ # -c requirements/ci.txt
+ # -r requirements/ci.txt
+ # aiohttp
+ # pytype
bandit==1.7.5
# via
# -c requirements/ci.txt
@@ -30,7 +43,6 @@ beautifulsoup4==4.12.2
# via
# -c requirements/ci.txt
# -r requirements/ci.txt
- # bs4
billiard==4.2.0
# via
# -c requirements/ci.txt
@@ -38,11 +50,6 @@ billiard==4.2.0
# celery
black==23.9.1
# via -r requirements/dev.in
-bs4==0.0.1
- # via
- # -c requirements/ci.txt
- # -r requirements/ci.txt
- # requests-html
celery==5.3.6
# via
# -c requirements/ci.txt
@@ -52,13 +59,7 @@ certifi==2023.7.22
# -c requirements/ci.txt
# -r requirements/ci.txt
# pyppeteer
- # requests
# sentry-sdk
-charset-normalizer==3.2.0
- # via
- # -c requirements/ci.txt
- # -r requirements/ci.txt
- # requests
click==8.1.7
# via
# -c requirements/ci.txt
@@ -115,13 +116,14 @@ django-stubs==4.2.4
# via -r requirements/dev.in
django-stubs-ext==4.2.2
# via django-stubs
-fake-useragent==1.2.1
+flake8==6.1.0
+ # via -r requirements/dev.in
+frozenlist==1.4.1
# via
# -c requirements/ci.txt
# -r requirements/ci.txt
- # requests-html
-flake8==6.1.0
- # via -r requirements/dev.in
+ # aiohttp
+ # aiosignal
gitdb==4.0.10
# via
# -c requirements/ci.txt
@@ -140,7 +142,7 @@ idna==3.4
# via
# -c requirements/ci.txt
# -r requirements/ci.txt
- # requests
+ # yarl
importlab==0.8
# via pytype
importlib-metadata==6.8.0
@@ -187,6 +189,12 @@ mdurl==0.1.2
# -c requirements/ci.txt
# -r requirements/ci.txt
# markdown-it-py
+multidict==6.0.5
+ # via
+ # -c requirements/ci.txt
+ # -r requirements/ci.txt
+ # aiohttp
+ # yarl
mypy==1.5.1
# via
# -r requirements/dev.in
@@ -209,11 +217,6 @@ packaging==23.2
# black
# gunicorn
# pytest
-parse==1.19.1
- # via
- # -c requirements/ci.txt
- # -r requirements/ci.txt
- # requests-html
pathspec==0.11.2
# via black
pbr==5.11.1
@@ -266,21 +269,24 @@ pyppeteer==1.0.2
# via
# -c requirements/ci.txt
# -r requirements/ci.txt
- # requests-html
pyquery==2.0.0
# via
# -c requirements/ci.txt
# -r requirements/ci.txt
- # requests-html
pytest==7.4.2
# via
# -c requirements/ci.txt
# -r requirements/ci.txt
# pytest-django
+ # pytest-mock
pytest-django==4.5.2
# via
# -c requirements/ci.txt
# -r requirements/ci.txt
+pytest-mock==3.12.0
+ # via
+ # -c requirements/ci.txt
+ # -r requirements/ci.txt
python-dateutil==2.8.2
# via
# -c requirements/ci.txt
@@ -306,15 +312,6 @@ regex==2023.8.8
# via
# -c requirements/ci.txt
# -r requirements/ci.txt
-requests==2.31.0
- # via
- # -c requirements/ci.txt
- # -r requirements/ci.txt
- # requests-html
-requests-html==0.10.0
- # via
- # -c requirements/ci.txt
- # -r requirements/ci.txt
rich==13.5.3
# via
# -c requirements/ci.txt
@@ -391,7 +388,6 @@ urllib3==1.26.18
# -c requirements/ci.txt
# -r requirements/ci.txt
# pyppeteer
- # requests
# sentry-sdk
vine==5.1.0
# via
@@ -400,11 +396,6 @@ vine==5.1.0
# amqp
# celery
# kombu
-w3lib==2.1.2
- # via
- # -c requirements/ci.txt
- # -r requirements/ci.txt
- # requests-html
wcwidth==0.2.13
# via
# -c requirements/ci.txt
@@ -420,6 +411,11 @@ whitenoise==6.5.0
# -c requirements/ci.txt
# -r requirements/ci.txt
# django-on-heroku
+yarl==1.9.4
+ # via
+ # -c requirements/ci.txt
+ # -r requirements/ci.txt
+ # aiohttp
zipp==3.16.2
# via
# -c requirements/ci.txt
diff --git a/scraper/parser.py b/scraper/parser.py
index 4389e10..04286da 100644
--- a/scraper/parser.py
+++ b/scraper/parser.py
@@ -1,12 +1,28 @@
import json
import logging
-from typing import Optional
+import urllib
+from typing import Generator, Optional
-import langdetect # type: ignore
-from bs4 import BeautifulSoup # type: ignore
+import langdetect
+from bs4 import BeautifulSoup
from django.utils.text import slugify
+from pyquery import PyQuery
-logger = logging.getLogger(__name__)
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def generate_filtered_links(html: str, sitemap: dict) -> Generator[str, None, None]:
+ doc = PyQuery(html)
+ anchors = doc.find("a")
+
+ for anchor in anchors:
+ try:
+ link = anchor.attrib["href"]
+ except KeyError:
+ pass
+ else:
+ if sitemap["filter"].search(link):
+ yield urllib.parse.urljoin(sitemap["base_url"], link)
def find_headline(soup: BeautifulSoup, sitemap: dict, url: str) -> Optional[str]:
@@ -29,7 +45,7 @@ def find_headline(soup: BeautifulSoup, sitemap: dict, url: str) -> Optional[str]
def find_summary(soup: BeautifulSoup, sitemap: dict, url: str) -> Optional[str]:
"""Use `parser` & `sitemap` to extract summary from article"""
- if sitemap["summary_selectors"] is None:
+ if not sitemap["summary_selectors"]:
return None
try:
diff --git a/scraper/spiders.py b/scraper/spiders.py
index ede8e6d..dd36ed2 100644
--- a/scraper/spiders.py
+++ b/scraper/spiders.py
@@ -1,17 +1,14 @@
import asyncio
import logging
import random
-from http.cookiejar import DefaultCookiePolicy
-import pyppeteer
-import requests
-from django.conf import settings
-from requests_html import AsyncHTMLSession, HTMLResponse
-from websockets.exceptions import ConnectionClosedError # pyre-ignore
+import aiohttp # pyre-ignore
+from aiohttp import ClientSession
+from aiohttp.web_exceptions import HTTPError
from . import headers, parser
-logger = logging.getLogger(__name__)
+logger: logging.Logger = logging.getLogger(__name__)
class Spider:
@@ -20,11 +17,6 @@ class Spider:
headers (list): a collection of HTTP headers
Instance Attributes:
- event_loop: the event loop is explicitly set on the `Spider` instance and
- passed to the requests session in order to minimize the risk of
- attaching Futures to different event loops by accident
- asession (AsyncHTMLSession): requests session that supports asynchronous
- requests
sitemap (dict): contains information about a particular page
starting_urls (list): the urls where each `Spider` instance searches for
links
@@ -35,80 +27,57 @@ class Spider:
headers = headers.headers
- def __init__(self, starting_urls: list[str], sitemap: dict):
- self.event_loop = asyncio.get_event_loop()
- self.asession = AsyncHTMLSession(loop=self.event_loop)
+ def __init__(self, starting_urls: list[str], sitemap: dict) -> None:
self.sitemap: dict = sitemap
self.starting_urls: list[str] = starting_urls
self.links: set[str] = set()
self.articles: set[str] = set()
- @property
- def asession(self):
- return self._asession
+ async def connect(self, session: ClientSession, url: str) -> str | None: # pyre-ignore
+ headers = random.choice(self.headers)
- @asession.setter
- def asession(self, asession: AsyncHTMLSession):
- self._asession = asession
- self._asession.cookies.set_policy(DefaultCookiePolicy(allowed_domains=[]))
-
- async def connect(self, url: str) -> HTMLResponse | None:
- """GET request wrapper"""
try:
- response = await self.asession.get( # pyre-ignore
- url,
- headers=random.choice(self.headers), # nosec
- timeout=settings.REQUESTS_TIMEOUT,
- )
- except requests.exceptions.RequestException as e:
- logger.error("Could not fetch %s (%s)", url, e)
+ async with session.get(url, headers=headers) as response:
+ html = await response.text()
+ except HTTPError as exc:
+ logger.error("Could not fetch %s", url, exc_info=exc)
+ return None
+ return html
+
+ async def get_links(self, session: ClientSession, url: str) -> list[str] | None:
+ html = await self.connect(session=session, url=url)
+ if not html:
+ return None
+
+ for link in parser.generate_filtered_links(html=html, sitemap=self.sitemap):
+ self.links.add(link)
+
+ async def scrape(self, session: ClientSession, link: str) -> str | None:
+ html = await self.connect(session=session, url=link)
+ if not html:
return None
- return response
-
- async def get_links(self, url: str) -> None:
- response = await self.connect(url)
- if not response:
- return
-
- if self.sitemap["javascript_required"]:
- try:
- await response.html.arender()
- except pyppeteer.errors.TimeoutError as e:
- logger.error("Could not render JavaScript for %s (%s)", url, e)
- for link in response.html.absolute_links:
- if self.sitemap["filter"].search(link):
- self.links.add(link)
-
- async def scrape(self, url: str) -> None:
- response = await self.connect(url)
- if not response:
- return
-
- html = response.text
- article = parser.parse(html, self.sitemap, url)
- if article:
- self.articles.add(article)
-
- async def collect_links(self) -> None:
- """
- Create & gather tasks for collection of links
- """
- coros = [self.get_links(url) for url in self.starting_urls]
+
+ article = parser.parse(html, sitemap=self.sitemap, url=link)
+ if not article:
+ return None
+
+ self.articles.add(article)
+
+ async def collect_links(self, session: ClientSession, starting_urls: list[str]) -> None:
+ coros = (self.get_links(session, url) for url in starting_urls)
await asyncio.gather(*coros)
- async def collect_metadata(self) -> None:
- """
- Create & gather tasks for scraping
- """
- coros = [self.scrape(link) for link in self.links]
+ async def collect_metadata(self, session: ClientSession, links: set[str]) -> None:
+ coros = (self.scrape(session, link) for link in links)
await asyncio.gather(*coros)
+ async def main(self) -> None:
+ async with aiohttp.ClientSession(
+ timeout=aiohttp.ClientTimeout(60),
+ ) as session:
+ await self.collect_links(session, self.starting_urls)
+ await self.collect_metadata(session, self.links)
+
def run(self):
- """
- Run the `spider` instance inside the event loop
- """
- try:
- self.event_loop.run_until_complete(self.collect_links())
- self.event_loop.run_until_complete(self.collect_metadata())
- except ConnectionClosedError as ex:
- logger.warning("Connection closed", exc_info=ex)
+ loop = asyncio.get_event_loop()
+ loop.run_until_complete(self.main())
diff --git a/scraper/tasks.py b/scraper/tasks.py
index ff45b77..c13fd7d 100644
--- a/scraper/tasks.py
+++ b/scraper/tasks.py
@@ -1,6 +1,6 @@
magazines = {
"en": {
- "schedule": 3600,
+ "schedule": 120,
"titles": [
"Al Jazeera",
"Associated Press",
diff --git a/scraper/tests/conftest.py b/scraper/tests/conftest.py
index 057c6eb..eb13753 100644
--- a/scraper/tests/conftest.py
+++ b/scraper/tests/conftest.py
@@ -1,3 +1,5 @@
+from typing import Dict, List
+
import pytest
import regex
@@ -18,12 +20,12 @@ def sitemap_aj():
@pytest.fixture
-def starting_urls_aj():
+def starting_urls_aj() -> List[str]:
return ["https://www.aljazeera.com/news/"]
@pytest.fixture
-def expected_aj():
+def expected_aj() -> Dict[str, Dict[str, str]]:
expected = {
"asian_cup": {
'headline': 'Asian Cup final brings FIFA World Cup frenzy back to Qatar’s Souq Waqif',
diff --git a/scraper/tests/integration/test_spider.py b/scraper/tests/integration/test_spider.py
index 02b69d2..3e3ca05 100644
--- a/scraper/tests/integration/test_spider.py
+++ b/scraper/tests/integration/test_spider.py
@@ -1,21 +1,41 @@
import json
from pathlib import Path
+from typing import Any, Dict
import pytest
+from django.utils.text import slugify
-import scraper
+from articles.constants import Language, PublicationType
+from articles.models import Source
from scraper.spiders import Spider
+from ..mocks import MockResponse
from ..utils import read_file
-FILES_DIR = Path(__file__).parent.parent.resolve() / "files" / "articles" / "aj"
+FILES_DIR: Path = Path(__file__).parent.parent.resolve() / "files" / "articles" / "aj"
#
# fixtures
#
@pytest.fixture
-def contents():
+def source():
+ return Source(
+ title="Al Jazeera",
+ slug=slugify("Al Jazeera"),
+ publication_type=PublicationType.newspaper,
+ language=Language.en,
+ url="https://www.aljazeera.com/",
+ paths=["news/"],
+ regex="(? Dict[str, Dict[str, Any]]:
contents = {
"_start": {
"link": "https://www.aljazeera.com/news/",
@@ -87,15 +107,30 @@ def contents():
#
# tests
#
-def test_run_spider(starting_urls_aj, sitemap_aj, contents, expected_aj, requests_mock):
- spider = Spider(starting_urls_aj, sitemap_aj)
+@pytest.mark.django_db
+def test_run_spider(source, contents, expected_aj, mocker) -> None:
+ #
+ # setup
+ #
+ def return_value(*args, **kwargs):
+ for k, v in contents.items():
+ if args[0] == v["link"]:
+ return MockResponse(text=v["content"])
+
+ mocker.patch("aiohttp.ClientSession.get", side_effect=return_value)
- for item in contents.values():
- requests_mock.get(item["link"], text=item["content"])
+ #
+ # asserts
+ #
+ sitemap = source.to_dict()
+ starting_urls = [
+ sitemap["base_url"] + path for path in sitemap["paths"]
+ ]
+ spider = Spider(starting_urls, sitemap)
spider.run()
- articles = [json.loads(article) for article in list(spider.articles)]
+ articles = [json.loads(article) for article in spider.articles]
assert len(articles) == 12
diff --git a/scraper/tests/mocks.py b/scraper/tests/mocks.py
new file mode 100644
index 0000000..4ad6cd6
--- /dev/null
+++ b/scraper/tests/mocks.py
@@ -0,0 +1,13 @@
+class MockResponse:
+ def __init__(self, status_code: int | None = None, text: str = ""):
+ self.status_code = status_code or 200
+ self._text = text
+
+ async def text(self):
+ return self._text
+
+ async def __aenter__(self):
+ return self
+
+ async def __aexit__(self, exc_type, exc, tb):
+ pass
diff --git a/scraper/tests/unit/test_parsers.py b/scraper/tests/unit/test_parsers.py
index e02d625..da7e008 100644
--- a/scraper/tests/unit/test_parsers.py
+++ b/scraper/tests/unit/test_parsers.py
@@ -7,13 +7,13 @@
from ..utils import read_file
-FILES_DIR = Path(__file__).parent.parent.resolve() / "files" / "articles" / "aj"
+FILES_DIR: Path = Path(__file__).parent.parent.resolve() / "files" / "articles" / "aj"
PAGE = "indonesia"
URL = "https://www.aljazeera.com/news/2024/2/10/big-election-rallies-in-indonesia-on-final-day-of-campaign"
-def test_find_headline(sitemap_aj, expected_aj):
+def test_find_headline(sitemap_aj, expected_aj) -> None:
html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html")
soup = BeautifulSoup(html, "lxml")
@@ -22,7 +22,7 @@ def test_find_headline(sitemap_aj, expected_aj):
assert headline_text == expected_aj[f"{PAGE}"]["headline"]
-def test_find_summary(sitemap_aj, expected_aj):
+def test_find_summary(sitemap_aj, expected_aj) -> None:
html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html")
soup = BeautifulSoup(html, "lxml")
@@ -31,7 +31,7 @@ def test_find_summary(sitemap_aj, expected_aj):
assert summary == expected_aj[f"{PAGE}"]["summary"]
-def test_find_language(sitemap_aj, expected_aj):
+def test_find_language(sitemap_aj, expected_aj) -> None:
html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html")
soup = BeautifulSoup(html, "lxml")
@@ -40,7 +40,7 @@ def test_find_language(sitemap_aj, expected_aj):
assert lang == expected_aj[f"{PAGE}"]["language"]
-def test_parse(sitemap_aj, expected_aj):
+def test_parse(sitemap_aj, expected_aj) -> None:
html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html")
json_data = parse(html, sitemap_aj, url=URL)
diff --git a/scraper/tests/unit/test_spider.py b/scraper/tests/unit/test_spider.py
index d0233f1..d407c91 100644
--- a/scraper/tests/unit/test_spider.py
+++ b/scraper/tests/unit/test_spider.py
@@ -1,51 +1,77 @@
import json
+import logging
from pathlib import Path
+import aiohttp # pyre-ignore
import pytest
-from requests_html import AsyncHTMLSession
+from aiohttp.web_exceptions import HTTPError
from scraper.spiders import Spider
+from ..mocks import MockResponse
from ..utils import read_file
-FILES_DIR = Path(__file__).parent.parent.resolve() / "files" / "articles" / "aj"
+FILES_DIR: Path = Path(__file__).parent.parent.resolve() / "files" / "articles" / "aj"
@pytest.mark.asyncio
-async def test_collect_links(starting_urls_aj, sitemap_aj, requests_mock):
+async def test_connect_error(starting_urls_aj, sitemap_aj, mocker, caplog):
spider = Spider(starting_urls_aj, sitemap_aj)
- spider.asession = AsyncHTMLSession()
+ mocker.patch("aiohttp.ClientSession.get", side_effect=HTTPError)
+
+ with caplog.at_level(logging.ERROR):
+ async with aiohttp.ClientSession() as session:
+ html = await spider.connect(session, starting_urls_aj[0])
+
+ assert html is None
+
+ assert len(caplog.messages) == 1
+ assert caplog.messages[0] == f"Could not fetch {starting_urls_aj[0]}"
+
+
+@pytest.mark.asyncio
+async def test_collect_links(starting_urls_aj, sitemap_aj, mocker):
+ spider = Spider(starting_urls_aj, sitemap_aj)
html = read_file(directory=FILES_DIR, file_name="_start.html")
- requests_mock.get(starting_urls_aj[0], text=html)
+ mock_response = MockResponse(status_code=200, text=html)
+ mocker.patch("aiohttp.ClientSession.get", return_value=mock_response)
- await spider.collect_links()
+ async with aiohttp.ClientSession() as session:
+ await spider.collect_links(session, starting_urls_aj)
- assert len(spider.links) == 12
+ assert len(spider.links) == 12
@pytest.mark.asyncio
async def test_collect_metadata(
- starting_urls_aj, sitemap_aj, expected_aj, requests_mock
-):
+ starting_urls_aj, sitemap_aj, expected_aj, mocker
+) -> None:
#
# setup
#
spider = Spider(starting_urls_aj, sitemap_aj)
- spider.links = ["https://indonesia.com", "https://taiwan.com"]
- spider.asession = AsyncHTMLSession()
+ spider.links = {"https://indonesia.com", "https://taiwan.com"}
html_indonesia = read_file(directory=FILES_DIR, file_name="indonesia.html")
html_taiwan = read_file(directory=FILES_DIR, file_name="taiwan.html")
- requests_mock.get("https://indonesia.com", text=html_indonesia)
- requests_mock.get("https://taiwan.com", text=html_taiwan)
+ def return_value(*args, **kwargs):
+ mock_response1 = MockResponse(status_code=200, text=html_indonesia)
+ mock_response2 = MockResponse(status_code=200, text=html_taiwan)
+
+ if args[0] == "https://indonesia.com":
+ return mock_response1
+ elif args[0] == "https://taiwan.com":
+ return mock_response2
+
+ mocker.patch("aiohttp.ClientSession.get", side_effect=return_value)
#
# asserts
#
- await spider.collect_metadata()
+ await spider.collect_metadata(aiohttp.ClientSession(), spider.links)
articles = [json.loads(article) for article in spider.articles]
diff --git a/scraper/tests/utils.py b/scraper/tests/utils.py
index ea7197a..e61ec4a 100644
--- a/scraper/tests/utils.py
+++ b/scraper/tests/utils.py
@@ -1,4 +1,4 @@
-def read_file(directory, file_name):
+def read_file(directory, file_name) -> str:
start_page = directory / file_name
with open(start_page, "r") as file: