From 7b79d14b25492b2b50ed92ba554e9874865f8ac4 Mon Sep 17 00:00:00 2001
From: Paul Schilling <mail@paulschilling.de>
Date: Sat, 13 Jan 2024 15:26:59 +0100
Subject: [PATCH] Clean up models and search view, update tests and fixtures

---
 Dockerfile.dev                                |  2 +-
 articles/management/commands/scrape.py        | 14 +-----
 articles/migrations/0001_initial.py           | 48 ++++++-------------
 ...y_remove_source_body_selectors_and_more.py | 40 ----------------
 articles/models.py                            | 35 +++++---------
 articles/scraper/parser.py                    |  1 -
 articles/scraper/spider.py                    |  4 +-
 .../templates/articles/search_results.html    |  4 +-
 articles/tests/conftest.py                    |  4 +-
 articles/tests/test_models.py                 |  2 +-
 articles/views.py                             | 32 ++++++-------
 docker-compose.yml                            |  2 +-
 fixtures/sources.json                         | 24 +++++-----
 nous_aggregator/settings/local.py             |  2 -
 pyproject.toml                                |  6 +++
 15 files changed, 71 insertions(+), 149 deletions(-)
 delete mode 100644 articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py
 create mode 100644 pyproject.toml

diff --git a/Dockerfile.dev b/Dockerfile.dev
index 527d8f7..c4a0220 100644
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -19,7 +19,7 @@ RUN apt-get update \
 COPY . /usr/src/app
 WORKDIR /usr/src/app
 
-RUN python manage.py collectstatic --no-input
+RUN python manage.py collectstatic --link --no-input
 
 # patch
 RUN ./patches/pyppeteer_patch.sh
diff --git a/articles/management/commands/scrape.py b/articles/management/commands/scrape.py
index b6b84ab..a329856 100644
--- a/articles/management/commands/scrape.py
+++ b/articles/management/commands/scrape.py
@@ -1,9 +1,3 @@
-"""
-Scheduler for scraping
-
-Jobs are stored in a Django job store. Old jobs should be deleted when
-the sources are changed; otherwise the scheduler will use the old information.
-"""
 import json
 import logging
 from datetime import datetime
@@ -23,11 +17,10 @@
 
 logger = logging.getLogger(__name__)
 
-INTERVAL = 480  # interval in minutes for scraping
+SCRAPING_INTERVAL = 1  # minutes
 
 
 def scrape(sitemap: dict):
-    """Scrape newspapers/journals and store articles in database."""
     Spider.crawl(sitemap)
     data = [json.loads(article) for article in Spider.articles]
 
@@ -57,8 +50,6 @@ def delete_old_job_executions(max_age=604_800):
 
 
 class Command(BaseCommand):
-    """Create jobs."""
-
     def handle(self, *args, **options):
         scheduler = BlockingScheduler(
             timezone=settings.TIME_ZONE,
@@ -66,7 +57,6 @@ def handle(self, *args, **options):
         )
         scheduler.add_jobstore(DjangoJobStore(), "default")
 
-        # jobs for scraping newspapers/journals
         sources = Source.objects.all()
         for index, source in enumerate(sources):
             source_id = f"Scraping {index + 1}: {source.name}"
@@ -77,7 +67,7 @@ def handle(self, *args, **options):
                     scrape,
                     args=[source.to_dict()],
                     trigger="interval",
-                    minutes=INTERVAL,
+                    minutes=SCRAPING_INTERVAL,
                     misfire_grace_time=600,
                     id=source_id,
                     max_instances=1,
diff --git a/articles/migrations/0001_initial.py b/articles/migrations/0001_initial.py
index b3bcf09..5abdce9 100644
--- a/articles/migrations/0001_initial.py
+++ b/articles/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.5 on 2023-09-19 18:32
+# Generated by Django 4.2.5 on 2024-01-13 14:26
 
 import django.core.validators
 import django.db.models.deletion
@@ -27,10 +27,7 @@ class Migration(migrations.Migration):
                 (
                     "name",
                     models.CharField(
-                        help_text="The name of the source",
-                        max_length=128,
-                        unique=True,
-                        verbose_name="name",
+                        help_text="The name of the source", max_length=128, unique=True
                     ),
                 ),
                 (
@@ -38,7 +35,7 @@ class Migration(migrations.Migration):
                     models.SlugField(
                         blank=True,
                         help_text="The slug of the source for SEO-friendly urls",
-                        verbose_name="Slug",
+                        max_length=255,
                     ),
                 ),
                 (
@@ -47,7 +44,6 @@ class Migration(migrations.Migration):
                         choices=[("newspaper/journal", "Newspaper or journal")],
                         help_text="The type of publication of the source",
                         max_length=24,
-                        verbose_name="publication type",
                     ),
                 ),
                 (
@@ -57,23 +53,21 @@ class Migration(migrations.Migration):
                         choices=[("en", "English")],
                         help_text="The language of the article",
                         max_length=4,
-                        verbose_name="language",
                     ),
                 ),
                 (
                     "link",
                     models.URLField(
                         help_text="The link to the source",
+                        max_length=255,
                         unique=True,
                         validators=[django.core.validators.URLValidator],
-                        verbose_name="link",
                     ),
                 ),
                 (
                     "paths",
                     models.JSONField(
-                        help_text="A list of resource paths where the scraper will look for articles",
-                        verbose_name="paths",
+                        help_text="A list of resource paths where the scraper will look for articles"
                     ),
                 ),
                 (
@@ -82,31 +76,27 @@ class Migration(migrations.Migration):
                         blank=True,
                         help_text="Regular expression for filtering hyper-links found at the resource paths",
                         max_length=255,
-                        verbose_name="regex",
                     ),
                 ),
                 (
-                    "javascript",
+                    "javascript_required",
                     models.BooleanField(
                         default=False,
                         help_text="Whether the parsing of articles by this source requires rendering of JavaScript",
-                        verbose_name="render javascript",
                     ),
                 ),
                 (
                     "headline_selectors",
                     models.JSONField(
-                        help_text="Information about the structure of the target page needed to extract the headline of articles published by this source",
-                        verbose_name="headline selectors",
+                        help_text="Information about the structure of the target page needed to extract the headline of articles published by this source"
                     ),
                 ),
                 (
-                    "body_selectors",
+                    "summary_selectors",
                     models.JSONField(
                         blank=True,
-                        help_text="Information about the structure of the target page needed to extract the body of articles published by this source",
+                        help_text="Information about the structure of the target page needed to extract the summary of articles published by this source",
                         null=True,
-                        verbose_name="body selectors",
                     ),
                 ),
             ],
@@ -128,15 +118,15 @@ class Migration(migrations.Migration):
                 ),
                 (
                     "headline",
-                    models.TextField(
-                        help_text="The headline of the article", verbose_name="headline"
+                    models.CharField(
+                        help_text="The headline of the article", max_length=200
                     ),
                 ),
                 (
                     "slug",
                     models.SlugField(
                         help_text="The slug of the article for SEO-friendly urls",
-                        verbose_name="Slug",
+                        max_length=255,
                     ),
                 ),
                 ("created_at", models.DateTimeField()),
@@ -146,25 +136,17 @@ class Migration(migrations.Migration):
                         choices=[("en", "English")],
                         help_text="The language of the article",
                         max_length=4,
-                        verbose_name="language",
                     ),
                 ),
                 (
                     "link",
                     models.URLField(
-                        help_text="The link to the article",
-                        unique=True,
-                        verbose_name="link",
+                        help_text="The link to the article", max_length=255, unique=True
                     ),
                 ),
                 (
-                    "body",
-                    models.TextField(
-                        blank=True,
-                        help_text="The body of the article",
-                        null=True,
-                        verbose_name="body",
-                    ),
+                    "summary",
+                    models.TextField(blank=True, help_text="A summary of the article"),
                 ),
                 (
                     "source",
diff --git a/articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py b/articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py
deleted file mode 100644
index 88dd0e5..0000000
--- a/articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Generated by Django 4.2.5 on 2023-09-20 18:56
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-    dependencies = [
-        ("articles", "0001_initial"),
-    ]
-
-    operations = [
-        migrations.RemoveField(
-            model_name="article",
-            name="body",
-        ),
-        migrations.RemoveField(
-            model_name="source",
-            name="body_selectors",
-        ),
-        migrations.AddField(
-            model_name="article",
-            name="summary",
-            field=models.TextField(
-                blank=True,
-                help_text="A summary of the article",
-                null=True,
-                verbose_name="summary",
-            ),
-        ),
-        migrations.AddField(
-            model_name="source",
-            name="summary_selectors",
-            field=models.JSONField(
-                blank=True,
-                help_text="Information about the structure of the target page needed to extract the summary of articles published by this source",
-                null=True,
-                verbose_name="summary selectors",
-            ),
-        ),
-    ]
diff --git a/articles/models.py b/articles/models.py
index 34723b3..5e539f5 100644
--- a/articles/models.py
+++ b/articles/models.py
@@ -30,33 +30,32 @@ class Article(models.Model):
         __str__: string representation for the admin area
     """
 
-    headline = models.TextField(
-        _("headline"),
+    headline = models.CharField(
+        max_length=200,
         help_text=_("The headline of the article"),
     )
     slug = models.SlugField(
-        _("Slug"),
+        max_length=255,
         help_text=_("The slug of the article for SEO-friendly urls"),
     )
     created_at = models.DateTimeField()
     language = models.CharField(
-        _("language"),
         max_length=4,
         choices=Language.choices,
         blank=False,
         help_text=_("The language of the article"),
     )
     link = models.URLField(
-        _("link"), unique=True, help_text=_("The link to the article")
+        max_length=255,
+        unique=True,
+        help_text=_("The link to the article"),
     )
     summary = models.TextField(
-        _("summary"),
-        null=True,
         blank=True,
         help_text=_("A summary of the article"),
     )
     source = models.ForeignKey(
-        "Source",
+        to="Source",
         on_delete=models.CASCADE,
         related_name="articles",
         help_text=_("The source where the article is published"),
@@ -85,8 +84,8 @@ class Source(models.Model):
             the base url to tell the scraper where to look for hyper-links
             ('https://example.com/path1/')
         regex (models.CharField): a regular expression for filtering links
-        javascript (models.BooleanField): True if JavaScript must be rendered before
-            data can be extracted from the webpage, False otherwise
+        javascript_required (models.BooleanField): True if JavaScript must be rendered
+            before data can be extracted from the webpage, False otherwise
         headline_selectors (models.JSONField): information about the CSS selectors
             needed to extract the headline of an article
         summary_selectors (models.JSONField): information about the CSS selectors
@@ -99,34 +98,31 @@ class Source(models.Model):
     """
 
     name = models.CharField(
-        _("name"),
         max_length=128,
         unique=True,
         blank=False,
         help_text=_("The name of the source"),
     )
     slug = models.SlugField(
-        _("Slug"),
+        max_length=255,
         blank=True,
         help_text=_("The slug of the source for SEO-friendly urls"),
     )
     publication_type = models.CharField(
-        _("publication type"),
         max_length=24,
         choices=PublicationType.choices,
         blank=False,
         help_text=_("The type of publication of the source"),
     )
     language = models.CharField(
-        _("language"),
         max_length=4,
         choices=Language.choices,
         blank=True,
         help_text=_("The language of the article"),
     )
     link = models.URLField(
-        _("link"),
         unique=True,
+        max_length=255,
         validators=[URLValidator],
         help_text=_("The link to the source"),
     )
@@ -134,21 +130,18 @@ class Source(models.Model):
     # info related to scraping
     #
     paths = models.JSONField(
-        _("paths"),
         help_text=_(
             "A list of resource paths where the scraper will look for articles"
         ),
     )
     regex = models.CharField(
-        _("regex"),
         max_length=255,
         blank=True,
         help_text=(
             "Regular expression for filtering hyper-links found at the resource paths"
         ),
     )
-    javascript = models.BooleanField(
-        _("render javascript"),
+    javascript_required = models.BooleanField(
         default=False,
         help_text=_(
             "Whether the parsing of articles by this source requires rendering "
@@ -156,14 +149,12 @@ class Source(models.Model):
         ),
     )
     headline_selectors = models.JSONField(
-        _("headline selectors"),
         help_text=_(
             "Information about the structure of the target page needed to extract "
             "the headline of articles published by this source"
         ),
     )
     summary_selectors = models.JSONField(
-        _("summary selectors"),
         null=True,
         blank=True,
         help_text=_(
@@ -185,7 +176,7 @@ def to_dict(self):
             "base_url": self.link,
             "paths": self.paths,
             "language": self.language,
-            "javascript": self.javascript,
+            "javascript_required": self.javascript_required,
             "filter": regex.compile(self.regex),
             "headline_selectors": self.headline_selectors,
             "summary_selectors": self.summary_selectors,
diff --git a/articles/scraper/parser.py b/articles/scraper/parser.py
index 9ee2b28..b872d23 100644
--- a/articles/scraper/parser.py
+++ b/articles/scraper/parser.py
@@ -1,4 +1,3 @@
-"""Provides access to the parsing function(s)"""
 import json
 import logging
 from typing import Optional
diff --git a/articles/scraper/spider.py b/articles/scraper/spider.py
index f26db6b..f643d8e 100644
--- a/articles/scraper/spider.py
+++ b/articles/scraper/spider.py
@@ -16,8 +16,6 @@
 
 class Spider:
     """
-    The Spider class is for extracting article metadata.
-
     Class Attributes:
         headers (list): a collection of HTTP headers
         articles (set): a collection of JSON strings representing article
@@ -67,7 +65,7 @@ async def get_links(self, asession: AsyncHTMLSession, url: str):
         if not response:
             return
 
-        if self.sitemap["javascript"]:
+        if self.sitemap["javascript_required"]:
             try:
                 await response.html.arender(timeout=settings.REQUESTS_TIMEOUT_JS)
             except pyppeteer.errors.TimeoutError as e:
diff --git a/articles/templates/articles/search_results.html b/articles/templates/articles/search_results.html
index 9149abf..b22fa0f 100644
--- a/articles/templates/articles/search_results.html
+++ b/articles/templates/articles/search_results.html
@@ -6,7 +6,9 @@
 
 {% block content %}
 <div class="container container-sources">
-  <h3>Search Results for: "{{ query }}"</h3>
+  {% if query %}
+      <h3>Search Results for: "{{ query }}"</h3>
+  {% endif %}
   <div class="row">
     {% for source in sources %}
       <div class="col-md-12 col-lg-6 col-xl-4 container-articles-outer">
diff --git a/articles/tests/conftest.py b/articles/tests/conftest.py
index a92a530..02504c2 100644
--- a/articles/tests/conftest.py
+++ b/articles/tests/conftest.py
@@ -18,7 +18,7 @@ def source_values():
         "publication_type": PublicationType.newspaper,
         "language": Language.en,
         "paths": ["world/"],
-        "javascript": False,
+        "javascript_required": False,
         "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
         "headline_selectors": {"tag": "h1", "attrs": {}},
         "summary_selectors": {"tag": "h2", "attrs": {}},
@@ -39,7 +39,7 @@ def source_values_2():
         "publication_type": PublicationType.newspaper,
         "language": Language.en,
         "paths": ["world/"],
-        "javascript": False,
+        "javascript_required": False,
         "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
         "headline_selectors": {"tag": "h1", "attrs": {}},
         "summary_selectors": {"tag": "h2", "attrs": {}},
diff --git a/articles/tests/test_models.py b/articles/tests/test_models.py
index 68e4dae..fc9417a 100644
--- a/articles/tests/test_models.py
+++ b/articles/tests/test_models.py
@@ -20,7 +20,7 @@ def test_source_to_dict(source_values):
     for attr_name in [
         "summary_selectors",
         "headline_selectors",
-        "javascript",
+        "javascript_required",
         "language",
         "paths",
     ]:
diff --git a/articles/views.py b/articles/views.py
index 68b4932..a8425c4 100644
--- a/articles/views.py
+++ b/articles/views.py
@@ -7,7 +7,6 @@
 
 
 def index(request):
-    """Display latest articles for all sources."""
     context = {
         "sources": Source.objects.only("name", "link", "publication_type"),
     }
@@ -15,32 +14,29 @@ def index(request):
 
 
 class SearchResultsView(ListView):
-    """Display sources + articles matching query."""
-
     model = Article
     fields = ["headline", "link", "body"]
     template_name = "articles/search_results.html"
 
     def get_context_data(self, **kwargs):
-        """Filter sources by query."""
-        query = self.request.GET.get("q")
-        # precaution: empty strings are also excluded on the client-side
-        if query in ("", " ", "  "):
-            query = "Please don't attempt to hack my website, thanks!"
-        regex = r"(?<![a-zA-Z])" + re.escape(query) + r"(?![a-rA-Rt-zT-Z])"
+        """Pre-filter sources on the basis of article headlines and queries"""
+
         context = super().get_context_data(**kwargs)
-        context.update(
-            {
-                "sources": Source.objects.only("name", "link", "publication_type")
-                .filter(articles__headline__iregex=regex)
-                .distinct(),
-                "query": query,
-            },
-        )
+
+        query = self.request.GET.get("q")
+        if query and not query.isspace():
+            regex = r"(?<![a-zA-Z])" + re.escape(query) + r"(?![a-rA-Rt-zT-Z])"
+            context.update(
+                {
+                    "sources": Source.objects.only("name", "link", "publication_type")
+                                             .filter(articles__headline__iregex=regex)
+                                             .distinct(),
+                    "query": query,
+                },
+            )
         return context
 
     def get_queryset(self):
-        """Filter articles by query."""
         query = self.request.GET.get("q")
         regex = r"(?<![a-zA-Z])" + re.escape(query) + r"(?![a-rA-Rt-zT-Z])"
         return Article.objects.filter(headline__iregex=regex)
diff --git a/docker-compose.yml b/docker-compose.yml
index 42a9870..18e9716 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,7 +4,7 @@ services:
   db:
     image: postgres
     volumes:
-      - postgres_data:/var/lib/postgresql/data:rw
+        - postgres_data:/var/lib/postgresql/data:rw
     environment:
       - POSTGRES_HOST_AUTH_METHOD=trust
 
diff --git a/fixtures/sources.json b/fixtures/sources.json
index bce0612..3c4ce79 100644
--- a/fixtures/sources.json
+++ b/fixtures/sources.json
@@ -11,7 +11,7 @@
     "paths": [
       "news/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics)",
     "headline_selectors": {
       "tag": "h1",
@@ -37,7 +37,7 @@
       "hub/world-news/",
       "hub/business/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "article/",
     "headline_selectors": {
       "tag": "h1",
@@ -63,7 +63,7 @@
       "world/",
       "USA/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "USA/.*[0-9]{4}/[0-9]{4}/|World/.*[0-9]{4}/[0-9]{4}/",
     "headline_selectors": {
       "tag": "h1",
@@ -90,7 +90,7 @@
     "paths": [
       ""
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)",
     "headline_selectors": {
       "tag": "h1",
@@ -116,7 +116,7 @@
       "category/interviews/",
       "category/history/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "[0-9]{4}/[0-9]{2}/",
     "headline_selectors": {
       "tag": "h1",
@@ -144,7 +144,7 @@
       "section/world/",
       "section/business/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "(?<!/es/)(?<!/live/)[0-9]{4}/[0-9]{2}/[0-9]{2}/",
     "headline_selectors": {
       "tag": "h1",
@@ -171,7 +171,7 @@
     "paths": [
       "sections/world/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "/[0-9]{4}/[0-9]{2}/[0-9]{2}/",
     "headline_selectors": {
       "tag": "h1",
@@ -191,7 +191,7 @@
     "paths": [
       "world/"
     ],
-    "javascript": true,
+    "javascript_required": true,
     "regex": "[0-9]{4}-[0-9]{2}-[0-9]{2}/$",
     "headline_selectors": {
       "tag": "h1",
@@ -218,7 +218,7 @@
       "politics/",
       "business/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "archive/[0-9]{4}/",
     "headline_selectors": {
       "tag": "h1",
@@ -245,7 +245,7 @@
       "justice/",
       "world/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
     "headline_selectors": {
       "tag": "h1",
@@ -268,7 +268,7 @@
     "paths": [
       "news/world/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "articles/(?!.*comments_sector)",
     "headline_selectors": {
       "tag": "h1",
@@ -288,7 +288,7 @@
     "paths": [
       "Top_News/World-News/"
     ],
-    "javascript": false,
+    "javascript_required": false,
     "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/",
     "headline_selectors": {
       "tag": "h1",
diff --git a/nous_aggregator/settings/local.py b/nous_aggregator/settings/local.py
index 272025c..a2078f5 100644
--- a/nous_aggregator/settings/local.py
+++ b/nous_aggregator/settings/local.py
@@ -1,6 +1,5 @@
 from .base import *
 
-
 SECRET_KEY = "hush-hush"
 
 DEBUG = True
@@ -12,7 +11,6 @@
     }
 }
 
-
 INSTALLED_APPS += [
     "debug_toolbar",
 ]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b335971
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,6 @@
+[tool.mypy]
+plugins = ["mypy_django_plugin.main"]
+python_executable="home/pi-sigma/.virtualenvs/nous/bin/python"
+
+[tool.django-stubs]
+django_settings_module = "nous_aggregator.settings.local"