Clean up models and search view, update tests and fixtures

pi-sigma · Jan 13, 2024 · 7b79d14 · 7b79d14
1 parent 3d480af
commit 7b79d14
Show file tree

Hide file tree

Showing 15 changed files with 71 additions and 149 deletions.
diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -19,7 +19,7 @@ RUN apt-get update \
 COPY . /usr/src/app
 WORKDIR /usr/src/app
 
-RUN python manage.py collectstatic --no-input
+RUN python manage.py collectstatic --link --no-input
 
 # patch
 RUN ./patches/pyppeteer_patch.sh

diff --git a/articles/management/commands/scrape.py b/articles/management/commands/scrape.py
@@ -1,9 +1,3 @@
-"""
-Scheduler for scraping
-
-Jobs are stored in a Django job store. Old jobs should be deleted when
-the sources are changed; otherwise the scheduler will use the old information.
-"""
 import json
 import logging
 from datetime import datetime
@@ -23,11 +17,10 @@
 
 logger = logging.getLogger(__name__)
 
-INTERVAL = 480  # interval in minutes for scraping
+SCRAPING_INTERVAL = 1  # minutes
 
 
 def scrape(sitemap: dict):
-    """Scrape newspapers/journals and store articles in database."""
     Spider.crawl(sitemap)
     data = [json.loads(article) for article in Spider.articles]
 
@@ -57,16 +50,13 @@ def delete_old_job_executions(max_age=604_800):
 
 
 class Command(BaseCommand):
-    """Create jobs."""
-
     def handle(self, *args, **options):
         scheduler = BlockingScheduler(
             timezone=settings.TIME_ZONE,
             executors={"default": ThreadPoolExecutor(1)},
         )
         scheduler.add_jobstore(DjangoJobStore(), "default")
 
-        # jobs for scraping newspapers/journals
         sources = Source.objects.all()
         for index, source in enumerate(sources):
             source_id = f"Scraping {index + 1}: {source.name}"
@@ -77,7 +67,7 @@ def handle(self, *args, **options):
                     scrape,
                     args=[source.to_dict()],
                     trigger="interval",
-                    minutes=INTERVAL,
+                    minutes=SCRAPING_INTERVAL,
                     misfire_grace_time=600,
                     id=source_id,
                     max_instances=1,

diff --git a/articles/migrations/0001_initial.py b/articles/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.5 on 2023-09-19 18:32
+# Generated by Django 4.2.5 on 2024-01-13 14:26
 
 import django.core.validators
 import django.db.models.deletion
@@ -27,18 +27,15 @@ class Migration(migrations.Migration):
                 (
                     "name",
                     models.CharField(
-                        help_text="The name of the source",
-                        max_length=128,
-                        unique=True,
-                        verbose_name="name",
+                        help_text="The name of the source", max_length=128, unique=True
                     ),
                 ),
                 (
                     "slug",
                     models.SlugField(
                         blank=True,
                         help_text="The slug of the source for SEO-friendly urls",
-                        verbose_name="Slug",
+                        max_length=255,
                     ),
                 ),
                 (
@@ -47,7 +44,6 @@ class Migration(migrations.Migration):
                         choices=[("newspaper/journal", "Newspaper or journal")],
                         help_text="The type of publication of the source",
                         max_length=24,
-                        verbose_name="publication type",
                     ),
                 ),
                 (
@@ -57,23 +53,21 @@ class Migration(migrations.Migration):
                         choices=[("en", "English")],
                         help_text="The language of the article",
                         max_length=4,
-                        verbose_name="language",
                     ),
                 ),
                 (
                     "link",
                     models.URLField(
                         help_text="The link to the source",
+                        max_length=255,
                         unique=True,
                         validators=[django.core.validators.URLValidator],
-                        verbose_name="link",
                     ),
                 ),
                 (
                     "paths",
                     models.JSONField(
-                        help_text="A list of resource paths where the scraper will look for articles",
-                        verbose_name="paths",
+                        help_text="A list of resource paths where the scraper will look for articles"
                     ),
                 ),
                 (
@@ -82,31 +76,27 @@ class Migration(migrations.Migration):
                         blank=True,
                         help_text="Regular expression for filtering hyper-links found at the resource paths",
                         max_length=255,
-                        verbose_name="regex",
                     ),
                 ),
                 (
-                    "javascript",
+                    "javascript_required",
                     models.BooleanField(
                         default=False,
                         help_text="Whether the parsing of articles by this source requires rendering of JavaScript",
-                        verbose_name="render javascript",
                     ),
                 ),
                 (
                     "headline_selectors",
                     models.JSONField(
-                        help_text="Information about the structure of the target page needed to extract the headline of articles published by this source",
-                        verbose_name="headline selectors",
+                        help_text="Information about the structure of the target page needed to extract the headline of articles published by this source"
                     ),
                 ),
                 (
-                    "body_selectors",
+                    "summary_selectors",
                     models.JSONField(
                         blank=True,
-                        help_text="Information about the structure of the target page needed to extract the body of articles published by this source",
+                        help_text="Information about the structure of the target page needed to extract the summary of articles published by this source",
                         null=True,
-                        verbose_name="body selectors",
                     ),
                 ),
             ],
@@ -128,15 +118,15 @@ class Migration(migrations.Migration):
                 ),
                 (
                     "headline",
-                    models.TextField(
-                        help_text="The headline of the article", verbose_name="headline"
+                    models.CharField(
+                        help_text="The headline of the article", max_length=200
                     ),
                 ),
                 (
                     "slug",
                     models.SlugField(
                         help_text="The slug of the article for SEO-friendly urls",
-                        verbose_name="Slug",
+                        max_length=255,
                     ),
                 ),
                 ("created_at", models.DateTimeField()),
@@ -146,25 +136,17 @@ class Migration(migrations.Migration):
                         choices=[("en", "English")],
                         help_text="The language of the article",
                         max_length=4,
-                        verbose_name="language",
                     ),
                 ),
                 (
                     "link",
                     models.URLField(
-                        help_text="The link to the article",
-                        unique=True,
-                        verbose_name="link",
+                        help_text="The link to the article", max_length=255, unique=True
                     ),
                 ),
                 (
-                    "body",
-                    models.TextField(
-                        blank=True,
-                        help_text="The body of the article",
-                        null=True,
-                        verbose_name="body",
-                    ),
+                    "summary",
+                    models.TextField(blank=True, help_text="A summary of the article"),
                 ),
                 (
                     "source",

diff --git a/articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py b/articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py
diff --git a/articles/models.py b/articles/models.py
@@ -30,33 +30,32 @@ class Article(models.Model):
         __str__: string representation for the admin area
     """
 
-    headline = models.TextField(
-        _("headline"),
+    headline = models.CharField(
+        max_length=200,
         help_text=_("The headline of the article"),
     )
     slug = models.SlugField(
-        _("Slug"),
+        max_length=255,
         help_text=_("The slug of the article for SEO-friendly urls"),
     )
     created_at = models.DateTimeField()
     language = models.CharField(
-        _("language"),
         max_length=4,
         choices=Language.choices,
         blank=False,
         help_text=_("The language of the article"),
     )
     link = models.URLField(
-        _("link"), unique=True, help_text=_("The link to the article")
+        max_length=255,
+        unique=True,
+        help_text=_("The link to the article"),
     )
     summary = models.TextField(
-        _("summary"),
-        null=True,
         blank=True,
         help_text=_("A summary of the article"),
     )
     source = models.ForeignKey(
-        "Source",
+        to="Source",
         on_delete=models.CASCADE,
         related_name="articles",
         help_text=_("The source where the article is published"),
@@ -85,8 +84,8 @@ class Source(models.Model):
             the base url to tell the scraper where to look for hyper-links
             ('https://example.com/path1/')
         regex (models.CharField): a regular expression for filtering links
-        javascript (models.BooleanField): True if JavaScript must be rendered before
-            data can be extracted from the webpage, False otherwise
+        javascript_required (models.BooleanField): True if JavaScript must be rendered
+            before data can be extracted from the webpage, False otherwise
         headline_selectors (models.JSONField): information about the CSS selectors
             needed to extract the headline of an article
         summary_selectors (models.JSONField): information about the CSS selectors
@@ -99,71 +98,63 @@ class Source(models.Model):
     """
 
     name = models.CharField(
-        _("name"),
         max_length=128,
         unique=True,
         blank=False,
         help_text=_("The name of the source"),
     )
     slug = models.SlugField(
-        _("Slug"),
+        max_length=255,
         blank=True,
         help_text=_("The slug of the source for SEO-friendly urls"),
     )
     publication_type = models.CharField(
-        _("publication type"),
         max_length=24,
         choices=PublicationType.choices,
         blank=False,
         help_text=_("The type of publication of the source"),
     )
     language = models.CharField(
-        _("language"),
         max_length=4,
         choices=Language.choices,
         blank=True,
         help_text=_("The language of the article"),
     )
     link = models.URLField(
-        _("link"),
         unique=True,
+        max_length=255,
         validators=[URLValidator],
         help_text=_("The link to the source"),
     )
     #
     # info related to scraping
     #
     paths = models.JSONField(
-        _("paths"),
         help_text=_(
             "A list of resource paths where the scraper will look for articles"
         ),
     )
     regex = models.CharField(
-        _("regex"),
         max_length=255,
         blank=True,
         help_text=(
             "Regular expression for filtering hyper-links found at the resource paths"
         ),
     )
-    javascript = models.BooleanField(
-        _("render javascript"),
+    javascript_required = models.BooleanField(
         default=False,
         help_text=_(
             "Whether the parsing of articles by this source requires rendering "
             "of JavaScript"
         ),
     )
     headline_selectors = models.JSONField(
-        _("headline selectors"),
         help_text=_(
             "Information about the structure of the target page needed to extract "
             "the headline of articles published by this source"
         ),
     )
     summary_selectors = models.JSONField(
-        _("summary selectors"),
         null=True,
         blank=True,
         help_text=_(
@@ -185,7 +176,7 @@ def to_dict(self):
             "base_url": self.link,
             "paths": self.paths,
             "language": self.language,
-            "javascript": self.javascript,
+            "javascript_required": self.javascript_required,
             "filter": regex.compile(self.regex),
             "headline_selectors": self.headline_selectors,
             "summary_selectors": self.summary_selectors,

diff --git a/articles/scraper/parser.py b/articles/scraper/parser.py
@@ -1,4 +1,3 @@
-"""Provides access to the parsing function(s)"""
 import json
 import logging
 from typing import Optional