Replace BeautifulSoup with PyQuery

pi-sigma · Mar 9, 2024 · 691da45 · 691da45
1 parent 453576f
commit 691da45
Show file tree

Hide file tree

Showing 31 changed files with 544 additions and 388 deletions.
diff --git a/.github/workflows/django.yml b/.github/workflows/django.yml
@@ -57,7 +57,11 @@ jobs:
           SECRET_KEY: dummy
           DJANGO_ENV: BASE
           SECURE_SSL_REDIRECT: False
-        run: pytest articles/tests/
+        run: |
+          pytest articles/tests/unit/
+          pytest articles/tests/integration/
+          pytest scraper/tests/
+
   #
   # Migrations
   #

diff --git a/Dockerfile b/Dockerfile
@@ -15,9 +15,6 @@ RUN pip install pip -U
 COPY /requirements/* /app/requirements/
 RUN pip install -r /app/requirements/dev.txt
 
-# pyppeteer deps (https://stackoverflow.com/a/71935536)
-# RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt
-
 
 #
 # Final

diff --git a/articles/migrations/0001_initial.py b/articles/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 5.0.1 on 2024-02-26 20:25
+# Generated by Django 5.0.1 on 2024-03-01 20:14
 
 import django.db.models.deletion
 import django.db.models.functions.text
@@ -63,7 +63,7 @@ class Migration(migrations.Migration):
                 (
                     "paths",
                     models.JSONField(
-                        help_text="A list of resource paths where the scraper will look for articles"
+                        help_text="List of resource paths where the scraper will look for articles"
                     ),
                 ),
                 (
@@ -82,22 +82,34 @@ class Migration(migrations.Migration):
                     ),
                 ),
                 (
-                    "headline_selectors",
+                    "headline_search_params_find",
                     models.JSONField(
-                        help_text="Information about the structure of the target page needed to extract the headline of articles published by this source"
+                        help_text="Selectors for extracting the headline of articles"
                     ),
                 ),
                 (
-                    "summary_selectors",
+                    "headline_search_params_remove",
                     models.JSONField(
-                        blank=True,
-                        help_text="Information about the structure of the target page needed to extract the summary of articles published by this source",
-                        null=True,
+                        help_text="Selectors for HTML elements that need to be removed from the headline"
+                    ),
+                ),
+                (
+                    "summary_search_params_find",
+                    models.JSONField(
+                        default=[],
+                        help_text="Selectors for extracting the summary of articles",
+                    ),
+                ),
+                (
+                    "summary_search_params_remove",
+                    models.JSONField(
+                        default=[],
+                        help_text="Selectors for HTML elements that need to be removed from the summary",
                     ),
                 ),
             ],
             options={
-                "ordering": [django.db.models.functions.text.Lower("title")],
+                "ordering": [django.db.models.functions.text.Lower("name")],
             },
         ),
         migrations.CreateModel(

diff --git a/articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py b/articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py
@@ -0,0 +1,62 @@
+# Generated by Django 5.0.1 on 2024-03-03 10:29
+
+import django.db.models.functions.text
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("articles", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name="source",
+            options={"ordering": [django.db.models.functions.text.Lower("title")]},
+        ),
+        migrations.AlterField(
+            model_name="article",
+            name="headline",
+            field=models.CharField(
+                help_text="The headline of the article", max_length=512
+            ),
+        ),
+        migrations.AlterField(
+            model_name="article",
+            name="slug",
+            field=models.SlugField(
+                help_text="The slug of the article for SEO-friendly urls",
+                max_length=1024,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="article",
+            name="url",
+            field=models.URLField(
+                help_text="The link to the article", max_length=512, unique=True
+            ),
+        ),
+        migrations.AlterField(
+            model_name="source",
+            name="summary_search_params_find",
+            field=models.JSONField(
+                default=str,
+                help_text="Selectors for extracting the summary of articles",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="source",
+            name="summary_search_params_remove",
+            field=models.JSONField(
+                default=list,
+                help_text="Selectors for HTML elements that need to be removed from the summary",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="source",
+            name="url",
+            field=models.URLField(
+                help_text="The url of the source", max_length=512, unique=True
+            ),
+        ),
+    ]
diff --git a/articles/models.py b/articles/models.py
@@ -1,5 +1,4 @@
 import regex
-from django.core.validators import URLValidator
 from django.db import models
 from django.db.models.functions import Lower
 from django.utils.translation import gettext_lazy as _
@@ -14,14 +13,13 @@ class Article(models.Model):
     Fields:
         headline (models.TextField): headline of the article
         slug (models.SlugField): slug of the article (generated from headline)
+        summary (models.TextField): short paragraph summarizing the article
         created_at (models.DateTimeField): date the article was added to the
             database. Mostly corresponds to actual publication date, though
             this can vary. Actual dates are not used because their format
             varies a lot, hence they are difficult to parse.
         language (models.CharField): the language of the article
         url (models.URLField): link to the article
-        body (models.TextField): either the actual body of the article,
-            or a short desriptive paragraph
 
     Relations:
         source (ForeignKey): the source of the article
@@ -31,13 +29,17 @@ class Article(models.Model):
     """
 
     headline = models.CharField(
-        max_length=200,
+        max_length=512,
         help_text=_("The headline of the article"),
     )
     slug = models.SlugField(
-        max_length=255,
+        max_length=1024,
         help_text=_("The slug of the article for SEO-friendly urls"),
     )
+    summary = models.TextField(
+        blank=True,
+        help_text=_("A summary of the article"),
+    )
     created_at = models.DateTimeField()
     language = models.CharField(
         max_length=4,
@@ -46,14 +48,10 @@ class Article(models.Model):
         help_text=_("The language of the article"),
     )
     url = models.URLField(
-        max_length=255,
+        max_length=512,
         unique=True,
         help_text=_("The link to the article"),
     )
-    summary = models.TextField(
-        blank=True,
-        help_text=_("A summary of the article"),
-    )
     source = models.ForeignKey(
         to="Source",
         on_delete=models.CASCADE,
@@ -65,7 +63,7 @@ class Meta:
         ordering = ("-created_at",)
         indexes = [models.Index(fields=["headline", "url"])]
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.source}: {self.headline}"
 
 
@@ -74,7 +72,7 @@ class Source(models.Model):
     Metadata about the source of articles
 
     Fields:
-        title (models.CharField): name of the source
+        title (models.CharField): name/title of the source
         slug (models.SlugField): slug of the source
         publication_type (models.CharField): the type of publication of the
             source (newspaper, journal, blog...)
@@ -122,15 +120,15 @@ class Source(models.Model):
     )
     url = models.URLField(
         unique=True,
-        max_length=255,
+        max_length=512,
         help_text=_("The url of the source"),
     )
     #
-    # info related to scraping
+    # data related to scraping
     #
     paths = models.JSONField(
         help_text=_(
-            "A list of resource paths where the scraper will look for articles"
+            "List of resource paths where the scraper will look for articles"
         ),
     )
     regex = models.CharField(
@@ -147,18 +145,26 @@ class Source(models.Model):
             "of JavaScript"
         ),
     )
-    headline_selectors = models.JSONField(
+    headline_search_params_find = models.JSONField(
         help_text=_(
-            "Information about the structure of the target page needed to extract "
-            "the headline of articles published by this source"
+            "Selectors for extracting the headline of articles"
         ),
     )
-    summary_selectors = models.JSONField(
-        null=True,
-        blank=True,
+    headline_search_params_remove = models.JSONField(
+        help_text=_(
+            "Selectors for HTML elements that need to be removed from the headline"
+        ),
+    )
+    summary_search_params_find = models.JSONField(
+        default=str,
+        help_text=_(
+            "Selectors for extracting the summary of articles"
+        ),
+    )
+    summary_search_params_remove = models.JSONField(
+        default=list,
         help_text=_(
-            "Information about the structure of the target page needed to extract "
-            "the summary of articles published by this source"
+            "Selectors for HTML elements that need to be removed from the summary"
         ),
     )
 
@@ -167,7 +173,7 @@ class Meta:
             Lower("title"),
         ]
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.title}"
 
     def to_dict(self):
@@ -177,7 +183,15 @@ def to_dict(self):
             "language": self.language,
             "javascript_required": self.javascript_required,
             "filter": regex.compile(self.regex),
-            "headline_selectors": self.headline_selectors,
-            "summary_selectors": self.summary_selectors,
+            "search_params": {
+                "headline": {
+                    "find": self.headline_search_params_find,
+                    "remove": self.headline_search_params_remove,
+                },
+                "summary": {
+                    "find": self.summary_search_params_find,
+                    "remove": self.summary_search_params_remove,
+                },
+            },
         }
         return sitemap
diff --git a/articles/tasks.py b/articles/tasks.py
@@ -1,13 +1,17 @@
 import json
+import logging
 
 from celery import group, shared_task
+from django.db.utils import DatabaseError
 from django.utils import timezone
 
 import scraper
 from scraper.tasks import magazines
 
 from .models import Article, Source
 
+logger = logging.getLogger("__name__")
+
 
 @shared_task
 def get_articles_for_source(source_title: str) -> None:
@@ -21,17 +25,34 @@ def get_articles_for_source(source_title: str) -> None:
     spider.run()
     articles = [json.loads(article) for article in spider.articles]
 
-    Article.objects.bulk_create([
-        Article(
-            headline=article_data["headline"],
-            slug=article_data["slug"],
-            source=Source.objects.get(url=article_data["source_link"]),
-            summary=article_data["summary"],
-            language=article_data["language"],
-            url=article_data["url"],
-            created_at=timezone.now(),
-        ) for article_data in articles
-    ], ignore_conflicts=True)
+    # try bulk create, revert to individual db saves in case of error
+    try:
+        Article.objects.bulk_create([
+            Article(
+                headline=article_data["headline"],
+                slug=article_data["slug"],
+                source=Source.objects.get(url=article_data["source_link"]),
+                summary=article_data["summary"],
+                language=article_data["language"],
+                url=article_data["url"],
+                created_at=timezone.now(),
+            ) for article_data in articles
+        ], ignore_conflicts=True)
+    except DatabaseError as exc:
+        logger.error("Bulk create failed", exc_info=exc)
+        for article_data in articles:
+            try:
+                Article.objects.create(
+                    headline=article_data["headline"],
+                    slug=article_data["slug"],
+                    source=Source.objects.get(url=article_data["source_link"]),
+                    summary=article_data["summary"],
+                    language=article_data["language"],
+                    url=article_data["url"],
+                    created_at=timezone.now(),
+                )
+            except DatabaseError as exc:
+                logger.error("DB save failed for %s", article_data["url"], exc_info=exc)
 
 
 @shared_task

diff --git a/articles/tests/conftest.py b/articles/tests/conftest.py
@@ -20,8 +20,10 @@ def source_values():
         "paths": ["world/"],
         "javascript_required": False,
         "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
-        "headline_selectors": {"tag": "h1", "attrs": {}},
-        "summary_selectors": {"tag": "h2", "attrs": {}},
+        "headline_search_params_find": "h1",
+        "headline_search_params_remove": [],
+        "summary_search_params_find": "",
+        "summary_search_params_remove": []
     }
 
 
@@ -41,8 +43,10 @@ def source_values_2():
         "paths": ["world/"],
         "javascript_required": False,
         "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
-        "headline_selectors": {"tag": "h1", "attrs": {}},
-        "summary_selectors": {"tag": "h2", "attrs": {}},
+        "headline_search_params_find": "h1",
+        "headline_search_params_remove": [],
+        "summary_search_params_find": "",
+        "summary_search_params_remove": []
     }
 
 

diff --git a/articles/tests/integration/__init__.py b/articles/tests/integration/__init__.py