From a9f59ca2b5438800e1785001b75e393649ac4a85 Mon Sep 17 00:00:00 2001
From: Paul Schilling <mail@paulschilling.de>
Date: Fri, 1 Mar 2024 21:25:27 +0100
Subject: [PATCH] Replace BeautifulSoup with PyQuery

---
 .github/workflows/django.yml                  |   3 +-
 Dockerfile                                    |   3 -
 articles/migrations/0001_initial.py           |  30 ++-
 ...options_alter_article_headline_and_more.py |  62 +++++
 articles/models.py                            |  66 +++--
 articles/tasks.py                             |  43 ++-
 articles/tests/conftest.py                    |  12 +-
 articles/tests/integration/__init__.py        |   0
 articles/tests/integration/test_tasks.py      |  57 ++++
 articles/tests/unit/__init__.py               |   0
 articles/tests/{ => unit}/test_models.py      |  12 +-
 articles/tests/{ => unit}/test_views.py       |   0
 fixtures/languages.json                       |   9 -
 fixtures/pub_types.json                       |   9 -
 fixtures/sources.json                         | 178 ++++---------
 fixtures/{sources2.json => sources_test.json} | 249 ++++++------------
 nous_aggregator/settings/base.py              |   4 +-
 requirements/base.in                          |   1 -
 requirements/base.txt                         |   4 -
 requirements/ci.in                            |   5 +-
 requirements/ci.txt                           |  16 +-
 requirements/dev.in                           |   6 +-
 requirements/dev.txt                          |  38 ++-
 scraper/__init__.py                           |   2 +-
 scraper/parser.py                             | 129 ++++-----
 scraper/spiders.py                            |  20 +-
 scraper/tests/__init__.py                     |   5 +
 scraper/tests/conftest.py                     |  26 +-
 scraper/tests/integration/test_spider.py      |  22 +-
 scraper/tests/mocks.py                        |   9 +-
 scraper/tests/unit/test_parsers.py            |  65 +++--
 scraper/tests/unit/test_spider.py             |   3 +-
 32 files changed, 585 insertions(+), 503 deletions(-)
 create mode 100644 articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py
 create mode 100644 articles/tests/integration/__init__.py
 create mode 100644 articles/tests/integration/test_tasks.py
 create mode 100644 articles/tests/unit/__init__.py
 rename articles/tests/{ => unit}/test_models.py (72%)
 rename articles/tests/{ => unit}/test_views.py (100%)
 delete mode 100644 fixtures/languages.json
 delete mode 100644 fixtures/pub_types.json
 rename fixtures/{sources2.json => sources_test.json} (50%)

diff --git a/.github/workflows/django.yml b/.github/workflows/django.yml
index ba38169..e79660d 100644
--- a/.github/workflows/django.yml
+++ b/.github/workflows/django.yml
@@ -57,7 +57,8 @@ jobs:
           SECRET_KEY: dummy
           DJANGO_ENV: BASE
           SECURE_SSL_REDIRECT: False
-        run: pytest articles/tests/
+        run: pytest
+
   #
   # Migrations
   #
diff --git a/Dockerfile b/Dockerfile
index bdb3b87..7f4ee03 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,9 +15,6 @@ RUN pip install pip -U
 COPY /requirements/* /app/requirements/
 RUN pip install -r /app/requirements/dev.txt
 
-# pyppeteer deps (https://stackoverflow.com/a/71935536)
-# RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt
-
 
 #
 # Final
diff --git a/articles/migrations/0001_initial.py b/articles/migrations/0001_initial.py
index 3a6252d..a589334 100644
--- a/articles/migrations/0001_initial.py
+++ b/articles/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 5.0.1 on 2024-02-26 20:25
+# Generated by Django 5.0.1 on 2024-03-01 20:14
 
 import django.db.models.deletion
 import django.db.models.functions.text
@@ -63,7 +63,7 @@ class Migration(migrations.Migration):
                 (
                     "paths",
                     models.JSONField(
-                        help_text="A list of resource paths where the scraper will look for articles"
+                        help_text="List of resource paths where the scraper will look for articles"
                     ),
                 ),
                 (
@@ -82,22 +82,34 @@ class Migration(migrations.Migration):
                     ),
                 ),
                 (
-                    "headline_selectors",
+                    "headline_search_params_find",
                     models.JSONField(
-                        help_text="Information about the structure of the target page needed to extract the headline of articles published by this source"
+                        help_text="Selectors for extracting the headline of articles"
                     ),
                 ),
                 (
-                    "summary_selectors",
+                    "headline_search_params_remove",
                     models.JSONField(
-                        blank=True,
-                        help_text="Information about the structure of the target page needed to extract the summary of articles published by this source",
-                        null=True,
+                        help_text="Selectors for HTML elements that need to be removed from the headline"
+                    ),
+                ),
+                (
+                    "summary_search_params_find",
+                    models.JSONField(
+                        default=[],
+                        help_text="Selectors for extracting the summary of articles",
+                    ),
+                ),
+                (
+                    "summary_search_params_remove",
+                    models.JSONField(
+                        default=[],
+                        help_text="Selectors for HTML elements that need to be removed from the summary",
                     ),
                 ),
             ],
             options={
-                "ordering": [django.db.models.functions.text.Lower("title")],
+                "ordering": [django.db.models.functions.text.Lower("name")],
             },
         ),
         migrations.CreateModel(
diff --git a/articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py b/articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py
new file mode 100644
index 0000000..78824cc
--- /dev/null
+++ b/articles/migrations/0002_alter_source_options_alter_article_headline_and_more.py
@@ -0,0 +1,62 @@
+# Generated by Django 5.0.1 on 2024-03-03 10:29
+
+import django.db.models.functions.text
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("articles", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name="source",
+            options={"ordering": [django.db.models.functions.text.Lower("title")]},
+        ),
+        migrations.AlterField(
+            model_name="article",
+            name="headline",
+            field=models.CharField(
+                help_text="The headline of the article", max_length=512
+            ),
+        ),
+        migrations.AlterField(
+            model_name="article",
+            name="slug",
+            field=models.SlugField(
+                help_text="The slug of the article for SEO-friendly urls",
+                max_length=1024,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="article",
+            name="url",
+            field=models.URLField(
+                help_text="The link to the article", max_length=512, unique=True
+            ),
+        ),
+        migrations.AlterField(
+            model_name="source",
+            name="summary_search_params_find",
+            field=models.JSONField(
+                default=str,
+                help_text="Selectors for extracting the summary of articles",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="source",
+            name="summary_search_params_remove",
+            field=models.JSONField(
+                default=list,
+                help_text="Selectors for HTML elements that need to be removed from the summary",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="source",
+            name="url",
+            field=models.URLField(
+                help_text="The url of the source", max_length=512, unique=True
+            ),
+        ),
+    ]
diff --git a/articles/models.py b/articles/models.py
index 9c4c00a..3fb56da 100644
--- a/articles/models.py
+++ b/articles/models.py
@@ -1,5 +1,4 @@
 import regex
-from django.core.validators import URLValidator
 from django.db import models
 from django.db.models.functions import Lower
 from django.utils.translation import gettext_lazy as _
@@ -14,14 +13,13 @@ class Article(models.Model):
     Fields:
         headline (models.TextField): headline of the article
         slug (models.SlugField): slug of the article (generated from headline)
+        summary (models.TextField): short paragraph summarizing the article
         created_at (models.DateTimeField): date the article was added to the
             database. Mostly corresponds to actual publication date, though
             this can vary. Actual dates are not used because their format
             varies a lot, hence they are difficult to parse.
         language (models.CharField): the language of the article
         url (models.URLField): link to the article
-        body (models.TextField): either the actual body of the article,
-            or a short desriptive paragraph
 
     Relations:
         source (ForeignKey): the source of the article
@@ -31,13 +29,17 @@ class Article(models.Model):
     """
 
     headline = models.CharField(
-        max_length=200,
+        max_length=512,
         help_text=_("The headline of the article"),
     )
     slug = models.SlugField(
-        max_length=255,
+        max_length=1024,
         help_text=_("The slug of the article for SEO-friendly urls"),
     )
+    summary = models.TextField(
+        blank=True,
+        help_text=_("A summary of the article"),
+    )
     created_at = models.DateTimeField()
     language = models.CharField(
         max_length=4,
@@ -46,14 +48,10 @@ class Article(models.Model):
         help_text=_("The language of the article"),
     )
     url = models.URLField(
-        max_length=255,
+        max_length=512,
         unique=True,
         help_text=_("The link to the article"),
     )
-    summary = models.TextField(
-        blank=True,
-        help_text=_("A summary of the article"),
-    )
     source = models.ForeignKey(
         to="Source",
         on_delete=models.CASCADE,
@@ -65,7 +63,7 @@ class Meta:
         ordering = ("-created_at",)
         indexes = [models.Index(fields=["headline", "url"])]
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.source}: {self.headline}"
 
 
@@ -74,7 +72,7 @@ class Source(models.Model):
     Metadata about the source of articles
 
     Fields:
-        title (models.CharField): name of the source
+        title (models.CharField): name/title of the source
         slug (models.SlugField): slug of the source
         publication_type (models.CharField): the type of publication of the
             source (newspaper, journal, blog...)
@@ -122,15 +120,15 @@ class Source(models.Model):
     )
     url = models.URLField(
         unique=True,
-        max_length=255,
+        max_length=512,
         help_text=_("The url of the source"),
     )
     #
-    # info related to scraping
+    # data related to scraping
     #
     paths = models.JSONField(
         help_text=_(
-            "A list of resource paths where the scraper will look for articles"
+            "List of resource paths where the scraper will look for articles"
         ),
     )
     regex = models.CharField(
@@ -147,18 +145,26 @@ class Source(models.Model):
             "of JavaScript"
         ),
     )
-    headline_selectors = models.JSONField(
+    headline_search_params_find = models.JSONField(
         help_text=_(
-            "Information about the structure of the target page needed to extract "
-            "the headline of articles published by this source"
+            "Selectors for extracting the headline of articles"
         ),
     )
-    summary_selectors = models.JSONField(
-        null=True,
-        blank=True,
+    headline_search_params_remove = models.JSONField(
+        help_text=_(
+            "Selectors for HTML elements that need to be removed from the headline"
+        ),
+    )
+    summary_search_params_find = models.JSONField(
+        default=str,
+        help_text=_(
+            "Selectors for extracting the summary of articles"
+        ),
+    )
+    summary_search_params_remove = models.JSONField(
+        default=list,
         help_text=_(
-            "Information about the structure of the target page needed to extract "
-            "the summary of articles published by this source"
+            "Selectors for HTML elements that need to be removed from the summary"
         ),
     )
 
@@ -167,7 +173,7 @@ class Meta:
             Lower("title"),
         ]
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.title}"
 
     def to_dict(self):
@@ -177,7 +183,15 @@ def to_dict(self):
             "language": self.language,
             "javascript_required": self.javascript_required,
             "filter": regex.compile(self.regex),
-            "headline_selectors": self.headline_selectors,
-            "summary_selectors": self.summary_selectors,
+            "search_params": {
+                "headline": {
+                    "find": self.headline_search_params_find,
+                    "remove": self.headline_search_params_remove,
+                },
+                "summary": {
+                    "find": self.summary_search_params_find,
+                    "remove": self.summary_search_params_remove,
+                },
+            },
         }
         return sitemap
diff --git a/articles/tasks.py b/articles/tasks.py
index 49bcbd2..f37f874 100644
--- a/articles/tasks.py
+++ b/articles/tasks.py
@@ -1,6 +1,8 @@
 import json
+import logging
 
 from celery import group, shared_task
+from django.db.utils import DatabaseError
 from django.utils import timezone
 
 import scraper
@@ -8,6 +10,8 @@
 
 from .models import Article, Source
 
+logger = logging.getLogger("__name__")
+
 
 @shared_task
 def get_articles_for_source(source_title: str) -> None:
@@ -21,17 +25,34 @@ def get_articles_for_source(source_title: str) -> None:
     spider.run()
     articles = [json.loads(article) for article in spider.articles]
 
-    Article.objects.bulk_create([
-        Article(
-            headline=article_data["headline"],
-            slug=article_data["slug"],
-            source=Source.objects.get(url=article_data["source_link"]),
-            summary=article_data["summary"],
-            language=article_data["language"],
-            url=article_data["url"],
-            created_at=timezone.now(),
-        ) for article_data in articles
-    ], ignore_conflicts=True)
+    # try bulk create, revert to individual db saves in case of error
+    try:
+        Article.objects.bulk_create([
+            Article(
+                headline=article_data["headline"],
+                slug=article_data["slug"],
+                source=Source.objects.get(url=article_data["source_link"]),
+                summary=article_data["summary"],
+                language=article_data["language"],
+                url=article_data["url"],
+                created_at=timezone.now(),
+            ) for article_data in articles
+        ], ignore_conflicts=True)
+    except DatabaseError as exc:
+        logger.error("Bulk create failed", exc_info=exc)
+        for article_data in articles:
+            try:
+                Article.objects.create(
+                    headline=article_data["headline"],
+                    slug=article_data["slug"],
+                    source=Source.objects.get(url=article_data["source_link"]),
+                    summary=article_data["summary"],
+                    language=article_data["language"],
+                    url=article_data["url"],
+                    created_at=timezone.now(),
+                )
+            except DatabaseError as exc:
+                logger.error("DB save failed for %s", article_data["url"], exc_info=exc)
 
 
 @shared_task
diff --git a/articles/tests/conftest.py b/articles/tests/conftest.py
index c80344c..f03b77b 100644
--- a/articles/tests/conftest.py
+++ b/articles/tests/conftest.py
@@ -20,8 +20,10 @@ def source_values():
         "paths": ["world/"],
         "javascript_required": False,
         "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
-        "headline_selectors": {"tag": "h1", "attrs": {}},
-        "summary_selectors": {"tag": "h2", "attrs": {}},
+        "headline_search_params_find": "h1",
+        "headline_search_params_remove": [],
+        "summary_search_params_find": "",
+        "summary_search_params_remove": []
     }
 
 
@@ -41,8 +43,10 @@ def source_values_2():
         "paths": ["world/"],
         "javascript_required": False,
         "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
-        "headline_selectors": {"tag": "h1", "attrs": {}},
-        "summary_selectors": {"tag": "h2", "attrs": {}},
+        "headline_search_params_find": "h1",
+        "headline_search_params_remove": [],
+        "summary_search_params_find": "",
+        "summary_search_params_remove": []
     }
 
 
diff --git a/articles/tests/integration/__init__.py b/articles/tests/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/articles/tests/integration/test_tasks.py b/articles/tests/integration/test_tasks.py
new file mode 100644
index 0000000..8ab1c1a
--- /dev/null
+++ b/articles/tests/integration/test_tasks.py
@@ -0,0 +1,57 @@
+import pytest
+from django.conf import settings
+from django.core.management import call_command
+
+from articles.models import Article
+from articles.tasks import get_articles_for_source
+from scraper.tests import MockResponse, contents_aj, expected_aj
+
+
+@pytest.fixture
+def django_db_setup(django_db_blocker):
+    settings.DATABASES["default"] = {
+        "ENGINE": "django.db.backends.postgresql",
+        "HOST": "db.example.com",
+        "NAME": "test_db",
+        "ATOMIC_REQUESTS": True,
+    }
+
+    with django_db_blocker.unblock():
+        call_command("loaddata", "fixtures/sources_test.json")
+
+
+@pytest.mark.usefixtures('celery_session_app')
+@pytest.mark.usefixtures('celery_session_worker')
+@pytest.mark.django_db
+def test_get_articles_for_source(django_db_setup, contents_aj, expected_aj, mocker):
+    #
+    # setup
+    #
+    SOURCE_TITLE = "Al Jazeera"
+
+    def return_value(*args, **kwargs):
+        for key, value in contents_aj.items():
+            if args[0] == value["link"]:
+                return MockResponse(text=value["content"])
+
+    mocker.patch("aiohttp.ClientSession.get", side_effect=return_value)
+
+    #
+    # asserts
+    #
+    promise = get_articles_for_source.delay(SOURCE_TITLE)
+    promise.get()
+
+    articles = Article.objects.all()
+
+    assert len(articles) == 12
+
+    for expected_data in expected_aj.values():
+        article = next(
+            (article for article in articles if article.headline == expected_data["headline"])
+        )
+        assert article.slug == expected_data["slug"]
+        assert article.summary == expected_data["summary"]
+        assert article.language == expected_data["language"]
+        assert article.url == expected_data["url"]
+        assert article.source.title == SOURCE_TITLE
diff --git a/articles/tests/unit/__init__.py b/articles/tests/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/articles/tests/test_models.py b/articles/tests/unit/test_models.py
similarity index 72%
rename from articles/tests/test_models.py
rename to articles/tests/unit/test_models.py
index 40563cf..3566282 100644
--- a/articles/tests/test_models.py
+++ b/articles/tests/unit/test_models.py
@@ -1,10 +1,10 @@
 import regex  # type: ignore
 
-from ..models import Article, Source
+from articles.models import Article, Source
 
 
 #
-# Test Source
+# Source
 #
 def test_create_source(source_values) -> None:
     source = Source(**source_values)
@@ -18,8 +18,6 @@ def test_source_to_dict(source_values) -> None:
     sitemap = source.to_dict()
 
     for attr_name in [
-        "summary_selectors",
-        "headline_selectors",
         "javascript_required",
         "language",
         "paths",
@@ -28,6 +26,10 @@ def test_source_to_dict(source_values) -> None:
 
     assert source.url == sitemap["base_url"]
     assert regex.compile(source.regex) == sitemap["filter"]
+    assert sitemap["search_params"]["headline"]["find"] == source.headline_search_params_find
+    assert sitemap["search_params"]["headline"]["remove"] == source.headline_search_params_remove
+    assert sitemap["search_params"]["summary"]["find"] == source.summary_search_params_find
+    assert sitemap["search_params"]["summary"]["remove"] == source.summary_search_params_remove
 
 
 def test_source_str_representation(source_values) -> None:
@@ -37,7 +39,7 @@ def test_source_str_representation(source_values) -> None:
 
 
 #
-# Test Article
+# Article
 #
 def test_create_article(article_values_m) -> None:
     article = Article(**article_values_m)
diff --git a/articles/tests/test_views.py b/articles/tests/unit/test_views.py
similarity index 100%
rename from articles/tests/test_views.py
rename to articles/tests/unit/test_views.py
diff --git a/fixtures/languages.json b/fixtures/languages.json
deleted file mode 100644
index 14d1ce8..0000000
--- a/fixtures/languages.json
+++ /dev/null
@@ -1,9 +0,0 @@
-[
-{
-  "model": "articles.language",
-  "pk": 1,
-  "fields": {
-    "name": "en"
-  }
-}
-]
diff --git a/fixtures/pub_types.json b/fixtures/pub_types.json
deleted file mode 100644
index 57b8cf5..0000000
--- a/fixtures/pub_types.json
+++ /dev/null
@@ -1,9 +0,0 @@
-[
-{
-  "model": "articles.publicationtype",
-  "pk": 1,
-  "fields": {
-    "name": "newspaper/journal"
-  }
-}
-]
diff --git a/fixtures/sources.json b/fixtures/sources.json
index f7f22f9..004c096 100644
--- a/fixtures/sources.json
+++ b/fixtures/sources.json
@@ -12,17 +12,11 @@
       "news/"
     ],
     "javascript_required": false,
-    "regex": "/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics)",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "p",
-      "attrs": {
-        "class": "article__subhead"
-      }
-    }
+    "regex": "(?<!liveblog)/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics)",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".p1", "p"],
+    "summary_search_params_remove": []
   }
 },
 {
@@ -39,24 +33,18 @@
     ],
     "javascript_required": false,
     "regex": "article/",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "div",
-      "attrs": {
-        "class": "RichTextStoryBody"
-      }
-    }
+    "headline_search_params_find": [".Page-headline"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".RichTextBody"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 3,
   "fields": {
-    "title": "Christian Science Monitor",
     "url": "https://www.csmonitor.com/",
+    "title": "Christian Science Monitor",
     "publication_type": "newspaper/journal",
     "language": "en",
     "paths": [
@@ -65,18 +53,10 @@
     ],
     "javascript_required": false,
     "regex": "USA/.*[0-9]{4}/[0-9]{4}/|World/.*[0-9]{4}/[0-9]{4}/",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {
-        "id": "headline"
-      }
-    },
-    "summary_selectors": {
-      "tag": "div",
-      "attrs": {
-        "class": "story-two"
-      }
-    }
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["div > .story-two"],
+    "summary_search_params_remove": ["aside", "div"]
   }
 },
 {
@@ -92,14 +72,10 @@
     ],
     "javascript_required": false,
     "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "p",
-      "attrs": {}
-    }
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".entry-content"],
+    "summary_search_params_remove": ["div"]
   }
 },
 {
@@ -118,18 +94,10 @@
     ],
     "javascript_required": false,
     "regex": "[0-9]{4}/[0-9]{2}/",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {
-        "class": "title"
-      }
-    },
-    "summary_selectors": {
-      "tag": "div",
-      "attrs": {
-        "class": "tagline"
-      }
-    }
+    "headline_search_params_find": [".title > span"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".details .tagline"],
+    "summary_search_params_remove": []
   }
 },
 {
@@ -146,18 +114,10 @@
     ],
     "javascript_required": false,
     "regex": "(?<!/es/)(?<!/live/)[0-9]{4}/[0-9]{2}/[0-9]{2}/",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {
-        "data-testid": "headline"
-      }
-    },
-    "summary_selectors": {
-      "tag": "section",
-      "attrs": {
-        "id": "articleBody"
-      }
-    }
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": "[name='articleBody']",
+    "summary_search_params_remove": ["figure", "span"]
   }
 },
 {
@@ -173,11 +133,10 @@
     ],
     "javascript_required": false,
     "regex": "/[0-9]{4}/[0-9]{2}/[0-9]{2}/",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {}
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["#storytext"],
+    "summary_search_params_remove": ["div"]
   }
 },
 {
@@ -193,16 +152,10 @@
     ],
     "javascript_required": true,
     "regex": "[0-9]{4}-[0-9]{2}-[0-9]{2}/$",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "div",
-      "attrs": {
-        "data-testid": "paragraph-0"
-      }
-    }
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["[class*='article-body__paragraph']"],
+    "summary_search_params_remove": []
   }
 },
 {
@@ -220,16 +173,10 @@
     ],
     "javascript_required": false,
     "regex": "archive/[0-9]{4}/",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "section",
-      "attrs": {
-        "class": "ArticleBody_root__2gF81"
-      }
-    }
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["[class*='ArticleBody']"],
+    "summary_search_params_remove": []
   }
 },
 {
@@ -247,57 +194,48 @@
     ],
     "javascript_required": false,
     "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "p",
-      "attrs": {"class": "post__excerpt"}
-    }
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".post__excerpt"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 11,
   "fields": {
-    "title": "Wall Street Journal",
-    "url": "https://www.wsj.com/",
+    "title": "UPI",
+    "url": "https://www.upi.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
     "paths": [
-      "news/world/"
+      "Top_News/World-News/"
     ],
     "javascript_required": false,
-    "regex": "articles/(?!.*comments_sector)",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": null
+    "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["article"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 12,
   "fields": {
-    "title": "UPI",
-    "url": "https://www.upi.com/",
+    "title": "Wall Street Journal",
+    "url": "https://www.wsj.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
     "paths": [
-      "Top_News/World-News/"
+      "news/world/"
     ],
     "javascript_required": false,
-    "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/",
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "article",
-      "attrs": {}
-    }
+    "regex": "articles/(?!.*comments_sector)",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["h2"],
+    "summary_search_params_remove": []
   }
 }
 ]
diff --git a/fixtures/sources2.json b/fixtures/sources_test.json
similarity index 50%
rename from fixtures/sources2.json
rename to fixtures/sources_test.json
index 1757aab..a803732 100644
--- a/fixtures/sources2.json
+++ b/fixtures/sources_test.json
@@ -3,314 +3,239 @@
   "model": "articles.source",
   "pk": 1,
   "fields": {
-    "name": "Al Jazeera",
-    "slug": "al-jazeera",
+    "title": "Al Jazeera",
+		"slug": "al-jazeera",
+    "url": "https://www.aljazeera.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.aljazeera.com/",
     "paths": [
       "news/"
     ],
-    "regex": "/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics)",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "p",
-      "attrs": {
-        "class": "article__subhead"
-      }
-    }
+    "regex": "(?<!liveblog)/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics)",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".p1", "p"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 2,
   "fields": {
-    "name": "Associated Press",
-    "slug": "",
+    "title": "Associated Press",
+    "url": "https://apnews.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://apnews.com/",
     "paths": [
       "hub/world-news/",
       "hub/business/"
     ],
-    "regex": "article/",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "div",
-      "attrs": {
-        "class": "Article"
-      }
-    }
+    "regex": "article/",
+    "headline_search_params_find": [".Page-headline"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".RichTextBody"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 3,
   "fields": {
-    "name": "Christian Science Monitor",
-    "slug": "",
+    "url": "https://www.csmonitor.com/",
+    "title": "Christian Science Monitor",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.csmonitor.com/",
     "paths": [
       "world/",
       "USA/"
     ],
-    "regex": "USA/.*[0-9]{4}/[0-9]{4}/|World/.*[0-9]{4}/[0-9]{4}/",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {
-        "id": "headline"
-      }
-    },
-    "summary_selectors": {
-      "tag": "div",
-      "attrs": {
-        "id": "summary"
-      }
-    }
+    "regex": "USA/.*[0-9]{4}/[0-9]{4}/|World/.*[0-9]{4}/[0-9]{4}/",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["div > .story-two"],
+    "summary_search_params_remove": ["aside", "div"]
   }
 },
 {
   "model": "articles.source",
   "pk": 4,
   "fields": {
-    "name": "Consortium News",
-    "slug": "",
+    "title": "Consortium News",
+    "url": "https://consortiumnews.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://consortiumnews.com/",
     "paths": [
       ""
     ],
-    "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "p",
-      "attrs": {}
-    }
+    "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/(?!.*contact-us-form/|.*parry-awarded|.*robert-parrys-legacy)[a-z]+(?!.*policy)(?!.*live)(?!.*fund-drive)",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".entry-content"],
+    "summary_search_params_remove": ["div"]
   }
 },
 {
   "model": "articles.source",
   "pk": 5,
   "fields": {
-    "name": "Current Affairs",
-    "slug": "",
+    "title": "Current Affairs",
+    "url": "https://www.currentaffairs.org/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.currentaffairs.org/",
     "paths": [
       "category/politics/",
       "category/economics/",
       "category/interviews/",
       "category/history/"
     ],
-    "regex": "[0-9]{4}/[0-9]{2}/",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {
-        "class": "title"
-      }
-    },
-    "summary_selectors": {
-      "tag": "div",
-      "attrs": {
-        "class": "tagline"
-      }
-    }
+    "regex": "[0-9]{4}/[0-9]{2}/",
+    "headline_search_params_find": [".title > span"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".details .tagline"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 6,
   "fields": {
-    "name": "New York Times",
-    "slug": "",
+    "title": "New York Times",
+    "url": "https://www.nytimes.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.nytimes.com/",
     "paths": [
       "section/world/",
       "section/business/"
     ],
-    "regex": "(?<!/es/)(?<!/live/)[0-9]{4}/[0-9]{2}/[0-9]{2}/",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {
-        "data-testid": "headline"
-      }
-    },
-    "summary_selectors": {
-      "tag": "section",
-      "attrs": {
-        "id": "article-summary"
-      }
-    }
+    "regex": "(?<!/es/)(?<!/live/)[0-9]{4}/[0-9]{2}/[0-9]{2}/",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": "[name='articleBody']",
+    "summary_search_params_remove": ["figure", "span"]
   }
 },
 {
   "model": "articles.source",
   "pk": 7,
   "fields": {
-    "name": "NPR",
-    "slug": "",
+    "title": "NPR",
+    "url": "https://www.npr.org/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.npr.org/",
     "paths": [
       "sections/world/"
     ],
-    "regex": "/[0-9]{4}/[0-9]{2}/[0-9]{2}/",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {}
+    "regex": "/[0-9]{4}/[0-9]{2}/[0-9]{2}/",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["#storytext"],
+    "summary_search_params_remove": ["div"]
   }
 },
 {
   "model": "articles.source",
   "pk": 8,
   "fields": {
-    "name": "Reuters",
-    "slug": "",
+    "title": "Reuters",
+    "url": "https://www.reuters.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.reuters.com/",
     "paths": [
       "world/"
     ],
+    "javascript_required": true,
     "regex": "[0-9]{4}-[0-9]{2}-[0-9]{2}/$",
-    "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "p",
-      "attrs": {
-        "data-testid": "paragraph-0"
-      }
-    }
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["[class*='article-body__paragraph']"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 9,
   "fields": {
-    "name": "The Atlantic",
-    "slug": "",
+    "title": "The Atlantic",
+    "url": "https://www.theatlantic.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.theatlantic.com/",
     "paths": [
       "international/",
       "politics/",
       "business/"
     ],
-    "regex": "archive/[0-9]{4}/",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "section",
-      "attrs": {
-        "class": "ArticleBody_root__nZ4AR"
-      }
-    }
+    "regex": "archive/[0-9]{4}/",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["[class*='ArticleBody']"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 10,
   "fields": {
-    "name": "The Intercept",
-    "slug": "",
+    "title": "The Intercept",
+    "url": "https://theintercept.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://theintercept.com/",
     "paths": [
       "politics/",
       "justice/",
       "world/"
     ],
-    "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "p",
-      "attrs": {
-        "class": "post__excerpt"
-      }
-    }
+    "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": [".post__excerpt"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 11,
   "fields": {
-    "name": "Wall Street Journal",
-    "slug": "",
+    "title": "UPI",
+    "url": "https://www.upi.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.wsj.com/",
     "paths": [
-      "news/world/"
+      "Top_News/World-News/"
     ],
-    "regex": "articles/(?!.*comments_sector)",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": null
+    "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["article"],
+    "summary_search_params_remove": []
   }
 },
 {
   "model": "articles.source",
   "pk": 12,
   "fields": {
-    "name": "UPI",
-    "slug": "",
+    "title": "Wall Street Journal",
+    "url": "https://www.wsj.com/",
     "publication_type": "newspaper/journal",
     "language": "en",
-    "link": "https://www.upi.com/",
     "paths": [
-      "Top_News/World-News/"
+      "news/world/"
     ],
-    "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}/",
     "javascript_required": false,
-    "headline_selectors": {
-      "tag": "h1",
-      "attrs": {}
-    },
-    "summary_selectors": {
-      "tag": "article",
-      "attrs": {}
-    }
+    "regex": "articles/(?!.*comments_sector)",
+    "headline_search_params_find": ["h1"],
+    "headline_search_params_remove": [],
+    "summary_search_params_find": ["h2"],
+    "summary_search_params_remove": []
   }
 }
 ]
diff --git a/nous_aggregator/settings/base.py b/nous_aggregator/settings/base.py
index 82d4e5b..e83a2e4 100644
--- a/nous_aggregator/settings/base.py
+++ b/nous_aggregator/settings/base.py
@@ -90,7 +90,7 @@
 # Logging
 
 LOG_DIR: Path = BASE_DIR / "logs"
-LOGGING: Dict[str, Union[Dict[str, Dict[str, str]], Dict[str, Dict[str, Union[List[str], bool, str]]], Dict[str, Dict[str, Union[int, str]]], int]] = {
+LOGGING = {
     'version': 1,
     'disable_existing_loggers': False,
     'formatters': {
@@ -185,7 +185,7 @@
 # Celery
 CELERY_BROKER_URL = config("CELERY_BROKER_URL", "redis://localhost:6379")
 CELERY_RESULT_BACKEND = config("CELERY_RESULT_BACKEND", "redis://localhost:6379")
-CELERY_BEAT_SCHEDULE: Dict[str, Dict[str, Union[Dict[str, str], List[str], int, str]]] = {
+CELERY_BEAT_SCHEDULE = {
     "get_articles_en": {
         "task": "articles.tasks.get_articles",
         "schedule": scraper_tasks.magazines["en"]["schedule"],
diff --git a/requirements/base.in b/requirements/base.in
index 054be83..ce2bd3f 100644
--- a/requirements/base.in
+++ b/requirements/base.in
@@ -1,6 +1,5 @@
 # Python
 aiohttp
-beautifulsoup4
 langdetect
 psycopg2
 python-decouple
diff --git a/requirements/base.txt b/requirements/base.txt
index ff36e21..4b3cf55 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -16,8 +16,6 @@ asgiref==3.7.2
     # via django
 attrs==23.2.0
     # via aiohttp
-beautifulsoup4==4.12.2
-    # via -r requirements/base.in
 billiard==4.2.0
     # via celery
 celery==5.3.6
@@ -97,8 +95,6 @@ six==1.16.0
     # via
     #   langdetect
     #   python-dateutil
-soupsieve==2.5
-    # via beautifulsoup4
 sqlparse==0.4.4
     # via django
 tqdm==4.66.1
diff --git a/requirements/ci.in b/requirements/ci.in
index 076ba5b..26abf23 100644
--- a/requirements/ci.in
+++ b/requirements/ci.in
@@ -4,7 +4,8 @@
 bandit
 
 # Testing
-pyquery
 pytest
-pytest-mock
+pytest-asyncio
+pytest-celery
 pytest-django
+pytest-mock
diff --git a/requirements/ci.txt b/requirements/ci.txt
index 4af1f45..dd55356 100644
--- a/requirements/ci.txt
+++ b/requirements/ci.txt
@@ -35,10 +35,6 @@ attrs==23.2.0
     #   aiohttp
 bandit==1.7.5
     # via -r requirements/ci.in
-beautifulsoup4==4.12.2
-    # via
-    #   -c requirements/base.txt
-    #   -r requirements/base.txt
 billiard==4.2.0
     # via
     #   -c requirements/base.txt
@@ -48,6 +44,7 @@ celery==5.3.6
     # via
     #   -c requirements/base.txt
     #   -r requirements/base.txt
+    #   pytest-celery
 certifi==2023.7.22
     # via
     #   -c requirements/base.txt
@@ -186,12 +183,16 @@ pyquery==2.0.0
     # via
     #   -c requirements/base.txt
     #   -r requirements/base.txt
-    #   -r requirements/ci.in
 pytest==7.4.2
     # via
     #   -r requirements/ci.in
+    #   pytest-asyncio
     #   pytest-django
     #   pytest-mock
+pytest-asyncio==0.23.5.post1
+    # via -r requirements/ci.in
+pytest-celery==0.0.0
+    # via -r requirements/ci.in
 pytest-django==4.5.2
     # via -r requirements/ci.in
 pytest-mock==3.12.0
@@ -229,11 +230,6 @@ six==1.16.0
     #   python-dateutil
 smmap==5.0.1
     # via gitdb
-soupsieve==2.5
-    # via
-    #   -c requirements/base.txt
-    #   -r requirements/base.txt
-    #   beautifulsoup4
 sqlparse==0.4.4
     # via
     #   -c requirements/base.txt
diff --git a/requirements/dev.in b/requirements/dev.in
index 3cec657..c95c0db 100644
--- a/requirements/dev.in
+++ b/requirements/dev.in
@@ -1,7 +1,8 @@
 -c ci.txt
 
+pip-tools
+
 # Code quality
-bandit
 black
 flake8
 isort
@@ -9,9 +10,12 @@ isort
 # Debug tooling
 django-debug-toolbar
 django-extensions
+ipython
+pdbr
 
 # Typing
 django-stubs
 mypy
 pytype
 types-requests
+types-regex
diff --git a/requirements/dev.txt b/requirements/dev.txt
index 5bb0ecd..1187e92 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -38,11 +38,6 @@ bandit==1.7.5
     # via
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
-    #   -r requirements/dev.in
-beautifulsoup4==4.12.2
-    # via
-    #   -c requirements/ci.txt
-    #   -r requirements/ci.txt
 billiard==4.2.0
     # via
     #   -c requirements/ci.txt
@@ -50,10 +45,13 @@ billiard==4.2.0
     #   celery
 black==23.9.1
     # via -r requirements/dev.in
+build==1.1.1
+    # via pip-tools
 celery==5.3.6
     # via
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
+    #   pytest-celery
 certifi==2023.7.22
     # via
     #   -c requirements/ci.txt
@@ -69,6 +67,7 @@ click==8.1.7
     #   click-didyoumean
     #   click-plugins
     #   click-repl
+    #   pip-tools
 click-didyoumean==0.3.0
     # via
     #   -c requirements/ci.txt
@@ -215,6 +214,7 @@ packaging==23.2
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
     #   black
+    #   build
     #   gunicorn
     #   pytest
 pathspec==0.11.2
@@ -224,6 +224,8 @@ pbr==5.11.1
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
     #   stevedore
+pip-tools==7.4.1
+    # via -r requirements/dev.in
 platformdirs==3.10.0
     # via black
 pluggy==1.3.0
@@ -269,6 +271,10 @@ pyppeteer==1.0.2
     # via
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
+pyproject-hooks==1.0.0
+    # via
+    #   build
+    #   pip-tools
 pyquery==2.0.0
     # via
     #   -c requirements/ci.txt
@@ -277,8 +283,17 @@ pytest==7.4.2
     # via
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
+    #   pytest-asyncio
     #   pytest-django
     #   pytest-mock
+pytest-asyncio==0.23.5.post1
+    # via
+    #   -c requirements/ci.txt
+    #   -r requirements/ci.txt
+pytest-celery==0.0.0
+    # via
+    #   -c requirements/ci.txt
+    #   -r requirements/ci.txt
 pytest-django==4.5.2
     # via
     #   -c requirements/ci.txt
@@ -332,11 +347,6 @@ smmap==5.0.1
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
     #   gitdb
-soupsieve==2.5
-    # via
-    #   -c requirements/ci.txt
-    #   -r requirements/ci.txt
-    #   beautifulsoup4
 sqlparse==0.4.4
     # via
     #   -c requirements/ci.txt
@@ -361,6 +371,8 @@ types-pytz==2023.3.0.1
     # via django-stubs
 types-pyyaml==6.0.12.11
     # via django-stubs
+types-regex==2023.12.25.20240106
+    # via -r requirements/dev.in
 types-requests==2.31.0.3
     # via -r requirements/dev.in
 types-urllib3==1.26.25.14
@@ -406,6 +418,8 @@ websockets==10.4
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
     #   pyppeteer
+wheel==0.42.0
+    # via pip-tools
 whitenoise==6.5.0
     # via
     #   -c requirements/ci.txt
@@ -421,3 +435,7 @@ zipp==3.16.2
     #   -c requirements/ci.txt
     #   -r requirements/ci.txt
     #   importlib-metadata
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
+# setuptools
diff --git a/scraper/__init__.py b/scraper/__init__.py
index bca07dc..0ba02b6 100644
--- a/scraper/__init__.py
+++ b/scraper/__init__.py
@@ -1,3 +1,3 @@
 from .spiders import Spider
 
-__all__ = [Spider]
+__all__ = ["Spider"]
diff --git a/scraper/parser.py b/scraper/parser.py
index 04286da..1437359 100644
--- a/scraper/parser.py
+++ b/scraper/parser.py
@@ -4,8 +4,8 @@
 from typing import Generator, Optional
 
 import langdetect
-from bs4 import BeautifulSoup
 from django.utils.text import slugify
+from langdetect.lang_detect_exception import LangDetectException
 from pyquery import PyQuery
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -25,90 +25,97 @@ def generate_filtered_links(html: str, sitemap: dict) -> Generator[str, None, No
                 yield urllib.parse.urljoin(sitemap["base_url"], link)
 
 
-def find_headline(soup: BeautifulSoup, sitemap: dict, url: str) -> Optional[str]:
-    """Use `sitemap` to extract headline from article"""
+def find_headline(doc: PyQuery, sitemap: dict, url: str) -> Optional[str]:
+    """ Use `doc` + `sitemap` to extract headline from article at `url` """
+
+    search_params = sitemap["search_params"]["headline"]
+
+    if not search_params.get("find", ""):
+        logger.warning("No search params for headline of %s", url)
+        return None
+
+    for param in search_params["find"]:
+        if headline_doc := doc.find(param):
+            break
+
+    if not headline_doc:
+        return None
+
+    for item in search_params.get("remove", []):
+        headline_doc(f"{item}").remove()
 
     try:
-        headline = soup.find(
-            sitemap["headline_selectors"]["tag"],
-            attrs=sitemap["headline_selectors"]["attrs"],
-        )
-    except KeyError as e:
-        logger.error("KeyError (%s) for headline of %s", e, url)
-        raise KeyError from e  # Abort job after logging error
-    if headline is None:
+        headline_text = headline_doc.text().strip()
+    except AttributeError:
         return None
-    headline_text = headline.get_text().strip()
+
     return headline_text
 
 
-def find_summary(soup: BeautifulSoup, sitemap: dict, url: str) -> Optional[str]:
-    """Use `parser` & `sitemap` to extract summary from article"""
+def find_summary(doc: PyQuery, sitemap: dict, url: str) -> Optional[str]:
+    """ Use `doc` + `sitemap` to extract summary from article at `url` """
 
-    if not sitemap["summary_selectors"]:
+    search_params = sitemap["search_params"]["summary"]
+
+    if not search_params.get("find", ""):
+        logger.warning("No search params for summary of %s", url)
         return None
 
+    for param in search_params["find"]:
+        if summary_doc := doc.find(param):
+            break
+
+    if not summary_doc:
+        return None
+
+    for item in search_params.get("remove", []):
+        summary_doc(f"{item}").remove()
+
     try:
-        summary = soup.find(
-            sitemap["summary_selectors"]["tag"],
-            attrs=sitemap["summary_selectors"]["attrs"],
-        )
-    except KeyError as e:
-        logger.error("KeyError (%s) for summary of %s", e, url)
-        summary = None  # Continue job, set `summary` to avoid UnboundLocalError
-    if summary is None:
-        logger.warning("Missing summary for %s", url)
+        summary_text = summary_doc.text().strip()
+    except AttributeError:
         return None
-    summary_text = summary.get_text().strip()
-    return summary_text
 
+    return summary_text[:1000]
 
-def find_language(soup: BeautifulSoup, url: str) -> Optional[str]:
-    """Detect the language of the page at `url`."""
 
-    if (body := soup.body) is None or (text := body.get_text()) is None:
-        return None
+def find_language(summary: str, headline: str, doc: PyQuery, url: str) -> Optional[str]:
+    """ Detect the language of the page at `url` """
 
-    language = langdetect.detect(text)
-    if language is None:
-        logger.warning("Missing language for %s", url)
-    return language
+    for item in (summary, headline, doc.text()):
+        if item:
+            try:
+                language = langdetect.detect(item)
+            except LangDetectException as exc:
+                logger.warning(
+                    "Failed to detect language for %s", url, exc_info=exc
+                )
+            else:
+                return language
+    return None
 
 
 def parse(html: str, sitemap: dict, url: str) -> Optional[str]:
-    """
-    Parse the `html` at `url` with lxml, use html.parser as a fallback,
-    return data as JSON. If html.parser fails to return a headline, return `None`
-    (every article must have a headline); if the language does not match the one
-    specified in `sitemap`, return `None`.
-    """
-
-    # try first parser
-    parser = "lxml"
-    soup = BeautifulSoup(html, parser)
-    headline = find_headline(soup, sitemap, url)
-    # try second parser
-    if headline is None:
-        parser = "html.parser"
-        soup = BeautifulSoup(html, parser)
-        headline = find_headline(soup, sitemap, url)
+    doc = PyQuery(html)
+
+    headline = find_headline(doc, sitemap=sitemap, url=url)
     if headline is None:
         logger.warning("No headline for %s", url)
         return None
-    language = find_language(soup, url)
-    if not language == sitemap["language"]:
-        logger.warning(
-            "Language of article (%s) + source (%s) do not match: %s",
-            language,
-            sitemap["language"],
-            url,
-        )
-        return None
-    summary = find_summary(soup, sitemap, url)
+
+    summary = find_summary(doc, sitemap=sitemap, url=url)
+    if summary is None:
+        logger.warning("Missing summary for %s", url)
+        summary = "No description"
+
+    language = find_language(summary, headline, doc, url)
+    if language is None:
+        language = sitemap["language"]
+
     article = {
         "headline": headline,
         "slug": slugify(headline),
-        "summary": summary if summary else "No description",
+        "summary": summary,
         "language": language,
         "url": url,
         "source_link": sitemap["base_url"],
diff --git a/scraper/spiders.py b/scraper/spiders.py
index dd36ed2..ccb766f 100644
--- a/scraper/spiders.py
+++ b/scraper/spiders.py
@@ -2,7 +2,7 @@
 import logging
 import random
 
-import aiohttp  # pyre-ignore
+import aiohttp
 from aiohttp import ClientSession
 from aiohttp.web_exceptions import HTTPError
 
@@ -33,7 +33,7 @@ def __init__(self, starting_urls: list[str], sitemap: dict) -> None:
         self.links: set[str] = set()
         self.articles: set[str] = set()
 
-    async def connect(self, session: ClientSession, url: str) -> str | None:  # pyre-ignore
+    async def connect(self, session: ClientSession, url: str) -> str | None:
         headers = random.choice(self.headers)
 
         try:
@@ -44,7 +44,7 @@ async def connect(self, session: ClientSession, url: str) -> str | None:  # pyre
             return None
         return html
 
-    async def get_links(self, session: ClientSession, url: str) -> list[str] | None:
+    async def get_links(self, session: ClientSession, url: str) -> None:
         html = await self.connect(session=session, url=url)
         if not html:
             return None
@@ -52,7 +52,9 @@ async def get_links(self, session: ClientSession, url: str) -> list[str] | None:
         for link in parser.generate_filtered_links(html=html, sitemap=self.sitemap):
             self.links.add(link)
 
-    async def scrape(self, session: ClientSession, link: str) -> str | None:
+        return None
+
+    async def scrape(self, session: ClientSession, link: str) -> None:
         html = await self.connect(session=session, url=link)
         if not html:
             return None
@@ -63,6 +65,8 @@ async def scrape(self, session: ClientSession, link: str) -> str | None:
 
         self.articles.add(article)
 
+        return None
+
     async def collect_links(self, session: ClientSession, starting_urls: list[str]) -> None:
         coros = (self.get_links(session, url) for url in starting_urls)
         await asyncio.gather(*coros)
@@ -78,6 +82,10 @@ async def main(self) -> None:
             await self.collect_links(session, self.starting_urls)
             await self.collect_metadata(session, self.links)
 
-    def run(self):
-        loop = asyncio.get_event_loop()
+    def run(self) -> None:
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
         loop.run_until_complete(self.main())
diff --git a/scraper/tests/__init__.py b/scraper/tests/__init__.py
index e69de29..d20bc93 100644
--- a/scraper/tests/__init__.py
+++ b/scraper/tests/__init__.py
@@ -0,0 +1,5 @@
+from .conftest import expected_aj, sitemap_aj
+from .integration.test_spider import contents_aj
+from .mocks import MockResponse
+
+__all__ = ["contents_aj", "expected_aj", "MockResponse", "sitemap_aj"]
diff --git a/scraper/tests/conftest.py b/scraper/tests/conftest.py
index eb13753..39d68bf 100644
--- a/scraper/tests/conftest.py
+++ b/scraper/tests/conftest.py
@@ -14,8 +14,16 @@ def sitemap_aj():
         'filter': regex.Regex(
             '(?<!liveblog)/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics|.*liveblog)', flags=regex.V0
         ),
-        'headline_selectors': {'tag': 'h1', 'attrs': {}},
-        'summary_selectors': {'tag': 'p', 'attrs': {'class': 'article__subhead'}}
+        'search_params': {
+            'headline': {
+                'find': ["h1"],
+                'remove': []
+            },
+            'summary': {
+                'find': [".p1", "p"],
+                'remove': []
+            }
+        }
     }
 
 
@@ -30,19 +38,18 @@ def expected_aj() -> Dict[str, Dict[str, str]]:
         "asian_cup": {
             'headline': 'Asian Cup final brings FIFA World Cup frenzy back to Qatar’s Souq Waqif',
             'slug': 'asian-cup-final-brings-fifa-world-cup-frenzy-back-to-qatars-souq-waqif',
-            'summary': "Excitement for the Asian Cup football final is reaching fever pitch "
-                       "as al-Annabi take on an-Nashama at Lusail Stadium.",
+            "summary": (
+                "Excitement for the Asian Cup football final is reaching fever pitch as al-Annabi take on an-Nashama at Lusail Stadium. Save articles to read later and create your own reading list. Doha, Qatar – On Friday nights, Souq Waqif – Qatar’s old-style all-purpose market that also serves as the country’s central tourist attraction – brings together people from all walks of life, dozens of different nationalities and varying interests for a unique mix of colour and noise. But when the country plays host to a football tournament – be it the world’s biggest sporting event such as the FIFA World Cup or a regional championship – the excitement reaches a fever pitch. On the eve of the final of the ongoing AFC Asian Cup 2023, the famous marketplace in the heart of Doha was the marching ground of football fans of both teams vying for the continental crown in Saturday’s final at Lusail Stadium. Passionate supporters of an-Nashama – the gentlemen, as Jordan’s football team is lovingly known – gathered in"
+            ),
             'language': 'en',
-            "url": "https://www.aljazeera.com/news/2024/2/10/football-fever-hits-"
-                   "jordan-ahead-of-historic-asian-cup-final",
+            "url": "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif",
             'source_link': 'https://www.aljazeera.com/'
         },
         "indonesia": {
             "headline": "Big election rallies in Indonesia on final day of campaign",
             "slug": "big-election-rallies-in-indonesia-on-final-day-of-campaign",
             "summary": (
-                "Hundreds of thousands of supporters of "
-                "presidential contenders pack rallies in capital Jakarta and other cities."
+                "Tens of thousands of supporters of Indonesia’s presidential candidates have poured onto its streets as they hold final campaigns before heading to the polls in the world’s biggest single-day election. The contenders to lead the world’s third-largest democracy are popular former governors, Ganjar Pranowo and Anies Baswedan, and ex-special forces commander Prabowo Subianto, who has soared in opinion polls with the tacit backing of the president, and with the incumbent’s son as his running mate. The elections on Wednesday will elect a new president and vice president, in addition to parliamentary and local representatives. High-schooler Alfiatnan, 18, said she would vote for Subianto because this was his third attempt at the presidency. “I think there’s no harm [in] giving opportunity to someone who is trying. His optimistic spirit influenced me to choose him.” Also in the running is Baswedan, the former governor of Jakarta who is running as an independent candidate. The 54-year-old was e"
             ),
             "language": "en",
             "url": "https://www.aljazeera.com/news/2024/2/10/big-election-rallies-in-"
@@ -53,8 +60,7 @@ def expected_aj() -> Dict[str, Dict[str, str]]:
             "headline": "How Taiwan’s elections challenge the power of China’s Communist Party",
             "slug": "how-taiwans-elections-challenge-the-power-of-chinas-communist-party",
             "summary": (
-                "Elections in Taiwan highlight dissatisfaction in China with a political "
-                "system that Beijing says works best for Chinese people."
+                "Elections in Taiwan highlight dissatisfaction in China with a political system that Beijing says works best for Chinese people. Save articles to read later and create your own reading list. If free and fair national elections are considered the hallmark of a democratic state, Taiwan has much to boast about. In January, the self-ruled island held its eighth presidential election concurrently with a parliamentary vote. Just 160km (100 miles) away on the other side of the narrow Taiwan Strait, the Communist Party of China (CPC) has ruled China since 1949, and though the party often claims that it governs a democratic state, there is no electoral process comparable with Taiwan’s. China’s President Xi Jinping has referred to “whole-process people’s democracy” to describe the Chinese political system where the “people are the masters” but the party-state apparatus runs the people’s affairs on their behalf. Ken Cai*, a 35-year-old entrepreneur from Shanghai, does not subscribe to Xi’s definit"
             ),
             "language": "en",
             "url": "https://www.aljazeera.com/news/2024/2/10/how-taiwans-elections-"
diff --git a/scraper/tests/integration/test_spider.py b/scraper/tests/integration/test_spider.py
index 3e3ca05..1036e06 100644
--- a/scraper/tests/integration/test_spider.py
+++ b/scraper/tests/integration/test_spider.py
@@ -29,25 +29,27 @@ def source():
         paths=["news/"],
         regex="(?<!liveblog)/[0-9]{4}/[0-9]+/[0-9]+/(?!.*terms-and-conditions/|.*community-rules-guidelines/|.*eu-eea-regulatory|.*code-of-ethics|.*liveblog)",
         javascript_required=False,
-        headline_selectors={'tag': 'h1', 'attrs': {}},
-        summary_selectors={'tag': 'p', 'attrs': {'class': 'article__subhead'}},
+        headline_search_params_find=["h1"],
+        headline_search_params_remove=[],
+        summary_search_params_find=[".p1", "p"],
+        summary_search_params_remove=[]
     )
 
 
 @pytest.fixture
-def contents() -> Dict[str, Dict[str, Any]]:
+def contents_aj():
     contents = {
         "_start": {
             "link": "https://www.aljazeera.com/news/",
             "content": read_file(directory=FILES_DIR, file_name="_start.html"),
         },
         "asian_cup": {
-            "link":  "https://www.aljazeera.com/news/2024/2/10/football-fever-hits-"
-                     "jordan-ahead-of-historic-asian-cup-final",
+            "link": "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif",
             "content": read_file(directory=FILES_DIR, file_name="asian_cup.html"),
         },
         "football": {
-            "link": "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif",
+            "link":  "https://www.aljazeera.com/news/2024/2/10/football-fever-hits-"
+                     "jordan-ahead-of-historic-asian-cup-final",
             "content": read_file(directory=FILES_DIR, file_name="football.html"),
         },
         "footprints": {
@@ -108,14 +110,14 @@ def contents() -> Dict[str, Dict[str, Any]]:
 # tests
 #
 @pytest.mark.django_db
-def test_run_spider(source, contents, expected_aj, mocker) -> None:
+def test_run_spider(source, contents_aj, expected_aj, mocker) -> None:
     #
     # setup
     #
     def return_value(*args, **kwargs):
-        for k, v in contents.items():
-            if args[0] == v["link"]:
-                return MockResponse(text=v["content"])
+        for key, value in contents_aj.items():
+            if args[0] == value["link"]:
+                return MockResponse(text=value["content"])
 
     mocker.patch("aiohttp.ClientSession.get", side_effect=return_value)
 
diff --git a/scraper/tests/mocks.py b/scraper/tests/mocks.py
index 4ad6cd6..bc48fbf 100644
--- a/scraper/tests/mocks.py
+++ b/scraper/tests/mocks.py
@@ -1,6 +1,9 @@
+from typing import Optional
+
+
 class MockResponse:
-    def __init__(self, status_code: int | None = None, text: str = ""):
-        self.status_code = status_code or 200
+    def __init__(self, status_code: Optional[int] = 200, text: str = ""):
+        self.status_code = status_code
         self._text = text
 
     async def text(self):
@@ -9,5 +12,5 @@ async def text(self):
     async def __aenter__(self):
         return self
 
-    async def __aexit__(self, exc_type, exc, tb):
+    async def __aexit__(self, exc_type, exc_value, exc_traceback):
         pass
diff --git a/scraper/tests/unit/test_parsers.py b/scraper/tests/unit/test_parsers.py
index da7e008..a9805df 100644
--- a/scraper/tests/unit/test_parsers.py
+++ b/scraper/tests/unit/test_parsers.py
@@ -1,7 +1,8 @@
 import json
 from pathlib import Path
 
-from bs4 import BeautifulSoup
+import pytest
+from pyquery import PyQuery
 
 from scraper.parser import find_headline, find_language, find_summary, parse
 
@@ -9,41 +10,61 @@
 
 FILES_DIR: Path = Path(__file__).parent.parent.resolve() / "files" / "articles" / "aj"
 
-PAGE = "indonesia"
-URL = "https://www.aljazeera.com/news/2024/2/10/big-election-rallies-in-indonesia-on-final-day-of-campaign"
 
+URL_ASIAN_CUP = "https://www.aljazeera.com/sports/2024/2/10/football-fans-souq-waqif"
+URL_INDONESIA = "https://www.aljazeera.com/news/2024/2/10/big-election-rallies-in-indonesia-on-final-day-of-campaign"
+URL_TAIWAN = "https://www.aljazeera.com/news/2024/2/10/how-taiwans-elections-challenge-the-power-of-chinas-communist-party"
 
-def test_find_headline(sitemap_aj, expected_aj) -> None:
-    html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html")
-    soup = BeautifulSoup(html, "lxml")
 
-    headline_text = find_headline(soup, sitemap_aj, url="dummy")
+@pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"])
+def test_find_headline(sitemap_aj, expected_aj, page) -> None:
+    html = read_file(directory=FILES_DIR, file_name=f"{page}.html")
+    doc = PyQuery(html)
 
-    assert headline_text == expected_aj[f"{PAGE}"]["headline"]
+    headline_text = find_headline(doc, sitemap_aj, url="dummy")
 
+    assert headline_text == expected_aj[f"{page}"]["headline"]
 
-def test_find_summary(sitemap_aj, expected_aj) -> None:
-    html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html")
-    soup = BeautifulSoup(html, "lxml")
 
-    summary = find_summary(soup, sitemap_aj, url="https://www.example.com")
+@pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"])
+def test_find_summary(sitemap_aj, expected_aj, page) -> None:
+    html = read_file(directory=FILES_DIR, file_name=f"{page}.html")
+    doc = PyQuery(html)
 
-    assert summary == expected_aj[f"{PAGE}"]["summary"]
+    summary = find_summary(doc, sitemap_aj, url="https://www.example.com")
 
+    assert summary
+    assert summary == expected_aj[f"{page}"]["summary"]
 
-def test_find_language(sitemap_aj, expected_aj) -> None:
-    html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html")
-    soup = BeautifulSoup(html, "lxml")
 
-    lang = find_language(soup, url="dummy")
+@pytest.mark.parametrize("page", ["asian_cup", "indonesia", "taiwan"])
+def test_find_language(sitemap_aj, expected_aj, page) -> None:
+    html = read_file(directory=FILES_DIR, file_name=f"{page}.html")
+    doc = PyQuery(html)
 
-    assert lang == expected_aj[f"{PAGE}"]["language"]
+    headline = find_headline(doc, sitemap_aj, url="dummy")
+    summary = find_summary(doc, sitemap_aj, url="https://www.example.com")
 
+    assert headline
+    assert summary
 
-def test_parse(sitemap_aj, expected_aj) -> None:
-    html = read_file(directory=FILES_DIR, file_name=f"{PAGE}.html")
+    lang = find_language(summary, headline, doc, url="dummy")
+
+    assert lang == expected_aj[f"{page}"]["language"]
+
+
+@pytest.mark.parametrize(
+    "page, url",
+    [("asian_cup", URL_ASIAN_CUP), ("indonesia", URL_INDONESIA), ("taiwan", URL_TAIWAN)]
+)
+def test_parse(sitemap_aj, expected_aj, page, url) -> None:
+    html = read_file(directory=FILES_DIR, file_name=f"{page}.html")
+
+    json_data = parse(html, sitemap_aj, url=url)
+
+    assert json_data
 
-    json_data = parse(html, sitemap_aj, url=URL)
     data = json.loads(json_data)
 
-    assert data == expected_aj[f"{PAGE}"]
+    assert data
+    assert data == expected_aj[f"{page}"]
diff --git a/scraper/tests/unit/test_spider.py b/scraper/tests/unit/test_spider.py
index d407c91..d56df05 100644
--- a/scraper/tests/unit/test_spider.py
+++ b/scraper/tests/unit/test_spider.py
@@ -71,7 +71,8 @@ def return_value(*args, **kwargs):
     #
     # asserts
     #
-    await spider.collect_metadata(aiohttp.ClientSession(), spider.links)
+    async with aiohttp.ClientSession() as session:
+        await spider.collect_metadata(session, spider.links)
 
     articles = [json.loads(article) for article in spider.articles]