From 7b79d14b25492b2b50ed92ba554e9874865f8ac4 Mon Sep 17 00:00:00 2001 From: Paul Schilling Date: Sat, 13 Jan 2024 15:26:59 +0100 Subject: [PATCH] Clean up models and search view, update tests and fixtures --- Dockerfile.dev | 2 +- articles/management/commands/scrape.py | 14 +----- articles/migrations/0001_initial.py | 48 ++++++------------- ...y_remove_source_body_selectors_and_more.py | 40 ---------------- articles/models.py | 35 +++++--------- articles/scraper/parser.py | 1 - articles/scraper/spider.py | 4 +- .../templates/articles/search_results.html | 4 +- articles/tests/conftest.py | 4 +- articles/tests/test_models.py | 2 +- articles/views.py | 32 ++++++------- docker-compose.yml | 2 +- fixtures/sources.json | 24 +++++----- nous_aggregator/settings/local.py | 2 - pyproject.toml | 6 +++ 15 files changed, 71 insertions(+), 149 deletions(-) delete mode 100644 articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py create mode 100644 pyproject.toml diff --git a/Dockerfile.dev b/Dockerfile.dev index 527d8f7..c4a0220 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -19,7 +19,7 @@ RUN apt-get update \ COPY . /usr/src/app WORKDIR /usr/src/app -RUN python manage.py collectstatic --no-input +RUN python manage.py collectstatic --link --no-input # patch RUN ./patches/pyppeteer_patch.sh diff --git a/articles/management/commands/scrape.py b/articles/management/commands/scrape.py index b6b84ab..a329856 100644 --- a/articles/management/commands/scrape.py +++ b/articles/management/commands/scrape.py @@ -1,9 +1,3 @@ -""" -Scheduler for scraping - -Jobs are stored in a Django job store. Old jobs should be deleted when -the sources are changed; otherwise the scheduler will use the old information. -""" import json import logging from datetime import datetime @@ -23,11 +17,10 @@ logger = logging.getLogger(__name__) -INTERVAL = 480 # interval in minutes for scraping +SCRAPING_INTERVAL = 1 # minutes def scrape(sitemap: dict): - """Scrape newspapers/journals and store articles in database.""" Spider.crawl(sitemap) data = [json.loads(article) for article in Spider.articles] @@ -57,8 +50,6 @@ def delete_old_job_executions(max_age=604_800): class Command(BaseCommand): - """Create jobs.""" - def handle(self, *args, **options): scheduler = BlockingScheduler( timezone=settings.TIME_ZONE, @@ -66,7 +57,6 @@ def handle(self, *args, **options): ) scheduler.add_jobstore(DjangoJobStore(), "default") - # jobs for scraping newspapers/journals sources = Source.objects.all() for index, source in enumerate(sources): source_id = f"Scraping {index + 1}: {source.name}" @@ -77,7 +67,7 @@ def handle(self, *args, **options): scrape, args=[source.to_dict()], trigger="interval", - minutes=INTERVAL, + minutes=SCRAPING_INTERVAL, misfire_grace_time=600, id=source_id, max_instances=1, diff --git a/articles/migrations/0001_initial.py b/articles/migrations/0001_initial.py index b3bcf09..5abdce9 100644 --- a/articles/migrations/0001_initial.py +++ b/articles/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.5 on 2023-09-19 18:32 +# Generated by Django 4.2.5 on 2024-01-13 14:26 import django.core.validators import django.db.models.deletion @@ -27,10 +27,7 @@ class Migration(migrations.Migration): ( "name", models.CharField( - help_text="The name of the source", - max_length=128, - unique=True, - verbose_name="name", + help_text="The name of the source", max_length=128, unique=True ), ), ( @@ -38,7 +35,7 @@ class Migration(migrations.Migration): models.SlugField( blank=True, help_text="The slug of the source for SEO-friendly urls", - verbose_name="Slug", + max_length=255, ), ), ( @@ -47,7 +44,6 @@ class Migration(migrations.Migration): choices=[("newspaper/journal", "Newspaper or journal")], help_text="The type of publication of the source", max_length=24, - verbose_name="publication type", ), ), ( @@ -57,23 +53,21 @@ class Migration(migrations.Migration): choices=[("en", "English")], help_text="The language of the article", max_length=4, - verbose_name="language", ), ), ( "link", models.URLField( help_text="The link to the source", + max_length=255, unique=True, validators=[django.core.validators.URLValidator], - verbose_name="link", ), ), ( "paths", models.JSONField( - help_text="A list of resource paths where the scraper will look for articles", - verbose_name="paths", + help_text="A list of resource paths where the scraper will look for articles" ), ), ( @@ -82,31 +76,27 @@ class Migration(migrations.Migration): blank=True, help_text="Regular expression for filtering hyper-links found at the resource paths", max_length=255, - verbose_name="regex", ), ), ( - "javascript", + "javascript_required", models.BooleanField( default=False, help_text="Whether the parsing of articles by this source requires rendering of JavaScript", - verbose_name="render javascript", ), ), ( "headline_selectors", models.JSONField( - help_text="Information about the structure of the target page needed to extract the headline of articles published by this source", - verbose_name="headline selectors", + help_text="Information about the structure of the target page needed to extract the headline of articles published by this source" ), ), ( - "body_selectors", + "summary_selectors", models.JSONField( blank=True, - help_text="Information about the structure of the target page needed to extract the body of articles published by this source", + help_text="Information about the structure of the target page needed to extract the summary of articles published by this source", null=True, - verbose_name="body selectors", ), ), ], @@ -128,15 +118,15 @@ class Migration(migrations.Migration): ), ( "headline", - models.TextField( - help_text="The headline of the article", verbose_name="headline" + models.CharField( + help_text="The headline of the article", max_length=200 ), ), ( "slug", models.SlugField( help_text="The slug of the article for SEO-friendly urls", - verbose_name="Slug", + max_length=255, ), ), ("created_at", models.DateTimeField()), @@ -146,25 +136,17 @@ class Migration(migrations.Migration): choices=[("en", "English")], help_text="The language of the article", max_length=4, - verbose_name="language", ), ), ( "link", models.URLField( - help_text="The link to the article", - unique=True, - verbose_name="link", + help_text="The link to the article", max_length=255, unique=True ), ), ( - "body", - models.TextField( - blank=True, - help_text="The body of the article", - null=True, - verbose_name="body", - ), + "summary", + models.TextField(blank=True, help_text="A summary of the article"), ), ( "source", diff --git a/articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py b/articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py deleted file mode 100644 index 88dd0e5..0000000 --- a/articles/migrations/0002_remove_article_body_remove_source_body_selectors_and_more.py +++ /dev/null @@ -1,40 +0,0 @@ -# Generated by Django 4.2.5 on 2023-09-20 18:56 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("articles", "0001_initial"), - ] - - operations = [ - migrations.RemoveField( - model_name="article", - name="body", - ), - migrations.RemoveField( - model_name="source", - name="body_selectors", - ), - migrations.AddField( - model_name="article", - name="summary", - field=models.TextField( - blank=True, - help_text="A summary of the article", - null=True, - verbose_name="summary", - ), - ), - migrations.AddField( - model_name="source", - name="summary_selectors", - field=models.JSONField( - blank=True, - help_text="Information about the structure of the target page needed to extract the summary of articles published by this source", - null=True, - verbose_name="summary selectors", - ), - ), - ] diff --git a/articles/models.py b/articles/models.py index 34723b3..5e539f5 100644 --- a/articles/models.py +++ b/articles/models.py @@ -30,33 +30,32 @@ class Article(models.Model): __str__: string representation for the admin area """ - headline = models.TextField( - _("headline"), + headline = models.CharField( + max_length=200, help_text=_("The headline of the article"), ) slug = models.SlugField( - _("Slug"), + max_length=255, help_text=_("The slug of the article for SEO-friendly urls"), ) created_at = models.DateTimeField() language = models.CharField( - _("language"), max_length=4, choices=Language.choices, blank=False, help_text=_("The language of the article"), ) link = models.URLField( - _("link"), unique=True, help_text=_("The link to the article") + max_length=255, + unique=True, + help_text=_("The link to the article"), ) summary = models.TextField( - _("summary"), - null=True, blank=True, help_text=_("A summary of the article"), ) source = models.ForeignKey( - "Source", + to="Source", on_delete=models.CASCADE, related_name="articles", help_text=_("The source where the article is published"), @@ -85,8 +84,8 @@ class Source(models.Model): the base url to tell the scraper where to look for hyper-links ('https://example.com/path1/') regex (models.CharField): a regular expression for filtering links - javascript (models.BooleanField): True if JavaScript must be rendered before - data can be extracted from the webpage, False otherwise + javascript_required (models.BooleanField): True if JavaScript must be rendered + before data can be extracted from the webpage, False otherwise headline_selectors (models.JSONField): information about the CSS selectors needed to extract the headline of an article summary_selectors (models.JSONField): information about the CSS selectors @@ -99,34 +98,31 @@ class Source(models.Model): """ name = models.CharField( - _("name"), max_length=128, unique=True, blank=False, help_text=_("The name of the source"), ) slug = models.SlugField( - _("Slug"), + max_length=255, blank=True, help_text=_("The slug of the source for SEO-friendly urls"), ) publication_type = models.CharField( - _("publication type"), max_length=24, choices=PublicationType.choices, blank=False, help_text=_("The type of publication of the source"), ) language = models.CharField( - _("language"), max_length=4, choices=Language.choices, blank=True, help_text=_("The language of the article"), ) link = models.URLField( - _("link"), unique=True, + max_length=255, validators=[URLValidator], help_text=_("The link to the source"), ) @@ -134,21 +130,18 @@ class Source(models.Model): # info related to scraping # paths = models.JSONField( - _("paths"), help_text=_( "A list of resource paths where the scraper will look for articles" ), ) regex = models.CharField( - _("regex"), max_length=255, blank=True, help_text=( "Regular expression for filtering hyper-links found at the resource paths" ), ) - javascript = models.BooleanField( - _("render javascript"), + javascript_required = models.BooleanField( default=False, help_text=_( "Whether the parsing of articles by this source requires rendering " @@ -156,14 +149,12 @@ class Source(models.Model): ), ) headline_selectors = models.JSONField( - _("headline selectors"), help_text=_( "Information about the structure of the target page needed to extract " "the headline of articles published by this source" ), ) summary_selectors = models.JSONField( - _("summary selectors"), null=True, blank=True, help_text=_( @@ -185,7 +176,7 @@ def to_dict(self): "base_url": self.link, "paths": self.paths, "language": self.language, - "javascript": self.javascript, + "javascript_required": self.javascript_required, "filter": regex.compile(self.regex), "headline_selectors": self.headline_selectors, "summary_selectors": self.summary_selectors, diff --git a/articles/scraper/parser.py b/articles/scraper/parser.py index 9ee2b28..b872d23 100644 --- a/articles/scraper/parser.py +++ b/articles/scraper/parser.py @@ -1,4 +1,3 @@ -"""Provides access to the parsing function(s)""" import json import logging from typing import Optional diff --git a/articles/scraper/spider.py b/articles/scraper/spider.py index f26db6b..f643d8e 100644 --- a/articles/scraper/spider.py +++ b/articles/scraper/spider.py @@ -16,8 +16,6 @@ class Spider: """ - The Spider class is for extracting article metadata. - Class Attributes: headers (list): a collection of HTTP headers articles (set): a collection of JSON strings representing article @@ -67,7 +65,7 @@ async def get_links(self, asession: AsyncHTMLSession, url: str): if not response: return - if self.sitemap["javascript"]: + if self.sitemap["javascript_required"]: try: await response.html.arender(timeout=settings.REQUESTS_TIMEOUT_JS) except pyppeteer.errors.TimeoutError as e: diff --git a/articles/templates/articles/search_results.html b/articles/templates/articles/search_results.html index 9149abf..b22fa0f 100644 --- a/articles/templates/articles/search_results.html +++ b/articles/templates/articles/search_results.html @@ -6,7 +6,9 @@ {% block content %}
-

Search Results for: "{{ query }}"

+ {% if query %} +

Search Results for: "{{ query }}"

+ {% endif %}
{% for source in sources %}
diff --git a/articles/tests/conftest.py b/articles/tests/conftest.py index a92a530..02504c2 100644 --- a/articles/tests/conftest.py +++ b/articles/tests/conftest.py @@ -18,7 +18,7 @@ def source_values(): "publication_type": PublicationType.newspaper, "language": Language.en, "paths": ["world/"], - "javascript": False, + "javascript_required": False, "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}", "headline_selectors": {"tag": "h1", "attrs": {}}, "summary_selectors": {"tag": "h2", "attrs": {}}, @@ -39,7 +39,7 @@ def source_values_2(): "publication_type": PublicationType.newspaper, "language": Language.en, "paths": ["world/"], - "javascript": False, + "javascript_required": False, "regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}", "headline_selectors": {"tag": "h1", "attrs": {}}, "summary_selectors": {"tag": "h2", "attrs": {}}, diff --git a/articles/tests/test_models.py b/articles/tests/test_models.py index 68e4dae..fc9417a 100644 --- a/articles/tests/test_models.py +++ b/articles/tests/test_models.py @@ -20,7 +20,7 @@ def test_source_to_dict(source_values): for attr_name in [ "summary_selectors", "headline_selectors", - "javascript", + "javascript_required", "language", "paths", ]: diff --git a/articles/views.py b/articles/views.py index 68b4932..a8425c4 100644 --- a/articles/views.py +++ b/articles/views.py @@ -7,7 +7,6 @@ def index(request): - """Display latest articles for all sources.""" context = { "sources": Source.objects.only("name", "link", "publication_type"), } @@ -15,32 +14,29 @@ def index(request): class SearchResultsView(ListView): - """Display sources + articles matching query.""" - model = Article fields = ["headline", "link", "body"] template_name = "articles/search_results.html" def get_context_data(self, **kwargs): - """Filter sources by query.""" - query = self.request.GET.get("q") - # precaution: empty strings are also excluded on the client-side - if query in ("", " ", " "): - query = "Please don't attempt to hack my website, thanks!" - regex = r"(?