Skip to content

Commit

Permalink
Clean up models and search view, update tests and fixtures
Browse files Browse the repository at this point in the history
  • Loading branch information
pi-sigma committed Jan 13, 2024
1 parent 3d480af commit 7b79d14
Show file tree
Hide file tree
Showing 15 changed files with 71 additions and 149 deletions.
2 changes: 1 addition & 1 deletion Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ RUN apt-get update \
COPY . /usr/src/app
WORKDIR /usr/src/app

RUN python manage.py collectstatic --no-input
RUN python manage.py collectstatic --link --no-input

# patch
RUN ./patches/pyppeteer_patch.sh
Expand Down
14 changes: 2 additions & 12 deletions articles/management/commands/scrape.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
"""
Scheduler for scraping
Jobs are stored in a Django job store. Old jobs should be deleted when
the sources are changed; otherwise the scheduler will use the old information.
"""
import json
import logging
from datetime import datetime
Expand All @@ -23,11 +17,10 @@

logger = logging.getLogger(__name__)

INTERVAL = 480 # interval in minutes for scraping
SCRAPING_INTERVAL = 1 # minutes


def scrape(sitemap: dict):
"""Scrape newspapers/journals and store articles in database."""
Spider.crawl(sitemap)
data = [json.loads(article) for article in Spider.articles]

Expand Down Expand Up @@ -57,16 +50,13 @@ def delete_old_job_executions(max_age=604_800):


class Command(BaseCommand):
"""Create jobs."""

def handle(self, *args, **options):
scheduler = BlockingScheduler(
timezone=settings.TIME_ZONE,
executors={"default": ThreadPoolExecutor(1)},
)
scheduler.add_jobstore(DjangoJobStore(), "default")

# jobs for scraping newspapers/journals
sources = Source.objects.all()
for index, source in enumerate(sources):
source_id = f"Scraping {index + 1}: {source.name}"
Expand All @@ -77,7 +67,7 @@ def handle(self, *args, **options):
scrape,
args=[source.to_dict()],
trigger="interval",
minutes=INTERVAL,
minutes=SCRAPING_INTERVAL,
misfire_grace_time=600,
id=source_id,
max_instances=1,
Expand Down
48 changes: 15 additions & 33 deletions articles/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 4.2.5 on 2023-09-19 18:32
# Generated by Django 4.2.5 on 2024-01-13 14:26

import django.core.validators
import django.db.models.deletion
Expand Down Expand Up @@ -27,18 +27,15 @@ class Migration(migrations.Migration):
(
"name",
models.CharField(
help_text="The name of the source",
max_length=128,
unique=True,
verbose_name="name",
help_text="The name of the source", max_length=128, unique=True
),
),
(
"slug",
models.SlugField(
blank=True,
help_text="The slug of the source for SEO-friendly urls",
verbose_name="Slug",
max_length=255,
),
),
(
Expand All @@ -47,7 +44,6 @@ class Migration(migrations.Migration):
choices=[("newspaper/journal", "Newspaper or journal")],
help_text="The type of publication of the source",
max_length=24,
verbose_name="publication type",
),
),
(
Expand All @@ -57,23 +53,21 @@ class Migration(migrations.Migration):
choices=[("en", "English")],
help_text="The language of the article",
max_length=4,
verbose_name="language",
),
),
(
"link",
models.URLField(
help_text="The link to the source",
max_length=255,
unique=True,
validators=[django.core.validators.URLValidator],
verbose_name="link",
),
),
(
"paths",
models.JSONField(
help_text="A list of resource paths where the scraper will look for articles",
verbose_name="paths",
help_text="A list of resource paths where the scraper will look for articles"
),
),
(
Expand All @@ -82,31 +76,27 @@ class Migration(migrations.Migration):
blank=True,
help_text="Regular expression for filtering hyper-links found at the resource paths",
max_length=255,
verbose_name="regex",
),
),
(
"javascript",
"javascript_required",
models.BooleanField(
default=False,
help_text="Whether the parsing of articles by this source requires rendering of JavaScript",
verbose_name="render javascript",
),
),
(
"headline_selectors",
models.JSONField(
help_text="Information about the structure of the target page needed to extract the headline of articles published by this source",
verbose_name="headline selectors",
help_text="Information about the structure of the target page needed to extract the headline of articles published by this source"
),
),
(
"body_selectors",
"summary_selectors",
models.JSONField(
blank=True,
help_text="Information about the structure of the target page needed to extract the body of articles published by this source",
help_text="Information about the structure of the target page needed to extract the summary of articles published by this source",
null=True,
verbose_name="body selectors",
),
),
],
Expand All @@ -128,15 +118,15 @@ class Migration(migrations.Migration):
),
(
"headline",
models.TextField(
help_text="The headline of the article", verbose_name="headline"
models.CharField(
help_text="The headline of the article", max_length=200
),
),
(
"slug",
models.SlugField(
help_text="The slug of the article for SEO-friendly urls",
verbose_name="Slug",
max_length=255,
),
),
("created_at", models.DateTimeField()),
Expand All @@ -146,25 +136,17 @@ class Migration(migrations.Migration):
choices=[("en", "English")],
help_text="The language of the article",
max_length=4,
verbose_name="language",
),
),
(
"link",
models.URLField(
help_text="The link to the article",
unique=True,
verbose_name="link",
help_text="The link to the article", max_length=255, unique=True
),
),
(
"body",
models.TextField(
blank=True,
help_text="The body of the article",
null=True,
verbose_name="body",
),
"summary",
models.TextField(blank=True, help_text="A summary of the article"),
),
(
"source",
Expand Down

This file was deleted.

35 changes: 13 additions & 22 deletions articles/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,33 +30,32 @@ class Article(models.Model):
__str__: string representation for the admin area
"""

headline = models.TextField(
_("headline"),
headline = models.CharField(
max_length=200,
help_text=_("The headline of the article"),
)
slug = models.SlugField(
_("Slug"),
max_length=255,
help_text=_("The slug of the article for SEO-friendly urls"),
)
created_at = models.DateTimeField()
language = models.CharField(
_("language"),
max_length=4,
choices=Language.choices,
blank=False,
help_text=_("The language of the article"),
)
link = models.URLField(
_("link"), unique=True, help_text=_("The link to the article")
max_length=255,
unique=True,
help_text=_("The link to the article"),
)
summary = models.TextField(
_("summary"),
null=True,
blank=True,
help_text=_("A summary of the article"),
)
source = models.ForeignKey(
"Source",
to="Source",
on_delete=models.CASCADE,
related_name="articles",
help_text=_("The source where the article is published"),
Expand Down Expand Up @@ -85,8 +84,8 @@ class Source(models.Model):
the base url to tell the scraper where to look for hyper-links
('https://example.com/path1/')
regex (models.CharField): a regular expression for filtering links
javascript (models.BooleanField): True if JavaScript must be rendered before
data can be extracted from the webpage, False otherwise
javascript_required (models.BooleanField): True if JavaScript must be rendered
before data can be extracted from the webpage, False otherwise
headline_selectors (models.JSONField): information about the CSS selectors
needed to extract the headline of an article
summary_selectors (models.JSONField): information about the CSS selectors
Expand All @@ -99,71 +98,63 @@ class Source(models.Model):
"""

name = models.CharField(
_("name"),
max_length=128,
unique=True,
blank=False,
help_text=_("The name of the source"),
)
slug = models.SlugField(
_("Slug"),
max_length=255,
blank=True,
help_text=_("The slug of the source for SEO-friendly urls"),
)
publication_type = models.CharField(
_("publication type"),
max_length=24,
choices=PublicationType.choices,
blank=False,
help_text=_("The type of publication of the source"),
)
language = models.CharField(
_("language"),
max_length=4,
choices=Language.choices,
blank=True,
help_text=_("The language of the article"),
)
link = models.URLField(
_("link"),
unique=True,
max_length=255,
validators=[URLValidator],
help_text=_("The link to the source"),
)
#
# info related to scraping
#
paths = models.JSONField(
_("paths"),
help_text=_(
"A list of resource paths where the scraper will look for articles"
),
)
regex = models.CharField(
_("regex"),
max_length=255,
blank=True,
help_text=(
"Regular expression for filtering hyper-links found at the resource paths"
),
)
javascript = models.BooleanField(
_("render javascript"),
javascript_required = models.BooleanField(
default=False,
help_text=_(
"Whether the parsing of articles by this source requires rendering "
"of JavaScript"
),
)
headline_selectors = models.JSONField(
_("headline selectors"),
help_text=_(
"Information about the structure of the target page needed to extract "
"the headline of articles published by this source"
),
)
summary_selectors = models.JSONField(
_("summary selectors"),
null=True,
blank=True,
help_text=_(
Expand All @@ -185,7 +176,7 @@ def to_dict(self):
"base_url": self.link,
"paths": self.paths,
"language": self.language,
"javascript": self.javascript,
"javascript_required": self.javascript_required,
"filter": regex.compile(self.regex),
"headline_selectors": self.headline_selectors,
"summary_selectors": self.summary_selectors,
Expand Down
1 change: 0 additions & 1 deletion articles/scraper/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
"""Provides access to the parsing function(s)"""
import json
import logging
from typing import Optional
Expand Down
Loading

0 comments on commit 7b79d14

Please sign in to comment.