Skip to content

Commit

Permalink
Replace BeautifulSoup with PyQuery
Browse files Browse the repository at this point in the history
  • Loading branch information
pi-sigma committed Mar 9, 2024
1 parent 453576f commit 75d9b43
Show file tree
Hide file tree
Showing 31 changed files with 541 additions and 388 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/django.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ jobs:
SECRET_KEY: dummy
DJANGO_ENV: BASE
SECURE_SSL_REDIRECT: False
run: pytest articles/tests/
run: pytest

#
# Migrations
#
Expand Down
3 changes: 0 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ RUN pip install pip -U
COPY /requirements/* /app/requirements/
RUN pip install -r /app/requirements/dev.txt

# pyppeteer deps (https://stackoverflow.com/a/71935536)
# RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt


#
# Final
Expand Down
30 changes: 21 additions & 9 deletions articles/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 5.0.1 on 2024-02-26 20:25
# Generated by Django 5.0.1 on 2024-03-01 20:14

import django.db.models.deletion
import django.db.models.functions.text
Expand Down Expand Up @@ -63,7 +63,7 @@ class Migration(migrations.Migration):
(
"paths",
models.JSONField(
help_text="A list of resource paths where the scraper will look for articles"
help_text="List of resource paths where the scraper will look for articles"
),
),
(
Expand All @@ -82,22 +82,34 @@ class Migration(migrations.Migration):
),
),
(
"headline_selectors",
"headline_search_params_find",
models.JSONField(
help_text="Information about the structure of the target page needed to extract the headline of articles published by this source"
help_text="Selectors for extracting the headline of articles"
),
),
(
"summary_selectors",
"headline_search_params_remove",
models.JSONField(
blank=True,
help_text="Information about the structure of the target page needed to extract the summary of articles published by this source",
null=True,
help_text="Selectors for HTML elements that need to be removed from the headline"
),
),
(
"summary_search_params_find",
models.JSONField(
default=[],
help_text="Selectors for extracting the summary of articles",
),
),
(
"summary_search_params_remove",
models.JSONField(
default=[],
help_text="Selectors for HTML elements that need to be removed from the summary",
),
),
],
options={
"ordering": [django.db.models.functions.text.Lower("title")],
"ordering": [django.db.models.functions.text.Lower("name")],
},
),
migrations.CreateModel(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Generated by Django 5.0.1 on 2024-03-03 10:29

import django.db.models.functions.text
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("articles", "0001_initial"),
]

operations = [
migrations.AlterModelOptions(
name="source",
options={"ordering": [django.db.models.functions.text.Lower("title")]},
),
migrations.AlterField(
model_name="article",
name="headline",
field=models.CharField(
help_text="The headline of the article", max_length=512
),
),
migrations.AlterField(
model_name="article",
name="slug",
field=models.SlugField(
help_text="The slug of the article for SEO-friendly urls",
max_length=1024,
),
),
migrations.AlterField(
model_name="article",
name="url",
field=models.URLField(
help_text="The link to the article", max_length=512, unique=True
),
),
migrations.AlterField(
model_name="source",
name="summary_search_params_find",
field=models.JSONField(
default=str,
help_text="Selectors for extracting the summary of articles",
),
),
migrations.AlterField(
model_name="source",
name="summary_search_params_remove",
field=models.JSONField(
default=list,
help_text="Selectors for HTML elements that need to be removed from the summary",
),
),
migrations.AlterField(
model_name="source",
name="url",
field=models.URLField(
help_text="The url of the source", max_length=512, unique=True
),
),
]
66 changes: 40 additions & 26 deletions articles/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import regex
from django.core.validators import URLValidator
from django.db import models
from django.db.models.functions import Lower
from django.utils.translation import gettext_lazy as _
Expand All @@ -14,14 +13,13 @@ class Article(models.Model):
Fields:
headline (models.TextField): headline of the article
slug (models.SlugField): slug of the article (generated from headline)
summary (models.TextField): short paragraph summarizing the article
created_at (models.DateTimeField): date the article was added to the
database. Mostly corresponds to actual publication date, though
this can vary. Actual dates are not used because their format
varies a lot, hence they are difficult to parse.
language (models.CharField): the language of the article
url (models.URLField): link to the article
body (models.TextField): either the actual body of the article,
or a short desriptive paragraph
Relations:
source (ForeignKey): the source of the article
Expand All @@ -31,13 +29,17 @@ class Article(models.Model):
"""

headline = models.CharField(
max_length=200,
max_length=512,
help_text=_("The headline of the article"),
)
slug = models.SlugField(
max_length=255,
max_length=1024,
help_text=_("The slug of the article for SEO-friendly urls"),
)
summary = models.TextField(
blank=True,
help_text=_("A summary of the article"),
)
created_at = models.DateTimeField()
language = models.CharField(
max_length=4,
Expand All @@ -46,14 +48,10 @@ class Article(models.Model):
help_text=_("The language of the article"),
)
url = models.URLField(
max_length=255,
max_length=512,
unique=True,
help_text=_("The link to the article"),
)
summary = models.TextField(
blank=True,
help_text=_("A summary of the article"),
)
source = models.ForeignKey(
to="Source",
on_delete=models.CASCADE,
Expand All @@ -65,7 +63,7 @@ class Meta:
ordering = ("-created_at",)
indexes = [models.Index(fields=["headline", "url"])]

def __str__(self):
def __str__(self) -> str:
return f"{self.source}: {self.headline}"


Expand All @@ -74,7 +72,7 @@ class Source(models.Model):
Metadata about the source of articles
Fields:
title (models.CharField): name of the source
title (models.CharField): name/title of the source
slug (models.SlugField): slug of the source
publication_type (models.CharField): the type of publication of the
source (newspaper, journal, blog...)
Expand Down Expand Up @@ -122,15 +120,15 @@ class Source(models.Model):
)
url = models.URLField(
unique=True,
max_length=255,
max_length=512,
help_text=_("The url of the source"),
)
#
# info related to scraping
# data related to scraping
#
paths = models.JSONField(
help_text=_(
"A list of resource paths where the scraper will look for articles"
"List of resource paths where the scraper will look for articles"
),
)
regex = models.CharField(
Expand All @@ -147,18 +145,26 @@ class Source(models.Model):
"of JavaScript"
),
)
headline_selectors = models.JSONField(
headline_search_params_find = models.JSONField(
help_text=_(
"Information about the structure of the target page needed to extract "
"the headline of articles published by this source"
"Selectors for extracting the headline of articles"
),
)
summary_selectors = models.JSONField(
null=True,
blank=True,
headline_search_params_remove = models.JSONField(
help_text=_(
"Selectors for HTML elements that need to be removed from the headline"
),
)
summary_search_params_find = models.JSONField(
default=str,
help_text=_(
"Selectors for extracting the summary of articles"
),
)
summary_search_params_remove = models.JSONField(
default=list,
help_text=_(
"Information about the structure of the target page needed to extract "
"the summary of articles published by this source"
"Selectors for HTML elements that need to be removed from the summary"
),
)

Expand All @@ -167,7 +173,7 @@ class Meta:
Lower("title"),
]

def __str__(self):
def __str__(self) -> str:
return f"{self.title}"

def to_dict(self):
Expand All @@ -177,7 +183,15 @@ def to_dict(self):
"language": self.language,
"javascript_required": self.javascript_required,
"filter": regex.compile(self.regex),
"headline_selectors": self.headline_selectors,
"summary_selectors": self.summary_selectors,
"search_params": {
"headline": {
"find": self.headline_search_params_find,
"remove": self.headline_search_params_remove,
},
"summary": {
"find": self.summary_search_params_find,
"remove": self.summary_search_params_remove,
},
},
}
return sitemap
43 changes: 32 additions & 11 deletions articles/tasks.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import json
import logging

from celery import group, shared_task
from django.db.utils import DatabaseError
from django.utils import timezone

import scraper
from scraper.tasks import magazines

from .models import Article, Source

logger = logging.getLogger("__name__")


@shared_task
def get_articles_for_source(source_title: str) -> None:
Expand All @@ -21,17 +25,34 @@ def get_articles_for_source(source_title: str) -> None:
spider.run()
articles = [json.loads(article) for article in spider.articles]

Article.objects.bulk_create([
Article(
headline=article_data["headline"],
slug=article_data["slug"],
source=Source.objects.get(url=article_data["source_link"]),
summary=article_data["summary"],
language=article_data["language"],
url=article_data["url"],
created_at=timezone.now(),
) for article_data in articles
], ignore_conflicts=True)
# try bulk create, revert to individual db saves in case of error
try:
Article.objects.bulk_create([
Article(
headline=article_data["headline"],
slug=article_data["slug"],
source=Source.objects.get(url=article_data["source_link"]),
summary=article_data["summary"],
language=article_data["language"],
url=article_data["url"],
created_at=timezone.now(),
) for article_data in articles
], ignore_conflicts=True)
except DatabaseError as exc:
logger.error("Bulk create failed", exc_info=exc)
for article_data in articles:
try:
Article.objects.create(
headline=article_data["headline"],
slug=article_data["slug"],
source=Source.objects.get(url=article_data["source_link"]),
summary=article_data["summary"],
language=article_data["language"],
url=article_data["url"],
created_at=timezone.now(),
)
except DatabaseError as exc:
logger.error("DB save failed for %s", article_data["url"], exc_info=exc)


@shared_task
Expand Down
12 changes: 8 additions & 4 deletions articles/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ def source_values():
"paths": ["world/"],
"javascript_required": False,
"regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
"headline_selectors": {"tag": "h1", "attrs": {}},
"summary_selectors": {"tag": "h2", "attrs": {}},
"headline_search_params_find": "h1",
"headline_search_params_remove": [],
"summary_search_params_find": "",
"summary_search_params_remove": []
}


Expand All @@ -41,8 +43,10 @@ def source_values_2():
"paths": ["world/"],
"javascript_required": False,
"regex": "[0-9]{4}/[0-9]{2}/[0-9]{2}",
"headline_selectors": {"tag": "h1", "attrs": {}},
"summary_selectors": {"tag": "h2", "attrs": {}},
"headline_search_params_find": "h1",
"headline_search_params_remove": [],
"summary_search_params_find": "",
"summary_search_params_remove": []
}


Expand Down
Empty file.
Loading

0 comments on commit 75d9b43

Please sign in to comment.