Skip to content

Commit

Permalink
Refactor scraper, add tests, fix dockerfile
Browse files Browse the repository at this point in the history
    * set event_loop in __init__ on spider instance and pass it to
      session
    * update pypeteer dependencies for Debian bookworm
    * add unit + integration tests for scraper
    * add development scripts for better iteration
  • Loading branch information
pi-sigma committed Feb 18, 2024
1 parent 4c52eae commit 7940e1d
Show file tree
Hide file tree
Showing 38 changed files with 12,980 additions and 112 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ media
staticfiles/
dev/
logs/*.log*
static/

### celery etc.
celerybeat-schedule
Expand Down
13 changes: 6 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# Backend build
# Base
#
FROM python:3.11-slim-bookworm AS backend
FROM python:3.11-slim-bookworm AS base

# build deps
RUN apt-get update && apt-get upgrade && apt-get install -y --no-install-recommends \
Expand All @@ -13,27 +13,26 @@ RUN pip install pip -U

# install requirements
COPY /requirements/* /app/requirements/
RUN pip install -r /app/requirements/base.txt
RUN pip install -r /app/requirements/dev.txt

# pyppeteer deps (https://stackoverflow.com/a/71935536)
RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt


#
# Final build
# Final
#
FROM python:3.11-slim-bookworm AS final

ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
ENV DJANGO_ENV "BASE"

RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
postgresql-client

# copy backend deps
COPY --from=backend /usr/local/lib/python3.11 /usr/local/lib/python3.11
COPY --from=backend /usr/local/bin/ /usr/local/bin/
COPY --from=base /usr/local/lib/python3.11 /usr/local/lib/python3.11
COPY --from=base /usr/local/bin/ /usr/local/bin/

COPY . /app
WORKDIR /app
Expand Down
16 changes: 6 additions & 10 deletions articles/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Generated by Django 4.2.5 on 2024-01-13 14:26
# Generated by Django 5.0.1 on 2024-02-18 12:41

import django.core.validators
import django.db.models.deletion
import django.db.models.functions.text
from django.db import migrations, models
Expand Down Expand Up @@ -56,12 +55,9 @@ class Migration(migrations.Migration):
),
),
(
"link",
"url",
models.URLField(
help_text="The link to the source",
max_length=255,
unique=True,
validators=[django.core.validators.URLValidator],
help_text="The url of the source", max_length=255, unique=True
),
),
(
Expand Down Expand Up @@ -139,7 +135,7 @@ class Migration(migrations.Migration):
),
),
(
"link",
"url",
models.URLField(
help_text="The link to the article", max_length=255, unique=True
),
Expand All @@ -162,8 +158,8 @@ class Migration(migrations.Migration):
"ordering": ("-created_at",),
"indexes": [
models.Index(
fields=["headline", "link"],
name="articles_ar_headlin_b3b21f_idx",
fields=["headline", "url"],
name="articles_ar_headlin_4f6c91_idx",
)
],
},
Expand Down
15 changes: 7 additions & 8 deletions articles/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class Article(models.Model):
this can vary. Actual dates are not used because their format
varies a lot, hence they are difficult to parse.
language (models.CharField): the language of the article
link (models.URLField): link to the article
url (models.URLField): link to the article
body (models.TextField): either the actual body of the article,
or a short desriptive paragraph
Expand All @@ -45,7 +45,7 @@ class Article(models.Model):
blank=False,
help_text=_("The language of the article"),
)
link = models.URLField(
url = models.URLField(
max_length=255,
unique=True,
help_text=_("The link to the article"),
Expand All @@ -63,7 +63,7 @@ class Article(models.Model):

class Meta:
ordering = ("-created_at",)
indexes = [models.Index(fields=["headline", "link"])]
indexes = [models.Index(fields=["headline", "url"])]

def __str__(self):
return f"{self.source}: {self.headline}"
Expand All @@ -79,7 +79,7 @@ class Source(models.Model):
publication_type (models.CharField): the type of publication of the
source (newspaper, journal, blog...)
language (models.CharField): the language of the source
link (models.URLField): the base url of the source
url (models.URLField): the base url of the source
paths (models.JSONField): a list of paths, each of which is appended to
the base url to tell the scraper where to look for hyper-links
('https://example.com/path1/')
Expand Down Expand Up @@ -120,11 +120,10 @@ class Source(models.Model):
blank=True,
help_text=_("The language of the article"),
)
link = models.URLField(
url = models.URLField(
unique=True,
max_length=255,
validators=[URLValidator],
help_text=_("The link to the source"),
help_text=_("The url of the source"),
)
#
# info related to scraping
Expand Down Expand Up @@ -173,7 +172,7 @@ def __str__(self):

def to_dict(self):
sitemap = {
"base_url": self.link,
"base_url": self.url,
"paths": self.paths,
"language": self.language,
"javascript_required": self.javascript_required,
Expand Down
4 changes: 2 additions & 2 deletions articles/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_articles_for_source(source_title: str):
]

spider = scraper.Spider(starting_urls, sitemap)
scraper.run(spider)
spider.run()
data = [json.loads(article) for article in spider.articles]

Article.objects.bulk_create([
Expand All @@ -28,7 +28,7 @@ def get_articles_for_source(source_title: str):
source=Source.objects.get(link=article_data["source_link"]),
summary=article_data["summary"],
language=article_data["language"],
link=article_data["link"],
url=article_data["url"],
created_at=timezone.now(),
) for article_data in data
], ignore_conflicts=True)
Expand Down
4 changes: 2 additions & 2 deletions articles/templates/articles/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
<div class="row">
{% for source in sources %}
<div class="col-md-12 col-lg-6 col-xl-4 container-articles-outer">
<div class="source"><a class="source-link" href="{{ source.link }}"><h5 class="source-name">{{ source.name }}</h5></a></div>
<div class="source"><a class="source-link" href="{{ source.url }}"><h5 class="source-name">{{ source.name }}</h5></a></div>
<div class="container-articles-inner">
<ul class="article-list">
{% for article in source.articles.all|slice:":10" %}
<li>
<div class="article-headline">
<a class="article-link main-text"
title="{{article.headline}}&#10;&#10;{{ article.summary|slice:":200"}}...&#10;&#10;{{article.created_at|naturaltime}}"
href="{{ article.link }}">{{ article.headline}}
href="{{ article.url }}">{{ article.headline}}
</a>
</div>
</li>
Expand Down
4 changes: 2 additions & 2 deletions articles/templates/articles/search_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ <h3>Search Results for: "{{ query }}"</h3>
<div class="row">
{% for source in sources %}
<div class="col-md-12 col-lg-6 col-xl-4 container-articles-outer">
<div><a class="source-link" href="{{ source.link }}"><h5 class="source-name">{{ source.name }}</h5></a></div>
<div><a class="source-link" href="{{ source.url }}"><h5 class="source-name">{{ source.name }}</h5></a></div>
<div class="container-articles-inner container-scrolling">
<ul>
{% for article in article_list %}
Expand All @@ -21,7 +21,7 @@ <h3>Search Results for: "{{ query }}"</h3>
<div class="article-headline">
<a class="article-link main-text"
title="{{ article.headline }}&#10;&#10;{{ article.summary|slice:":200" }}...&#10;&#10;{{ article.created_at|naturaltime }}"
href="{{ article.link }}">{{ article.headline}}
href="{{ article.url }}">{{ article.headline}}
</a>
</div>
</li>
Expand Down
10 changes: 5 additions & 5 deletions articles/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def source_values():
return {
"name": "Fake News",
"slug": "fake-news",
"link": "https://www.hocusbogus.com/",
"url": "https://www.hocusbogus.com/",
"publication_type": PublicationType.newspaper,
"language": Language.en,
"paths": ["world/"],
Expand All @@ -35,7 +35,7 @@ def source_values_2():
return {
"name": "Alternative Facts",
"slug": "alternative-facts",
"link": "https://www.nonsensical.org/",
"url": "https://www.nonsensical.org/",
"publication_type": PublicationType.newspaper,
"language": Language.en,
"paths": ["world/"],
Expand All @@ -57,7 +57,7 @@ def article_values(source_instance):
"headline": "A cow jumps over the moon",
"slug": "a-cow-jumps-over-the-moon",
"summary": "Lorem dolor sit amet...",
"link": "https://www.hocusbogus.com/2022/05/08/foobar",
"url": "https://www.hocusbogus.com/2022/05/08/foobar",
"source": source_instance,
"created_at": timezone.localtime(),
}
Expand All @@ -69,7 +69,7 @@ def article_values_m(source_values):
"headline": "A cow jumps over the moon",
"slug": "a-cow-jumps-over-the-moon",
"summary": "Lorem dolor sit amet...",
"link": "https://www.hocusbogus.com/2022/05/08/foobar",
"url": "https://www.hocusbogus.com/2022/05/08/foobar",
"source": Source(**source_values),
"created_at": timezone.localtime(),
}
Expand All @@ -86,7 +86,7 @@ def article_values_2(source_instance):
"headline": "The moon is made of cheese",
"slug": "the-moon-is-made-of-cheese",
"summary": "Consectetur adipiscing elit, sed do eiusmod tempor incididunt...",
"link": "https://www.nonsensical.org/2022/05/08/baz",
"url": "https://www.nonsensical.org/2022/05/08/baz",
"source": source_instance,
"created_at": timezone.localtime(),
}
Expand Down
2 changes: 1 addition & 1 deletion articles/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_source_to_dict(source_values):
]:
assert getattr(source, attr_name) == sitemap.get(attr_name)

assert source.link == sitemap["base_url"]
assert source.url == sitemap["base_url"]
assert regex.compile(source.regex) == sitemap["filter"]


Expand Down
24 changes: 0 additions & 24 deletions articles/tests/test_parsers.py

This file was deleted.

21 changes: 11 additions & 10 deletions articles/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ def test_index_view(client, source_instance, article_instance):
source_link = doc.find(".source-link")
source_link_href = source_link.attr("href")


assert source_name == source_instance.name
assert source_link.is_("a")
assert source_link_href == source_instance.link
assert source_link_href == source_instance.url

# assert that details of article are present in response content
article_headline = doc.find(".article-headline").text()
Expand All @@ -34,7 +35,7 @@ def test_index_view(client, source_instance, article_instance):

assert article_headline == article_instance.headline
assert article_link.is_("a")
assert article_link_href == article_instance.link
assert article_link_href == article_instance.url
assert article_instance.headline in article_link_title


Expand Down Expand Up @@ -64,7 +65,7 @@ def test_search_results_view(

assert source_name == source_instance.name
assert source_link.is_("a")
assert source_link_href == source_instance.link
assert source_link_href == source_instance.url

# assert that details of article matching query are present in response content
article_headline = doc.find(".article-headline").text()
Expand All @@ -74,17 +75,17 @@ def test_search_results_view(

assert article_headline == article_instance.headline
assert article_link.is_("a")
assert article_link_href == article_instance.link
assert article_link_href == article_instance.url
assert article_instance.headline in article_link_title
assert article_instance.summary in article_link_title

# assert that details of non-matching source are not found
assert source_instance_2.name not in html
assert source_instance_2.link not in html
assert source_instance_2.url not in html

# assert that details of non-matching article are not found
assert article_instance_2.headline not in html
assert article_instance_2.link not in html
assert article_instance_2.url not in html
assert article_instance_2.summary not in html


Expand All @@ -104,11 +105,11 @@ def test_search_result_not_found(

# assert that details of non-matching source are not found
assert source_instance.name not in html
assert source_instance.link not in html
assert source_instance.url not in html

# assert that details of non-matching article are not found
assert article_instance.headline not in html
assert article_instance.link not in html
assert article_instance.url not in html
assert article_instance.summary not in html


Expand All @@ -127,9 +128,9 @@ def test_search_result_substring(

# assert that details of non-matching source are not found
assert source_instance.name not in html
assert source_instance.link not in html
assert source_instance.url not in html

# assert that details of non-matching article are not found
assert article_instance.headline not in html
assert article_instance.link not in html
assert article_instance.url not in html
assert article_instance.summary not in html
6 changes: 3 additions & 3 deletions articles/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@

def index(request):
context = {
"sources": Source.objects.only("name", "link", "publication_type"),
"sources": Source.objects.only("name", "url", "publication_type"),
}
return render(request, "articles/index.html", context)


class SearchResultsView(ListView):
model = Article
fields = ["headline", "link", "body"]
fields = ["headline", "url", "body"]
template_name = "articles/search_results.html"

def get_context_data(self, **kwargs):
Expand All @@ -28,7 +28,7 @@ def get_context_data(self, **kwargs):
regex = r"(?<![a-zA-Z])" + re.escape(query) + r"(?![a-rA-Rt-zT-Z])"
context.update(
{
"sources": Source.objects.only("name", "link", "publication_type")
"sources": Source.objects.only("name", "url", "publication_type")
.filter(articles__headline__iregex=regex)
.distinct(),
"query": query,
Expand Down
2 changes: 1 addition & 1 deletion nous_aggregator/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@
# https://docs.djangoproject.com/en/4.0/howto/static-files/

STATIC_URL = "static/"
STATIC_ROOT = os.path.join(BASE_DIR, 'staticfiles')
STATIC_ROOT = os.path.join(BASE_DIR, 'static')


# Default primary key field type
Expand Down
Loading

0 comments on commit 7940e1d

Please sign in to comment.