Skip to content

Commit

Permalink
Refactor scraper
Browse files Browse the repository at this point in the history
    * replace requests/requests-html with aiohttp
    * use custom requests mocker for async tests
  • Loading branch information
pi-sigma committed Mar 1, 2024
1 parent 7940e1d commit 453576f
Show file tree
Hide file tree
Showing 34 changed files with 374 additions and 314 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ COPY /requirements/* /app/requirements/
RUN pip install -r /app/requirements/dev.txt

# pyppeteer deps (https://stackoverflow.com/a/71935536)
RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt
# RUN xargs apt-get install -y --no-install-recommends < /app/requirements/pyppeteer_deps.txt


#
Expand Down
6 changes: 3 additions & 3 deletions articles/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 5.0.1 on 2024-02-18 12:41
# Generated by Django 5.0.1 on 2024-02-26 20:25

import django.db.models.deletion
import django.db.models.functions.text
Expand All @@ -24,7 +24,7 @@ class Migration(migrations.Migration):
),
),
(
"name",
"title",
models.CharField(
help_text="The name of the source", max_length=128, unique=True
),
Expand Down Expand Up @@ -97,7 +97,7 @@ class Migration(migrations.Migration):
),
],
options={
"ordering": [django.db.models.functions.text.Lower("name")],
"ordering": [django.db.models.functions.text.Lower("title")],
},
),
migrations.CreateModel(
Expand Down
8 changes: 4 additions & 4 deletions articles/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class Source(models.Model):
Metadata about the source of articles
Fields:
name (models.CharField): name of the source
title (models.CharField): name of the source
slug (models.SlugField): slug of the source
publication_type (models.CharField): the type of publication of the
source (newspaper, journal, blog...)
Expand All @@ -97,7 +97,7 @@ class Source(models.Model):
scraper
"""

name = models.CharField(
title = models.CharField(
max_length=128,
unique=True,
blank=False,
Expand Down Expand Up @@ -164,11 +164,11 @@ class Source(models.Model):

class Meta:
ordering = [
Lower("name"),
Lower("title"),
]

def __str__(self):
return f"{self.name}"
return f"{self.title}"

def to_dict(self):
sitemap = {
Expand Down
10 changes: 5 additions & 5 deletions articles/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,27 @@


@shared_task
def get_articles_for_source(source_title: str):
source: Source = Source.objects.get(name=source_title)
def get_articles_for_source(source_title: str) -> None:
source: Source = Source.objects.get(title=source_title)
sitemap = source.to_dict()
starting_urls = [
sitemap["base_url"] + path for path in sitemap["paths"]
]

spider = scraper.Spider(starting_urls, sitemap)
spider.run()
data = [json.loads(article) for article in spider.articles]
articles = [json.loads(article) for article in spider.articles]

Article.objects.bulk_create([
Article(
headline=article_data["headline"],
slug=article_data["slug"],
source=Source.objects.get(link=article_data["source_link"]),
source=Source.objects.get(url=article_data["source_link"]),
summary=article_data["summary"],
language=article_data["language"],
url=article_data["url"],
created_at=timezone.now(),
) for article_data in data
) for article_data in articles
], ignore_conflicts=True)


Expand Down
2 changes: 1 addition & 1 deletion articles/templates/articles/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<div class="row">
{% for source in sources %}
<div class="col-md-12 col-lg-6 col-xl-4 container-articles-outer">
<div class="source"><a class="source-link" href="{{ source.url }}"><h5 class="source-name">{{ source.name }}</h5></a></div>
<div class="source"><a class="source-link" href="{{ source.url }}"><h5 class="source-title">{{ source.title }}</h5></a></div>
<div class="container-articles-inner">
<ul class="article-list">
{% for article in source.articles.all|slice:":10" %}
Expand Down
4 changes: 2 additions & 2 deletions articles/templates/articles/search_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ <h3>Search Results for: "{{ query }}"</h3>
<div class="row">
{% for source in sources %}
<div class="col-md-12 col-lg-6 col-xl-4 container-articles-outer">
<div><a class="source-link" href="{{ source.url }}"><h5 class="source-name">{{ source.name }}</h5></a></div>
<div><a class="source-link" href="{{ source.url }}"><h5 class="source-title">{{ source.title }}</h5></a></div>
<div class="container-articles-inner container-scrolling">
<ul>
{% for article in article_list %}
{% if article.source.name == source.name %}
{% if article.source.title == source.title %}
<li>
<div class="article-headline">
<a class="article-link main-text"
Expand Down
18 changes: 9 additions & 9 deletions articles/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from datetime import datetime
from typing import Dict, Union

import pytest
from django.test import Client
from django.utils import timezone
Expand All @@ -6,13 +9,10 @@
from ..models import Article, Source


#
# Test fixtures
#
@pytest.fixture
def source_values():
return {
"name": "Fake News",
"title": "Fake News",
"slug": "fake-news",
"url": "https://www.hocusbogus.com/",
"publication_type": PublicationType.newspaper,
Expand All @@ -33,7 +33,7 @@ def source_instance(source_values):
@pytest.fixture
def source_values_2():
return {
"name": "Alternative Facts",
"title": "Alternative Facts",
"slug": "alternative-facts",
"url": "https://www.nonsensical.org/",
"publication_type": PublicationType.newspaper,
Expand All @@ -52,7 +52,7 @@ def source_instance_2(source_values_2):


@pytest.fixture
def article_values(source_instance):
def article_values(source_instance) -> Dict[str, Union[datetime, str]]:
return {
"headline": "A cow jumps over the moon",
"slug": "a-cow-jumps-over-the-moon",
Expand All @@ -64,7 +64,7 @@ def article_values(source_instance):


@pytest.fixture
def article_values_m(source_values):
def article_values_m(source_values) -> Dict[str, Union[Source, datetime, str]]:
return {
"headline": "A cow jumps over the moon",
"slug": "a-cow-jumps-over-the-moon",
Expand All @@ -81,7 +81,7 @@ def article_instance(article_values):


@pytest.fixture
def article_values_2(source_instance):
def article_values_2(source_instance) -> Dict[str, Union[datetime, str]]:
return {
"headline": "The moon is made of cheese",
"slug": "the-moon-is-made-of-cheese",
Expand All @@ -98,5 +98,5 @@ def article_instance_2(article_values_2):


@pytest.fixture
def client():
def client() -> Client:
return Client()
10 changes: 5 additions & 5 deletions articles/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
#
# Test Source
#
def test_create_source(source_values):
def test_create_source(source_values) -> None:
source = Source(**source_values)

for attr_name in source_values:
assert getattr(source, attr_name) == source_values.get(attr_name)


def test_source_to_dict(source_values):
def test_source_to_dict(source_values) -> None:
source = Source(**source_values)
sitemap = source.to_dict()

Expand All @@ -30,7 +30,7 @@ def test_source_to_dict(source_values):
assert regex.compile(source.regex) == sitemap["filter"]


def test_source_str_representation(source_values):
def test_source_str_representation(source_values) -> None:
source = Source(**source_values)

assert str(source) == "Fake News"
Expand All @@ -39,14 +39,14 @@ def test_source_str_representation(source_values):
#
# Test Article
#
def test_create_article(article_values_m):
def test_create_article(article_values_m) -> None:
article = Article(**article_values_m)

for attr_name in article_values_m:
assert getattr(article, attr_name) == article_values_m.get(attr_name)


def test_article_representation(article_values_m):
def test_article_representation(article_values_m) -> None:
article = Article(**article_values_m)

assert str(article) == (f"{article.source}: {article.headline}")
22 changes: 11 additions & 11 deletions articles/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# Test IndexView
#
@pytest.mark.django_db
def test_index_view(client, source_instance, article_instance):
def test_index_view(client, source_instance, article_instance) -> None:
response = client.get(reverse("index"))

assert response.status_code == 200
Expand All @@ -18,12 +18,12 @@ def test_index_view(client, source_instance, article_instance):
doc = pq(html)

# assert that details of source are present in response content
source_name = doc.find(".source-name").text()
source_title = doc.find(".source-title").text()
source_link = doc.find(".source-link")
source_link_href = source_link.attr("href")


assert source_name == source_instance.name
assert source_title == source_instance.title
assert source_link.is_("a")
assert source_link_href == source_instance.url

Expand All @@ -50,7 +50,7 @@ def test_search_results_view(
article_values,
article_instance,
article_instance_2,
):
) -> None:
query_params = {"q": article_values["headline"][:5]}
response = client.get(reverse("search"), query_params)
html = response.content.decode("utf-8")
Expand All @@ -59,11 +59,11 @@ def test_search_results_view(
assert response.status_code == 200

# assert that details of source matching query are present in response content
source_name = doc.find(".source-name").text()
source_title = doc.find(".source-title").text()
source_link = doc.find(".source-link")
source_link_href = source_link.attr("href")

assert source_name == source_instance.name
assert source_title == source_instance.title
assert source_link.is_("a")
assert source_link_href == source_instance.url

Expand All @@ -80,7 +80,7 @@ def test_search_results_view(
assert article_instance.summary in article_link_title

# assert that details of non-matching source are not found
assert source_instance_2.name not in html
assert source_instance_2.title not in html
assert source_instance_2.url not in html

# assert that details of non-matching article are not found
Expand All @@ -96,15 +96,15 @@ def test_search_result_not_found(
source_instance_2,
article_instance,
article_instance_2,
):
) -> None:
query_params = {"q": "test"}
response = client.get(reverse("search"), query_params)
html = response.content.decode("utf-8")

assert response.status_code == 200

# assert that details of non-matching source are not found
assert source_instance.name not in html
assert source_instance.title not in html
assert source_instance.url not in html

# assert that details of non-matching article are not found
Expand All @@ -119,15 +119,15 @@ def test_search_result_substring(
source_instance,
article_instance,
article_values,
):
) -> None:
query_params = {"q": article_values["headline"][2:7]}
response = client.get(reverse("search"), query_params)
html = response.content.decode("utf-8")

assert response.status_code == 200

# assert that details of non-matching source are not found
assert source_instance.name not in html
assert source_instance.title not in html
assert source_instance.url not in html

# assert that details of non-matching article are not found
Expand Down
5 changes: 4 additions & 1 deletion articles/urls.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from typing import List

from django.urls import path
from django.urls.resolvers import URLPattern

from . import views

urlpatterns = [
urlpatterns: List[URLPattern] = [
path("", views.index, name="index"),
path("search", views.SearchResultsView.as_view(), name="search"),
]
11 changes: 7 additions & 4 deletions articles/views.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import re
from typing import Any, Dict, Optional

from django.http.request import HttpRequest
from django.http.response import HttpResponse
from django.shortcuts import render
from django.views.generic import ListView

from .models import Article, Source


def index(request):
def index(request: Optional[HttpRequest]) -> HttpResponse:
context = {
"sources": Source.objects.only("name", "url", "publication_type"),
"sources": Source.objects.only("title", "url", "publication_type"),
}
return render(request, "articles/index.html", context)

Expand All @@ -18,7 +21,7 @@ class SearchResultsView(ListView):
fields = ["headline", "url", "body"]
template_name = "articles/search_results.html"

def get_context_data(self, **kwargs):
def get_context_data(self, **kwargs) -> Dict[str, Any]:
"""Pre-filter sources on the basis of article headlines and queries"""

context = super().get_context_data(**kwargs)
Expand All @@ -28,7 +31,7 @@ def get_context_data(self, **kwargs):
regex = r"(?<![a-zA-Z])" + re.escape(query) + r"(?![a-rA-Rt-zT-Z])"
context.update(
{
"sources": Source.objects.only("name", "url", "publication_type")
"sources": Source.objects.only("title", "url", "publication_type")
.filter(articles__headline__iregex=regex)
.distinct(),
"query": query,
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ services:
build: &web_build
context: .
dockerfile: Dockerfile
restart: always
environment: &web_env
- DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE}
- SECRET_KEY=${SECRET_KEY}
Expand Down Expand Up @@ -43,7 +44,6 @@ services:
python manage.py runserver 0.0.0.0:8000"
ports:
- "8000:8000"
restart: always
depends_on:
- db
- redis
Expand Down
Loading

0 comments on commit 453576f

Please sign in to comment.