Skip to content

Commit

Permalink
(refactor) Replace AppScheduler with Redis + Celery
Browse files Browse the repository at this point in the history
  * introduce redis + celery for running background tasks
  * remove appscheduler and pyppeteer patch
  * refactor scraper
  • Loading branch information
pi-sigma committed Feb 4, 2024
1 parent b8ced94 commit db61145
Show file tree
Hide file tree
Showing 25 changed files with 390 additions and 670 deletions.
13 changes: 6 additions & 7 deletions .github/workflows/django.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
name: Django CI

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
on: [push]

jobs:
#
Expand Down Expand Up @@ -58,7 +54,9 @@ jobs:
pip install -r requirements/ci.txt
- name: Run tests
env:
DJANGO_ENV: "CI"
SECRET_KEY: dummy
DJANGO_ENV: BASE
SECURE_SSL_REDIRECT: False
run: pytest articles/tests/
#
# Migrations
Expand Down Expand Up @@ -91,4 +89,5 @@ jobs:
run: |
python manage.py makemigrations --check --dry-run
env:
DJANGO_ENV: "CI"
SECRET_KEY: dummy
DJANGO_ENV: BASE
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ staticfiles/
dev/
logs/*.log*

### celery etc.
celerybeat-schedule

### Unit test / coverage reports
htmlcov/
.tox/
Expand Down
16 changes: 16 additions & 0 deletions articles/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
sources = {
"en": [
"Al Jazeera",
"Associated Press",
"Christian Science Monitor",
"Consortium News",
"Current Affairs",
"New York Times",
"NPR",
"Reuters",
"The Atlantic",
"The Intercept",
"UPI",
"Wall Street Journal",
]
}
97 changes: 0 additions & 97 deletions articles/management/commands/scrape.py

This file was deleted.

Empty file removed articles/scraper/__init__.py
Empty file.
44 changes: 44 additions & 0 deletions articles/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import json

from celery import group, shared_task
from django.utils import timezone

import scraper

from .data import sources
from .models import Article, Source


@shared_task
def get_articles_for_source(source_title: str):
source: Source = Source.objects.get(name=source_title)
sitemap = source.to_dict()
starting_urls = [
sitemap["base_url"] + path for path in sitemap["paths"]
]

spider = scraper.Spider(starting_urls, sitemap)
scraper.run(spider)
data = [json.loads(article) for article in spider.articles]

Article.objects.bulk_create([
Article(
headline=article_data["headline"],
slug=article_data["slug"],
source=Source.objects.get(link=article_data["source_link"]),
summary=article_data["summary"],
language=article_data["language"],
link=article_data["link"],
created_at=timezone.now(),
) for article_data in data
], ignore_conflicts=True)


@shared_task
def get_articles(language: str):
task_group = group(
get_articles_for_source.s(source_title=title) for title in sources[language]
)
promise = task_group.apply_async()
if promise.ready():
return promise.get()
3 changes: 2 additions & 1 deletion articles/tests/test_parsers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json

from scraper.parser import parse

from ..constants import Language
from ..models import Source
from ..scraper.parser import parse


def test_parse(source_values):
Expand Down
3 changes: 3 additions & 0 deletions nous_aggregator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .celery import app as celery_app

__all__ = ("celery_app",)
11 changes: 11 additions & 0 deletions nous_aggregator/celery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import os

from celery import Celery

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "nous_aggregator.settings.local")

app = Celery("nous_aggregator")

app.config_from_object("django.conf:settings", namespace="CELERY")

app.autodiscover_tasks()
6 changes: 2 additions & 4 deletions nous_aggregator/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@
DJANGO_ENV = config('DJANGO_ENV', default="")

match DJANGO_ENV:
case "CI":
from .ci import *
case "BASE":
from .base import *
case "LOCAL":
from .local import *
case "STAGING":
from .staging import *
case "":
from .production import *
55 changes: 47 additions & 8 deletions nous_aggregator/settings/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,39 @@
import os
from pathlib import Path

from decouple import Csv, config

# Build paths inside the project like this: BASE_DIR / 'subdir'.
# (modified because settings files are nested one level deeper)
BASE_DIR = Path(__file__).resolve().parent.parent.parent

SECRET_KEY = config("SECRET_KEY", default="")

DEBUG = config("DEBUG", default=False, cast=bool)

ALLOWED_HOSTS = config("ALLOWED_HOSTS", default="", cast=Csv())

CSRF_TRUSTED_ORIGINS = config("CSRF_TRUSTED_ORIGINS", default="", cast=Csv())

SESSION_COOKIE_SECURE = config("SESSION_COOKIE_SECURE", default=True, cast=bool)

CSRF_COOKIE_SECURE = config("CSRF_COOKIE_SECURE", default=True, cast=bool)

SECURE_SSL_REDIRECT = config("SECURE_SSL_REDIRECT", default=True, cast=bool)

SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https')

DATABASES = {
"default": {
"ENGINE": config("DATABASE_ENGINE", default="django.db.backends.postgresql"),
"NAME": config("DATABASE_NAME", default="postgres"),
"USER": config("DATABASE_USER", default="postgres"),
"PASSWORD": config("DATABASE_PASSWORD", default="postgres"),
"HOST": config("DATABASE_HOST", default="localhost"),
"PORT": config("DATABASE_PORT", default=5432, cast=int),
},
}


# Application definition

Expand All @@ -17,10 +45,8 @@
"django.contrib.messages",
"django.contrib.staticfiles",
"django.contrib.humanize",
# My Apps
# nous_aggregator apps
"articles.apps.ArticlesConfig",
# Third Party Apps
"django_apscheduler",
]

MIDDLEWARE = [
Expand All @@ -39,11 +65,6 @@
"127.0.0.1",
]

# Timeouts (connection timeout, read timeout) in seconds for requests
# made with the requests library
REQUESTS_TIMEOUT = (6, 18)
REQUESTS_TIMEOUT_JS = (6, 60)

TEMPLATES = [
{
"BACKEND": "django.template.backends.django.DjangoTemplates",
Expand Down Expand Up @@ -154,3 +175,21 @@
# https://docs.djangoproject.com/en/4.0/ref/settings/#default-auto-field

DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"


# Timeouts (connection timeout, read timeout) in seconds for requests
REQUESTS_TIMEOUT = (30, 60)

# Celery
CELERY_BROKER_URL = config("CELERY_BROKER_URL", "")
CELERY_RESULT_BACKEND = config("CELERY_RESULT_BACKEND", "")
# TODO: retrieve schedule from env var or module
CELERY_BEAT_SCHEDULE = {
"get_articles_en": {
"task": "articles.tasks.get_articles",
"schedule": 60,
"kwargs": {
"language": "en",
}
}
}
28 changes: 14 additions & 14 deletions nous_aggregator/settings/ci.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from .base import *
# from .base import *

SECRET_KEY = "hush-hush"
# SECRET_KEY = "hush-hush"

DEBUG = False
# DEBUG = False

ALLOWED_HOSTS = ['*']
# ALLOWED_HOSTS = ['*']

DATABASES = {
"default": {
"ENGINE": "django.db.backends.postgresql",
"NAME": "nous_aggregator",
"USER": "postgres",
"PASSWORD": "postgres",
"HOST": "localhost",
"PORT": 5432,
},
}
# DATABASES = {
# "default": {
# "ENGINE": "django.db.backends.postgresql",
# "NAME": "nous_aggregator",
# "USER": "postgres",
# "PASSWORD": "postgres",
# "HOST": "localhost",
# "PORT": 5432,
# },
# }
7 changes: 7 additions & 0 deletions nous_aggregator/settings/local.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import socket

from .base import *

SECRET_KEY = "hush-hush"
Expand All @@ -13,8 +15,13 @@

INSTALLED_APPS += [
"debug_toolbar",
"django_extensions",
]

MIDDLEWARE += [
"debug_toolbar.middleware.DebugToolbarMiddleware",
]

# for django_debug_toolbar
hostname, _, ips = socket.gethostbyname_ex(socket.gethostname())
INTERNAL_IPS = [ip[: ip.rfind(".")] + ".1" for ip in ips] + ["127.0.0.1", "10.0.2.2"]
2 changes: 1 addition & 1 deletion nous_aggregator/settings/production.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sentry_sdk
from sentry_sdk.integrations.django import DjangoIntegration

from .staging import *
from .base import *

sentry_sdk.init(
dsn="https://[email protected]/6748377",
Expand Down
Loading

0 comments on commit db61145

Please sign in to comment.