Skip to content

Commit

Permalink
Replace AppScheduler with Redis + Celery
Browse files Browse the repository at this point in the history
  * use Redis + Celery for running background tasks
  * remove Appscheduler and pyppeteer patch
  * refactor scraper
  * update docker-compose
  • Loading branch information
pi-sigma committed Feb 6, 2024
1 parent b8ced94 commit 6f50e67
Show file tree
Hide file tree
Showing 28 changed files with 410 additions and 723 deletions.
13 changes: 6 additions & 7 deletions .github/workflows/django.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
name: Django CI

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
on: [ push, pull_request ]

jobs:
#
Expand Down Expand Up @@ -58,7 +54,9 @@ jobs:
pip install -r requirements/ci.txt
- name: Run tests
env:
DJANGO_ENV: "CI"
SECRET_KEY: dummy
DJANGO_ENV: BASE
SECURE_SSL_REDIRECT: False
run: pytest articles/tests/
#
# Migrations
Expand Down Expand Up @@ -91,4 +89,5 @@ jobs:
run: |
python manage.py makemigrations --check --dry-run
env:
DJANGO_ENV: "CI"
SECRET_KEY: dummy
DJANGO_ENV: BASE
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ staticfiles/
dev/
logs/*.log*

### celery etc.
celerybeat-schedule

### Unit test / coverage reports
htmlcov/
.tox/
Expand Down
7 changes: 2 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,12 @@ RUN apt-get update \
&& pip install psycopg2 \
# pyppeteer deps (cf. https://stackoverflow.com/a/71935536)
&& xargs apt-get install -y --no-install-recommends < requirements/pyppeteer_deps.txt \
&& pip install -r requirements/base.txt
&& pip install -r requirements/production.txt

COPY . /usr/src/app
WORKDIR /usr/src/app

RUN python manage.py collectstatic --no-input

# patch
RUN ./patches/pyppeteer_patch.sh
RUN python manage.py collectstatic --link --no-input

RUN useradd -m myuser
USER myuser
13 changes: 5 additions & 8 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
FROM python:3.10-slim
FROM python:3.11-slim-bookworm

ENV PIP_DISABLE_PIP_VERSION_CHECK 1
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
ENV DJANGO_ENV "STAGING"
ENV DJANGO_ENV "LOCAL"

COPY requirements/* requirements/

Expand All @@ -14,15 +14,12 @@ RUN apt-get update \
&& pip install psycopg2 \
# pyppeteer deps (cf. https://stackoverflow.com/a/71935536)
&& xargs apt-get install -y --no-install-recommends < requirements/pyppeteer_deps.txt \
&& pip install -r requirements/base.txt
&& pip install -r requirements/dev.txt

COPY . /usr/src/app
WORKDIR /usr/src/app

RUN python manage.py collectstatic --link --no-input

# patch
RUN ./patches/pyppeteer_patch.sh

RUN useradd -m myuser
USER myuser
# RUN useradd -m myuser
# USER myuser
97 changes: 0 additions & 97 deletions articles/management/commands/scrape.py

This file was deleted.

Empty file removed articles/scraper/__init__.py
Empty file.
44 changes: 44 additions & 0 deletions articles/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import json

from celery import group, shared_task
from django.utils import timezone

import scraper
from scraper.tasks import magazines

from .models import Article, Source


@shared_task
def get_articles_for_source(source_title: str):
source: Source = Source.objects.get(name=source_title)
sitemap = source.to_dict()
starting_urls = [
sitemap["base_url"] + path for path in sitemap["paths"]
]

spider = scraper.Spider(starting_urls, sitemap)
scraper.run(spider)
data = [json.loads(article) for article in spider.articles]

Article.objects.bulk_create([
Article(
headline=article_data["headline"],
slug=article_data["slug"],
source=Source.objects.get(link=article_data["source_link"]),
summary=article_data["summary"],
language=article_data["language"],
link=article_data["link"],
created_at=timezone.now(),
) for article_data in data
], ignore_conflicts=True)


@shared_task
def get_articles(language: str):
task_group = group(
get_articles_for_source.s(source_title=title) for title in magazines[language]["titles"]
)
promise = task_group.apply_async()
if promise.ready():
return promise.get()
3 changes: 2 additions & 1 deletion articles/tests/test_parsers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json

from scraper.parser import parse

from ..constants import Language
from ..models import Source
from ..scraper.parser import parse


def test_parse(source_values):
Expand Down
62 changes: 47 additions & 15 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,70 @@ services:
volumes:
- postgres_data:/var/lib/postgresql/data:rw
environment:
- POSTGRES_HOST_AUTH_METHOD=trust
- POSTGRES_HOST_AUTH_METHOD=${POSTGRES_HOST_AUTH_METHOD}
ports:
- ${PG_DOCKER_MAPPING}

web:
image: nous_aggregator
build:
build: &web_build
context: .
dockerfile: Dockerfile.dev
environment: &web_env
- DJANGO_SETTINGS_MODULE=nous_aggregator.settings.base
- SECRET_KEY=${SECRET_KEY}
# Network
- ALLOWED_HOSTS=${ALLOWED_HOSTS}
- CSRF_TRUSTED_ORIGINS=${CSRF_TRUSTED_ORIGINS}
- SESSION_COOKIE_SECURE=${SESSION_COOKIE_SECURE}
- CSRF_COOKIE_SECURE=${CSRF_COOKIE_SECURE}
- SECURE_SSL_REDIRECT=${SECURE_SSL_REDIRECT}
# Database
- DATABASE_ENGINE=${DATABASE_ENGINE}
- DATABASE_NAME=${DATABASE_NAME}
- DATABASE_USER=${DATABASE_USER}
- DATABASE_PASSWORD=${DATABASE_PASSWORD}
- DATABASE_HOST=${DATABASE_HOST}
- DATABASE_PORT=${DATABASE_PORT}
# Redis + Celery
- REDIS_PORT=${REDIS_PORT}
- CELERY_BROKER_URL=${CELERY_BROKER_URL_DOCKER}
- CELERY_RESULT_BACKEND=${CELERY_RESULT_BACKEND_DOCKER}
- CELERY_LOGLEVEL=${CELERY_LOGLEVEL}
volumes:
- .:/app
command: >
sh -c "python manage.py migrate &&
python manage.py runserver 0.0.0.0:8000"
volumes:
- .:/usr/src/app
ports:
- "8000:8000"
restart: always
env_file:
- ./.env
depends_on:
- db
- redis

scheduler:
image: nous_aggregator
environment:
DATABASE_HOST: "${DATABASE_HOST}"
DATABASE_PORT: "${DATABASE_PORT}"
command: >
sh -c "python manage.py scrape"
redis:
image: redis

celery-worker:
build: *web_build
environment: *web_env
command: celery -A nous_aggregator worker
volumes:
- .:/app
depends_on:
- db
- redis

celery-beat:
build: *web_build
environment: *web_env
command: celery -A nous_aggregator beat
volumes:
- .:/usr/src/app
- .:/app
depends_on:
- web
- db
- redis

volumes:
postgres_data:
3 changes: 3 additions & 0 deletions nous_aggregator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .celery import app as celery_app

__all__ = ("celery_app",)
11 changes: 11 additions & 0 deletions nous_aggregator/celery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import os

from celery import Celery

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "nous_aggregator.settings.local")

app = Celery("nous_aggregator")

app.config_from_object("django.conf:settings", namespace="CELERY")

app.autodiscover_tasks()
6 changes: 2 additions & 4 deletions nous_aggregator/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@
DJANGO_ENV = config('DJANGO_ENV', default="")

match DJANGO_ENV:
case "CI":
from .ci import *
case "BASE":
from .base import *
case "LOCAL":
from .local import *
case "STAGING":
from .staging import *
case "":
from .production import *
Loading

0 comments on commit 6f50e67

Please sign in to comment.