From 6f50e67056e35eda0e7a520facc688081c7e403a Mon Sep 17 00:00:00 2001 From: Paul Schilling Date: Mon, 29 Jan 2024 21:53:06 +0100 Subject: [PATCH] Replace AppScheduler with Redis + Celery * use Redis + Celery for running background tasks * remove Appscheduler and pyppeteer patch * refactor scraper * update docker-compose --- .github/workflows/django.yml | 13 +- .gitignore | 3 + Dockerfile | 7 +- Dockerfile.dev | 13 +- articles/management/commands/scrape.py | 97 ----- articles/scraper/__init__.py | 0 articles/tasks.py | 44 ++ articles/tests/test_parsers.py | 3 +- docker-compose.yml | 62 ++- nous_aggregator/__init__.py | 3 + nous_aggregator/celery.py | 11 + nous_aggregator/settings/__init__.py | 6 +- nous_aggregator/settings/base.py | 55 ++- nous_aggregator/settings/ci.py | 18 - nous_aggregator/settings/local.py | 7 + nous_aggregator/settings/production.py | 5 +- nous_aggregator/settings/staging.py | 46 -- patches/pyppeteer_patch.py | 404 ------------------ patches/pyppeteer_patch.sh | 14 - requirements/base.in | 16 +- requirements/base.txt | 48 ++- requirements/ci.txt | 85 +++- requirements/dev.txt | 92 +++- scraper/__init__.py | 1 + {articles/scraper => scraper}/headers.py | 0 {articles/scraper => scraper}/parser.py | 0 .../scraper/spider.py => scraper/spiders.py | 61 +-- scraper/tasks.py | 19 + 28 files changed, 410 insertions(+), 723 deletions(-) delete mode 100644 articles/management/commands/scrape.py delete mode 100644 articles/scraper/__init__.py create mode 100644 articles/tasks.py create mode 100644 nous_aggregator/celery.py delete mode 100644 nous_aggregator/settings/ci.py delete mode 100644 nous_aggregator/settings/staging.py delete mode 100644 patches/pyppeteer_patch.py delete mode 100755 patches/pyppeteer_patch.sh create mode 100644 scraper/__init__.py rename {articles/scraper => scraper}/headers.py (100%) rename {articles/scraper => scraper}/parser.py (100%) rename articles/scraper/spider.py => scraper/spiders.py (65%) create mode 100644 scraper/tasks.py diff --git a/.github/workflows/django.yml b/.github/workflows/django.yml index b4d01f2..ba38169 100644 --- a/.github/workflows/django.yml +++ b/.github/workflows/django.yml @@ -1,10 +1,6 @@ name: Django CI -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] +on: [ push, pull_request ] jobs: # @@ -58,7 +54,9 @@ jobs: pip install -r requirements/ci.txt - name: Run tests env: - DJANGO_ENV: "CI" + SECRET_KEY: dummy + DJANGO_ENV: BASE + SECURE_SSL_REDIRECT: False run: pytest articles/tests/ # # Migrations @@ -91,4 +89,5 @@ jobs: run: | python manage.py makemigrations --check --dry-run env: - DJANGO_ENV: "CI" + SECRET_KEY: dummy + DJANGO_ENV: BASE diff --git a/.gitignore b/.gitignore index b5a5d28..99dd356 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ staticfiles/ dev/ logs/*.log* +### celery etc. +celerybeat-schedule + ### Unit test / coverage reports htmlcov/ .tox/ diff --git a/Dockerfile b/Dockerfile index a220223..2cba62f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,15 +13,12 @@ RUN apt-get update \ && pip install psycopg2 \ # pyppeteer deps (cf. https://stackoverflow.com/a/71935536) && xargs apt-get install -y --no-install-recommends < requirements/pyppeteer_deps.txt \ - && pip install -r requirements/base.txt + && pip install -r requirements/production.txt COPY . /usr/src/app WORKDIR /usr/src/app -RUN python manage.py collectstatic --no-input - -# patch -RUN ./patches/pyppeteer_patch.sh +RUN python manage.py collectstatic --link --no-input RUN useradd -m myuser USER myuser diff --git a/Dockerfile.dev b/Dockerfile.dev index c4a0220..a19fa5d 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -1,9 +1,9 @@ -FROM python:3.10-slim +FROM python:3.11-slim-bookworm ENV PIP_DISABLE_PIP_VERSION_CHECK 1 ENV PYTHONDONTWRITEBYTECODE 1 ENV PYTHONUNBUFFERED 1 -ENV DJANGO_ENV "STAGING" +ENV DJANGO_ENV "LOCAL" COPY requirements/* requirements/ @@ -14,15 +14,12 @@ RUN apt-get update \ && pip install psycopg2 \ # pyppeteer deps (cf. https://stackoverflow.com/a/71935536) && xargs apt-get install -y --no-install-recommends < requirements/pyppeteer_deps.txt \ - && pip install -r requirements/base.txt + && pip install -r requirements/dev.txt COPY . /usr/src/app WORKDIR /usr/src/app RUN python manage.py collectstatic --link --no-input -# patch -RUN ./patches/pyppeteer_patch.sh - -RUN useradd -m myuser -USER myuser +# RUN useradd -m myuser +# USER myuser diff --git a/articles/management/commands/scrape.py b/articles/management/commands/scrape.py deleted file mode 100644 index f0405e7..0000000 --- a/articles/management/commands/scrape.py +++ /dev/null @@ -1,97 +0,0 @@ -import json -import logging -from datetime import datetime - -from apscheduler.executors.pool import ThreadPoolExecutor -from apscheduler.schedulers.blocking import BlockingScheduler -from apscheduler.triggers.cron import CronTrigger -from django.conf import settings -from django.core.management.base import BaseCommand -from django.db import IntegrityError -from django.utils.timezone import make_aware -from django_apscheduler.jobstores import DjangoJobStore -from django_apscheduler.models import DjangoJob, DjangoJobExecution - -from articles.models import Article, Source -from articles.scraper.spider import Spider - -logger = logging.getLogger(__name__) - -SCRAPING_INTERVAL = 180 # minutes - - -def scrape(sitemap: dict): - Spider.crawl(sitemap) - data = [json.loads(article) for article in Spider.articles] - - for article_data in data: - article = Article( - headline=article_data["headline"], - slug=article_data["slug"], - source=Source.objects.get(link=article_data["source_link"]), - summary=article_data["summary"], - language=article_data["language"], - link=article_data["link"], - created_at=make_aware(datetime.now()), - ) - try: - article.save() - except IntegrityError as e: - logger.info( - "Article (%s) already exists in database (%s)", - article_data["headline"], - e, - ) - - -def delete_old_job_executions(max_age=604_800): - """Deletes all apscheduler job execution logs older than `max_age`.""" - DjangoJobExecution.objects.delete_old_job_executions(max_age) - - -class Command(BaseCommand): - def handle(self, *args, **options): - scheduler = BlockingScheduler( - timezone=settings.TIME_ZONE, - executors={"default": ThreadPoolExecutor(1)}, - ) - scheduler.add_jobstore(DjangoJobStore(), "default") - - sources = Source.objects.all() - for index, source in enumerate(sources): - source_id = f"Scraping {index + 1}: {source.name}" - try: - DjangoJob.objects.get(pk=source_id) - except DjangoJob.DoesNotExist: - scheduler.add_job( - scrape, - args=[source.to_dict()], - trigger="interval", - minutes=SCRAPING_INTERVAL, - misfire_grace_time=600, - id=source_id, - max_instances=1, - replace_existing=True, - ) - logger.info("Added daily job: %s.", source_id) - - # delete old job executions - try: - DjangoJob.objects.get(pk="Delete Old Job Executions") - except DjangoJob.DoesNotExist: - scheduler.add_job( - delete_old_job_executions, - # Monday midnight - trigger=CronTrigger(day_of_week="mon", hour="00", minute="00"), - id="Delete Old Job Executions", - max_instances=1, - replace_existing=True, - ) - logger.info("Added weekly job: delete old executions.") - - try: - scheduler.start() - except (KeyboardInterrupt, SystemExit): - logger.info("Manual shutdown of scheduler...") - scheduler.shutdown() - logger.info("Scheduler shut down successfully.\n") diff --git a/articles/scraper/__init__.py b/articles/scraper/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/articles/tasks.py b/articles/tasks.py new file mode 100644 index 0000000..7662e80 --- /dev/null +++ b/articles/tasks.py @@ -0,0 +1,44 @@ +import json + +from celery import group, shared_task +from django.utils import timezone + +import scraper +from scraper.tasks import magazines + +from .models import Article, Source + + +@shared_task +def get_articles_for_source(source_title: str): + source: Source = Source.objects.get(name=source_title) + sitemap = source.to_dict() + starting_urls = [ + sitemap["base_url"] + path for path in sitemap["paths"] + ] + + spider = scraper.Spider(starting_urls, sitemap) + scraper.run(spider) + data = [json.loads(article) for article in spider.articles] + + Article.objects.bulk_create([ + Article( + headline=article_data["headline"], + slug=article_data["slug"], + source=Source.objects.get(link=article_data["source_link"]), + summary=article_data["summary"], + language=article_data["language"], + link=article_data["link"], + created_at=timezone.now(), + ) for article_data in data + ], ignore_conflicts=True) + + +@shared_task +def get_articles(language: str): + task_group = group( + get_articles_for_source.s(source_title=title) for title in magazines[language]["titles"] + ) + promise = task_group.apply_async() + if promise.ready(): + return promise.get() diff --git a/articles/tests/test_parsers.py b/articles/tests/test_parsers.py index 8945db6..083a8da 100644 --- a/articles/tests/test_parsers.py +++ b/articles/tests/test_parsers.py @@ -1,8 +1,9 @@ import json +from scraper.parser import parse + from ..constants import Language from ..models import Source -from ..scraper.parser import parse def test_parse(source_values): diff --git a/docker-compose.yml b/docker-compose.yml index 18e9716..16e7b23 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,38 +6,70 @@ services: volumes: - postgres_data:/var/lib/postgresql/data:rw environment: - - POSTGRES_HOST_AUTH_METHOD=trust + - POSTGRES_HOST_AUTH_METHOD=${POSTGRES_HOST_AUTH_METHOD} + ports: + - ${PG_DOCKER_MAPPING} web: image: nous_aggregator - build: + build: &web_build context: . dockerfile: Dockerfile.dev + environment: &web_env + - DJANGO_SETTINGS_MODULE=nous_aggregator.settings.base + - SECRET_KEY=${SECRET_KEY} + # Network + - ALLOWED_HOSTS=${ALLOWED_HOSTS} + - CSRF_TRUSTED_ORIGINS=${CSRF_TRUSTED_ORIGINS} + - SESSION_COOKIE_SECURE=${SESSION_COOKIE_SECURE} + - CSRF_COOKIE_SECURE=${CSRF_COOKIE_SECURE} + - SECURE_SSL_REDIRECT=${SECURE_SSL_REDIRECT} + # Database + - DATABASE_ENGINE=${DATABASE_ENGINE} + - DATABASE_NAME=${DATABASE_NAME} + - DATABASE_USER=${DATABASE_USER} + - DATABASE_PASSWORD=${DATABASE_PASSWORD} + - DATABASE_HOST=${DATABASE_HOST} + - DATABASE_PORT=${DATABASE_PORT} + # Redis + Celery + - REDIS_PORT=${REDIS_PORT} + - CELERY_BROKER_URL=${CELERY_BROKER_URL_DOCKER} + - CELERY_RESULT_BACKEND=${CELERY_RESULT_BACKEND_DOCKER} + - CELERY_LOGLEVEL=${CELERY_LOGLEVEL} + volumes: + - .:/app command: > sh -c "python manage.py migrate && python manage.py runserver 0.0.0.0:8000" - volumes: - - .:/usr/src/app ports: - "8000:8000" restart: always - env_file: - - ./.env depends_on: - db + - redis - scheduler: - image: nous_aggregator - environment: - DATABASE_HOST: "${DATABASE_HOST}" - DATABASE_PORT: "${DATABASE_PORT}" - command: > - sh -c "python manage.py scrape" + redis: + image: redis + + celery-worker: + build: *web_build + environment: *web_env + command: celery -A nous_aggregator worker + volumes: + - .:/app + depends_on: + - db + - redis + + celery-beat: + build: *web_build + environment: *web_env + command: celery -A nous_aggregator beat volumes: - - .:/usr/src/app + - .:/app depends_on: - - web - db + - redis volumes: postgres_data: diff --git a/nous_aggregator/__init__.py b/nous_aggregator/__init__.py index e69de29..53f4ccb 100644 --- a/nous_aggregator/__init__.py +++ b/nous_aggregator/__init__.py @@ -0,0 +1,3 @@ +from .celery import app as celery_app + +__all__ = ("celery_app",) diff --git a/nous_aggregator/celery.py b/nous_aggregator/celery.py new file mode 100644 index 0000000..fd5d514 --- /dev/null +++ b/nous_aggregator/celery.py @@ -0,0 +1,11 @@ +import os + +from celery import Celery + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "nous_aggregator.settings.local") + +app = Celery("nous_aggregator") + +app.config_from_object("django.conf:settings", namespace="CELERY") + +app.autodiscover_tasks() diff --git a/nous_aggregator/settings/__init__.py b/nous_aggregator/settings/__init__.py index c8f6c28..df4bafe 100644 --- a/nous_aggregator/settings/__init__.py +++ b/nous_aggregator/settings/__init__.py @@ -10,11 +10,9 @@ DJANGO_ENV = config('DJANGO_ENV', default="") match DJANGO_ENV: - case "CI": - from .ci import * + case "BASE": + from .base import * case "LOCAL": from .local import * - case "STAGING": - from .staging import * case "": from .production import * diff --git a/nous_aggregator/settings/base.py b/nous_aggregator/settings/base.py index 69164a2..e525b6a 100644 --- a/nous_aggregator/settings/base.py +++ b/nous_aggregator/settings/base.py @@ -1,11 +1,41 @@ import os from pathlib import Path +from decouple import Csv, config + +from scraper import tasks as scraper_tasks # Build paths inside the project like this: BASE_DIR / 'subdir'. # (modified because settings files are nested one level deeper) BASE_DIR = Path(__file__).resolve().parent.parent.parent +SECRET_KEY = config("SECRET_KEY", default="") + +DEBUG = config("DEBUG", default=False, cast=bool) + +ALLOWED_HOSTS = config("ALLOWED_HOSTS", default="", cast=Csv()) + +CSRF_TRUSTED_ORIGINS = config("CSRF_TRUSTED_ORIGINS", default="", cast=Csv()) + +SESSION_COOKIE_SECURE = config("SESSION_COOKIE_SECURE", default=True, cast=bool) + +CSRF_COOKIE_SECURE = config("CSRF_COOKIE_SECURE", default=True, cast=bool) + +SECURE_SSL_REDIRECT = config("SECURE_SSL_REDIRECT", default=True, cast=bool) + +SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https') + +DATABASES = { + "default": { + "ENGINE": config("DATABASE_ENGINE", default="django.db.backends.postgresql"), + "NAME": config("DATABASE_NAME", default="postgres"), + "USER": config("DATABASE_USER", default="postgres"), + "PASSWORD": config("DATABASE_PASSWORD", default="postgres"), + "HOST": config("DATABASE_HOST", default="localhost"), + "PORT": config("DATABASE_PORT", default=5432, cast=int), + }, +} + # Application definition @@ -17,10 +47,8 @@ "django.contrib.messages", "django.contrib.staticfiles", "django.contrib.humanize", - # My Apps + # nous_aggregator apps "articles.apps.ArticlesConfig", - # Third Party Apps - "django_apscheduler", ] MIDDLEWARE = [ @@ -39,11 +67,6 @@ "127.0.0.1", ] -# Timeouts (connection timeout, read timeout) in seconds for requests -# made with the requests library -REQUESTS_TIMEOUT = (6, 18) -REQUESTS_TIMEOUT_JS = (6, 60) - TEMPLATES = [ { "BACKEND": "django.template.backends.django.DjangoTemplates", @@ -154,3 +177,19 @@ # https://docs.djangoproject.com/en/4.0/ref/settings/#default-auto-field DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" + +# Timeouts (connection timeout, read timeout) in seconds for requests +REQUESTS_TIMEOUT = (30, 60) + +# Celery +CELERY_BROKER_URL = config("CELERY_BROKER_URL", "redis://localhost:6379") +CELERY_RESULT_BACKEND = config("CELERY_RESULT_BACKEND", "redis://localhost:6379") +CELERY_BEAT_SCHEDULE = { + "get_articles_en": { + "task": "articles.tasks.get_articles", + "schedule": scraper_tasks.magazines["en"]["schedule"], + "kwargs": { + "language": "en", + } + } +} diff --git a/nous_aggregator/settings/ci.py b/nous_aggregator/settings/ci.py deleted file mode 100644 index e0f5834..0000000 --- a/nous_aggregator/settings/ci.py +++ /dev/null @@ -1,18 +0,0 @@ -from .base import * - -SECRET_KEY = "hush-hush" - -DEBUG = False - -ALLOWED_HOSTS = ['*'] - -DATABASES = { - "default": { - "ENGINE": "django.db.backends.postgresql", - "NAME": "nous_aggregator", - "USER": "postgres", - "PASSWORD": "postgres", - "HOST": "localhost", - "PORT": 5432, - }, -} diff --git a/nous_aggregator/settings/local.py b/nous_aggregator/settings/local.py index a2078f5..c3a5c96 100644 --- a/nous_aggregator/settings/local.py +++ b/nous_aggregator/settings/local.py @@ -1,3 +1,5 @@ +import socket + from .base import * SECRET_KEY = "hush-hush" @@ -13,8 +15,13 @@ INSTALLED_APPS += [ "debug_toolbar", + "django_extensions", ] MIDDLEWARE += [ "debug_toolbar.middleware.DebugToolbarMiddleware", ] + +# for django_debug_toolbar +hostname, _, ips = socket.gethostbyname_ex(socket.gethostname()) +INTERNAL_IPS = [ip[: ip.rfind(".")] + ".1" for ip in ips] + ["127.0.0.1", "10.0.2.2"] diff --git a/nous_aggregator/settings/production.py b/nous_aggregator/settings/production.py index c3e4f9d..e013ae6 100644 --- a/nous_aggregator/settings/production.py +++ b/nous_aggregator/settings/production.py @@ -1,8 +1,7 @@ -import django_on_heroku import sentry_sdk from sentry_sdk.integrations.django import DjangoIntegration -from .staging import * +from .base import * sentry_sdk.init( dsn="https://242fe72f1a234cecae5a3b1fad7bb4c0@o1410776.ingest.sentry.io/6748377", @@ -21,5 +20,3 @@ environment="production", ) - -django_on_heroku.settings(locals()) diff --git a/nous_aggregator/settings/staging.py b/nous_aggregator/settings/staging.py deleted file mode 100644 index dfde42d..0000000 --- a/nous_aggregator/settings/staging.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -The settings emulate the production environment - -Defaults are chosen with security in mind: DEBUG is False by default, SECRET_KEY -has an empty default in order to make the app crash if it's not set and DEBUG is off etc. -""" - -from .base import * - -import socket - -from decouple import config, Csv - - -SECRET_KEY = config("SECRET_KEY", default="") - -DEBUG = config("DEBUG", default=False, cast=bool) - -ALLOWED_HOSTS = config("ALLOWED_HOSTS", default="", cast=Csv()) - -CSRF_TRUSTED_ORIGINS = config("CSRF_TRUSTED_ORIGINS", default="", cast=Csv()) - -SESSION_COOKIE_SECURE = config("SESSION_COOKIE_SECURE", default=True, cast=bool) - -CSRF_COOKIE_SECURE = config("CSRF_COOKIE_SECURE", default=True, cast=bool) - -SECURE_SSL_REDIRECT = config("SECURE_SSL_REDIRECT", default=True, cast=bool) - -SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https') - -DATABASES = { - "default": { - "ENGINE": config("DATABASE_ENGINE", default="django.db.backends.postgresql"), - "NAME": config("DATABASE_NAME", default="postgres"), - "USER": config("DATABASE_USER", default="postgres"), - "PASSWORD": config("DATABASE_PASSWORD", default="postgres"), - "HOST": config("DATABASE_HOST", default="localhost"), - # "HOST": "host.docker.internal", - # "HOST": "localhost", - "PORT": config("DATABASE_PORT", default=5432, cast=int), - }, -} - -# for django_debug_toolbar -hostname, _, ips = socket.gethostbyname_ex(socket.gethostname()) -INTERNAL_IPS = [ip[: ip.rfind(".")] + ".1" for ip in ips] + ["127.0.0.1", "10.0.2.2"] diff --git a/patches/pyppeteer_patch.py b/patches/pyppeteer_patch.py deleted file mode 100644 index e1414d5..0000000 --- a/patches/pyppeteer_patch.py +++ /dev/null @@ -1,404 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""Chromium process launcher module.""" -import asyncio -import atexit -import json -import logging -import os.path -import shutil -import signal -import subprocess -import sys -import tempfile -import time -from copy import copy -from http.client import HTTPException -from pathlib import Path -from typing import Any -from typing import Dict -from typing import List -from typing import TYPE_CHECKING -from urllib.error import URLError -from urllib.request import urlopen - -from pyppeteer import __pyppeteer_home__ -from pyppeteer.browser import Browser -from pyppeteer.chromium_downloader import current_platform -from pyppeteer.connection import Connection -from pyppeteer.errors import BrowserError -from pyppeteer.helper import addEventListener -from pyppeteer.helper import debugError -from pyppeteer.helper import removeEventListeners -from pyppeteer.target import Target -from pyppeteer.util import check_chromium -from pyppeteer.util import chromium_executable -from pyppeteer.util import download_chromium -from pyppeteer.util import get_free_port -from pyppeteer.util import merge_dict - -if TYPE_CHECKING: - from typing import Optional # noqa: F401 - -logger = logging.getLogger(__name__) - -pyppeteer_home = Path(__pyppeteer_home__) -CHROME_PROFILE_PATH = pyppeteer_home / '.dev_profile' - -DEFAULT_ARGS = [ - '--disable-background-networking', - '--disable-background-timer-throttling', - '--disable-breakpad', - '--disable-browser-side-navigation', - '--disable-client-side-phishing-detection', - '--disable-default-apps', - '--disable-dev-shm-usage', - '--disable-extensions', - '--disable-features=site-per-process', - '--disable-hang-monitor', - '--disable-popup-blocking', - '--disable-prompt-on-repost', - '--disable-sync', - '--disable-translate', - '--metrics-recording-only', - '--no-first-run', - '--safebrowsing-disable-auto-update', - '--enable-automation', - '--password-store=basic', - '--use-mock-keychain', -] - - -class Launcher(object): - """Chrome process launcher class.""" - - def __init__(self, options: Dict[str, Any] = None, # noqa: C901 - **kwargs: Any) -> None: - """Make new launcher.""" - options = merge_dict(options, kwargs) - - self.port = get_free_port() - self.url = f'http://127.0.0.1:{self.port}' - self._loop = options.get('loop', asyncio.get_event_loop()) - self.chromeClosed = True - - ignoreDefaultArgs = options.get('ignoreDefaultArgs', False) - args: List[str] = options.get('args', list()) - self.dumpio = options.get('dumpio', False) - executablePath = options.get('executablePath') - self.env = options.get('env') - self.handleSIGINT = options.get('handleSIGINT', True) - self.handleSIGTERM = options.get('handleSIGTERM', True) - self.handleSIGHUP = options.get('handleSIGHUP', True) - self.ignoreHTTPSErrors = options.get('ignoreHTTPSErrors', False) - self.defaultViewport = options.get('defaultViewport', {'width': 800, 'height': 600}) # noqa: E501 - self.slowMo = options.get('slowMo', 0) - self.timeout = options.get('timeout', 30000) - self.autoClose = options.get('autoClose', True) - - logLevel = options.get('logLevel') - if logLevel: - logging.getLogger('pyppeteer').setLevel(logLevel) - - self.chromeArguments: List[str] = list() - if not ignoreDefaultArgs: - self.chromeArguments.extend(defaultArgs(options)) - elif isinstance(ignoreDefaultArgs, list): - self.chromeArguments.extend(filter(lambda arg: arg not in ignoreDefaultArgs, defaultArgs(options), )) - else: - self.chromeArguments.extend(args) - - self.temporaryUserDataDir: Optional[str] = None - - if not any(arg for arg in self.chromeArguments if arg.startswith('--remote-debugging-')): - self.chromeArguments.append(f'--remote-debugging-port={self.port}') - - if not any(arg for arg in self.chromeArguments if arg.startswith('--user-data-dir')): - if not CHROME_PROFILE_PATH.exists(): - CHROME_PROFILE_PATH.mkdir(parents=True) - self.temporaryUserDataDir = tempfile.mkdtemp(dir=str(CHROME_PROFILE_PATH)) # noqa: E501 - self.chromeArguments.append(f'--user-data-dir={self.temporaryUserDataDir}') # noqa: E501 - - self.chromeExecutable = executablePath - if not self.chromeExecutable: - if not check_chromium(): - download_chromium() - self.chromeExecutable = str(chromium_executable()) - - self.cmd = [self.chromeExecutable] + self.chromeArguments - - def _cleanup_tmp_user_data_dir(self) -> None: - for retry in range(100): - if self.temporaryUserDataDir and os.path.exists(self.temporaryUserDataDir): - shutil.rmtree(self.temporaryUserDataDir, ignore_errors=True) - if os.path.exists(self.temporaryUserDataDir): - time.sleep(0.01) - else: - break - else: - raise IOError('Unable to remove Temporary User Data') - - async def launch(self) -> Browser: # noqa: C901 - """Start chrome process and return `Browser` object.""" - self.chromeClosed = False - self.connection: Optional[Connection] = None - - options = dict() - options['env'] = self.env - if not self.dumpio: - # discard stdout, it's never read in any case. - options['stdout'] = subprocess.DEVNULL - options['stderr'] = subprocess.STDOUT - - self.proc = subprocess.Popen( # type: ignore - self.cmd, **options, ) - - def _close_process(*args: Any, **kwargs: Any) -> None: - if not self.chromeClosed: - self._loop.run_until_complete(self.killChrome()) - - # don't forget to close browser process - if self.autoClose: - atexit.register(_close_process) - if self.handleSIGINT: - signal.signal(signal.SIGINT, _close_process) - if self.handleSIGTERM: - signal.signal(signal.SIGTERM, _close_process) - if not sys.platform.startswith('win'): - # SIGHUP is not defined on windows - if self.handleSIGHUP: - signal.signal(signal.SIGHUP, _close_process) - - connectionDelay = self.slowMo - self.browserWSEndpoint = get_ws_endpoint(self.url) - logger.info(f'Browser listening on: {self.browserWSEndpoint}') - self.connection = Connection(self.browserWSEndpoint, self._loop, connectionDelay, ) - browser = await Browser.create(self.connection, [], self.ignoreHTTPSErrors, self.defaultViewport, self.proc, - self.killChrome) - await self.ensureInitialPage(browser) - return browser - - async def ensureInitialPage(self, browser: Browser) -> None: - """Wait for initial page target to be created.""" - for target in browser.targets(): - if target.type == 'page': - return - - initialPagePromise = self._loop.create_future() - - def initialPageCallback() -> None: - initialPagePromise.set_result(True) - - def check_target(target: Target) -> None: - if target.type == 'page': - initialPageCallback() - - listeners = [addEventListener(browser, 'targetcreated', check_target)] - await initialPagePromise - removeEventListeners(listeners) - - def waitForChromeToClose(self) -> None: - """Terminate chrome.""" - if self.proc.poll() is None and not self.chromeClosed: - self.chromeClosed = True - try: - self.proc.terminate() - self.proc.wait() - except Exception: - # browser process may be already closed - pass - - async def killChrome(self) -> None: - """Terminate chromium process.""" - logger.info('terminate chrome process...') - if self.connection and self.connection._connected: - try: - await self.connection.send('Browser.close') - await self.connection.dispose() - except Exception as e: - # ignore errors on browser termination process - debugError(logger, e) - if self.temporaryUserDataDir and os.path.exists(self.temporaryUserDataDir): # noqa: E501 - # Force kill chrome only when using temporary userDataDir - self.waitForChromeToClose() - self._cleanup_tmp_user_data_dir() - - -def get_ws_endpoint(url) -> str: - url = url + '/json/version' - timeout = time.time() + 30 - while (True): - if time.time() > timeout: - raise BrowserError('Browser closed unexpectedly:\n') - try: - with urlopen(url) as f: - data = json.loads(f.read().decode()) - break - except (URLError, HTTPException): - pass - time.sleep(0.1) - - return data['webSocketDebuggerUrl'] - - -async def launch(options: dict = None, **kwargs: Any) -> Browser: - """Start chrome process and return :class:`~pyppeteer.browser.Browser`. - This function is a shortcut to :meth:`Launcher(options, **kwargs).launch`. - Available options are: - * ``ignoreHTTPSErrors`` (bool): Whether to ignore HTTPS errors. Defaults to - ``False``. - * ``headless`` (bool): Whether to run browser in headless mode. Defaults to - ``True`` unless ``appMode`` or ``devtools`` options is ``True``. - * ``executablePath`` (str): Path to a Chromium or Chrome executable to run - instead of default bundled Chromium. - * ``slowMo`` (int|float): Slow down pyppeteer operations by the specified - amount of milliseconds. - * ``defaultViewport`` (dict): Set a consistent viewport for each page. - Defaults to an 800x600 viewport. ``None`` disables default viewport. - * ``width`` (int): page width in pixels. - * ``height`` (int): page height in pixels. - * ``deviceScaleFactor`` (int|float): Specify device scale factor (can be - thought as dpr). Defaults to ``1``. - * ``isMobile`` (bool): Whether the ``meta viewport`` tag is taken into - account. Defaults to ``False``. - * ``hasTouch`` (bool): Specify if viewport supports touch events. - Defaults to ``False``. - * ``isLandscape`` (bool): Specify if viewport is in landscape mode. - Defaults to ``False``. - * ``args`` (List[str]): Additional arguments (flags) to pass to the browser - process. - * ``ignoreDefaultArgs`` (bool or List[str]): If ``True``, do not use - :func:`~pyppeteer.defaultArgs`. If list is given, then filter out given - default arguments. Dangerous option; use with care. Defaults to - ``False``. - * ``handleSIGINT`` (bool): Close the browser process on Ctrl+C. Defaults to - ``True``. - * ``handleSIGTERM`` (bool): Close the browser process on SIGTERM. Defaults - to ``True``. - * ``handleSIGHUP`` (bool): Close the browser process on SIGHUP. Defaults to - ``True``. - * ``dumpio`` (bool): Whether to pipe the browser process stdout and stderr - into ``process.stdout`` and ``process.stderr``. Defaults to ``False``. - * ``userDataDir`` (str): Path to a user data directory. - * ``env`` (dict): Specify environment variables that will be visible to the - browser. Defaults to same as python process. - * ``devtools`` (bool): Whether to auto-open a DevTools panel for each tab. - If this option is ``True``, the ``headless`` option will be set - ``False``. - * ``logLevel`` (int|str): Log level to print logs. Defaults to same as the - root logger. - * ``autoClose`` (bool): Automatically close browser process when script - completed. Defaults to ``True``. - * ``loop`` (asyncio.AbstractEventLoop): Event loop (**experimental**). - * ``appMode`` (bool): Deprecated. - This function combines 3 steps: - 1. Infer a set of flags to launch chromium with using - :func:`~pyppeteer.defaultArgs`. - 2. Launch browser and start managing its process according to the - ``executablePath``, ``handleSIGINT``, ``dumpio``, and other options. - 3. Create an instance of :class:`~pyppeteer.browser.Browser` class and - initialize it with ``defaultViewport``, ``slowMo``, and - ``ignoreHTTPSErrors``. - ``ignoreDefaultArgs`` option can be used to customize behavior on the (1) - step. For example, to filter out ``--mute-audio`` from default arguments: - .. code:: - browser = await launch(ignoreDefaultArgs=['--mute-audio']) - .. note:: - Pyppeteer can also be used to control the Chrome browser, but it works - best with the version of Chromium it is bundled with. There is no - guarantee it will work with any other version. Use ``executablePath`` - option with extreme caution. - """ - options = {"handleSIGINT": False, "handleSIGTERM": False, "handleSIGHUP": False, "args": ["--no-sandbox"]} - return await Launcher(options, **kwargs).launch() - - -async def connect(options: dict = None, **kwargs: Any) -> Browser: - """Connect to the existing chrome. - ``browserWSEndpoint`` or ``browserURL`` option is necessary to connect to - the chrome. The format of ``browserWSEndpoint`` is - ``ws://${host}:${port}/devtools/browser/`` and format of ``browserURL`` - is ``http://127.0.0.1:9222```. - The value of ``browserWSEndpoint`` can get by :attr:`~pyppeteer.browser.Browser.wsEndpoint`. - Available options are: - * ``browserWSEndpoint`` (str): A browser websocket endpoint to connect to. - * ``browserURL`` (str): A browser URL to connect to. - * ``ignoreHTTPSErrors`` (bool): Whether to ignore HTTPS errors. Defaults to - ``False``. - * ``defaultViewport`` (dict): Set a consistent viewport for each page. - Defaults to an 800x600 viewport. ``None`` disables default viewport. - * ``width`` (int): page width in pixels. - * ``height`` (int): page height in pixels. - * ``deviceScaleFactor`` (int|float): Specify device scale factor (can be - thought as dpr). Defaults to ``1``. - * ``isMobile`` (bool): Whether the ``meta viewport`` tag is taken into - account. Defaults to ``False``. - * ``hasTouch`` (bool): Specify if viewport supports touch events. - Defaults to ``False``. - * ``isLandscape`` (bool): Specify if viewport is in landscape mode. - Defaults to ``False``. - * ``slowMo`` (int|float): Slow down pyppeteer's by the specified amount of - milliseconds. - * ``logLevel`` (int|str): Log level to print logs. Defaults to same as the - root logger. - * ``loop`` (asyncio.AbstractEventLoop): Event loop (**experimental**). - """ - options = merge_dict(options, kwargs) - logLevel = options.get('logLevel') - if logLevel: - logging.getLogger('pyppeteer').setLevel(logLevel) - - browserWSEndpoint = options.get('browserWSEndpoint') - if not browserWSEndpoint: - browserURL = options.get('browserURL') - if not browserURL: - raise BrowserError('Need `browserWSEndpoint` or `browserURL` option.') - browserWSEndpoint = get_ws_endpoint(browserURL) - connectionDelay = options.get('slowMo', 0) - connection = Connection(browserWSEndpoint, options.get('loop', asyncio.get_event_loop()), connectionDelay) - browserContextIds = (await connection.send('Target.getBrowserContexts')).get('browserContextIds', []) - ignoreHTTPSErrors = bool(options.get('ignoreHTTPSErrors', False)) - defaultViewport = options.get('defaultViewport', {'width': 800, 'height': 600}) - return await Browser.create(connection, browserContextIds, ignoreHTTPSErrors, defaultViewport, None, - lambda: connection.send('Browser.close')) - - -def executablePath() -> str: - """Get executable path of default chromium.""" - return str(chromium_executable()) - - -def defaultArgs(options: Dict = None, **kwargs: Any) -> List[str]: # noqa: C901,E501 - """Get the default flags the chromium will be launched with. - ``options`` or keyword arguments are set of configurable options to set on - the browser. Can have the following fields: - * ``headless`` (bool): Whether to run browser in headless mode. Defaults to - ``True`` unless the ``devtools`` option is ``True``. - * ``args`` (List[str]): Additional arguments to pass to the browser - instance. The list of chromium flags can be found - `here `__. - * ``userDataDir`` (str): Path to a User Data Directory. - * ``devtools`` (bool): Whether to auto-open DevTools panel for each tab. If - this option is ``True``, the ``headless`` option will be set ``False``. - """ - options = merge_dict(options, kwargs) - devtools = options.get('devtools', False) - headless = options.get('headless', not devtools) - args = options.get('args', list()) - userDataDir = options.get('userDataDir') - chromeArguments = copy(DEFAULT_ARGS) - - if userDataDir: - chromeArguments.append(f'--user-data-dir={userDataDir}') - if devtools: - chromeArguments.append('--auto-open-devtools-for-tabs') - if headless: - chromeArguments.extend(('--headless', '--hide-scrollbars', '--mute-audio',)) - if current_platform().startswith('win'): - chromeArguments.append('--disable-gpu') - - if all(map(lambda arg: arg.startswith('-'), args)): # type: ignore - chromeArguments.append('about:blank') - chromeArguments.extend(args) - - return chromeArguments diff --git a/patches/pyppeteer_patch.sh b/patches/pyppeteer_patch.sh deleted file mode 100755 index 6ac05d0..0000000 --- a/patches/pyppeteer_patch.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# The patch disables pyppeteer launcher's signal handling (line 312 in -# pyppeteer_patch.py) since it causes an error when jobs are run in -# a thread created by Apscheduler. In order to patch the launcher -# file in a virtual environment, the location of the file will be: -# -# /venv/lib/python3.10/site-packages/pyppeteer/launcher.py -# -# Background and discussion: -# https://docs.python.org/3/library/signal.html#signals-and-threads -# https://bugs.python.org/issue38904 (comment by Eric Snow) - -cp patches/pyppeteer_patch.py /usr/local/lib/python3.10/site-packages/pyppeteer/launcher.py diff --git a/requirements/base.in b/requirements/base.in index 53597e0..36c6922 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -1,10 +1,3 @@ -gunicorn - -# Django -django -django-apscheduler -django-on-heroku - # Python beautifulsoup4 langdetect @@ -16,3 +9,12 @@ psycopg2 pyppeteer sentry-sdk websockets + +# Django +django +django-on-heroku + +# Other +gunicorn +redis +celery diff --git a/requirements/base.txt b/requirements/base.txt index 77993f9..1042fdf 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,18 +4,22 @@ # # ./bin/compile_dependencies.sh # +amqp==5.2.0 + # via kombu appdirs==1.4.4 # via pyppeteer -apscheduler==3.10.4 - # via django-apscheduler asgiref==3.7.2 # via django beautifulsoup4==4.12.2 # via # -r requirements/base.in # bs4 +billiard==4.2.0 + # via celery bs4==0.0.1 # via requests-html +celery==5.3.6 + # via -r requirements/base.in certifi==2023.7.22 # via # pyppeteer @@ -23,6 +27,18 @@ certifi==2023.7.22 # sentry-sdk charset-normalizer==3.2.0 # via requests +click==8.1.7 + # via + # celery + # click-didyoumean + # click-plugins + # click-repl +click-didyoumean==0.3.0 + # via celery +click-plugins==1.1.1 + # via celery +click-repl==0.3.0 + # via celery cssselect==1.2.0 # via pyquery dj-database-url==2.1.0 @@ -31,10 +47,7 @@ django==5.0.1 # via # -r requirements/base.in # dj-database-url - # django-apscheduler # django-on-heroku -django-apscheduler==0.6.2 - # via -r requirements/base.in django-on-heroku==1.1.2 # via -r requirements/base.in fake-useragent==1.2.1 @@ -45,6 +58,8 @@ idna==3.4 # via requests importlib-metadata==6.8.0 # via pyppeteer +kombu==5.3.5 + # via celery langdetect==1.0.9 # via -r requirements/base.in lxml==4.9.3 @@ -53,7 +68,9 @@ packaging==23.2 # via gunicorn parse==1.19.1 # via requests-html -psycopg2==2.9.7 +prompt-toolkit==3.0.43 + # via click-repl +psycopg2==2.9.9 # via -r requirements/base.in psycopg2-binary==2.9.7 # via django-on-heroku @@ -65,10 +82,12 @@ pyppeteer==1.0.2 # requests-html pyquery==2.0.0 # via requests-html +python-dateutil==2.8.2 + # via celery python-decouple==3.8 # via -r requirements/base.in -pytz==2023.3.post1 - # via apscheduler +redis==5.0.1 + # via -r requirements/base.in regex==2023.8.8 # via -r requirements/base.in requests==2.31.0 @@ -81,8 +100,8 @@ sentry-sdk==1.39.2 # via -r requirements/base.in six==1.16.0 # via - # apscheduler # langdetect + # python-dateutil soupsieve==2.5 # via beautifulsoup4 sqlparse==0.4.4 @@ -91,15 +110,22 @@ tqdm==4.66.1 # via pyppeteer typing-extensions==4.7.1 # via dj-database-url -tzlocal==5.0.1 - # via apscheduler +tzdata==2023.4 + # via celery urllib3==1.26.18 # via # pyppeteer # requests # sentry-sdk +vine==5.1.0 + # via + # amqp + # celery + # kombu w3lib==2.1.2 # via requests-html +wcwidth==0.2.13 + # via prompt-toolkit websockets==10.4 # via # -r requirements/base.in diff --git a/requirements/ci.txt b/requirements/ci.txt index f461485..eafbd45 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -4,16 +4,16 @@ # # ./bin/compile_dependencies.sh # -appdirs==1.4.4 +amqp==5.2.0 # via # -c requirements/base.txt # -r requirements/base.txt - # pyppeteer -apscheduler==3.10.4 + # kombu +appdirs==1.4.4 # via # -c requirements/base.txt # -r requirements/base.txt - # django-apscheduler + # pyppeteer asgiref==3.7.2 # via # -c requirements/base.txt @@ -26,11 +26,20 @@ beautifulsoup4==4.12.2 # -c requirements/base.txt # -r requirements/base.txt # bs4 +billiard==4.2.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # celery bs4==0.0.1 # via # -c requirements/base.txt # -r requirements/base.txt # requests-html +celery==5.3.6 + # via + # -c requirements/base.txt + # -r requirements/base.txt certifi==2023.7.22 # via # -c requirements/base.txt @@ -43,6 +52,29 @@ charset-normalizer==3.2.0 # -c requirements/base.txt # -r requirements/base.txt # requests +click==8.1.7 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # celery + # click-didyoumean + # click-plugins + # click-repl +click-didyoumean==0.3.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # celery +click-plugins==1.1.1 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # celery +click-repl==0.3.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # celery cssselect==1.2.0 # via # -c requirements/base.txt @@ -58,12 +90,7 @@ django==5.0.1 # -c requirements/base.txt # -r requirements/base.txt # dj-database-url - # django-apscheduler # django-on-heroku -django-apscheduler==0.6.2 - # via - # -c requirements/base.txt - # -r requirements/base.txt django-on-heroku==1.1.2 # via # -c requirements/base.txt @@ -93,6 +120,11 @@ importlib-metadata==6.8.0 # pyppeteer iniconfig==2.0.0 # via pytest +kombu==5.3.5 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # celery langdetect==1.0.9 # via # -c requirements/base.txt @@ -121,7 +153,12 @@ pbr==5.11.1 # via stevedore pluggy==1.3.0 # via pytest -psycopg2==2.9.7 +prompt-toolkit==3.0.43 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # click-repl +psycopg2==2.9.9 # via # -c requirements/base.txt # -r requirements/base.txt @@ -154,17 +191,21 @@ pytest==7.4.2 # pytest-django pytest-django==4.5.2 # via -r requirements/ci.in -python-decouple==3.8 +python-dateutil==2.8.2 # via # -c requirements/base.txt # -r requirements/base.txt -pytz==2023.3.post1 + # celery +python-decouple==3.8 # via # -c requirements/base.txt # -r requirements/base.txt - # apscheduler pyyaml==6.0.1 # via bandit +redis==5.0.1 + # via + # -c requirements/base.txt + # -r requirements/base.txt regex==2023.8.8 # via # -c requirements/base.txt @@ -188,8 +229,8 @@ six==1.16.0 # via # -c requirements/base.txt # -r requirements/base.txt - # apscheduler # langdetect + # python-dateutil smmap==5.0.1 # via gitdb soupsieve==2.5 @@ -214,11 +255,11 @@ typing-extensions==4.7.1 # -c requirements/base.txt # -r requirements/base.txt # dj-database-url -tzlocal==5.0.1 +tzdata==2023.4 # via # -c requirements/base.txt # -r requirements/base.txt - # apscheduler + # celery urllib3==1.26.18 # via # -c requirements/base.txt @@ -226,11 +267,23 @@ urllib3==1.26.18 # pyppeteer # requests # sentry-sdk +vine==5.1.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # amqp + # celery + # kombu w3lib==2.1.2 # via # -c requirements/base.txt # -r requirements/base.txt # requests-html +wcwidth==0.2.13 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # prompt-toolkit websockets==10.4 # via # -c requirements/base.txt diff --git a/requirements/dev.txt b/requirements/dev.txt index b7be5c5..6f08f0f 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -4,22 +4,22 @@ # # ./bin/compile_dependencies.sh # -appdirs==1.4.4 +amqp==5.2.0 # via # -c requirements/ci.txt # -r requirements/ci.txt - # pyppeteer -apscheduler==3.10.4 + # kombu +appdirs==1.4.4 # via # -c requirements/ci.txt # -r requirements/ci.txt - # django-apscheduler + # pyppeteer asgiref==3.7.2 # via # -c requirements/ci.txt # -r requirements/ci.txt # django -attrs==23.1.0 +attrs==23.2.0 # via pytype bandit==1.7.5 # via @@ -31,6 +31,11 @@ beautifulsoup4==4.12.2 # -c requirements/ci.txt # -r requirements/ci.txt # bs4 +billiard==4.2.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # celery black==23.9.1 # via -r requirements/dev.in bs4==0.0.1 @@ -38,6 +43,10 @@ bs4==0.0.1 # -c requirements/ci.txt # -r requirements/ci.txt # requests-html +celery==5.3.6 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt certifi==2023.7.22 # via # -c requirements/ci.txt @@ -51,7 +60,29 @@ charset-normalizer==3.2.0 # -r requirements/ci.txt # requests click==8.1.7 - # via black + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # black + # celery + # click-didyoumean + # click-plugins + # click-repl +click-didyoumean==0.3.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # celery +click-plugins==1.1.1 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # celery +click-repl==0.3.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # celery cssselect==1.2.0 # via # -c requirements/ci.txt @@ -67,16 +98,11 @@ django==5.0.1 # -c requirements/ci.txt # -r requirements/ci.txt # dj-database-url - # django-apscheduler # django-debug-toolbar # django-extensions # django-on-heroku # django-stubs # django-stubs-ext -django-apscheduler==0.6.2 - # via - # -c requirements/ci.txt - # -r requirements/ci.txt django-debug-toolbar==4.2.0 # via -r requirements/dev.in django-extensions==3.2.3 @@ -131,6 +157,11 @@ isort==5.12.0 # via -r requirements/dev.in jinja2==3.1.3 # via pytype +kombu==5.3.5 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # celery langdetect==1.0.9 # via # -c requirements/ci.txt @@ -197,7 +228,12 @@ pluggy==1.3.0 # -c requirements/ci.txt # -r requirements/ci.txt # pytest -psycopg2==2.9.7 +prompt-toolkit==3.0.43 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # click-repl +psycopg2==2.9.9 # via # -c requirements/ci.txt # -r requirements/ci.txt @@ -245,23 +281,27 @@ pytest-django==4.5.2 # via # -c requirements/ci.txt # -r requirements/ci.txt -python-decouple==3.8 +python-dateutil==2.8.2 # via # -c requirements/ci.txt # -r requirements/ci.txt -pytype==2023.10.17 - # via -r requirements/dev.in -pytz==2023.3.post1 + # celery +python-decouple==3.8 # via # -c requirements/ci.txt # -r requirements/ci.txt - # apscheduler +pytype==2023.10.17 + # via -r requirements/dev.in pyyaml==6.0.1 # via # -c requirements/ci.txt # -r requirements/ci.txt # bandit # libcst +redis==5.0.1 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt regex==2023.8.8 # via # -c requirements/ci.txt @@ -288,8 +328,8 @@ six==1.16.0 # via # -c requirements/ci.txt # -r requirements/ci.txt - # apscheduler # langdetect + # python-dateutil smmap==5.0.1 # via # -c requirements/ci.txt @@ -341,11 +381,11 @@ typing-extensions==4.7.1 # typing-inspect typing-inspect==0.9.0 # via libcst -tzlocal==5.0.1 +tzdata==2023.4 # via # -c requirements/ci.txt # -r requirements/ci.txt - # apscheduler + # celery urllib3==1.26.18 # via # -c requirements/ci.txt @@ -353,11 +393,23 @@ urllib3==1.26.18 # pyppeteer # requests # sentry-sdk +vine==5.1.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # amqp + # celery + # kombu w3lib==2.1.2 # via # -c requirements/ci.txt # -r requirements/ci.txt # requests-html +wcwidth==0.2.13 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # prompt-toolkit websockets==10.4 # via # -c requirements/ci.txt diff --git a/scraper/__init__.py b/scraper/__init__.py new file mode 100644 index 0000000..38f837a --- /dev/null +++ b/scraper/__init__.py @@ -0,0 +1 @@ +from .spiders import Spider, run diff --git a/articles/scraper/headers.py b/scraper/headers.py similarity index 100% rename from articles/scraper/headers.py rename to scraper/headers.py diff --git a/articles/scraper/parser.py b/scraper/parser.py similarity index 100% rename from articles/scraper/parser.py rename to scraper/parser.py diff --git a/articles/scraper/spider.py b/scraper/spiders.py similarity index 65% rename from articles/scraper/spider.py rename to scraper/spiders.py index 25ec698..975d010 100644 --- a/articles/scraper/spider.py +++ b/scraper/spiders.py @@ -1,7 +1,7 @@ import asyncio import logging import random -from typing import Optional +from http.cookiejar import DefaultCookiePolicy import pyppeteer import requests @@ -18,33 +18,26 @@ class Spider: """ Class Attributes: headers (list): a collection of HTTP headers - articles (set): a collection of JSON strings representing article - metadata Instance Attributes: sitemap (dict): contains information about a particular page starting_urls (list): the urls where each Spider object searches for links links (set): urls of pages targeted for scraping - - Public Methods: - crawl(sitemap): the main method of the Spider class; called from the - scheduler which supplies the argument `sitemap`; creates a spider - object for `sitemap` and runs the event loop. + articles (set): a collection of JSON strings representing article + metadata """ headers = headers.headers - articles: set[str] = set() - def __init__(self, sitemap: dict): + def __init__(self, starting_urls: list, sitemap: dict): self.sitemap = sitemap - self.starting_urls = [ - self.sitemap["base_url"] + path for path in self.sitemap["paths"] - ] + self.starting_urls = starting_urls self.links: set[str] = set() + self.articles: set[str] = set() @staticmethod - async def connect(asession: AsyncHTMLSession, url: str) -> Optional[HTMLResponse]: + async def connect(asession: AsyncHTMLSession, url: str) -> HTMLResponse | None: """GET request wrapper""" try: response = await asession.get( @@ -58,16 +51,13 @@ async def connect(asession: AsyncHTMLSession, url: str) -> Optional[HTMLResponse return response async def get_links(self, asession: AsyncHTMLSession, url: str): - """ - Get all article links at `url` and filter them - """ response = await Spider.connect(asession, url) if not response: return if self.sitemap["javascript_required"]: try: - await response.html.arender(timeout=settings.REQUESTS_TIMEOUT_JS) + await response.html.arender(timeout=settings.REQUESTS_TIMEOUT) except pyppeteer.errors.TimeoutError as e: logger.error("Could not render JavaScript for %s (%s)", url, e) for link in response.html.absolute_links: @@ -75,9 +65,6 @@ async def get_links(self, asession: AsyncHTMLSession, url: str): self.links.add(link) async def scrape(self, asession: AsyncHTMLSession, url: str): - """ - Scrape the page at `url` and store data - """ response = await Spider.connect(asession, url) if not response: return @@ -85,7 +72,7 @@ async def scrape(self, asession: AsyncHTMLSession, url: str): html = response.text article = parser.parse(html, self.sitemap, url) if article: - Spider.articles.add(article) + self.articles.add(article) async def collect_links(self, asession: AsyncHTMLSession): """ @@ -101,19 +88,17 @@ async def collect_metadata(self, asession: AsyncHTMLSession): coros = [self.scrape(asession, link) for link in self.links] await asyncio.gather(*coros) - @staticmethod - def crawl(sitemap: dict): - """ - Create spider instance and run the event loop - """ - spider = Spider(sitemap) - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - asession = AsyncHTMLSession() - # reject cookies - # asession.cookies.set_policy(DefaultCookiePolicy(allowed_domains=[])) - try: - loop.run_until_complete(spider.collect_links(asession)) - loop.run_until_complete(spider.collect_metadata(asession)) - except ConnectionClosedError: - logger.warning("Connection closed.") + +def run(spider: Spider): + """ + Run `spider` with async HTML session inside event loop + """ + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + asession = AsyncHTMLSession() + asession.cookies.set_policy(DefaultCookiePolicy(allowed_domains=[])) + try: + loop.run_until_complete(spider.collect_links(asession)) + loop.run_until_complete(spider.collect_metadata(asession)) + except ConnectionClosedError as ex: + logger.warning("Connection closed", exc_info=ex) diff --git a/scraper/tasks.py b/scraper/tasks.py new file mode 100644 index 0000000..e56b34c --- /dev/null +++ b/scraper/tasks.py @@ -0,0 +1,19 @@ +magazines = { + "en": { + "schedule": 3600 * 4, # 4h + "titles": [ + "Al Jazeera", + "Associated Press", + "Christian Science Monitor", + "Consortium News", + "Current Affairs", + "New York Times", + "NPR", + "Reuters", + "The Atlantic", + "The Intercept", + "UPI", + "Wall Street Journal", + ] + } +}