diff --git a/.gitignore b/.gitignore index 6fb8c4b..03b2039 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /.venv /dist /build +/config.yaml diff --git a/Dockerfile b/Dockerfile index 0bc6448..bd34b0d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,12 +19,14 @@ RUN poetry build --format wheel FROM python:3.12-slim -VOLUME /app +WORKDIR /app -COPY --from=compiler /app/dist/*.whl / +COPY --from=compiler /app/dist/*.whl . RUN pip3 install --no-cache-dir -- *.whl +RUN rm *.whl + RUN playwright install --with-deps firefox ENV SB__BROWSER__TYPE="firefox" diff --git a/README.md b/README.md index c3f7c4c..f344eb9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![GitHub](https://img.shields.io/github/license/RobertoBochet/scraper-bot?style=flat-square)](https://github.com/RobertoBochet/scraper-bot) [![GitHub Version](https://img.shields.io/github/v/tag/RobertoBochet/scraper-bot?label=version&style=flat-square)](https://github.com/RobertoBochet/scraper-bot) -[![PyPI - Version](https://img.shields.io/pypi/v/scraper-bot)](https://pypi.org/project/scraper-bot/) +[![PyPI - Version](https://img.shields.io/pypi/v/scraper-bot?style=flat-square)](https://pypi.org/project/scraper-bot/) [![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/RobertoBochet/scraper-bot/test-code.yml?label=test%20code&style=flat-square)](https://github.com/RobertoBochet/scraper-bot) [![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/RobertoBochet/scraper-bot/release.yml?label=publish%20release&style=flat-square)](https://github.com/RobertoBochet/scraper-bot/pkgs/container/scraper-bot) [![CodeFactor Grade](https://img.shields.io/codefactor/grade/github/RobertoBochet/scraper-bot?style=flat-square)](https://www.codefactor.io/repository/github/robertobochet/scraper-bot) diff --git a/scraper_bot/__main__.py b/scraper_bot/__main__.py index 221410c..63a479d 100644 --- a/scraper_bot/__main__.py +++ b/scraper_bot/__main__.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 import asyncio import json -import logging.config from argparse import ArgumentParser from asyncio import CancelledError, create_task +from logging import DEBUG, getLogger from signal import SIGINT from pydantic import ValidationError @@ -14,14 +14,11 @@ def main() -> int: - # loads logger config - setup_default_logger() - - LOGGER = logging.getLogger(__package__) - # gets inline arguments parser = ArgumentParser(prog="bot_scraper") + parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Increase logging verbosity") + parser.add_argument( "-c", "--config", @@ -53,6 +50,11 @@ def main() -> int: # parses args args = vars(parser.parse_args()) + # loads logger config + setup_default_logger(DEBUG if args.get("verbose", None) else None) + + logger = getLogger(__package__) + cli_override_settings = {} if args.get("show_config_schema"): @@ -61,7 +63,7 @@ def main() -> int: if config_path := args.get("config_path"): Settings.set_settings_path(config_path) - LOGGER.info(f"Using config file '{config_path}'") + logger.info(f"Using config file '{config_path}'") if args.get("daemonize"): cli_override_settings["daemonize"] = True @@ -69,20 +71,20 @@ def main() -> int: try: settings = Settings(**cli_override_settings) except ValidationError as e: - LOGGER.critical(f"Configuration issue: {e}") + logger.critical(f"Configuration issue: {e}") return 1 # creates an instance of ScraperBot bot = ScraperBot(settings) - LOGGER.info("bot_scraper is ready to start") + logger.info("bot_scraper is ready to start") if not settings.daemonize: asyncio.run(bot.run_once()) return 0 async def daemonize(): - LOGGER.info("Starting daemon") + logger.info("Starting daemon") task = create_task(bot.run()) task.get_loop().add_signal_handler(SIGINT, task.cancel) @@ -90,7 +92,7 @@ async def daemonize(): try: await task except CancelledError: - LOGGER.info("Daemon has been stopped") + logger.info("Scraper bot has been stopped") # starts bot as daemon asyncio.run(daemonize()) diff --git a/scraper_bot/logging/logging.py b/scraper_bot/logging/logging.py index be53c12..d98bf44 100644 --- a/scraper_bot/logging/logging.py +++ b/scraper_bot/logging/logging.py @@ -15,7 +15,7 @@ class CustomFormatter(Formatter): def get_format(self, level: int) -> str: match level: case logging.DEBUG: - return colored(f"{self._format} (%(filename)s:%(lineno)d)", "grey") + return colored(f"{self._format} (%(filename)s:%(lineno)d)", "dark_grey") case logging.INFO: return colored(self._format, "white") case logging.WARNING: diff --git a/scraper_bot/scraper/browser_manager.py b/scraper_bot/scraper/browser_manager.py index f9c1ab6..0e4355f 100644 --- a/scraper_bot/scraper/browser_manager.py +++ b/scraper_bot/scraper/browser_manager.py @@ -1,5 +1,6 @@ from contextlib import asynccontextmanager from logging import getLogger +from typing import AsyncIterator from playwright.async_api import Browser, Error, async_playwright @@ -13,12 +14,14 @@ def __init__(self, settings: BrowserSettings): self._settings = settings @asynccontextmanager - async def launch_browser(self) -> Browser: + async def launch_browser(self) -> AsyncIterator[Browser]: async with async_playwright() as pw: browser_types = [ next((b for b in [pw.firefox, pw.chromium, pw.webkit] if b.name == i)) for i in self._settings.type ] + _LOGGER.info(browser_types) + for browser_type in browser_types: try: browser = await browser_type.launch(headless=self._settings.headless) @@ -33,6 +36,7 @@ async def launch_browser(self) -> Browser: yield browser finally: await browser.close() + _LOGGER.debug("Close browser") break diff --git a/scraper_bot/scraper/exceptions.py b/scraper_bot/scraper/exceptions.py index f76c0f5..4c9c3ec 100644 --- a/scraper_bot/scraper/exceptions.py +++ b/scraper_bot/scraper/exceptions.py @@ -1,10 +1,6 @@ -class ScraperError(Exception): +class ScraperTaskError(Exception): pass -class NoTargetFound(ScraperError): - pass - - -class RequestError(ScraperError): +class TargetScriptError(ScraperTaskError): pass diff --git a/scraper_bot/scraper/scraper.py b/scraper_bot/scraper/scraper.py index 6628a55..6190394 100644 --- a/scraper_bot/scraper/scraper.py +++ b/scraper_bot/scraper/scraper.py @@ -5,6 +5,7 @@ from scraper_bot.settings.task import TaskSettings from .browser_manager import BrowserManager +from .exceptions import ScraperTaskError from .scraper_task import ScraperTask from .scraper_task_result import ScraperTaskResult @@ -27,5 +28,13 @@ def add_task(self, *tasks: TaskSettings) -> list[ScraperTask]: def tasks(self) -> list[ScraperTask]: return self._tasks + @staticmethod + async def _run_task(task: ScraperTask) -> ScraperTaskResult | None: + try: + return await task.run() + except ScraperTaskError: + _LOGGER.error(f"Task {task.name} failed") + return None + async def run(self) -> tuple[ScraperTaskResult, ...]: - return await gather(*(t.run() for t in self._tasks)) + return (r for r in (await gather(*(self._run_task(t) for t in self._tasks))) if r is not None) diff --git a/scraper_bot/scraper/scraper_task.py b/scraper_bot/scraper/scraper_task.py index c7cad55..a89835d 100644 --- a/scraper_bot/scraper/scraper_task.py +++ b/scraper_bot/scraper/scraper_task.py @@ -1,10 +1,12 @@ from logging import Logger, getLogger +from playwright.async_api import Error, Page from playwright_stealth import stealth_async from scraper_bot.settings.task import TaskSettings from .browser_manager import BrowserManager +from .exceptions import TargetScriptError from .scraper_task_result import ScraperTaskResult @@ -29,7 +31,7 @@ async def run(self) -> ScraperTaskResult: self._logger.info("Starting scraper task") async with self._browser_manager.launch_browser() as browser: - page = await browser.new_page() + page: Page = await browser.new_page() if self._browser_manager.stealth_enabled: await stealth_async(page) @@ -38,10 +40,21 @@ async def run(self) -> ScraperTaskResult: # TODO add support for waitingForTarget - data: str | list[str] | dict | list[dict] = await page.evaluate(self.settings.target) + self._logger.info("Starting target script evaluated") + try: + data: str | list[str] | dict | list[dict] = await page.evaluate(self.settings.target) + except Error as e: + self._logger.error("Target script error") + self._logger.debug(e) + raise TargetScriptError() + + self._logger.info("Target script evaluated") + self._logger.debug(data) # TODO add support for nextPageTarget + self._logger.info("Completed scraping") + if not isinstance(data, list): data = list(data) diff --git a/scraper_bot/scraper/scraper_task_result_entity.py b/scraper_bot/scraper/scraper_task_result_entity.py index 5c4a8b2..f58ed6e 100644 --- a/scraper_bot/scraper/scraper_task_result_entity.py +++ b/scraper_bot/scraper/scraper_task_result_entity.py @@ -28,7 +28,8 @@ def __len__(self): return len(self._data) + 1 def __str__(self) -> str: - return f"{self._task.name}#{"|".join(["=".join(v) for v in sorted(self._data.items(), key=lambda x: x[0])])}" + fields = "|".join([f"{k}={v}" for k, v in sorted(self._data.items(), key=lambda x: x[0]) if v is not None]) + return f"{self._task.name}#{fields}" def __hash__(self) -> int: return int(sha256(str(self).encode()).hexdigest(), 16)