Skip to content

Commit

Permalink
Fix scraper bug in js evaluation
Browse files Browse the repository at this point in the history
Squashed commit of the following:

commit 3350d7e
Author: Roberto Bochet <[email protected]>
Date:   Fri Jun 28 19:40:39 2024 +0200

    Update gitignore

commit 0b4b1ec
Author: Roberto Bochet <[email protected]>
Date:   Fri Jun 28 19:40:14 2024 +0200

    Add verbosity argument

commit 1513647
Author: Roberto Bochet <[email protected]>
Date:   Fri Jun 28 19:29:58 2024 +0200

    Improve log readability

commit bc3d862
Author: Roberto Bochet <[email protected]>
Date:   Fri Jun 28 19:29:36 2024 +0200

    Fix bug and improve logging

commit 11d31ba
Author: Roberto Bochet <[email protected]>
Date:   Fri Jun 28 19:21:28 2024 +0200

    Minor change in readme

commit a125031
Author: Roberto Bochet <[email protected]>
Date:   Fri Jun 28 19:21:06 2024 +0200

    Minor improve in docker build
  • Loading branch information
RobertoBochet committed Jun 28, 2024
1 parent 5d76a5e commit f16ab1f
Show file tree
Hide file tree
Showing 10 changed files with 54 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
/.venv
/dist
/build
/config.yaml
6 changes: 4 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ RUN poetry build --format wheel

FROM python:3.12-slim

VOLUME /app
WORKDIR /app

COPY --from=compiler /app/dist/*.whl /
COPY --from=compiler /app/dist/*.whl .

RUN pip3 install --no-cache-dir -- *.whl

RUN rm *.whl

RUN playwright install --with-deps firefox

ENV SB__BROWSER__TYPE="firefox"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![GitHub](https://img.shields.io/github/license/RobertoBochet/scraper-bot?style=flat-square)](https://github.com/RobertoBochet/scraper-bot)
[![GitHub Version](https://img.shields.io/github/v/tag/RobertoBochet/scraper-bot?label=version&style=flat-square)](https://github.com/RobertoBochet/scraper-bot)
[![PyPI - Version](https://img.shields.io/pypi/v/scraper-bot)](https://pypi.org/project/scraper-bot/)
[![PyPI - Version](https://img.shields.io/pypi/v/scraper-bot?style=flat-square)](https://pypi.org/project/scraper-bot/)
[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/RobertoBochet/scraper-bot/test-code.yml?label=test%20code&style=flat-square)](https://github.com/RobertoBochet/scraper-bot)
[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/RobertoBochet/scraper-bot/release.yml?label=publish%20release&style=flat-square)](https://github.com/RobertoBochet/scraper-bot/pkgs/container/scraper-bot)
[![CodeFactor Grade](https://img.shields.io/codefactor/grade/github/RobertoBochet/scraper-bot?style=flat-square)](https://www.codefactor.io/repository/github/robertobochet/scraper-bot)
Expand Down
24 changes: 13 additions & 11 deletions scraper_bot/__main__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python3
import asyncio
import json
import logging.config
from argparse import ArgumentParser
from asyncio import CancelledError, create_task
from logging import DEBUG, getLogger
from signal import SIGINT

from pydantic import ValidationError
Expand All @@ -14,14 +14,11 @@


def main() -> int:
# loads logger config
setup_default_logger()

LOGGER = logging.getLogger(__package__)

# gets inline arguments
parser = ArgumentParser(prog="bot_scraper")

parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Increase logging verbosity")

parser.add_argument(
"-c",
"--config",
Expand Down Expand Up @@ -53,6 +50,11 @@ def main() -> int:
# parses args
args = vars(parser.parse_args())

# loads logger config
setup_default_logger(DEBUG if args.get("verbose", None) else None)

logger = getLogger(__package__)

cli_override_settings = {}

if args.get("show_config_schema"):
Expand All @@ -61,36 +63,36 @@ def main() -> int:

if config_path := args.get("config_path"):
Settings.set_settings_path(config_path)
LOGGER.info(f"Using config file '{config_path}'")
logger.info(f"Using config file '{config_path}'")

if args.get("daemonize"):
cli_override_settings["daemonize"] = True

try:
settings = Settings(**cli_override_settings)
except ValidationError as e:
LOGGER.critical(f"Configuration issue: {e}")
logger.critical(f"Configuration issue: {e}")
return 1

# creates an instance of ScraperBot
bot = ScraperBot(settings)

LOGGER.info("bot_scraper is ready to start")
logger.info("bot_scraper is ready to start")

if not settings.daemonize:
asyncio.run(bot.run_once())
return 0

async def daemonize():
LOGGER.info("Starting daemon")
logger.info("Starting daemon")
task = create_task(bot.run())

task.get_loop().add_signal_handler(SIGINT, task.cancel)

try:
await task
except CancelledError:
LOGGER.info("Daemon has been stopped")
logger.info("Scraper bot has been stopped")

# starts bot as daemon
asyncio.run(daemonize())
Expand Down
2 changes: 1 addition & 1 deletion scraper_bot/logging/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class CustomFormatter(Formatter):
def get_format(self, level: int) -> str:
match level:
case logging.DEBUG:
return colored(f"{self._format} (%(filename)s:%(lineno)d)", "grey")
return colored(f"{self._format} (%(filename)s:%(lineno)d)", "dark_grey")
case logging.INFO:
return colored(self._format, "white")
case logging.WARNING:
Expand Down
6 changes: 5 additions & 1 deletion scraper_bot/scraper/browser_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from contextlib import asynccontextmanager
from logging import getLogger
from typing import AsyncIterator

from playwright.async_api import Browser, Error, async_playwright

Expand All @@ -13,12 +14,14 @@ def __init__(self, settings: BrowserSettings):
self._settings = settings

@asynccontextmanager
async def launch_browser(self) -> Browser:
async def launch_browser(self) -> AsyncIterator[Browser]:
async with async_playwright() as pw:
browser_types = [
next((b for b in [pw.firefox, pw.chromium, pw.webkit] if b.name == i)) for i in self._settings.type
]

_LOGGER.info(browser_types)

for browser_type in browser_types:
try:
browser = await browser_type.launch(headless=self._settings.headless)
Expand All @@ -33,6 +36,7 @@ async def launch_browser(self) -> Browser:
yield browser
finally:
await browser.close()
_LOGGER.debug("Close browser")

break

Expand Down
8 changes: 2 additions & 6 deletions scraper_bot/scraper/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
class ScraperError(Exception):
class ScraperTaskError(Exception):
pass


class NoTargetFound(ScraperError):
pass


class RequestError(ScraperError):
class TargetScriptError(ScraperTaskError):
pass
11 changes: 10 additions & 1 deletion scraper_bot/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from scraper_bot.settings.task import TaskSettings

from .browser_manager import BrowserManager
from .exceptions import ScraperTaskError
from .scraper_task import ScraperTask
from .scraper_task_result import ScraperTaskResult

Expand All @@ -27,5 +28,13 @@ def add_task(self, *tasks: TaskSettings) -> list[ScraperTask]:
def tasks(self) -> list[ScraperTask]:
return self._tasks

@staticmethod
async def _run_task(task: ScraperTask) -> ScraperTaskResult | None:
try:
return await task.run()
except ScraperTaskError:
_LOGGER.error(f"Task {task.name} failed")
return None

async def run(self) -> tuple[ScraperTaskResult, ...]:
return await gather(*(t.run() for t in self._tasks))
return (r for r in (await gather(*(self._run_task(t) for t in self._tasks))) if r is not None)
17 changes: 15 additions & 2 deletions scraper_bot/scraper/scraper_task.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from logging import Logger, getLogger

from playwright.async_api import Error, Page
from playwright_stealth import stealth_async

from scraper_bot.settings.task import TaskSettings

from .browser_manager import BrowserManager
from .exceptions import TargetScriptError
from .scraper_task_result import ScraperTaskResult


Expand All @@ -29,7 +31,7 @@ async def run(self) -> ScraperTaskResult:
self._logger.info("Starting scraper task")

async with self._browser_manager.launch_browser() as browser:
page = await browser.new_page()
page: Page = await browser.new_page()

if self._browser_manager.stealth_enabled:
await stealth_async(page)
Expand All @@ -38,10 +40,21 @@ async def run(self) -> ScraperTaskResult:

# TODO add support for waitingForTarget

data: str | list[str] | dict | list[dict] = await page.evaluate(self.settings.target)
self._logger.info("Starting target script evaluated")
try:
data: str | list[str] | dict | list[dict] = await page.evaluate(self.settings.target)
except Error as e:
self._logger.error("Target script error")
self._logger.debug(e)
raise TargetScriptError()

self._logger.info("Target script evaluated")
self._logger.debug(data)

# TODO add support for nextPageTarget

self._logger.info("Completed scraping")

if not isinstance(data, list):
data = list(data)

Expand Down
3 changes: 2 additions & 1 deletion scraper_bot/scraper/scraper_task_result_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def __len__(self):
return len(self._data) + 1

def __str__(self) -> str:
return f"{self._task.name}#{"|".join(["=".join(v) for v in sorted(self._data.items(), key=lambda x: x[0])])}"
fields = "|".join([f"{k}={v}" for k, v in sorted(self._data.items(), key=lambda x: x[0]) if v is not None])
return f"{self._task.name}#{fields}"

def __hash__(self) -> int:
return int(sha256(str(self).encode()).hexdigest(), 16)
Expand Down

0 comments on commit f16ab1f

Please sign in to comment.