From 8051d40e7bbb4719ccac1ac5764b9f4045cfe3d6 Mon Sep 17 00:00:00 2001 From: gitstart-nimhdsst Date: Fri, 21 Jun 2024 15:18:57 +0000 Subject: [PATCH] fix: update tox envlist, add ruff for linting and formatting, set up pre-commit, and clean up code (docker, server) before XML conversion --- .github/workflows/unit-tests.yml | 41 +++++++++++++++++++++++++++ .pre-commit-config.yaml | 6 ++++ .ruff.toml | 3 ++ README.md | 4 ++- commands/converters/pdf_converter.py | 42 ++++++++++++++++++++++++++-- commands/file_processing.py | 10 ++++++- commands/utils/config.py | 10 +++++++ requirements.txt | 7 +++-- tests/test_file_processing.py | 41 +++++++++++++-------------- tox.ini | 22 +++++---------- 10 files changed, 143 insertions(+), 43 deletions(-) create mode 100644 .github/workflows/unit-tests.yml create mode 100644 .pre-commit-config.yaml create mode 100644 .ruff.toml create mode 100644 commands/utils/config.py diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 00000000..864ce7d1 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,41 @@ +name: Unit Tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [3.9] + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache pip + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Start ScienceBeam Docker container + run: | + docker run -d --rm -p 8082:8070 elifesciences/sciencebeam-parser + + - name: Run tests + run: | + tox \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..a4e07126 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.0.257 + hooks: + - id: ruff + args: ["--fix"] diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 00000000..26edd0c2 --- /dev/null +++ b/.ruff.toml @@ -0,0 +1,3 @@ +#.ruff.toml +line-length = 88 +select = ["E", "F"] diff --git a/README.md b/README.md index 23a9173a..ffcfa641 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,9 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to Windows: `venv\Scripts\activate`
macOS and Linux: `source venv/bin/activate` -- Next, run `pip install -r requirements.txt` to install all the dependencies. +- Next, run `pip install -r requirements.txt && pre-commit install` to install all the dependencies. +- Open a terminal tab and run the image `docker run --rm -p 8082:8070 elifesciences/sciencebeam-parser` and keep it running for the rest of the testing period +- Go back to terminal(root folder) - Finally, run `python -m osm.cli pdf-xml "path_to_file_name.pdf" file_id` # How to run tests of the application diff --git a/commands/converters/pdf_converter.py b/commands/converters/pdf_converter.py index 690716a9..f2f3ad71 100644 --- a/commands/converters/pdf_converter.py +++ b/commands/converters/pdf_converter.py @@ -1,13 +1,20 @@ +import socket + +import docker +from docker.errors import DockerException from pydantic import FilePath import requests from commands.converters.converter import Converter +from commands.utils.config import config class PDFConverter(Converter): - sciencebeam_url: str = 'http://localhost:8080/api/convert' + protocol: str = config.PROTOCAL + host: str = config.HOST + port: int = config.PORT - def convert(self, pdf_path: FilePath): + def convert(self, pdf_path: FilePath) -> str: """Convert a PDF file to XML using ScienceBeam Parser. Args: @@ -15,12 +22,41 @@ def convert(self, pdf_path: FilePath): Returns: XML content as a string """ + sciencebeam_url: str = f'{self.protocol}://{self.host}:{self.port}/api/convert' with open(pdf_path, 'rb') as pdf_file: files = {'file': pdf_file} headers = {'Accept': 'application/tei+xml'} - response = requests.post(self.sciencebeam_url, files=files, headers=headers) + response = requests.post( + sciencebeam_url, files=files, headers=headers) if response.status_code == 200: return response.text else: response.raise_for_status() + + def is_host_ready(self, timeout=3) -> bool: + """Check if the host is ready to accept requests. + Args: + timeout: Timeout in seconds + Returns: + True if the docker is host is ready, False otherwise + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(timeout) + try: + sock.connect((self.host, self.port)) + except (socket.timeout, socket.error): + return False + return True + + def is_docker_running(self): + """Check if the docker image exists. + Returns: + True if the docker image exists, False otherwise + """ + try: + client = docker.from_env() + client.images.get('elifesciences/sciencebeam-parser') + return True + except DockerException: + return False diff --git a/commands/file_processing.py b/commands/file_processing.py index 22d1e501..f37d0866 100644 --- a/commands/file_processing.py +++ b/commands/file_processing.py @@ -21,9 +21,17 @@ def pdf_xml(file_path, file_id): """ try: converter = PDFConverter() + + if not converter.is_docker_running(): + raise click.ClickException('Please make sure the docker is running') + + if not converter.is_host_ready(): + raise click.ClickException('The converter server is offline') + xml_content = converter.convert(file_path) # Save the converted xml contents - with open(f'docs/examples/sciencebeam_xml_outputs/{file_id}.xml', 'w', encoding='utf-8') as xml_file: + output_file: str = f'docs/examples/sciencebeam_xml_outputs/{file_id}.xml' + with open(output_file, 'w', encoding='utf-8') as xml_file: xml_file.write(xml_content) logger.info(f'Converted: {file_path} with ID: {file_id} to XML') diff --git a/commands/utils/config.py b/commands/utils/config.py new file mode 100644 index 00000000..ac7ace90 --- /dev/null +++ b/commands/utils/config.py @@ -0,0 +1,10 @@ +from pydantic.v1 import BaseSettings + + +class AppConfig(BaseSettings): + PORT:int = 8082 + HOST: str = 'localhost' + PROTOCAL: str = 'http' + + +config = AppConfig() diff --git a/requirements.txt b/requirements.txt index f60402f8..a5ecb53b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,9 @@ click==8.1.7 -tox==4.15.0 +docker==7.1.0 +pre_commit==3.5.0 pydantic==2.7.3 pytest==8.2.1 -rich==13.7.1 requests==2.32.3 +rich==13.7.1 +ruff==0.4.9 +tox==4.15.0 diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py index b43cf112..30185079 100644 --- a/tests/test_file_processing.py +++ b/tests/test_file_processing.py @@ -1,33 +1,32 @@ -import unittest +import pytest from click.testing import CliRunner import os from osm.cli import cli -class TestFileProcessing(unittest.TestCase): - def setUp(self): - # Create a temporary PDF file for testing - self.pdfs_folder = 'docs/examples/pdf_inputs' - self.file = 'test_sample.pdf' - self.file_id = 'test_file_id' +@pytest.fixture +def setup_and_teardown(): + # Setup: Create a temporary PDF file for testing + pdfs_folder = 'docs/examples/pdf_inputs' + file = 'test_sample.pdf' + file_id = 'test_file_id' + output_file = f'docs/examples/sciencebeam_xml_outputs/{file_id}.xml' - self.output_file = f'docs/examples/sciencebeam_xml_outputs/{self.file_id}.xml' + yield pdfs_folder, file, file_id, output_file - def tearDown(self): - # Remove the generated XML file - if os.path.exists(self.output_file): - os.remove(self.output_file) + # Teardown: Remove the generated XML file + if os.path.exists(output_file): + os.remove(output_file) - def test_pdf_xml_command(self): - runner = CliRunner() - pdf_path = f'{self.pdfs_folder}/{self.file}' - result = runner.invoke(cli, ['pdf-xml', pdf_path, self.file_id]) - # Check that the command executed successfully - self.assertEqual(result.exit_code, 0) - self.assertTrue(os.path.exists(self.output_file)) +def test_pdf_xml_command(setup_and_teardown): + pdfs_folder, file, file_id, output_file = setup_and_teardown + runner = CliRunner() + pdf_path = f'{pdfs_folder}/{file}' + result = runner.invoke(cli, ['pdf-xml', pdf_path, file_id]) -if __name__ == '__main__': - unittest.main() + # Check that the command executed successfully + assert result.exit_code == 0 + assert os.path.exists(output_file) diff --git a/tox.ini b/tox.ini index 7754b88d..4d100d5f 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -enlist = py38, py39, py310, lint, coverage +envlist = py39, py310, py311, py312, lint [testenv] deps = @@ -10,19 +10,11 @@ commands = python -m pytest --cov=commands --cov-report=term-missing --cov-report=html --cov-report=xml [testenv:lint] -description = Run linters and code style checks -deps = - flake8 - black - isort +description = Run ruff to lint the code commands = - flake8 commands tests - black --check commands tests - isort --check-only commands tests - -[testenv:coverage] -description = Generate code coverage report + ruff check . -[flake8] -max-line-length = 88 # Follow PEP 8 guidelines -exclude = .tox,*.pdf,*.xml,build,data +[testenv:format] +description = Run ruff to format the code +commands = + ruff check --fix .