diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/tests.yml similarity index 67% rename from .github/workflows/unit-tests.yml rename to .github/workflows/tests.yml index 864ce7d1..cf26fabd 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/tests.yml @@ -1,9 +1,15 @@ -name: Unit Tests +name: Tests -on: [push, pull_request] +on: push jobs: - test: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.1 + tox-tests: runs-on: ubuntu-latest strategy: @@ -11,6 +17,7 @@ jobs: python-version: [3.9] steps: + - name: Checkout repository uses: actions/checkout@v2 @@ -30,12 +37,16 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r requirements.txt + pip install -e .[dev] - name: Start ScienceBeam Docker container run: | - docker run -d --rm -p 8082:8070 elifesciences/sciencebeam-parser + docker run -d --rm -p 8070 elifesciences/sciencebeam-parser - name: Run tests run: | - tox \ No newline at end of file + tox + + - name: Test packaging + run: | + tox -e .package diff --git a/.gitignore b/.gitignore index 811c2ace..861b62ba 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ # Ignore the virtual environment directory +_version.py +node_modules venv/ *coverage* .idea diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a4e07126..8fb49dff 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,35 @@ repos: - - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.257 + - repo: https://github.com/psf/black + rev: 24.4.2 + hooks: + - id: black + + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: "v0.4.9" hooks: - id: ruff args: ["--fix"] + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + files: ".*\\.py" + exclude: "examples|docs/examples" + - id: check-added-large-files + - id: check-toml + - id: end-of-file-fixer + exclude: "examples|docs/examples" + + # - repo: https://github.com/pre-commit/mirrors-prettier + # rev: v4.0.0-alpha.8 + # hooks: + # - id: prettier + + - repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + name: isort + args: ["--profile", "black"] diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..71eec60b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,35 @@ +# Use the official Debian Docker image as the base +FROM debian:latest + +# Set working directory +WORKDIR /app + +# Install system dependencies, Python, pip, networking, and debugging tools +RUN apt-get update && apt-get install -y \ + git \ + python3 \ + python3-pip \ + python3-venv \ + curl \ + iputils-ping \ + net-tools \ + && rm -rf /var/lib/apt/lists/* + +# Copy your project files +COPY . /app + +# Create and activate virtual environment +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Upgrade pip in the virtual environment +RUN pip install --upgrade pip + +# Install the package and its dependencies +RUN pip install -e . + +# Install pre-commit (optional, remove if not needed in the container) +RUN pip install pre-commit + +# Set the command to the osm command +CMD ["osm", "--help"] diff --git a/README.md b/README.md index ffcfa641..fda883be 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,35 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to transparency, data sharing, rigor, and open science on biomedical publications. # How to setup and run the application -- After cloning the repo, navigate into the project's root directory by running `cd osm_cli` +- After cloning the repo, navigate into the project's root directory by running `cd osm` - Run `python -m venv venv` to create a Virtual Environment - Depending on your system, run the approriate command to Activate the Virtual Environment Windows: `venv\Scripts\activate`
macOS and Linux: `source venv/bin/activate` -- Next, run `pip install -r requirements.txt && pre-commit install` to install all the dependencies. -- Open a terminal tab and run the image `docker run --rm -p 8082:8070 elifesciences/sciencebeam-parser` and keep it running for the rest of the testing period -- Go back to terminal(root folder) -- Finally, run `python -m osm.cli pdf-xml "path_to_file_name.pdf" file_id` +- Next, run `pip install -e .` to install the package with its dependencies. +- Open a terminal tab and run the image `docker run --rm -p 8070:8070 elifesciences/sciencebeam-parser` and keep it running + +**Note:** The ScienceBeam image is not supported by all apple silicon chips. You may need to consider using an alternative systems. +- Finally, run `osm pdf-xml "path_to_file_name.pdf" output_file_path` # How to run tests of the application Run `tox` + # How to run the unit tests -- Navigate to the project's root directory and run `python -m pytest` +- Navigate to the project's root directory and run `pytest` + +# Using pre-commit for commit checks + +Pre-commit will run all of its hooks on every commit you make. To install +pre-commit and its hooks, run the following commands: + +``` +pip install pre-commit +pre-commit install +``` +# How to build the Docker image and run the Docker container +- Navigate to the project's root directory and run `docker-compose up --build` +- When the image is built and the containers are running, open another terminal and start osm container in interactive mode using the command `docker-compose run osm bash` +- You can do file conversions in the container using this command `osm pdf-xml "path_to_file_name.pdf" output_file_path` +- Or use the command `docker-compose run --rm osm osm pdf-xml "path_to_file_name.pdf" output_file_path` to convert files in non-interactive mode diff --git a/commands/converters/converter.py b/commands/converters/converter.py deleted file mode 100644 index 4bf98da7..00000000 --- a/commands/converters/converter.py +++ /dev/null @@ -1,9 +0,0 @@ -from abc import ABC, abstractmethod - -from pydantic import FilePath, BaseModel - - -class Converter(ABC, BaseModel): - @abstractmethod - def convert(self, pdf_path: FilePath) -> str: - pass diff --git a/commands/converters/pdf_converter.py b/commands/converters/pdf_converter.py deleted file mode 100644 index f2f3ad71..00000000 --- a/commands/converters/pdf_converter.py +++ /dev/null @@ -1,62 +0,0 @@ -import socket - -import docker -from docker.errors import DockerException -from pydantic import FilePath -import requests - -from commands.converters.converter import Converter -from commands.utils.config import config - - -class PDFConverter(Converter): - protocol: str = config.PROTOCAL - host: str = config.HOST - port: int = config.PORT - - def convert(self, pdf_path: FilePath) -> str: - """Convert a PDF file to XML using ScienceBeam Parser. - - Args: - pdf_path: Path to the PDF file - Returns: - XML content as a string - """ - sciencebeam_url: str = f'{self.protocol}://{self.host}:{self.port}/api/convert' - with open(pdf_path, 'rb') as pdf_file: - files = {'file': pdf_file} - headers = {'Accept': 'application/tei+xml'} - response = requests.post( - sciencebeam_url, files=files, headers=headers) - - if response.status_code == 200: - return response.text - else: - response.raise_for_status() - - def is_host_ready(self, timeout=3) -> bool: - """Check if the host is ready to accept requests. - Args: - timeout: Timeout in seconds - Returns: - True if the docker is host is ready, False otherwise - """ - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.settimeout(timeout) - try: - sock.connect((self.host, self.port)) - except (socket.timeout, socket.error): - return False - return True - - def is_docker_running(self): - """Check if the docker image exists. - Returns: - True if the docker image exists, False otherwise - """ - try: - client = docker.from_env() - client.images.get('elifesciences/sciencebeam-parser') - return True - except DockerException: - return False diff --git a/commands/file_processing.py b/commands/file_processing.py deleted file mode 100644 index f37d0866..00000000 --- a/commands/file_processing.py +++ /dev/null @@ -1,42 +0,0 @@ -import click -import requests -from pydantic import ValidationError - -from commands.converters.pdf_converter import PDFConverter -from logs.logger import logger - - -@click.command() -@click.argument('file_path', type=click.Path(exists=True)) -@click.argument('file_id', type=str) -def pdf_xml(file_path, file_id): - """This function converts a file from PDF - to XML - - Args: - file_path (file path): First parameter - file_id (string): Second parameter - Returns: - Creates an XML file in the directory xmls_sciencebeam - """ - try: - converter = PDFConverter() - - if not converter.is_docker_running(): - raise click.ClickException('Please make sure the docker is running') - - if not converter.is_host_ready(): - raise click.ClickException('The converter server is offline') - - xml_content = converter.convert(file_path) - # Save the converted xml contents - output_file: str = f'docs/examples/sciencebeam_xml_outputs/{file_id}.xml' - with open(output_file, 'w', encoding='utf-8') as xml_file: - xml_file.write(xml_content) - logger.info(f'Converted: {file_path} with ID: {file_id} to XML') - - except ValidationError as error: - logger.error("Validation error:", error) - - except requests.RequestException as error: - logger.error("Request error:", error) diff --git a/commands/utils/config.py b/commands/utils/config.py deleted file mode 100644 index ac7ace90..00000000 --- a/commands/utils/config.py +++ /dev/null @@ -1,10 +0,0 @@ -from pydantic.v1 import BaseSettings - - -class AppConfig(BaseSettings): - PORT:int = 8082 - HOST: str = 'localhost' - PROTOCAL: str = 'http' - - -config = AppConfig() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..c2a885a1 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,30 @@ +version: '3.8' + +services: + sciencebeam: + image: elifesciences/sciencebeam-parser + ports: + - "8070:8070" + networks: + - app-network + + osm: + build: + context: . + dockerfile: Dockerfile + ports: + - "8081:8070" + volumes: + - .:/app + depends_on: + - sciencebeam + environment: + - SCIENCEBEAM_URL=http://sciencebeam:8070 + tty: true + stdin_open: true + networks: + - app-network + +networks: + app-network: + driver: bridge diff --git a/osm/__init__.py b/osm/__init__.py index e69de29b..eb23a131 100644 --- a/osm/__init__.py +++ b/osm/__init__.py @@ -0,0 +1,3 @@ +from . import _version + +__version__ = _version.version diff --git a/osm/cli.py b/osm/cli.py deleted file mode 100644 index ea7045cd..00000000 --- a/osm/cli.py +++ /dev/null @@ -1,15 +0,0 @@ -import click -from commands.file_processing import pdf_xml - - -@click.group() -def cli(): - """Main command group.""" - pass - - -# Add commands to the main group -cli.add_command(pdf_xml) - -if __name__ == '__main__': - cli() diff --git a/osm/cli/main.py b/osm/cli/main.py new file mode 100644 index 00000000..183c5d57 --- /dev/null +++ b/osm/cli/main.py @@ -0,0 +1,27 @@ +import click + +from osm.converters.converter import convert_pdf_to_xml + + +@click.group() +def osm(): + """Main command for OSM""" + pass + + +@osm.command() +@click.argument("file_path", type=click.Path(exists=True)) +@click.argument("output_file", type=str) +def pdf_xml(file_path, output_file): + """This function converts a file from PDF to XML + Args: + file_path (string): an input file path + output_file (string): an output file path + Returns: + Creates an XML file and saves it in the output file path + """ + try: + convert_pdf_to_xml(file_path, output_file) + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + exit(1) diff --git a/commands/__init__.py b/osm/converters/__init__.py similarity index 100% rename from commands/__init__.py rename to osm/converters/__init__.py diff --git a/osm/converters/converter.py b/osm/converters/converter.py new file mode 100644 index 00000000..8a9842dd --- /dev/null +++ b/osm/converters/converter.py @@ -0,0 +1,87 @@ +import socket +from abc import ABC, abstractmethod +from pathlib import Path + +import requests + +from osm.logging.logger import logger +from osm.utils.config import config + + +class Converter(ABC): + @abstractmethod + def convert(self, pdf_path) -> str: + pass + + def handle_error(self, error): + if isinstance(error, requests.RequestException): + logger.error("Request error:", exc_info=error) + else: + logger.error("An error occurred:", exc_info=error) + + raise error + + +class PDFConverter(Converter): + protocol: str = config.PROTOCOL + host: str = config.HOST + port: int = config.PORT + + def convert(self, pdf_path) -> str: + """Convert a PDF file to XML using ScienceBeam Parser. + + Args: + pdf_path: Path to the PDF file + Returns: + XML content as a string + """ + sciencebeam_url: str = f"{self.protocol}://{self.host}:{self.port}/api/convert" + with Path(pdf_path).open("rb") as pdf_file: + files = {"file": pdf_file} + headers = {"Accept": "application/tei+xml"} + response = requests.post(sciencebeam_url, files=files, headers=headers) + + if response.status_code == 200: + return response.text + else: + response.raise_for_status() + + def is_host_ready(self, timeout=3) -> bool: + """Check if the host is ready to accept requests. + Args: + timeout: Timeout in seconds + Returns: + True if the docker is host is ready, False otherwise + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(timeout) + try: + sock.connect((self.host, self.port)) + except (socket.timeout, socket.error): + return False + return True + + +def convert_pdf_to_xml(file_path, output_file_path): + """Converts a PDF file to XML and saves the output. + + Args: + file_path (str): Path to the input PDF file. + output_file_path (str): Path to the output XML file. + """ + converter = PDFConverter() + try: + if not converter.is_host_ready(): + raise Exception("The converter server is offline") + + xml_content = converter.convert(file_path) + + # Save the converted xml contents + Path(output_file_path).write_text(xml_content) + logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}") + + except requests.RequestException as error: + converter.handle_error(error) + + except Exception as error: + converter.handle_error(error) diff --git a/commands/converters/__init__.py b/osm/logging/__init__.py similarity index 100% rename from commands/converters/__init__.py rename to osm/logging/__init__.py diff --git a/logs/logger.py b/osm/logging/logger.py similarity index 53% rename from logs/logger.py rename to osm/logging/logger.py index d5880999..bdfadba1 100644 --- a/logs/logger.py +++ b/osm/logging/logger.py @@ -1,11 +1,9 @@ -from rich.logging import RichHandler import logging +from rich.logging import RichHandler + logging.basicConfig( - level=logging.INFO, - format="%(message)s", - datefmt="[%X]", - handlers=[RichHandler()] + level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()] ) logger = logging.getLogger("rich") diff --git a/logs/__init__.py b/osm/utils/__init__.py similarity index 100% rename from logs/__init__.py rename to osm/utils/__init__.py diff --git a/osm/utils/config.py b/osm/utils/config.py new file mode 100644 index 00000000..c658e687 --- /dev/null +++ b/osm/utils/config.py @@ -0,0 +1,7 @@ +class AppConfig: + PORT: int = 8070 + HOST: str = "sciencebeam" + PROTOCOL: str = "http" + + +config = AppConfig() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..1668aa35 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,57 @@ +[project] +name = "osm" +description = "Open Science Metrics (OSM) client for tracking scientific transparency and reproducibility metrics" +readme = "README.md" +requires-python = ">=3.8" +keywords = [ + "open science", + "altmetrics", + "scientific transparency", + "reproducibility" +] +dynamic = ["version"] + +dependencies = [ + "click>=8.1.7", + "docker>=7.1.0", + "rich>=13.7.1", + "requests>=2.32.3", +] + +[project.optional-dependencies] +dev = [ + "tox>=4.15.0", + "pytest>=8.2.1", + "pytest-cov", + "ruff>=0.4.9", + "build", + "twine", + "pre-commit", + "pkginfo>=1.10" +] + +[project.urls] +homepage = "https://website" +source = "https://github.com/nimh-dsst/osm" +issues = "https://github.com/nimh-dsst/osm/issues" + +[build-system] +requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] +write_to = "osm/_version.py" + +[tool.setuptools.packages.find] +where = ["."] + +[project.scripts] +osm = "osm.cli.main:osm" + +[tool.black] +line-length = 88 +target-version = ["py38", "py39", "py310"] + +[tool.ruff] +line-length = 88 +indent-width = 4 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1ff11364..00000000 --- a/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -click==8.1.7 -docker==7.1.0 -pre_commit==3.5.0 -pydantic==2.7.3 -pytest==8.2.1 -requests==2.32.3 -rich==13.7.1 -ruff==0.4.9 -tox==4.15.0 \ No newline at end of file diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py index 30185079..0ee2576e 100644 --- a/tests/test_file_processing.py +++ b/tests/test_file_processing.py @@ -1,32 +1,33 @@ +from pathlib import Path + import pytest from click.testing import CliRunner -import os -from osm.cli import cli +from osm.cli.main import pdf_xml @pytest.fixture def setup_and_teardown(): # Setup: Create a temporary PDF file for testing - pdfs_folder = 'docs/examples/pdf_inputs' - file = 'test_sample.pdf' - file_id = 'test_file_id' - output_file = f'docs/examples/sciencebeam_xml_outputs/{file_id}.xml' + pdfs_folder = "docs/examples/pdf_inputs" + file = "test_sample.pdf" + output_file = "test_output_file.xml" - yield pdfs_folder, file, file_id, output_file + yield pdfs_folder, file, output_file # Teardown: Remove the generated XML file - if os.path.exists(output_file): - os.remove(output_file) + output_file_path = Path(output_file) + if output_file_path.exists(): + output_file_path.unlink() def test_pdf_xml_command(setup_and_teardown): - pdfs_folder, file, file_id, output_file = setup_and_teardown + pdfs_folder, file, output_file = setup_and_teardown runner = CliRunner() - pdf_path = f'{pdfs_folder}/{file}' - result = runner.invoke(cli, ['pdf-xml', pdf_path, file_id]) + pdf_path = f"{pdfs_folder}/{file}" + result = runner.invoke(pdf_xml, [str(pdf_path), output_file]) # Check that the command executed successfully assert result.exit_code == 0 - assert os.path.exists(output_file) + assert Path(output_file).exists() diff --git a/tox.ini b/tox.ini index 2831f21c..4420e676 100644 --- a/tox.ini +++ b/tox.ini @@ -1,13 +1,11 @@ [tox] -envlist = py39, py310, py311, py312, lint +envlist = py39, py310, py311, py312, lint, format [testenv] deps = - pytest - pytest-cov - -r requirements.txt + .[dev] commands = - python -m pytest --cov=commands --cov-report=term-missing --cov-report=html --cov-report=xml + python -m pytest --cov=osm --cov-report=term-missing --cov-report=html --cov-report=xml [testenv:lint] description = Run ruff to lint the code @@ -15,6 +13,11 @@ commands = ruff check . [testenv:format] -description = Run ruff to format the code +description = Check that code is formatted with ruff commands = - ruff check --fix . \ No newline at end of file + ruff format --check + +[testenv:.package] +description = Generate distribution package +basepython = python3 +commands = python -m build --sdist --wheel --outdir packaged