From a5f8faceb59429f0840c774c2beba276e9fd9f66 Mon Sep 17 00:00:00 2001 From: gitstart-nimhdsst Date: Fri, 21 Jun 2024 15:18:57 +0000 Subject: [PATCH] fix: update tox envlist, add ruff for linting and formatting, set up pre-commit, and clean up code (docker, server) before XML conversion --- README.md | 2 ++ osm/cli/main.py | 10 +++++++- osm/converters/pdf_converter.py | 42 ++++++++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f6e9ae5f..4d6fd2cd 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ Windows: `venv\Scripts\activate`
macOS and Linux: `source venv/bin/activate` - Next, run `pip install -e .` to install the package with its dependencies. +- Open a terminal tab and run the image `docker run --rm -p 8082:8070 elifesciences/sciencebeam-parser` and keep it running for the rest of the testing period +- Go back to terminal(root folder) - Finally, run `osm pdf-xml "path_to_file_name.pdf" file_id` # How to run tests of the application diff --git a/osm/cli/main.py b/osm/cli/main.py index 908e7409..01112701 100644 --- a/osm/cli/main.py +++ b/osm/cli/main.py @@ -23,9 +23,17 @@ def pdf_xml(file_path, file_id): """ try: converter = PDFConverter() + + if not converter.is_docker_running(): + raise click.ClickException('Please make sure the docker is running') + + if not converter.is_host_ready(): + raise click.ClickException('The converter server is offline') + xml_content = converter.convert(file_path) # Save the converted xml contents - with open(f'docs/examples/sci`encebeam_xml_outputs/{file_id}.xml', 'w', encoding='utf-8') as xml_file: + output_file: str = f'docs/examples/sciencebeam_xml_outputs/{file_id}.xml' + with open(output_file, 'w', encoding='utf-8') as xml_file: xml_file.write(xml_content) logger.info(f'Converted: {file_path} with ID: {file_id} to XML') diff --git a/osm/converters/pdf_converter.py b/osm/converters/pdf_converter.py index 7269fbf8..4bcabe70 100644 --- a/osm/converters/pdf_converter.py +++ b/osm/converters/pdf_converter.py @@ -1,13 +1,20 @@ +import socket + +import docker +from docker.errors import DockerException from pydantic import FilePath import requests from osm_cli.converters.converter import Converter +from osm_cli.utils.config import config class PDFConverter(Converter): - sciencebeam_url: str = 'http://localhost:8080/api/convert' + protocol: str = config.PROTOCAL + host: str = config.HOST + port: int = config.PORT - def convert(self, pdf_path: FilePath): + def convert(self, pdf_path: FilePath) -> str: """Convert a PDF file to XML using ScienceBeam Parser. Args: @@ -15,12 +22,41 @@ def convert(self, pdf_path: FilePath): Returns: XML content as a string """ + sciencebeam_url: str = f'{self.protocol}://{self.host}:{self.port}/api/convert' with open(pdf_path, 'rb') as pdf_file: files = {'file': pdf_file} headers = {'Accept': 'application/tei+xml'} - response = requests.post(self.sciencebeam_url, files=files, headers=headers) + response = requests.post( + sciencebeam_url, files=files, headers=headers) if response.status_code == 200: return response.text else: response.raise_for_status() + + def is_host_ready(self, timeout=3) -> bool: + """Check if the host is ready to accept requests. + Args: + timeout: Timeout in seconds + Returns: + True if the docker is host is ready, False otherwise + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(timeout) + try: + sock.connect((self.host, self.port)) + except (socket.timeout, socket.error): + return False + return True + + def is_docker_running(self): + """Check if the docker image exists. + Returns: + True if the docker image exists, False otherwise + """ + try: + client = docker.from_env() + client.images.get('elifesciences/sciencebeam-parser') + return True + except DockerException: + return False