Skip to content

Commit

Permalink
fix: update tox envlist, add ruff for linting and formatting, set up …
Browse files Browse the repository at this point in the history
…pre-commit, and clean up code (docker, server) before XML conversion
  • Loading branch information
gitstart-nimhdsst authored and leej3 committed Jun 25, 2024
1 parent 1d53809 commit a5f8fac
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 4 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Windows: `venv\Scripts\activate`<br>
macOS and Linux: `source venv/bin/activate`

- Next, run `pip install -e .` to install the package with its dependencies.
- Open a terminal tab and run the image `docker run --rm -p 8082:8070 elifesciences/sciencebeam-parser` and keep it running for the rest of the testing period
- Go back to terminal(root folder)
- Finally, run `osm pdf-xml "path_to_file_name.pdf" file_id`

# How to run tests of the application
Expand Down
10 changes: 9 additions & 1 deletion osm/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,17 @@ def pdf_xml(file_path, file_id):
"""
try:
converter = PDFConverter()

if not converter.is_docker_running():
raise click.ClickException('Please make sure the docker is running')

if not converter.is_host_ready():
raise click.ClickException('The converter server is offline')

xml_content = converter.convert(file_path)
# Save the converted xml contents
with open(f'docs/examples/sci`encebeam_xml_outputs/{file_id}.xml', 'w', encoding='utf-8') as xml_file:
output_file: str = f'docs/examples/sciencebeam_xml_outputs/{file_id}.xml'
with open(output_file, 'w', encoding='utf-8') as xml_file:
xml_file.write(xml_content)
logger.info(f'Converted: {file_path} with ID: {file_id} to XML')

Expand Down
42 changes: 39 additions & 3 deletions osm/converters/pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,62 @@
import socket

import docker
from docker.errors import DockerException
from pydantic import FilePath
import requests

from osm_cli.converters.converter import Converter
from osm_cli.utils.config import config


class PDFConverter(Converter):
sciencebeam_url: str = 'http://localhost:8080/api/convert'
protocol: str = config.PROTOCAL
host: str = config.HOST
port: int = config.PORT

def convert(self, pdf_path: FilePath):
def convert(self, pdf_path: FilePath) -> str:
"""Convert a PDF file to XML using ScienceBeam Parser.
Args:
pdf_path: Path to the PDF file
Returns:
XML content as a string
"""
sciencebeam_url: str = f'{self.protocol}://{self.host}:{self.port}/api/convert'
with open(pdf_path, 'rb') as pdf_file:
files = {'file': pdf_file}
headers = {'Accept': 'application/tei+xml'}
response = requests.post(self.sciencebeam_url, files=files, headers=headers)
response = requests.post(
sciencebeam_url, files=files, headers=headers)

if response.status_code == 200:
return response.text
else:
response.raise_for_status()

def is_host_ready(self, timeout=3) -> bool:
"""Check if the host is ready to accept requests.
Args:
timeout: Timeout in seconds
Returns:
True if the docker is host is ready, False otherwise
"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout)
try:
sock.connect((self.host, self.port))
except (socket.timeout, socket.error):
return False
return True

def is_docker_running(self):
"""Check if the docker image exists.
Returns:
True if the docker image exists, False otherwise
"""
try:
client = docker.from_env()
client.images.get('elifesciences/sciencebeam-parser')
return True
except DockerException:
return False

0 comments on commit a5f8fac

Please sign in to comment.