-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: resolve feedback and include docker file
- Loading branch information
1 parent
a85ecf9
commit 497c44a
Showing
13 changed files
with
231 additions
and
120 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,5 @@ __pycache__/ | |
dist/ | ||
build/ | ||
.tox/ | ||
venv/ | ||
.vscode/settings.json | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Use the official Debian Docker image as the base | ||
FROM debian:latest | ||
|
||
# Set working directory | ||
WORKDIR /app | ||
|
||
# Install system dependencies, Python, pip, networking, and debugging tools | ||
RUN apt-get update && apt-get install -y \ | ||
git \ | ||
python3 \ | ||
python3-pip \ | ||
python3-venv \ | ||
curl \ | ||
iputils-ping \ | ||
net-tools \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Copy your project files | ||
COPY . /app | ||
|
||
# Create and activate virtual environment | ||
RUN python3 -m venv /opt/venv | ||
ENV PATH="/opt/venv/bin:$PATH" | ||
|
||
# Upgrade pip in the virtual environment | ||
RUN pip install --upgrade pip | ||
|
||
# Install the package and its dependencies | ||
RUN pip install -e . | ||
|
||
# Install pre-commit (optional, remove if not needed in the container) | ||
RUN pip install pre-commit | ||
|
||
# Set the command to the osm command | ||
CMD ["osm", "--help"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
version: '3.8' | ||
|
||
services: | ||
sciencebeam: | ||
image: elifesciences/sciencebeam-parser | ||
ports: | ||
- "8070:8070" | ||
networks: | ||
- app-network | ||
|
||
osm: | ||
build: | ||
context: . | ||
dockerfile: Dockerfile | ||
ports: | ||
- "8081:8070" | ||
volumes: | ||
- .:/app | ||
depends_on: | ||
- sciencebeam | ||
environment: | ||
- SCIENCEBEAM_URL=http://sciencebeam:8070 | ||
tty: true | ||
stdin_open: true | ||
networks: | ||
- app-network | ||
|
||
networks: | ||
app-network: | ||
driver: bridge |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,87 @@ | ||
import socket | ||
from abc import ABC, abstractmethod | ||
from pathlib import Path | ||
|
||
from pydantic import BaseModel, FilePath | ||
import requests | ||
|
||
from osm.logging.logger import logger | ||
from osm.utils.config import config | ||
|
||
class Converter(ABC, BaseModel): | ||
|
||
class Converter(ABC): | ||
@abstractmethod | ||
def convert(self, pdf_path: FilePath) -> str: | ||
def convert(self, pdf_path) -> str: | ||
pass | ||
|
||
def handle_error(self, error): | ||
if isinstance(error, requests.RequestException): | ||
logger.error("Request error:", exc_info=error) | ||
else: | ||
logger.error("An error occurred:", exc_info=error) | ||
|
||
raise error | ||
|
||
|
||
class PDFConverter(Converter): | ||
protocol: str = config.PROTOCOL | ||
host: str = config.HOST | ||
port: int = config.PORT | ||
|
||
def convert(self, pdf_path) -> str: | ||
"""Convert a PDF file to XML using ScienceBeam Parser. | ||
Args: | ||
pdf_path: Path to the PDF file | ||
Returns: | ||
XML content as a string | ||
""" | ||
sciencebeam_url: str = f"{self.protocol}://{self.host}:{self.port}/api/convert" | ||
with Path(pdf_path).open("rb") as pdf_file: | ||
files = {"file": pdf_file} | ||
headers = {"Accept": "application/tei+xml"} | ||
response = requests.post(sciencebeam_url, files=files, headers=headers) | ||
|
||
if response.status_code == 200: | ||
return response.text | ||
else: | ||
response.raise_for_status() | ||
|
||
def is_host_ready(self, timeout=3) -> bool: | ||
"""Check if the host is ready to accept requests. | ||
Args: | ||
timeout: Timeout in seconds | ||
Returns: | ||
True if the server host is ready, False otherwise | ||
""" | ||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: | ||
sock.settimeout(timeout) | ||
try: | ||
sock.connect((self.host, self.port)) | ||
except (socket.timeout, socket.error): | ||
return False | ||
return True | ||
|
||
|
||
def convert_pdf_to_xml(file_path, output_file_path): | ||
"""Converts a PDF file to XML and saves the output. | ||
Args: | ||
file_path (str): Path to the input PDF file. | ||
output_file_path (str): Path to the output XML file. | ||
""" | ||
converter = PDFConverter() | ||
try: | ||
if not converter.is_host_ready(): | ||
raise Exception("The converter server is offline") | ||
|
||
xml_content = converter.convert(file_path) | ||
|
||
# Save the converted xml contents | ||
Path(output_file_path).write_text(xml_content) | ||
logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}") | ||
|
||
except requests.RequestException as error: | ||
converter.handle_error(error) | ||
|
||
except Exception as error: | ||
converter.handle_error(error) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,7 @@ | ||
from pydantic.v1 import BaseSettings | ||
|
||
|
||
class AppConfig(BaseSettings): | ||
PORT: int = 8082 | ||
HOST: str = "localhost" | ||
PROTOCAL: str = "http" | ||
class AppConfig: | ||
PORT: int = 8070 | ||
HOST: str = "sciencebeam" | ||
PROTOCOL: str = "http" | ||
|
||
|
||
config = AppConfig() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.