Skip to content

Commit

Permalink
chore: resolve feedback and include docker file
Browse files Browse the repository at this point in the history
  • Loading branch information
gitstart-nimhdsst committed Jul 2, 2024
1 parent a85ecf9 commit 497c44a
Show file tree
Hide file tree
Showing 13 changed files with 231 additions and 120 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,5 @@ __pycache__/
dist/
build/
.tox/
venv/
.vscode/settings.json
.DS_Store
35 changes: 35 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Use the official Debian Docker image as the base
FROM debian:latest

# Set working directory
WORKDIR /app

# Install system dependencies, Python, pip, networking, and debugging tools
RUN apt-get update && apt-get install -y \
git \
python3 \
python3-pip \
python3-venv \
curl \
iputils-ping \
net-tools \
&& rm -rf /var/lib/apt/lists/*

# Copy your project files
COPY . /app

# Create and activate virtual environment
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# Upgrade pip in the virtual environment
RUN pip install --upgrade pip

# Install the package and its dependencies
RUN pip install -e .

# Install pre-commit (optional, remove if not needed in the container)
RUN pip install pre-commit

# Set the command to the osm command
CMD ["osm", "--help"]
24 changes: 19 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
# OpenSciMetrics

OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to transparency, data sharing, rigor, and open science on biomedical publications.

# How to setup and run the application

- After cloning the repo, navigate into the project's root directory by running `cd osm`
- Run `python -m venv venv` to create a Virtual Environment
- Depending on your system, run the approriate command to Activate the Virtual Environment
Windows: `venv\Scripts\activate`<br>
macOS and Linux: `source venv/bin/activate`
Windows: `venv\Scripts\activate`<br>
macOS and Linux: `source venv/bin/activate`

- Next, run `pip install -e .` to install the package with its dependencies.
- Open a terminal tab and run the image `docker run --rm -p 8082:8070 elifesciences/sciencebeam-parser` and keep it running for the rest of the testing period
- Go back to terminal(root folder)
- Finally, run `osm pdf-xml "path_to_file_name.pdf" file_id`
- Open a terminal tab and run the image `docker run --rm -p 8070:8070 elifesciences/sciencebeam-parser` and keep it running

**Note:** The ScienceBeam image is not supported by all apple silicon chips. You may need to consider using an alternative systems.

- Finally, run `osm pdf-xml-json "path_to_file_name.pdf" output_file_path`

# How to run tests of the application

Run `tox`

# How to run the unit tests

- Navigate to the project's root directory and run `pytest`

# Using pre-commit for commit checks
Expand All @@ -27,3 +34,10 @@ pre-commit and its hooks, run the following commands:
pip install pre-commit
pre-commit install
```

# How to build the Docker image and run the Docker container

- Navigate to the project's root directory and run `docker-compose up --build`
- When the image is built and the containers are running, open another terminal and start osm container in interactive mode using the command `docker-compose run osm bash`
- You can do file conversions in the container using this command `osm pdf-xml-json "path_to_file_name.pdf" output_file_path`
- Or use the command `docker-compose run --rm osm pdf-xml-json "path_to_file_name.pdf" output_file_path` to convert files in non-interactive mode
30 changes: 30 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
version: '3.8'

services:
sciencebeam:
image: elifesciences/sciencebeam-parser
ports:
- "8070:8070"
networks:
- app-network

osm:
build:
context: .
dockerfile: Dockerfile
ports:
- "8081:8070"
volumes:
- .:/app
depends_on:
- sciencebeam
environment:
- SCIENCEBEAM_URL=http://sciencebeam:8070
tty: true
stdin_open: true
networks:
- app-network

networks:
app-network:
driver: bridge
Binary file modified docs/examples/pdf_inputs/test_sample.pdf
Binary file not shown.
Binary file removed example_pdf_inputs/test.pdf
Binary file not shown.
Binary file removed example_pdf_inputs/test_sample.pdf
Binary file not shown.
42 changes: 14 additions & 28 deletions osm/cli/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import click
import requests

from osm.logging.logger import logger
from osm.converters.pdf_converter import PDFConverter
from osm.converters.converter import convert_pdf_to_xml


@click.group()
def osm():
Expand All @@ -12,32 +11,19 @@ def osm():

@osm.command()
@click.argument("file_path", type=click.Path(exists=True))
@click.argument("file_id", type=str)
def pdf_xml(file_path, file_id):
"""
@click.argument("output_file", type=str)
def pdf_xml_json(file_path, output_file):
"""Converts a PDF file of a biomedical publication to JSON format
Args:
file_path (file path): First parameter
file_id (string): Second parameter
file_path (string): Path to the input PDF file
output_file (string): an output file path
Returns:
Creates an XML file in the directory xmls_sciencebeam
Creates an JSON file containing bibliometric
indicators and metadata and saves it in the
output file path
"""
try:
converter = PDFConverter()

if not converter.is_docker_running():
raise click.ClickException("Please make sure the docker is running")

if not converter.is_host_ready():
raise click.ClickException("The converter server is offline")

xml_content = converter.convert(file_path)
# Save the converted xml contents
output_file: str = f"docs/examples/sciencebeam_xml_outputs/{file_id}.xml"
with open(output_file, "w", encoding="utf-8") as xml_file:
xml_file.write(xml_content)
logger.info(f"Converted: {file_path} with ID: {file_id} to XML")

except requests.RequestException as error:
logger.error("Request error:", error)

logger.info(f"Converted: {file_path} with ID: {file_id} to XML")
convert_pdf_to_xml(file_path, output_file)
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
exit(1)
84 changes: 81 additions & 3 deletions osm/converters/converter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,87 @@
import socket
from abc import ABC, abstractmethod
from pathlib import Path

from pydantic import BaseModel, FilePath
import requests

from osm.logging.logger import logger
from osm.utils.config import config

class Converter(ABC, BaseModel):

class Converter(ABC):
@abstractmethod
def convert(self, pdf_path: FilePath) -> str:
def convert(self, pdf_path) -> str:
pass

def handle_error(self, error):
if isinstance(error, requests.RequestException):
logger.error("Request error:", exc_info=error)
else:
logger.error("An error occurred:", exc_info=error)

raise error


class PDFConverter(Converter):
protocol: str = config.PROTOCOL
host: str = config.HOST
port: int = config.PORT

def convert(self, pdf_path) -> str:
"""Convert a PDF file to XML using ScienceBeam Parser.
Args:
pdf_path: Path to the PDF file
Returns:
XML content as a string
"""
sciencebeam_url: str = f"{self.protocol}://{self.host}:{self.port}/api/convert"
with Path(pdf_path).open("rb") as pdf_file:
files = {"file": pdf_file}
headers = {"Accept": "application/tei+xml"}
response = requests.post(sciencebeam_url, files=files, headers=headers)

if response.status_code == 200:
return response.text
else:
response.raise_for_status()

def is_host_ready(self, timeout=3) -> bool:
"""Check if the host is ready to accept requests.
Args:
timeout: Timeout in seconds
Returns:
True if the server host is ready, False otherwise
"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout)
try:
sock.connect((self.host, self.port))
except (socket.timeout, socket.error):
return False
return True


def convert_pdf_to_xml(file_path, output_file_path):
"""Converts a PDF file to XML and saves the output.
Args:
file_path (str): Path to the input PDF file.
output_file_path (str): Path to the output XML file.
"""
converter = PDFConverter()
try:
if not converter.is_host_ready():
raise Exception("The converter server is offline")

xml_content = converter.convert(file_path)

# Save the converted xml contents
Path(output_file_path).write_text(xml_content)
logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}")

except requests.RequestException as error:
converter.handle_error(error)

except Exception as error:
converter.handle_error(error)
61 changes: 0 additions & 61 deletions osm/converters/pdf_converter.py

This file was deleted.

11 changes: 4 additions & 7 deletions osm/utils/config.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from pydantic.v1 import BaseSettings


class AppConfig(BaseSettings):
PORT: int = 8082
HOST: str = "localhost"
PROTOCAL: str = "http"
class AppConfig:
PORT: int = 8070
HOST: str = "sciencebeam"
PROTOCOL: str = "http"


config = AppConfig()
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ dynamic = ["version"]
dependencies = [
"click>=8.1.7",
"rich>=13.7.1",
"docker",
"pydantic",
"requests>=2.32.3",
]

[project.optional-dependencies]
dev = [
"pytest-mock>=3.14.0",
"tox>=4.15.0",
"pytest>=8.2.1",
"pytest-cov",
Expand Down
Loading

0 comments on commit 497c44a

Please sign in to comment.