Skip to content

Commit

Permalink
add prototype
Browse files Browse the repository at this point in the history
  • Loading branch information
leej3 committed Jul 10, 2024
1 parent 1f06cef commit a374648
Show file tree
Hide file tree
Showing 9 changed files with 157 additions and 26 deletions.
46 changes: 46 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: "Build and upload Docker image for releases"

on:
push:
tags: ["*"]
workflow_dispatch:

jobs:
build_and_push_docker_image:
name: "Build Docker Image"
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@v4

- name: "Set up Docker Buildx"
uses: docker/setup-buildx-action@v3

- name: "Login to Docker Hub 🐳"
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: "Add Docker metadata"
id: meta
uses: docker/metadata-action@v5
with:
images: |
timdanaos/app
tags: |
type=ref,event=tag
type=ref,event=branch
type=sha
- name: "Publish Docker image"
uses: docker/build-push-action@v5
with:
context: .
tags: |
${{ steps.meta.outputs.tags }}
push: true
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/')
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ RUN R -e '\
install.packages("roadoi", repos = "http://cran.us.r-project.org"); \
devtools::install_github("quest-bih/oddpub"); \
devtools::install_github("cran/crminer"); \
devtools::install_github("serghiou/metareadr"); \
devtools::install_github("serghiou/rtransparent")'
devtools::install_github("serghiou/metareadr")'
COPY external /app/external
RUN R -e 'devtools::install("external/rtransparent")'

# Copy the project files and install the package
COPY pyproject.toml /app
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to
```
docker-compose -f compose.yaml run \
--rm \
-v $PWD:/mnt \
-v $PWD:/app \
app \
rtransparent \
/mnt/docs/examples/pdf_inputs/test_sample.pdf \
Expand Down
29 changes: 23 additions & 6 deletions osm/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from pathlib import Path

import click

from osm.converters import convert_pdf
from osm.oddpub import oddpub_metric_extraction, oddpub_pdf_conversion
from osm.sciencebeam import convert_pdf


@click.group()
Expand All @@ -10,8 +13,22 @@ def osm():


@osm.command()
@click.argument("file_path", type=click.Path(exists=True))
@click.argument("output_file", type=str)
def rtransparent(file_path, output_file):
"""Processes a biomedical publication. Writes out processed document and associated metrics."""
convert_pdf(file_path, output_file)
@click.argument("pdf_dir", type=click.Path(exists=True))
@click.argument("outdir", type=click.Path(exists=True))
@click.option("--use-sciencebeam", is_flag=True, type=bool)
def extract_metrics(pdf_dir, outdir, use_sciencebeam):
"""Processes a biomedical publications. Writes out processed documents and associated metrics."""
outdir = Path(outdir)
pdf_dir = Path(pdf_dir)
text_dir = outdir / "pdf_texts"
text_dir.mkdir(exist_ok=True, parents=True)
try:
if use_sciencebeam:
convert_pdf(pdf_dir, text_dir)
else:
oddpub_pdf_conversion(pdf_dir, text_dir)
metrics = oddpub_metric_extraction(text_dir)
except Exception as err:
raise (err)
# breakpoint()
metrics.to_csv(outdir / "metrics.csv", index=False)
7 changes: 6 additions & 1 deletion osm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@ class AppConfig:
sb_protocol: str = "http"

def __init__(
self, sb_port: int = None, sb_host: str = None, sb_protocol: str = None
self,
sb_port: int = None,
sb_host: str = None,
sb_protocol: str = None,
vroom_connection_size: int = 2**20,
):
self.sb_port = int(sb_port) if sb_port is not None else self.sb_port
self.sb_host = sb_host if sb_host is not None else self.sb_host
self.sb_protocol = sb_protocol if sb_protocol is not None else self.sb_protocol
self.vroom_connection_size = vroom_connection_size


load_dotenv()
Expand Down
42 changes: 42 additions & 0 deletions osm/oddpub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Oddpub is being actively developed where as rtransparent has stagnated.
Oddpub implements parallelism and their interface does not easily allow working
with objects in memory so we will use that to reduce IO overhead.
The alternative would be to load the pdf file into memory (pdftools::pdf_data
and then pass that into oddpub private functions). This would make it easier to
manage the parallelism, troubleshoot, and define the interface but partially
reinvents the wheel.
"""

import os
from pathlib import Path

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

from osm.config import osm_config

oddpub = importr("oddpub")
future = importr("future")
ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")')


def oddpub_pdf_conversion(
pdf_dir: Path, text_dir: Path, workers: int = len(os.sched_getaffinity(0))
):
future.plan(future.multisession, workers=workers)
oddpub.pdf_convert(str(pdf_dir), str(text_dir))


def oddpub_metric_extraction(
text_dir: Path, workers: int = len(os.sched_getaffinity(0))
):
future.plan(future.multisession, workers=workers)
pdf_sentences = oddpub.pdf_load(f"{text_dir}/")
open_data_results = oddpub.open_data_search(pdf_sentences)
with (ro.default_converter + pandas2ri.converter).context():
metrics = ro.conversion.get_conversion().rpy2py(open_data_results)

return metrics
17 changes: 12 additions & 5 deletions osm/converters.py → osm/sciencebeam.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,23 @@ def _is_host_ready(self, timeout=3) -> bool:
return True


def convert_pdf(file_path, output_file_path):
def convert_pdf(pdf_dir, text_dir):
"""Converts a PDF file to XML and saves the output.
Args:
file_path (str): Path to the input PDF file.
output_file_path (str): Path to the output XML file.
"""
converter = PDFConverter()
xml_content = converter.convert(file_path)
for file_path in pdf_dir.glob("*.pdf"):
output_file_path = text_dir / (Path(file_path).stem + ".xml")
if output_file_path.exists():
logger.info(
f"Skipping: {file_path}. Output file already exists: {output_file_path}"
)
continue
xml_content = converter.convert(file_path)

# Save the converted xml contents
Path(output_file_path).write_text(xml_content)
logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}")
# Save the converted xml contents
Path(output_file_path).write_text(xml_content)
logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}")
13 changes: 12 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,19 @@ def mocked_requests_post():


@pytest.fixture
def pdf_setup(tmp_path):
def sample_pdf(tmp_path):
pdfs_folder = Path("docs/examples/pdf_inputs")
file_in = pdfs_folder / "test_sample.pdf"
output = tmp_path / "test_output_file.xml"
yield file_in, output


@pytest.fixture
def temp_pipeline_dir(tmp_path):
pdfs_folder = Path("docs/examples/pdf_inputs")
sample = "test_sample.pdf"
tmp_pdfs = tmp_path / "pdfs"
tmp_pdfs.mkdir()
(tmp_pdfs / sample).symlink_to(pdfs_folder / sample)
outdir = tmp_path
yield tmp_pdfs, outdir
22 changes: 12 additions & 10 deletions tests/test_file_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,32 @@
import requests
from click.testing import CliRunner

from osm.cli import rtransparent
from osm.converters import PDFConverter
from osm.cli import extract_metrics
from osm.sciencebeam import PDFConverter

from .utils import verify_xml_structure


def test_cli_rtransparent(
pdf_setup, monkeypatch, mocked_requests_post, mocked_socket, caplog
def test_cli_metrics(
temp_pipeline_dir, monkeypatch, mocked_requests_post, mocked_socket, caplog
):
caplog.set_level(logging.INFO)
sample, output = pdf_setup
pdfs_dir, outdir = temp_pipeline_dir
with monkeypatch.context() as m:
m.setattr(requests, "post", mocked_requests_post)
monkeypatch.setattr(socket, "socket", lambda *args, **kwargs: mocked_socket)
result = CliRunner().invoke(rtransparent, [str(sample), str(output)])
result = CliRunner().invoke(
extract_metrics, [str(pdfs_dir), str(outdir), "--use-sciencebeam"]
)
assert result.exit_code == 0
assert output.exists()
assert f"Converted: {sample}" in caplog.text
assert len(outdir.glob("pdf_texts/*.txt")) == 1
# assert f"Converted: {sample}" in caplog.text
mocked_requests_post.assert_called_once()
mocked_socket.connect.assert_called_once()


def test_pdf_converter(caplog, pdf_setup):
sample, _ = pdf_setup
def test_pdf_converter(sample_pdf):
sample, _ = sample_pdf

response = PDFConverter().convert(sample)
verify_xml_structure(response)

0 comments on commit a374648

Please sign in to comment.