Skip to content

Commit

Permalink
add prototype
Browse files Browse the repository at this point in the history
  • Loading branch information
leej3 committed Jul 11, 2024
1 parent 1f06cef commit 3d39509
Show file tree
Hide file tree
Showing 10 changed files with 306 additions and 30 deletions.
46 changes: 46 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: "Build and upload Docker image for releases"

on:
push:
tags: ["*"]
workflow_dispatch:

jobs:
build_and_push_docker_image:
name: "Build Docker Image"
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@v4

- name: "Set up Docker Buildx"
uses: docker/setup-buildx-action@v3

- name: "Login to Docker Hub 🐳"
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: "Add Docker metadata"
id: meta
uses: docker/metadata-action@v5
with:
images: |
timdanaos/app
tags: |
type=ref,event=tag
type=ref,event=branch
type=sha
- name: "Publish Docker image"
uses: docker/build-push-action@v5
with:
context: .
tags: |
${{ steps.meta.outputs.tags }}
push: true
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/')
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ RUN R -e '\
install.packages("roadoi", repos = "http://cran.us.r-project.org"); \
devtools::install_github("quest-bih/oddpub"); \
devtools::install_github("cran/crminer"); \
devtools::install_github("serghiou/metareadr"); \
devtools::install_github("serghiou/rtransparent")'
devtools::install_github("serghiou/metareadr")'
COPY external /app/external
RUN R -e 'devtools::install("external/rtransparent")'

# Copy the project files and install the package
COPY pyproject.toml /app
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to
```
docker-compose -f compose.yaml run \
--rm \
-v $PWD:/mnt \
-v $PWD:/app \
app \
rtransparent \
/mnt/docs/examples/pdf_inputs/test_sample.pdf \
Expand Down
109 changes: 98 additions & 11 deletions osm/cli.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,104 @@
import click
from pathlib import Path
from typing import Tuple, Union

from osm.converters import convert_pdf
import fire

from osm.oddpub import oddpub_metric_extraction, oddpub_pdf_conversion
from osm.rtransparent import rtransparent_metric_extraction
from osm.sciencebeam import sciencebeam_pdf_conversion


def setup_dirs(
input_dir: Union[str, Path], output_dir: Union[str, Path]
) -> Tuple[Path, Path]:
input_dir = Path(input_dir)
output_dir = Path(output_dir)

if not input_dir.exists():
raise ValueError(f"The path {input_dir} does not exist.")

output_dir.mkdir(parents=True, exist_ok=True)

return input_dir, output_dir


class OSM:
def __init__(self):
self.pdf_converters = {
"sciencebeam": sciencebeam_pdf_conversion,
"oddpub": oddpub_pdf_conversion,
}
self.metric_extractors = {
"rtransparent": rtransparent_metric_extraction,
"oddpub": oddpub_metric_extraction,
}
self._pdf_dir = None
self._text_dir = None
self._outdir = None

def convert(
self,
*,
pdf_dir: Union[str, Path],
text_dir: Union[str, Path] = "./osm_output/pdf_texts",
converter: str = "sciencebeam",
):
"""
Convert PDFs to text using the specified converter.
Args:
pdf_dir: Directory containing PDF files.
text_dir: Directory to store extracted text. Defaults to "./osm_output/pdf_texts".
converter: PDF conversion method to use. Defaults to "sciencebeam".
"""
self._pdf_dir, self._text_dir = setup_dirs(pdf_dir, text_dir)

if converter not in self.pdf_converters:
raise ValueError(f"Unknown converter: {converter}")

self.pdf_converters[converter](self._pdf_dir, self._text_dir)
return self

def extract(
self,
*,
text_dir: Union[str, Path] = None,
outdir: Union[str, Path] = "./osm_output",
extractor: str = "rtransparent",
):
"""
Extract metrics from text using the specified extractor.
Args:
text_dir: Directory containing text files. If not provided, uses the last converted text directory.
outdir: Directory to output results. Defaults to "./osm_output".
extractor: Metric extraction method to use. Defaults to "rtransparent".
"""
if text_dir is None:
if self._text_dir is None:
raise ValueError(
"No text_dir provided and no previous conversion found."
)
text_dir = self._text_dir

self._text_dir, self._outdir = setup_dirs(text_dir, outdir)

if extractor not in self.metric_extractors:
raise ValueError(f"Unknown extractor: {extractor}")

metrics = self.metric_extractors[extractor](self._text_dir)
metrics.to_csv(self._outdir / "metrics.csv", index=False)
return self


@click.group()
def osm():
"""Main command for OSM"""
pass
fire.Fire(
{
"convert": OSM().convert,
"extract": OSM().extract,
}
)


@osm.command()
@click.argument("file_path", type=click.Path(exists=True))
@click.argument("output_file", type=str)
def rtransparent(file_path, output_file):
"""Processes a biomedical publication. Writes out processed document and associated metrics."""
convert_pdf(file_path, output_file)
if __name__ == "__main__":
osm()
7 changes: 6 additions & 1 deletion osm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@ class AppConfig:
sb_protocol: str = "http"

def __init__(
self, sb_port: int = None, sb_host: str = None, sb_protocol: str = None
self,
sb_port: int = None,
sb_host: str = None,
sb_protocol: str = None,
vroom_connection_size: int = 2**20,
):
self.sb_port = int(sb_port) if sb_port is not None else self.sb_port
self.sb_host = sb_host if sb_host is not None else self.sb_host
self.sb_protocol = sb_protocol if sb_protocol is not None else self.sb_protocol
self.vroom_connection_size = vroom_connection_size


load_dotenv()
Expand Down
47 changes: 47 additions & 0 deletions osm/oddpub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Oddpub is being actively developed where as rtransparent has stagnated.
Oddpub implements parallelism and their interface does not easily allow working
with objects in memory so we will use that to reduce IO overhead.
The alternative would be to load the pdf file into memory (pdftools::pdf_data
and then pass that into oddpub private functions). This would make it easier to
manage the parallelism, troubleshoot, and define the interface but partially
reinvents the wheel.
"""

import logging
from pathlib import Path

import psutil
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

from osm.config import osm_config

logging.basicConfig(level=logging.DEBUG)

# Adjust the logging level for rpy2
rpy2_logger = logging.getLogger("rpy2")
rpy2_logger.setLevel(logging.DEBUG)

oddpub = importr("oddpub")
future = importr("future")
ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")')


def oddpub_pdf_conversion(
pdf_dir: Path, text_dir: Path, workers: int = psutil.cpu_count()
):
future.plan(future.multisession, workers=workers)
oddpub.pdf_convert(str(pdf_dir), str(text_dir))


def oddpub_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()):
future.plan(future.multisession, workers=workers)
pdf_sentences = oddpub.pdf_load(f"{text_dir}/")
open_data_results = oddpub.open_data_search(pdf_sentences)
with (ro.default_converter + pandas2ri.converter).context():
metrics = ro.conversion.get_conversion().rpy2py(open_data_results)

return metrics
26 changes: 26 additions & 0 deletions osm/rtransparent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pathlib import Path

import pandas as pd
import psutil
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

from osm.config import osm_config

ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")')


def rtransparent_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()):
rtransparent = importr("rtransparent")
future = importr("future")
future.plan(future.multisession, workers=workers)
metrics = []
for file_path in text_dir.glob("*.txt"):
with (ro.default_converter + pandas2ri.converter).context():
metrics.append = ro.conversion.get_conversion().rpy2py(
rtransparent.rt_data_code(file_path)
)

breakpoint()
return pd.concat([row for row in metrics], ignore_index=True)
10 changes: 8 additions & 2 deletions osm/converters.py → osm/sciencebeam.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,20 @@ def _is_host_ready(self, timeout=3) -> bool:
return True


def convert_pdf(file_path, output_file_path):
"""Converts a PDF file to XML and saves the output.
def sciencebeam_pdf_conversion(file_path, text_dir):
"""Converts a directory of PDFs to a directory of XML and saves the output.
Args:
file_path (str): Path to the input PDF file.
output_file_path (str): Path to the output XML file.
"""
converter = PDFConverter()
output_file_path = text_dir / (Path(file_path).stem + ".xml")
if output_file_path.exists():
logger.info(
f"Skipping: {file_path}. Output file already exists: {output_file_path}"
)
return
xml_content = converter.convert(file_path)

# Save the converted xml contents
Expand Down
26 changes: 25 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,30 @@
import requests


def pytest_addoption(parser):
parser.addoption(
"--rs",
"--run-slow",
action="store_true",
help="Run tests that take a long time >10s to complete",
)


def pytest_configure(config):
config.addinivalue_line("markers", "run_slow: test takes >10s to complete")


def pytest_collection_modifyitems(config, items):
if not config.getoption("--run-slow"):
for item in items:
try:
next(m for m in item.iter_markers() if m.name == "run_slow")
except StopIteration:
pass
else:
item.add_marker(pytest.mark.skip("run with --run-slow"))


@pytest.fixture
def mocked_socket():
mock_sock_instance = MagicMock(spec=socket.socket)
Expand Down Expand Up @@ -37,7 +61,7 @@ def mocked_requests_post():


@pytest.fixture
def pdf_setup(tmp_path):
def sample_pdf(tmp_path):
pdfs_folder = Path("docs/examples/pdf_inputs")
file_in = pdfs_folder / "test_sample.pdf"
output = tmp_path / "test_output_file.xml"
Expand Down
Loading

0 comments on commit 3d39509

Please sign in to comment.