From 3d39509b15cc3a0a3af053b5e8025596d2c08dac Mon Sep 17 00:00:00 2001 From: John lee Date: Wed, 10 Jul 2024 13:14:13 +0000 Subject: [PATCH] add prototype --- .github/workflows/release.yml | 46 +++++++++++ Dockerfile | 5 +- README.md | 2 +- osm/cli.py | 109 +++++++++++++++++++++++--- osm/config.py | 7 +- osm/oddpub.py | 47 +++++++++++ osm/rtransparent.py | 26 ++++++ osm/{converters.py => sciencebeam.py} | 10 ++- tests/conftest.py | 26 +++++- tests/test_file_processing.py | 58 +++++++++++--- 10 files changed, 306 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/release.yml create mode 100644 osm/oddpub.py create mode 100644 osm/rtransparent.py rename osm/{converters.py => sciencebeam.py} (85%) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..eb80c70a --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,46 @@ +name: "Build and upload Docker image for releases" + +on: + push: + tags: ["*"] + workflow_dispatch: + +jobs: + build_and_push_docker_image: + name: "Build Docker Image" + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@v4 + + - name: "Set up Docker Buildx" + uses: docker/setup-buildx-action@v3 + + - name: "Login to Docker Hub 🐳" + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: "Add Docker metadata" + id: meta + uses: docker/metadata-action@v5 + with: + images: | + timdanaos/app + tags: | + type=ref,event=tag + type=ref,event=branch + type=sha + + - name: "Publish Docker image" + uses: docker/build-push-action@v5 + with: + context: . + tags: | + ${{ steps.meta.outputs.tags }} + push: true + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/') diff --git a/Dockerfile b/Dockerfile index f381c9ff..6370feb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,8 +25,9 @@ RUN R -e '\ install.packages("roadoi", repos = "http://cran.us.r-project.org"); \ devtools::install_github("quest-bih/oddpub"); \ devtools::install_github("cran/crminer"); \ -devtools::install_github("serghiou/metareadr"); \ -devtools::install_github("serghiou/rtransparent")' +devtools::install_github("serghiou/metareadr")' +COPY external /app/external +RUN R -e 'devtools::install("external/rtransparent")' # Copy the project files and install the package COPY pyproject.toml /app diff --git a/README.md b/README.md index 1a585a4a..578e1dbe 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to ``` docker-compose -f compose.yaml run \ --rm \ - -v $PWD:/mnt \ + -v $PWD:/app \ app \ rtransparent \ /mnt/docs/examples/pdf_inputs/test_sample.pdf \ diff --git a/osm/cli.py b/osm/cli.py index 32fda5d0..fdbade10 100644 --- a/osm/cli.py +++ b/osm/cli.py @@ -1,17 +1,104 @@ -import click +from pathlib import Path +from typing import Tuple, Union -from osm.converters import convert_pdf +import fire + +from osm.oddpub import oddpub_metric_extraction, oddpub_pdf_conversion +from osm.rtransparent import rtransparent_metric_extraction +from osm.sciencebeam import sciencebeam_pdf_conversion + + +def setup_dirs( + input_dir: Union[str, Path], output_dir: Union[str, Path] +) -> Tuple[Path, Path]: + input_dir = Path(input_dir) + output_dir = Path(output_dir) + + if not input_dir.exists(): + raise ValueError(f"The path {input_dir} does not exist.") + + output_dir.mkdir(parents=True, exist_ok=True) + + return input_dir, output_dir + + +class OSM: + def __init__(self): + self.pdf_converters = { + "sciencebeam": sciencebeam_pdf_conversion, + "oddpub": oddpub_pdf_conversion, + } + self.metric_extractors = { + "rtransparent": rtransparent_metric_extraction, + "oddpub": oddpub_metric_extraction, + } + self._pdf_dir = None + self._text_dir = None + self._outdir = None + + def convert( + self, + *, + pdf_dir: Union[str, Path], + text_dir: Union[str, Path] = "./osm_output/pdf_texts", + converter: str = "sciencebeam", + ): + """ + Convert PDFs to text using the specified converter. + + Args: + pdf_dir: Directory containing PDF files. + text_dir: Directory to store extracted text. Defaults to "./osm_output/pdf_texts". + converter: PDF conversion method to use. Defaults to "sciencebeam". + """ + self._pdf_dir, self._text_dir = setup_dirs(pdf_dir, text_dir) + + if converter not in self.pdf_converters: + raise ValueError(f"Unknown converter: {converter}") + + self.pdf_converters[converter](self._pdf_dir, self._text_dir) + return self + + def extract( + self, + *, + text_dir: Union[str, Path] = None, + outdir: Union[str, Path] = "./osm_output", + extractor: str = "rtransparent", + ): + """ + Extract metrics from text using the specified extractor. + + Args: + text_dir: Directory containing text files. If not provided, uses the last converted text directory. + outdir: Directory to output results. Defaults to "./osm_output". + extractor: Metric extraction method to use. Defaults to "rtransparent". + """ + if text_dir is None: + if self._text_dir is None: + raise ValueError( + "No text_dir provided and no previous conversion found." + ) + text_dir = self._text_dir + + self._text_dir, self._outdir = setup_dirs(text_dir, outdir) + + if extractor not in self.metric_extractors: + raise ValueError(f"Unknown extractor: {extractor}") + + metrics = self.metric_extractors[extractor](self._text_dir) + metrics.to_csv(self._outdir / "metrics.csv", index=False) + return self -@click.group() def osm(): - """Main command for OSM""" - pass + fire.Fire( + { + "convert": OSM().convert, + "extract": OSM().extract, + } + ) -@osm.command() -@click.argument("file_path", type=click.Path(exists=True)) -@click.argument("output_file", type=str) -def rtransparent(file_path, output_file): - """Processes a biomedical publication. Writes out processed document and associated metrics.""" - convert_pdf(file_path, output_file) +if __name__ == "__main__": + osm() diff --git a/osm/config.py b/osm/config.py index 969f460d..b602f7a8 100644 --- a/osm/config.py +++ b/osm/config.py @@ -13,11 +13,16 @@ class AppConfig: sb_protocol: str = "http" def __init__( - self, sb_port: int = None, sb_host: str = None, sb_protocol: str = None + self, + sb_port: int = None, + sb_host: str = None, + sb_protocol: str = None, + vroom_connection_size: int = 2**20, ): self.sb_port = int(sb_port) if sb_port is not None else self.sb_port self.sb_host = sb_host if sb_host is not None else self.sb_host self.sb_protocol = sb_protocol if sb_protocol is not None else self.sb_protocol + self.vroom_connection_size = vroom_connection_size load_dotenv() diff --git a/osm/oddpub.py b/osm/oddpub.py new file mode 100644 index 00000000..2934c6c2 --- /dev/null +++ b/osm/oddpub.py @@ -0,0 +1,47 @@ +""" +Oddpub is being actively developed where as rtransparent has stagnated. +Oddpub implements parallelism and their interface does not easily allow working +with objects in memory so we will use that to reduce IO overhead. + +The alternative would be to load the pdf file into memory (pdftools::pdf_data +and then pass that into oddpub private functions). This would make it easier to +manage the parallelism, troubleshoot, and define the interface but partially +reinvents the wheel. +""" + +import logging +from pathlib import Path + +import psutil +import rpy2.robjects as ro +from rpy2.robjects import pandas2ri +from rpy2.robjects.packages import importr + +from osm.config import osm_config + +logging.basicConfig(level=logging.DEBUG) + +# Adjust the logging level for rpy2 +rpy2_logger = logging.getLogger("rpy2") +rpy2_logger.setLevel(logging.DEBUG) + +oddpub = importr("oddpub") +future = importr("future") +ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")') + + +def oddpub_pdf_conversion( + pdf_dir: Path, text_dir: Path, workers: int = psutil.cpu_count() +): + future.plan(future.multisession, workers=workers) + oddpub.pdf_convert(str(pdf_dir), str(text_dir)) + + +def oddpub_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()): + future.plan(future.multisession, workers=workers) + pdf_sentences = oddpub.pdf_load(f"{text_dir}/") + open_data_results = oddpub.open_data_search(pdf_sentences) + with (ro.default_converter + pandas2ri.converter).context(): + metrics = ro.conversion.get_conversion().rpy2py(open_data_results) + + return metrics diff --git a/osm/rtransparent.py b/osm/rtransparent.py new file mode 100644 index 00000000..7c1b9af6 --- /dev/null +++ b/osm/rtransparent.py @@ -0,0 +1,26 @@ +from pathlib import Path + +import pandas as pd +import psutil +import rpy2.robjects as ro +from rpy2.robjects import pandas2ri +from rpy2.robjects.packages import importr + +from osm.config import osm_config + +ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")') + + +def rtransparent_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()): + rtransparent = importr("rtransparent") + future = importr("future") + future.plan(future.multisession, workers=workers) + metrics = [] + for file_path in text_dir.glob("*.txt"): + with (ro.default_converter + pandas2ri.converter).context(): + metrics.append = ro.conversion.get_conversion().rpy2py( + rtransparent.rt_data_code(file_path) + ) + + breakpoint() + return pd.concat([row for row in metrics], ignore_index=True) diff --git a/osm/converters.py b/osm/sciencebeam.py similarity index 85% rename from osm/converters.py rename to osm/sciencebeam.py index c5da3a89..1877026d 100644 --- a/osm/converters.py +++ b/osm/sciencebeam.py @@ -60,14 +60,20 @@ def _is_host_ready(self, timeout=3) -> bool: return True -def convert_pdf(file_path, output_file_path): - """Converts a PDF file to XML and saves the output. +def sciencebeam_pdf_conversion(file_path, text_dir): + """Converts a directory of PDFs to a directory of XML and saves the output. Args: file_path (str): Path to the input PDF file. output_file_path (str): Path to the output XML file. """ converter = PDFConverter() + output_file_path = text_dir / (Path(file_path).stem + ".xml") + if output_file_path.exists(): + logger.info( + f"Skipping: {file_path}. Output file already exists: {output_file_path}" + ) + return xml_content = converter.convert(file_path) # Save the converted xml contents diff --git a/tests/conftest.py b/tests/conftest.py index 9a8f4159..5ade7542 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,30 @@ import requests +def pytest_addoption(parser): + parser.addoption( + "--rs", + "--run-slow", + action="store_true", + help="Run tests that take a long time >10s to complete", + ) + + +def pytest_configure(config): + config.addinivalue_line("markers", "run_slow: test takes >10s to complete") + + +def pytest_collection_modifyitems(config, items): + if not config.getoption("--run-slow"): + for item in items: + try: + next(m for m in item.iter_markers() if m.name == "run_slow") + except StopIteration: + pass + else: + item.add_marker(pytest.mark.skip("run with --run-slow")) + + @pytest.fixture def mocked_socket(): mock_sock_instance = MagicMock(spec=socket.socket) @@ -37,7 +61,7 @@ def mocked_requests_post(): @pytest.fixture -def pdf_setup(tmp_path): +def sample_pdf(tmp_path): pdfs_folder = Path("docs/examples/pdf_inputs") file_in = pdfs_folder / "test_sample.pdf" output = tmp_path / "test_output_file.xml" diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py index 91f61fbf..f9006f6c 100644 --- a/tests/test_file_processing.py +++ b/tests/test_file_processing.py @@ -1,33 +1,67 @@ import logging +import shutil import socket +from pathlib import Path +import pytest import requests from click.testing import CliRunner -from osm.cli import rtransparent -from osm.converters import PDFConverter +from osm.cli import extract_metrics +from osm.sciencebeam import PDFConverter from .utils import verify_xml_structure +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger("rpy2") -def test_cli_rtransparent( - pdf_setup, monkeypatch, mocked_requests_post, mocked_socket, caplog + +def test_cli_metrics( + tmp_path, monkeypatch, mocked_requests_post, mocked_socket, caplog ): - caplog.set_level(logging.INFO) - sample, output = pdf_setup + # set up temp dir + tmp_pdfs = tmp_path / "pdfs" + tmp_pdfs.mkdir() + sample = shutil.copy2( + Path("docs/examples/pdf_inputs/test_sample.pdf"), tmp_pdfs / "test_sample.pdf" + ) + # (tmp_pdfs / sample).symlink_to(pdfs_dir / sample) + with monkeypatch.context() as m: + caplog.set_level(logging.INFO) m.setattr(requests, "post", mocked_requests_post) monkeypatch.setattr(socket, "socket", lambda *args, **kwargs: mocked_socket) - result = CliRunner().invoke(rtransparent, [str(sample), str(output)]) - assert result.exit_code == 0 - assert output.exists() - assert f"Converted: {sample}" in caplog.text + result = CliRunner().invoke(extract_metrics, [str(tmp_pdfs), str(tmp_path)]) + if result.exit_code != 0: + logger.error("Command failed with exit code %s", result.exit_code) + logger.error("Output:\n%s", result.output) + raise result.exception + assert list(tmp_path.rglob("*txt"))[0].name == sample.with_suffix(".txt").name mocked_requests_post.assert_called_once() mocked_socket.connect.assert_called_once() -def test_pdf_converter(caplog, pdf_setup): - sample, _ = pdf_setup +@pytest.mark.run_slow +def test_cli_metrics_oddpub(tmp_path): + # set up temp dir + tmp_pdfs = tmp_path / "pdfs" + tmp_pdfs.mkdir() + sample = shutil.copy2( + Path("docs/examples/pdf_inputs/test_sample.pdf"), tmp_pdfs / "test_sample.pdf" + ) + + result = CliRunner().invoke( + extract_metrics, [str(tmp_pdfs), str(tmp_path), "--parse-with-oddpub"] + ) + if result.exit_code != 0: + logger.error("Command failed with exit code %s", result.exit_code) + logger.error("Output:\n%s", result.output) + raise result.exception + assert list(tmp_path.rglob("*txt"))[0].name == sample.with_suffix(".txt").name + + +def test_pdf_converter(sample_pdf): + sample, _ = sample_pdf response = PDFConverter().convert(sample) verify_xml_structure(response)