add prototype

nimh-dsst · Jul 10, 2024 · a374648 · a374648
1 parent 1f06cef
commit a374648
Show file tree

Hide file tree

Showing 9 changed files with 157 additions and 26 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,46 @@
+name: "Build and upload Docker image for releases"
+
+on:
+  push:
+    tags: ["*"]
+  workflow_dispatch:
+
+jobs:
+  build_and_push_docker_image:
+    name: "Build Docker Image"
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@v4
+
+      - name: "Set up Docker Buildx"
+        uses: docker/setup-buildx-action@v3
+
+      - name: "Login to Docker Hub 🐳"
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: "Add Docker metadata"
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            timdanaos/app
+          tags: |
+            type=ref,event=tag
+            type=ref,event=branch
+            type=sha
+
+      - name: "Publish Docker image"
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          tags: |
+            ${{ steps.meta.outputs.tags }}
+          push: true
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+        if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/')
diff --git a/Dockerfile b/Dockerfile
@@ -25,8 +25,9 @@ RUN R -e '\
 install.packages("roadoi", repos = "http://cran.us.r-project.org"); \
 devtools::install_github("quest-bih/oddpub"); \
 devtools::install_github("cran/crminer"); \
-devtools::install_github("serghiou/metareadr"); \
-devtools::install_github("serghiou/rtransparent")'
+devtools::install_github("serghiou/metareadr")'
+COPY external /app/external
+RUN R -e 'devtools::install("external/rtransparent")'
 
 # Copy the project files and install the package
 COPY pyproject.toml /app

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to
 ```
 docker-compose -f compose.yaml run \
   --rm \
-  -v $PWD:/mnt \
+  -v $PWD:/app \
   app \
   rtransparent \
   /mnt/docs/examples/pdf_inputs/test_sample.pdf \

diff --git a/osm/cli.py b/osm/cli.py
@@ -1,6 +1,9 @@
+from pathlib import Path
+
 import click
 
-from osm.converters import convert_pdf
+from osm.oddpub import oddpub_metric_extraction, oddpub_pdf_conversion
+from osm.sciencebeam import convert_pdf
 
 
 @click.group()
@@ -10,8 +13,22 @@ def osm():
 
 
 @osm.command()
-@click.argument("file_path", type=click.Path(exists=True))
-@click.argument("output_file", type=str)
-def rtransparent(file_path, output_file):
-    """Processes a biomedical publication. Writes out processed document and associated metrics."""
-    convert_pdf(file_path, output_file)
+@click.argument("pdf_dir", type=click.Path(exists=True))
+@click.argument("outdir", type=click.Path(exists=True))
+@click.option("--use-sciencebeam", is_flag=True, type=bool)
+def extract_metrics(pdf_dir, outdir, use_sciencebeam):
+    """Processes a biomedical publications. Writes out processed documents and associated metrics."""
+    outdir = Path(outdir)
+    pdf_dir = Path(pdf_dir)
+    text_dir = outdir / "pdf_texts"
+    text_dir.mkdir(exist_ok=True, parents=True)
+    try:
+        if use_sciencebeam:
+            convert_pdf(pdf_dir, text_dir)
+        else:
+            oddpub_pdf_conversion(pdf_dir, text_dir)
+        metrics = oddpub_metric_extraction(text_dir)
+    except Exception as err:
+        raise (err)
+    #     breakpoint()
+    metrics.to_csv(outdir / "metrics.csv", index=False)
diff --git a/osm/config.py b/osm/config.py
@@ -13,11 +13,16 @@ class AppConfig:
     sb_protocol: str = "http"
 
     def __init__(
-        self, sb_port: int = None, sb_host: str = None, sb_protocol: str = None
+        self,
+        sb_port: int = None,
+        sb_host: str = None,
+        sb_protocol: str = None,
+        vroom_connection_size: int = 2**20,
     ):
         self.sb_port = int(sb_port) if sb_port is not None else self.sb_port
         self.sb_host = sb_host if sb_host is not None else self.sb_host
         self.sb_protocol = sb_protocol if sb_protocol is not None else self.sb_protocol
+        self.vroom_connection_size = vroom_connection_size
 
 
 load_dotenv()

diff --git a/osm/oddpub.py b/osm/oddpub.py
@@ -0,0 +1,42 @@
+"""
+Oddpub is being actively developed where as rtransparent has stagnated.
+Oddpub implements parallelism and their interface does not easily allow working
+with objects in memory so we will use that to reduce IO overhead.
+
+The alternative would be to load the pdf file into memory (pdftools::pdf_data
+and then pass that into oddpub private functions). This would make it easier to
+manage the parallelism, troubleshoot, and define the interface but partially
+reinvents the wheel.
+"""
+
+import os
+from pathlib import Path
+
+import rpy2.robjects as ro
+from rpy2.robjects import pandas2ri
+from rpy2.robjects.packages import importr
+
+from osm.config import osm_config
+
+oddpub = importr("oddpub")
+future = importr("future")
+ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")')
+
+
+def oddpub_pdf_conversion(
+    pdf_dir: Path, text_dir: Path, workers: int = len(os.sched_getaffinity(0))
+):
+    future.plan(future.multisession, workers=workers)
+    oddpub.pdf_convert(str(pdf_dir), str(text_dir))
+
+
+def oddpub_metric_extraction(
+    text_dir: Path, workers: int = len(os.sched_getaffinity(0))
+):
+    future.plan(future.multisession, workers=workers)
+    pdf_sentences = oddpub.pdf_load(f"{text_dir}/")
+    open_data_results = oddpub.open_data_search(pdf_sentences)
+    with (ro.default_converter + pandas2ri.converter).context():
+        metrics = ro.conversion.get_conversion().rpy2py(open_data_results)
+
+    return metrics
diff --git a/osm/converters.py → osm/sciencebeam.py b/osm/converters.py → osm/sciencebeam.py
@@ -60,16 +60,23 @@ def _is_host_ready(self, timeout=3) -> bool:
             return True
 
 
-def convert_pdf(file_path, output_file_path):
+def convert_pdf(pdf_dir, text_dir):
     """Converts a PDF file to XML and saves the output.
 
     Args:
         file_path (str): Path to the input PDF file.
         output_file_path (str): Path to the output XML file.
     """
     converter = PDFConverter()
-    xml_content = converter.convert(file_path)
+    for file_path in pdf_dir.glob("*.pdf"):
+        output_file_path = text_dir / (Path(file_path).stem + ".xml")
+        if output_file_path.exists():
+            logger.info(
+                f"Skipping: {file_path}. Output file already exists: {output_file_path}"
+            )
+            continue
+        xml_content = converter.convert(file_path)
 
-    # Save the converted xml contents
-    Path(output_file_path).write_text(xml_content)
-    logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}")
+        # Save the converted xml contents
+        Path(output_file_path).write_text(xml_content)
+        logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}")
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -37,8 +37,19 @@ def mocked_requests_post():
 
 
 @pytest.fixture
-def pdf_setup(tmp_path):
+def sample_pdf(tmp_path):
     pdfs_folder = Path("docs/examples/pdf_inputs")
     file_in = pdfs_folder / "test_sample.pdf"
     output = tmp_path / "test_output_file.xml"
     yield file_in, output
+
+
+@pytest.fixture
+def temp_pipeline_dir(tmp_path):
+    pdfs_folder = Path("docs/examples/pdf_inputs")
+    sample = "test_sample.pdf"
+    tmp_pdfs = tmp_path / "pdfs"
+    tmp_pdfs.mkdir()
+    (tmp_pdfs / sample).symlink_to(pdfs_folder / sample)
+    outdir = tmp_path
+    yield tmp_pdfs, outdir
diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py
@@ -4,30 +4,32 @@
 import requests
 from click.testing import CliRunner
 
-from osm.cli import rtransparent
-from osm.converters import PDFConverter
+from osm.cli import extract_metrics
+from osm.sciencebeam import PDFConverter
 
 from .utils import verify_xml_structure
 
 
-def test_cli_rtransparent(
-    pdf_setup, monkeypatch, mocked_requests_post, mocked_socket, caplog
+def test_cli_metrics(
+    temp_pipeline_dir, monkeypatch, mocked_requests_post, mocked_socket, caplog
 ):
     caplog.set_level(logging.INFO)
-    sample, output = pdf_setup
+    pdfs_dir, outdir = temp_pipeline_dir
     with monkeypatch.context() as m:
         m.setattr(requests, "post", mocked_requests_post)
         monkeypatch.setattr(socket, "socket", lambda *args, **kwargs: mocked_socket)
-        result = CliRunner().invoke(rtransparent, [str(sample), str(output)])
+        result = CliRunner().invoke(
+            extract_metrics, [str(pdfs_dir), str(outdir), "--use-sciencebeam"]
+        )
         assert result.exit_code == 0
-        assert output.exists()
-        assert f"Converted: {sample}" in caplog.text
+        assert len(outdir.glob("pdf_texts/*.txt")) == 1
+        # assert f"Converted: {sample}" in caplog.text
         mocked_requests_post.assert_called_once()
         mocked_socket.connect.assert_called_once()
 
 
-def test_pdf_converter(caplog, pdf_setup):
-    sample, _ = pdf_setup
+def test_pdf_converter(sample_pdf):
+    sample, _ = sample_pdf
 
     response = PDFConverter().convert(sample)
     verify_xml_structure(response)