From 3d39509b15cc3a0a3af053b5e8025596d2c08dac Mon Sep 17 00:00:00 2001
From: John lee <johnleenimh@gmail.com>
Date: Wed, 10 Jul 2024 13:14:13 +0000
Subject: [PATCH] add prototype

---
 .github/workflows/release.yml         |  46 +++++++++++
 Dockerfile                            |   5 +-
 README.md                             |   2 +-
 osm/cli.py                            | 109 +++++++++++++++++++++++---
 osm/config.py                         |   7 +-
 osm/oddpub.py                         |  47 +++++++++++
 osm/rtransparent.py                   |  26 ++++++
 osm/{converters.py => sciencebeam.py} |  10 ++-
 tests/conftest.py                     |  26 +++++-
 tests/test_file_processing.py         |  58 +++++++++++---
 10 files changed, 306 insertions(+), 30 deletions(-)
 create mode 100644 .github/workflows/release.yml
 create mode 100644 osm/oddpub.py
 create mode 100644 osm/rtransparent.py
 rename osm/{converters.py => sciencebeam.py} (85%)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 00000000..eb80c70a
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,46 @@
+name: "Build and upload Docker image for releases"
+
+on:
+  push:
+    tags: ["*"]
+  workflow_dispatch:
+
+jobs:
+  build_and_push_docker_image:
+    name: "Build Docker Image"
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@v4
+
+      - name: "Set up Docker Buildx"
+        uses: docker/setup-buildx-action@v3
+
+      - name: "Login to Docker Hub 🐳"
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: "Add Docker metadata"
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            timdanaos/app
+          tags: |
+            type=ref,event=tag
+            type=ref,event=branch
+            type=sha
+
+      - name: "Publish Docker image"
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          tags: |
+            ${{ steps.meta.outputs.tags }}
+          push: true
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+        if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/')
diff --git a/Dockerfile b/Dockerfile
index f381c9ff..6370feb4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,8 +25,9 @@ RUN R -e '\
 install.packages("roadoi", repos = "http://cran.us.r-project.org"); \
 devtools::install_github("quest-bih/oddpub"); \
 devtools::install_github("cran/crminer"); \
-devtools::install_github("serghiou/metareadr"); \
-devtools::install_github("serghiou/rtransparent")'
+devtools::install_github("serghiou/metareadr")'
+COPY external /app/external
+RUN R -e 'devtools::install("external/rtransparent")'
 
 # Copy the project files and install the package
 COPY pyproject.toml /app
diff --git a/README.md b/README.md
index 1a585a4a..578e1dbe 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to
 ```
 docker-compose -f compose.yaml run \
   --rm \
-  -v $PWD:/mnt \
+  -v $PWD:/app \
   app \
   rtransparent \
   /mnt/docs/examples/pdf_inputs/test_sample.pdf \
diff --git a/osm/cli.py b/osm/cli.py
index 32fda5d0..fdbade10 100644
--- a/osm/cli.py
+++ b/osm/cli.py
@@ -1,17 +1,104 @@
-import click
+from pathlib import Path
+from typing import Tuple, Union
 
-from osm.converters import convert_pdf
+import fire
+
+from osm.oddpub import oddpub_metric_extraction, oddpub_pdf_conversion
+from osm.rtransparent import rtransparent_metric_extraction
+from osm.sciencebeam import sciencebeam_pdf_conversion
+
+
+def setup_dirs(
+    input_dir: Union[str, Path], output_dir: Union[str, Path]
+) -> Tuple[Path, Path]:
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+
+    if not input_dir.exists():
+        raise ValueError(f"The path {input_dir} does not exist.")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    return input_dir, output_dir
+
+
+class OSM:
+    def __init__(self):
+        self.pdf_converters = {
+            "sciencebeam": sciencebeam_pdf_conversion,
+            "oddpub": oddpub_pdf_conversion,
+        }
+        self.metric_extractors = {
+            "rtransparent": rtransparent_metric_extraction,
+            "oddpub": oddpub_metric_extraction,
+        }
+        self._pdf_dir = None
+        self._text_dir = None
+        self._outdir = None
+
+    def convert(
+        self,
+        *,
+        pdf_dir: Union[str, Path],
+        text_dir: Union[str, Path] = "./osm_output/pdf_texts",
+        converter: str = "sciencebeam",
+    ):
+        """
+        Convert PDFs to text using the specified converter.
+
+        Args:
+            pdf_dir: Directory containing PDF files.
+            text_dir: Directory to store extracted text. Defaults to "./osm_output/pdf_texts".
+            converter: PDF conversion method to use. Defaults to "sciencebeam".
+        """
+        self._pdf_dir, self._text_dir = setup_dirs(pdf_dir, text_dir)
+
+        if converter not in self.pdf_converters:
+            raise ValueError(f"Unknown converter: {converter}")
+
+        self.pdf_converters[converter](self._pdf_dir, self._text_dir)
+        return self
+
+    def extract(
+        self,
+        *,
+        text_dir: Union[str, Path] = None,
+        outdir: Union[str, Path] = "./osm_output",
+        extractor: str = "rtransparent",
+    ):
+        """
+        Extract metrics from text using the specified extractor.
+
+        Args:
+            text_dir: Directory containing text files. If not provided, uses the last converted text directory.
+            outdir: Directory to output results. Defaults to "./osm_output".
+            extractor: Metric extraction method to use. Defaults to "rtransparent".
+        """
+        if text_dir is None:
+            if self._text_dir is None:
+                raise ValueError(
+                    "No text_dir provided and no previous conversion found."
+                )
+            text_dir = self._text_dir
+
+        self._text_dir, self._outdir = setup_dirs(text_dir, outdir)
+
+        if extractor not in self.metric_extractors:
+            raise ValueError(f"Unknown extractor: {extractor}")
+
+        metrics = self.metric_extractors[extractor](self._text_dir)
+        metrics.to_csv(self._outdir / "metrics.csv", index=False)
+        return self
 
 
-@click.group()
 def osm():
-    """Main command for OSM"""
-    pass
+    fire.Fire(
+        {
+            "convert": OSM().convert,
+            "extract": OSM().extract,
+        }
+    )
 
 
-@osm.command()
-@click.argument("file_path", type=click.Path(exists=True))
-@click.argument("output_file", type=str)
-def rtransparent(file_path, output_file):
-    """Processes a biomedical publication. Writes out processed document and associated metrics."""
-    convert_pdf(file_path, output_file)
+if __name__ == "__main__":
+    osm()
diff --git a/osm/config.py b/osm/config.py
index 969f460d..b602f7a8 100644
--- a/osm/config.py
+++ b/osm/config.py
@@ -13,11 +13,16 @@ class AppConfig:
     sb_protocol: str = "http"
 
     def __init__(
-        self, sb_port: int = None, sb_host: str = None, sb_protocol: str = None
+        self,
+        sb_port: int = None,
+        sb_host: str = None,
+        sb_protocol: str = None,
+        vroom_connection_size: int = 2**20,
     ):
         self.sb_port = int(sb_port) if sb_port is not None else self.sb_port
         self.sb_host = sb_host if sb_host is not None else self.sb_host
         self.sb_protocol = sb_protocol if sb_protocol is not None else self.sb_protocol
+        self.vroom_connection_size = vroom_connection_size
 
 
 load_dotenv()
diff --git a/osm/oddpub.py b/osm/oddpub.py
new file mode 100644
index 00000000..2934c6c2
--- /dev/null
+++ b/osm/oddpub.py
@@ -0,0 +1,47 @@
+"""
+Oddpub is being actively developed where as rtransparent has stagnated.
+Oddpub implements parallelism and their interface does not easily allow working
+with objects in memory so we will use that to reduce IO overhead.
+
+The alternative would be to load the pdf file into memory (pdftools::pdf_data
+and then pass that into oddpub private functions). This would make it easier to
+manage the parallelism, troubleshoot, and define the interface but partially
+reinvents the wheel.
+"""
+
+import logging
+from pathlib import Path
+
+import psutil
+import rpy2.robjects as ro
+from rpy2.robjects import pandas2ri
+from rpy2.robjects.packages import importr
+
+from osm.config import osm_config
+
+logging.basicConfig(level=logging.DEBUG)
+
+# Adjust the logging level for rpy2
+rpy2_logger = logging.getLogger("rpy2")
+rpy2_logger.setLevel(logging.DEBUG)
+
+oddpub = importr("oddpub")
+future = importr("future")
+ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")')
+
+
+def oddpub_pdf_conversion(
+    pdf_dir: Path, text_dir: Path, workers: int = psutil.cpu_count()
+):
+    future.plan(future.multisession, workers=workers)
+    oddpub.pdf_convert(str(pdf_dir), str(text_dir))
+
+
+def oddpub_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()):
+    future.plan(future.multisession, workers=workers)
+    pdf_sentences = oddpub.pdf_load(f"{text_dir}/")
+    open_data_results = oddpub.open_data_search(pdf_sentences)
+    with (ro.default_converter + pandas2ri.converter).context():
+        metrics = ro.conversion.get_conversion().rpy2py(open_data_results)
+
+    return metrics
diff --git a/osm/rtransparent.py b/osm/rtransparent.py
new file mode 100644
index 00000000..7c1b9af6
--- /dev/null
+++ b/osm/rtransparent.py
@@ -0,0 +1,26 @@
+from pathlib import Path
+
+import pandas as pd
+import psutil
+import rpy2.robjects as ro
+from rpy2.robjects import pandas2ri
+from rpy2.robjects.packages import importr
+
+from osm.config import osm_config
+
+ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")')
+
+
+def rtransparent_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()):
+    rtransparent = importr("rtransparent")
+    future = importr("future")
+    future.plan(future.multisession, workers=workers)
+    metrics = []
+    for file_path in text_dir.glob("*.txt"):
+        with (ro.default_converter + pandas2ri.converter).context():
+            metrics.append = ro.conversion.get_conversion().rpy2py(
+                rtransparent.rt_data_code(file_path)
+            )
+
+    breakpoint()
+    return pd.concat([row for row in metrics], ignore_index=True)
diff --git a/osm/converters.py b/osm/sciencebeam.py
similarity index 85%
rename from osm/converters.py
rename to osm/sciencebeam.py
index c5da3a89..1877026d 100644
--- a/osm/converters.py
+++ b/osm/sciencebeam.py
@@ -60,14 +60,20 @@ def _is_host_ready(self, timeout=3) -> bool:
             return True
 
 
-def convert_pdf(file_path, output_file_path):
-    """Converts a PDF file to XML and saves the output.
+def sciencebeam_pdf_conversion(file_path, text_dir):
+    """Converts a directory of PDFs to a directory of XML and saves the output.
 
     Args:
         file_path (str): Path to the input PDF file.
         output_file_path (str): Path to the output XML file.
     """
     converter = PDFConverter()
+    output_file_path = text_dir / (Path(file_path).stem + ".xml")
+    if output_file_path.exists():
+        logger.info(
+            f"Skipping: {file_path}. Output file already exists: {output_file_path}"
+        )
+        return
     xml_content = converter.convert(file_path)
 
     # Save the converted xml contents
diff --git a/tests/conftest.py b/tests/conftest.py
index 9a8f4159..5ade7542 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,30 @@
 import requests
 
 
+def pytest_addoption(parser):
+    parser.addoption(
+        "--rs",
+        "--run-slow",
+        action="store_true",
+        help="Run tests that take a long time >10s to complete",
+    )
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "run_slow: test takes >10s to complete")
+
+
+def pytest_collection_modifyitems(config, items):
+    if not config.getoption("--run-slow"):
+        for item in items:
+            try:
+                next(m for m in item.iter_markers() if m.name == "run_slow")
+            except StopIteration:
+                pass
+            else:
+                item.add_marker(pytest.mark.skip("run with --run-slow"))
+
+
 @pytest.fixture
 def mocked_socket():
     mock_sock_instance = MagicMock(spec=socket.socket)
@@ -37,7 +61,7 @@ def mocked_requests_post():
 
 
 @pytest.fixture
-def pdf_setup(tmp_path):
+def sample_pdf(tmp_path):
     pdfs_folder = Path("docs/examples/pdf_inputs")
     file_in = pdfs_folder / "test_sample.pdf"
     output = tmp_path / "test_output_file.xml"
diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py
index 91f61fbf..f9006f6c 100644
--- a/tests/test_file_processing.py
+++ b/tests/test_file_processing.py
@@ -1,33 +1,67 @@
 import logging
+import shutil
 import socket
+from pathlib import Path
 
+import pytest
 import requests
 from click.testing import CliRunner
 
-from osm.cli import rtransparent
-from osm.converters import PDFConverter
+from osm.cli import extract_metrics
+from osm.sciencebeam import PDFConverter
 
 from .utils import verify_xml_structure
 
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger("rpy2")
 
-def test_cli_rtransparent(
-    pdf_setup, monkeypatch, mocked_requests_post, mocked_socket, caplog
+
+def test_cli_metrics(
+    tmp_path, monkeypatch, mocked_requests_post, mocked_socket, caplog
 ):
-    caplog.set_level(logging.INFO)
-    sample, output = pdf_setup
+    # set up temp dir
+    tmp_pdfs = tmp_path / "pdfs"
+    tmp_pdfs.mkdir()
+    sample = shutil.copy2(
+        Path("docs/examples/pdf_inputs/test_sample.pdf"), tmp_pdfs / "test_sample.pdf"
+    )
+    # (tmp_pdfs / sample).symlink_to(pdfs_dir / sample)
+
     with monkeypatch.context() as m:
+        caplog.set_level(logging.INFO)
         m.setattr(requests, "post", mocked_requests_post)
         monkeypatch.setattr(socket, "socket", lambda *args, **kwargs: mocked_socket)
-        result = CliRunner().invoke(rtransparent, [str(sample), str(output)])
-        assert result.exit_code == 0
-        assert output.exists()
-        assert f"Converted: {sample}" in caplog.text
+        result = CliRunner().invoke(extract_metrics, [str(tmp_pdfs), str(tmp_path)])
+        if result.exit_code != 0:
+            logger.error("Command failed with exit code %s", result.exit_code)
+            logger.error("Output:\n%s", result.output)
+            raise result.exception
+        assert list(tmp_path.rglob("*txt"))[0].name == sample.with_suffix(".txt").name
         mocked_requests_post.assert_called_once()
         mocked_socket.connect.assert_called_once()
 
 
-def test_pdf_converter(caplog, pdf_setup):
-    sample, _ = pdf_setup
+@pytest.mark.run_slow
+def test_cli_metrics_oddpub(tmp_path):
+    # set up temp dir
+    tmp_pdfs = tmp_path / "pdfs"
+    tmp_pdfs.mkdir()
+    sample = shutil.copy2(
+        Path("docs/examples/pdf_inputs/test_sample.pdf"), tmp_pdfs / "test_sample.pdf"
+    )
+
+    result = CliRunner().invoke(
+        extract_metrics, [str(tmp_pdfs), str(tmp_path), "--parse-with-oddpub"]
+    )
+    if result.exit_code != 0:
+        logger.error("Command failed with exit code %s", result.exit_code)
+        logger.error("Output:\n%s", result.output)
+        raise result.exception
+    assert list(tmp_path.rglob("*txt"))[0].name == sample.with_suffix(".txt").name
+
+
+def test_pdf_converter(sample_pdf):
+    sample, _ = sample_pdf
 
     response = PDFConverter().convert(sample)
     verify_xml_structure(response)