diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..eb80c70a --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,46 @@ +name: "Build and upload Docker image for releases" + +on: + push: + tags: ["*"] + workflow_dispatch: + +jobs: + build_and_push_docker_image: + name: "Build Docker Image" + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@v4 + + - name: "Set up Docker Buildx" + uses: docker/setup-buildx-action@v3 + + - name: "Login to Docker Hub 🐳" + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: "Add Docker metadata" + id: meta + uses: docker/metadata-action@v5 + with: + images: | + timdanaos/app + tags: | + type=ref,event=tag + type=ref,event=branch + type=sha + + - name: "Publish Docker image" + uses: docker/build-push-action@v5 + with: + context: . + tags: | + ${{ steps.meta.outputs.tags }} + push: true + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/') diff --git a/Dockerfile b/Dockerfile index f381c9ff..6370feb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,8 +25,9 @@ RUN R -e '\ install.packages("roadoi", repos = "http://cran.us.r-project.org"); \ devtools::install_github("quest-bih/oddpub"); \ devtools::install_github("cran/crminer"); \ -devtools::install_github("serghiou/metareadr"); \ -devtools::install_github("serghiou/rtransparent")' +devtools::install_github("serghiou/metareadr")' +COPY external /app/external +RUN R -e 'devtools::install("external/rtransparent")' # Copy the project files and install the package COPY pyproject.toml /app diff --git a/README.md b/README.md index 1a585a4a..578e1dbe 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to ``` docker-compose -f compose.yaml run \ --rm \ - -v $PWD:/mnt \ + -v $PWD:/app \ app \ rtransparent \ /mnt/docs/examples/pdf_inputs/test_sample.pdf \ diff --git a/osm/cli.py b/osm/cli.py index 32fda5d0..36768c7d 100644 --- a/osm/cli.py +++ b/osm/cli.py @@ -1,6 +1,9 @@ +from pathlib import Path + import click -from osm.converters import convert_pdf +from osm.oddpub import oddpub_metric_extraction, oddpub_pdf_conversion +from osm.sciencebeam import convert_pdf @click.group() @@ -10,8 +13,22 @@ def osm(): @osm.command() -@click.argument("file_path", type=click.Path(exists=True)) -@click.argument("output_file", type=str) -def rtransparent(file_path, output_file): - """Processes a biomedical publication. Writes out processed document and associated metrics.""" - convert_pdf(file_path, output_file) +@click.argument("pdf_dir", type=click.Path(exists=True)) +@click.argument("outdir", type=click.Path(exists=True)) +@click.option("--use-sciencebeam", is_flag=True, type=bool) +def extract_metrics(pdf_dir, outdir, use_sciencebeam): + """Processes a biomedical publications. Writes out processed documents and associated metrics.""" + outdir = Path(outdir) + pdf_dir = Path(pdf_dir) + text_dir = outdir / "pdf_texts" + text_dir.mkdir(exist_ok=True, parents=True) + try: + if use_sciencebeam: + convert_pdf(pdf_dir, text_dir) + else: + oddpub_pdf_conversion(pdf_dir, text_dir) + metrics = oddpub_metric_extraction(text_dir) + except Exception as err: + raise (err) + # breakpoint() + metrics.to_csv(outdir / "metrics.csv", index=False) diff --git a/osm/config.py b/osm/config.py index 969f460d..b602f7a8 100644 --- a/osm/config.py +++ b/osm/config.py @@ -13,11 +13,16 @@ class AppConfig: sb_protocol: str = "http" def __init__( - self, sb_port: int = None, sb_host: str = None, sb_protocol: str = None + self, + sb_port: int = None, + sb_host: str = None, + sb_protocol: str = None, + vroom_connection_size: int = 2**20, ): self.sb_port = int(sb_port) if sb_port is not None else self.sb_port self.sb_host = sb_host if sb_host is not None else self.sb_host self.sb_protocol = sb_protocol if sb_protocol is not None else self.sb_protocol + self.vroom_connection_size = vroom_connection_size load_dotenv() diff --git a/osm/oddpub.py b/osm/oddpub.py new file mode 100644 index 00000000..df2d6afb --- /dev/null +++ b/osm/oddpub.py @@ -0,0 +1,42 @@ +""" +Oddpub is being actively developed where as rtransparent has stagnated. +Oddpub implements parallelism and their interface does not easily allow working +with objects in memory so we will use that to reduce IO overhead. + +The alternative would be to load the pdf file into memory (pdftools::pdf_data +and then pass that into oddpub private functions). This would make it easier to +manage the parallelism, troubleshoot, and define the interface but partially +reinvents the wheel. +""" + +import os +from pathlib import Path + +import rpy2.robjects as ro +from rpy2.robjects import pandas2ri +from rpy2.robjects.packages import importr + +from osm.config import osm_config + +oddpub = importr("oddpub") +future = importr("future") +ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")') + + +def oddpub_pdf_conversion( + pdf_dir: Path, text_dir: Path, workers: int = len(os.sched_getaffinity(0)) +): + future.plan(future.multisession, workers=workers) + oddpub.pdf_convert(str(pdf_dir), str(text_dir)) + + +def oddpub_metric_extraction( + text_dir: Path, workers: int = len(os.sched_getaffinity(0)) +): + future.plan(future.multisession, workers=workers) + pdf_sentences = oddpub.pdf_load(f"{text_dir}/") + open_data_results = oddpub.open_data_search(pdf_sentences) + with (ro.default_converter + pandas2ri.converter).context(): + metrics = ro.conversion.get_conversion().rpy2py(open_data_results) + + return metrics diff --git a/osm/converters.py b/osm/sciencebeam.py similarity index 77% rename from osm/converters.py rename to osm/sciencebeam.py index c5da3a89..53738cfc 100644 --- a/osm/converters.py +++ b/osm/sciencebeam.py @@ -60,7 +60,7 @@ def _is_host_ready(self, timeout=3) -> bool: return True -def convert_pdf(file_path, output_file_path): +def convert_pdf(pdf_dir, text_dir): """Converts a PDF file to XML and saves the output. Args: @@ -68,8 +68,15 @@ def convert_pdf(file_path, output_file_path): output_file_path (str): Path to the output XML file. """ converter = PDFConverter() - xml_content = converter.convert(file_path) + for file_path in pdf_dir.glob("*.pdf"): + output_file_path = text_dir / (Path(file_path).stem + ".xml") + if output_file_path.exists(): + logger.info( + f"Skipping: {file_path}. Output file already exists: {output_file_path}" + ) + continue + xml_content = converter.convert(file_path) - # Save the converted xml contents - Path(output_file_path).write_text(xml_content) - logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}") + # Save the converted xml contents + Path(output_file_path).write_text(xml_content) + logger.info(f"Converted: {file_path} to XML. Output file: {output_file_path}") diff --git a/tests/conftest.py b/tests/conftest.py index 9a8f4159..4134d484 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,8 +37,19 @@ def mocked_requests_post(): @pytest.fixture -def pdf_setup(tmp_path): +def sample_pdf(tmp_path): pdfs_folder = Path("docs/examples/pdf_inputs") file_in = pdfs_folder / "test_sample.pdf" output = tmp_path / "test_output_file.xml" yield file_in, output + + +@pytest.fixture +def temp_pipeline_dir(tmp_path): + pdfs_folder = Path("docs/examples/pdf_inputs") + sample = "test_sample.pdf" + tmp_pdfs = tmp_path / "pdfs" + tmp_pdfs.mkdir() + (tmp_pdfs / sample).symlink_to(pdfs_folder / sample) + outdir = tmp_path + yield tmp_pdfs, outdir diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py index 91f61fbf..7cc1f999 100644 --- a/tests/test_file_processing.py +++ b/tests/test_file_processing.py @@ -4,30 +4,32 @@ import requests from click.testing import CliRunner -from osm.cli import rtransparent -from osm.converters import PDFConverter +from osm.cli import extract_metrics +from osm.sciencebeam import PDFConverter from .utils import verify_xml_structure -def test_cli_rtransparent( - pdf_setup, monkeypatch, mocked_requests_post, mocked_socket, caplog +def test_cli_metrics( + temp_pipeline_dir, monkeypatch, mocked_requests_post, mocked_socket, caplog ): caplog.set_level(logging.INFO) - sample, output = pdf_setup + pdfs_dir, outdir = temp_pipeline_dir with monkeypatch.context() as m: m.setattr(requests, "post", mocked_requests_post) monkeypatch.setattr(socket, "socket", lambda *args, **kwargs: mocked_socket) - result = CliRunner().invoke(rtransparent, [str(sample), str(output)]) + result = CliRunner().invoke( + extract_metrics, [str(pdfs_dir), str(outdir), "--use-sciencebeam"] + ) assert result.exit_code == 0 - assert output.exists() - assert f"Converted: {sample}" in caplog.text + assert len(outdir.glob("pdf_texts/*.txt")) == 1 + # assert f"Converted: {sample}" in caplog.text mocked_requests_post.assert_called_once() mocked_socket.connect.assert_called_once() -def test_pdf_converter(caplog, pdf_setup): - sample, _ = pdf_setup +def test_pdf_converter(sample_pdf): + sample, _ = sample_pdf response = PDFConverter().convert(sample) verify_xml_structure(response)