-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
306 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
name: "Build and upload Docker image for releases" | ||
|
||
on: | ||
push: | ||
tags: ["*"] | ||
workflow_dispatch: | ||
|
||
jobs: | ||
build_and_push_docker_image: | ||
name: "Build Docker Image" | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: "Checkout" | ||
uses: actions/checkout@v4 | ||
|
||
- name: "Set up Docker Buildx" | ||
uses: docker/setup-buildx-action@v3 | ||
|
||
- name: "Login to Docker Hub 🐳" | ||
uses: docker/login-action@v3 | ||
with: | ||
username: ${{ secrets.DOCKER_USERNAME }} | ||
password: ${{ secrets.DOCKER_PASSWORD }} | ||
|
||
- name: "Add Docker metadata" | ||
id: meta | ||
uses: docker/metadata-action@v5 | ||
with: | ||
images: | | ||
timdanaos/app | ||
tags: | | ||
type=ref,event=tag | ||
type=ref,event=branch | ||
type=sha | ||
- name: "Publish Docker image" | ||
uses: docker/build-push-action@v5 | ||
with: | ||
context: . | ||
tags: | | ||
${{ steps.meta.outputs.tags }} | ||
push: true | ||
labels: ${{ steps.meta.outputs.labels }} | ||
cache-from: type=gha | ||
cache-to: type=gha,mode=max | ||
if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,104 @@ | ||
import click | ||
from pathlib import Path | ||
from typing import Tuple, Union | ||
|
||
from osm.converters import convert_pdf | ||
import fire | ||
|
||
from osm.oddpub import oddpub_metric_extraction, oddpub_pdf_conversion | ||
from osm.rtransparent import rtransparent_metric_extraction | ||
from osm.sciencebeam import sciencebeam_pdf_conversion | ||
|
||
|
||
def setup_dirs( | ||
input_dir: Union[str, Path], output_dir: Union[str, Path] | ||
) -> Tuple[Path, Path]: | ||
input_dir = Path(input_dir) | ||
output_dir = Path(output_dir) | ||
|
||
if not input_dir.exists(): | ||
raise ValueError(f"The path {input_dir} does not exist.") | ||
|
||
output_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
return input_dir, output_dir | ||
|
||
|
||
class OSM: | ||
def __init__(self): | ||
self.pdf_converters = { | ||
"sciencebeam": sciencebeam_pdf_conversion, | ||
"oddpub": oddpub_pdf_conversion, | ||
} | ||
self.metric_extractors = { | ||
"rtransparent": rtransparent_metric_extraction, | ||
"oddpub": oddpub_metric_extraction, | ||
} | ||
self._pdf_dir = None | ||
self._text_dir = None | ||
self._outdir = None | ||
|
||
def convert( | ||
self, | ||
*, | ||
pdf_dir: Union[str, Path], | ||
text_dir: Union[str, Path] = "./osm_output/pdf_texts", | ||
converter: str = "sciencebeam", | ||
): | ||
""" | ||
Convert PDFs to text using the specified converter. | ||
Args: | ||
pdf_dir: Directory containing PDF files. | ||
text_dir: Directory to store extracted text. Defaults to "./osm_output/pdf_texts". | ||
converter: PDF conversion method to use. Defaults to "sciencebeam". | ||
""" | ||
self._pdf_dir, self._text_dir = setup_dirs(pdf_dir, text_dir) | ||
|
||
if converter not in self.pdf_converters: | ||
raise ValueError(f"Unknown converter: {converter}") | ||
|
||
self.pdf_converters[converter](self._pdf_dir, self._text_dir) | ||
return self | ||
|
||
def extract( | ||
self, | ||
*, | ||
text_dir: Union[str, Path] = None, | ||
outdir: Union[str, Path] = "./osm_output", | ||
extractor: str = "rtransparent", | ||
): | ||
""" | ||
Extract metrics from text using the specified extractor. | ||
Args: | ||
text_dir: Directory containing text files. If not provided, uses the last converted text directory. | ||
outdir: Directory to output results. Defaults to "./osm_output". | ||
extractor: Metric extraction method to use. Defaults to "rtransparent". | ||
""" | ||
if text_dir is None: | ||
if self._text_dir is None: | ||
raise ValueError( | ||
"No text_dir provided and no previous conversion found." | ||
) | ||
text_dir = self._text_dir | ||
|
||
self._text_dir, self._outdir = setup_dirs(text_dir, outdir) | ||
|
||
if extractor not in self.metric_extractors: | ||
raise ValueError(f"Unknown extractor: {extractor}") | ||
|
||
metrics = self.metric_extractors[extractor](self._text_dir) | ||
metrics.to_csv(self._outdir / "metrics.csv", index=False) | ||
return self | ||
|
||
|
||
@click.group() | ||
def osm(): | ||
"""Main command for OSM""" | ||
pass | ||
fire.Fire( | ||
{ | ||
"convert": OSM().convert, | ||
"extract": OSM().extract, | ||
} | ||
) | ||
|
||
|
||
@osm.command() | ||
@click.argument("file_path", type=click.Path(exists=True)) | ||
@click.argument("output_file", type=str) | ||
def rtransparent(file_path, output_file): | ||
"""Processes a biomedical publication. Writes out processed document and associated metrics.""" | ||
convert_pdf(file_path, output_file) | ||
if __name__ == "__main__": | ||
osm() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
""" | ||
Oddpub is being actively developed where as rtransparent has stagnated. | ||
Oddpub implements parallelism and their interface does not easily allow working | ||
with objects in memory so we will use that to reduce IO overhead. | ||
The alternative would be to load the pdf file into memory (pdftools::pdf_data | ||
and then pass that into oddpub private functions). This would make it easier to | ||
manage the parallelism, troubleshoot, and define the interface but partially | ||
reinvents the wheel. | ||
""" | ||
|
||
import logging | ||
from pathlib import Path | ||
|
||
import psutil | ||
import rpy2.robjects as ro | ||
from rpy2.robjects import pandas2ri | ||
from rpy2.robjects.packages import importr | ||
|
||
from osm.config import osm_config | ||
|
||
logging.basicConfig(level=logging.DEBUG) | ||
|
||
# Adjust the logging level for rpy2 | ||
rpy2_logger = logging.getLogger("rpy2") | ||
rpy2_logger.setLevel(logging.DEBUG) | ||
|
||
oddpub = importr("oddpub") | ||
future = importr("future") | ||
ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")') | ||
|
||
|
||
def oddpub_pdf_conversion( | ||
pdf_dir: Path, text_dir: Path, workers: int = psutil.cpu_count() | ||
): | ||
future.plan(future.multisession, workers=workers) | ||
oddpub.pdf_convert(str(pdf_dir), str(text_dir)) | ||
|
||
|
||
def oddpub_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()): | ||
future.plan(future.multisession, workers=workers) | ||
pdf_sentences = oddpub.pdf_load(f"{text_dir}/") | ||
open_data_results = oddpub.open_data_search(pdf_sentences) | ||
with (ro.default_converter + pandas2ri.converter).context(): | ||
metrics = ro.conversion.get_conversion().rpy2py(open_data_results) | ||
|
||
return metrics |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
import psutil | ||
import rpy2.robjects as ro | ||
from rpy2.robjects import pandas2ri | ||
from rpy2.robjects.packages import importr | ||
|
||
from osm.config import osm_config | ||
|
||
ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")') | ||
|
||
|
||
def rtransparent_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()): | ||
rtransparent = importr("rtransparent") | ||
future = importr("future") | ||
future.plan(future.multisession, workers=workers) | ||
metrics = [] | ||
for file_path in text_dir.glob("*.txt"): | ||
with (ro.default_converter + pandas2ri.converter).context(): | ||
metrics.append = ro.conversion.get_conversion().rpy2py( | ||
rtransparent.rt_data_code(file_path) | ||
) | ||
|
||
breakpoint() | ||
return pd.concat([row for row in metrics], ignore_index=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.