From 1182a9c025d55d8b900b83fdfb0d5091f45b7eb9 Mon Sep 17 00:00:00 2001 From: Quang Date: Wed, 11 Dec 2024 15:19:52 +0700 Subject: [PATCH 01/21] Add oddpub_metrics table and model definition --- .../832c238c1be7_add_oddpub_metrics_table.py | 52 +++++++++++++++++++ dsst_etl/models.py | 20 +++++++ 2 files changed, 72 insertions(+) create mode 100644 alembic/versions/832c238c1be7_add_oddpub_metrics_table.py diff --git a/alembic/versions/832c238c1be7_add_oddpub_metrics_table.py b/alembic/versions/832c238c1be7_add_oddpub_metrics_table.py new file mode 100644 index 0000000..fbb1f5c --- /dev/null +++ b/alembic/versions/832c238c1be7_add_oddpub_metrics_table.py @@ -0,0 +1,52 @@ +"""add oddpub_metrics table + +Revision ID: 832c238c1be7 +Revises: 52101c205c9d +Create Date: 2024-12-11 15:18:24.714630 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '832c238c1be7' +down_revision: Union[str, None] = '52101c205c9d' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('oddpub_metrics', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('article', sa.String(), nullable=False), + sa.Column('is_open_data', sa.Boolean(), nullable=False), + sa.Column('open_data_category', sa.String(), nullable=True), + sa.Column('is_reuse', sa.Boolean(), nullable=False), + sa.Column('is_open_code', sa.Boolean(), nullable=False), + sa.Column('is_open_data_das', sa.Boolean(), nullable=False), + sa.Column('is_open_code_cas', sa.Boolean(), nullable=False), + sa.Column('das', sa.String(), nullable=True), + sa.Column('open_data_statements', sa.String(), nullable=True), + sa.Column('cas', sa.String(), nullable=True), + sa.Column('open_code_statements', sa.String(), nullable=True), + sa.Column('work_id', sa.Integer(), nullable=True), + sa.Column('provenance_id', sa.Integer(), nullable=True), + sa.Column('document_id', sa.Integer(), nullable=True), + sa.ForeignKeyConstraint(['document_id'], ['documents.id'], name=op.f('fk_oddpub_metrics_document_id_documents')), + sa.ForeignKeyConstraint(['provenance_id'], ['provenance.id'], name=op.f('fk_oddpub_metrics_provenance_id_provenance')), + sa.ForeignKeyConstraint(['work_id'], ['works.id'], name=op.f('fk_oddpub_metrics_work_id_works')), + sa.PrimaryKeyConstraint('id', name=op.f('pk_oddpub_metrics')) + ) + op.create_index(op.f('ix_oddpub_metrics_article'), 'oddpub_metrics', ['article'], unique=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_oddpub_metrics_article'), table_name='oddpub_metrics') + op.drop_table('oddpub_metrics') + # ### end Alembic commands ### diff --git a/dsst_etl/models.py b/dsst_etl/models.py index 0527b87..20885d1 100644 --- a/dsst_etl/models.py +++ b/dsst_etl/models.py @@ -250,3 +250,23 @@ class RTransparentPublication(Base): work_id = Column(Integer, ForeignKey("works.id"), nullable=True) provenance_id = Column(Integer, ForeignKey("provenance.id"), nullable=True) + + +class OddpubMetrics(Base): + __tablename__ = "oddpub_metrics" + + id = Column(Integer, primary_key=True) + article = Column(String, unique=True, nullable=False, index=True) + is_open_data = Column(Boolean, nullable=False, default=False) + open_data_category = Column(String) + is_reuse = Column(Boolean, nullable=False, default=False) + is_open_code = Column(Boolean, nullable=False, default=False) + is_open_data_das = Column(Boolean, nullable=False, default=False) + is_open_code_cas = Column(Boolean, nullable=False, default=False) + das = Column(String) + open_data_statements = Column(String) + cas = Column(String) + open_code_statements = Column(String) + work_id = Column(Integer, ForeignKey("works.id"), nullable=True) + provenance_id = Column(Integer, ForeignKey("provenance.id"), nullable=True) + document_id = Column(Integer, ForeignKey("documents.id"), nullable=True) From 580d6c4cf8515f41dd1cd6d04951413bfb493735 Mon Sep 17 00:00:00 2001 From: Quang Date: Wed, 11 Dec 2024 16:12:33 +0700 Subject: [PATCH 02/21] create Oddpub API --- services/oddpub/README.md | 57 +++++++++++ services/oddpub/dockerfile | 24 +++++ services/oddpub/main.py | 182 +++++++++++++++++++++++++++++++++ services/oddpub/pyproject.toml | 19 ++++ 4 files changed, 282 insertions(+) create mode 100644 services/oddpub/README.md create mode 100644 services/oddpub/dockerfile create mode 100644 services/oddpub/main.py create mode 100644 services/oddpub/pyproject.toml diff --git a/services/oddpub/README.md b/services/oddpub/README.md new file mode 100644 index 0000000..c2f48c5 --- /dev/null +++ b/services/oddpub/README.md @@ -0,0 +1,57 @@ +# Oddpub API + +This is a FastAPI application for processing PDF files using the oddpub functions: `pdf_convert`, `pdf_load`, and `open_data_search`. + +## Project structure +services/ +└── oddpub/ + ├── dockerfile + ├── main.py + ├── pyproject.toml + └── README.md + +## Requirements + +- Python 3.11 +- Docker (optional, for containerized deployment) + +## Setup + +1. **Clone the repository**: + ```bash + git clone https://github.com/nimh-dsst/dsst-etl.git + cd services/oddpub + ``` + +2. **Install dependencies**: + If you are using Poetry: + ```bash + poetry install + ``` + +3. **Run the application**: + ```bash + uvicorn main:app --reload + ``` + +## Usage + +- Access the API at `http://localhost:8000/oddpub` to upload a PDF file and receive JSON output. + +## Docker + +To build and run the application using Docker: + +1. **Build the Docker image**: + ```bash + docker build -t oddpub-api . + ``` + +2. **Run the Docker container**: + ```bash + docker run -p 80:80 oddpub-api + ``` + +## License + +This project is licensed under the MIT License. \ No newline at end of file diff --git a/services/oddpub/dockerfile b/services/oddpub/dockerfile new file mode 100644 index 0000000..9d2b21f --- /dev/null +++ b/services/oddpub/dockerfile @@ -0,0 +1,24 @@ +# Use the official Python image as a parent image +FROM python:3.11-slim + +# Set the working directory in the container +WORKDIR /app + +# Copy the current directory contents into the container at /app +COPY . /app + +# Install Poetry +RUN pip install --no-cache-dir poetry + +# Install dependencies using Poetry +RUN poetry config virtualenvs.create false && poetry install --no-interaction --no-ansi + +# Install rpy2 package dependencies +RUN apt-get update && \ + apt-get install -y r-base + +# Make port 80 available to the world outside this container +EXPOSE 80 + +# Run app.py when the container launches +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/services/oddpub/main.py b/services/oddpub/main.py new file mode 100644 index 0000000..cd2994c --- /dev/null +++ b/services/oddpub/main.py @@ -0,0 +1,182 @@ +import logging +from pathlib import Path +from typing import Dict, List + +import pandas as pd +from fastapi import FastAPI, File, UploadFile +from fastapi.responses import JSONResponse +import shutil +import os +import rpy2.robjects as robjects +from rpy2.robjects import pandas2ri +from rpy2.robjects.packages import importr +from dataclasses import dataclass, asdict + +logger = logging.getLogger(__name__) + +app = FastAPI() + + +@dataclass +class OddpubMetrics: + article: str + is_open_data: bool + open_data_category: str + is_reuse: bool + is_open_code: bool + is_open_data_das: bool + is_open_code_cas: bool + das: str + open_data_statements: str + cas: str + open_code_statements: str + + def serialize(self) -> dict: + """ + Serialize the OddpubMetrics instance to a dictionary. + Returns: + dict: A dictionary representation of the OddpubMetrics instance. + """ + return asdict(self) + + +class OddpubWrapper: + """ + A wrapper class for calling ODDPub R functions from Python using rpy2. + """ + + def __init__(self): + """ + Initialize the OddpubWrapper. + Args: + db (Session, optional): SQLAlchemy database session + work_id (int): ID of the work being processed + document_id (int): ID of the document being processed + """ + try: + self.base = importr("base") + self.oddpub = importr("oddpub") + pandas2ri.activate() + logger.info("Successfully initialized OddpubWrapper") + except Exception as e: + logger.error(f"Failed to initialize OddpubWrapper: {str(e)}") + raise + + def _convert_pdfs(self, pdf_folder: str, output_folder: str) -> None: + """Convert PDFs to text using oddpub::pdf_convert.""" + try: + r_pdf_folder = robjects.StrVector([str(Path(pdf_folder))]) + r_output_folder = robjects.StrVector([str(Path(output_folder))]) + + self.oddpub.pdf_convert(r_pdf_folder, r_output_folder) + logger.info( + f"Successfully converted PDFs from {pdf_folder} to text in {output_folder}" + ) + except Exception as e: + logger.error(f"Error in PDF conversion: {str(e)}") + raise + + def _load_pdf_text(self, pdf_text_folder: str) -> robjects.vectors.ListVector: + """Load converted PDF text using oddpub::pdf_load.""" + try: + r_text_folder = robjects.StrVector([str(Path(pdf_text_folder))]) + pdf_text_sentences = self.oddpub.pdf_load(r_text_folder) + logger.info(f"Successfully loaded PDF text from {pdf_text_folder}") + return pdf_text_sentences + except Exception as e: + logger.error(f"Error in loading PDF text: {str(e)}") + raise + + def _search_open_data( + self, pdf_text_sentences: robjects.vectors.ListVector + ) -> List[OddpubMetrics]: + """Search for open data statements using oddpub::open_data_search.""" + try: + open_data_results = self.oddpub.open_data_search(pdf_text_sentences) + result = self._convert_r_result(open_data_results) + logger.info("Successfully completed open data search") + return result + except Exception as e: + logger.error(f"Error in open data search: {str(e)}") + raise + + def _cleanup_output_folder(self, output_folder: str) -> None: + """Remove the temporary output folder and its contents.""" + try: + shutil.rmtree(output_folder) + logger.info(f"Successfully cleaned up output folder: {output_folder}") + except Exception as e: + logger.error(f"Error cleaning up output folder: {str(e)}") + raise + + def process_pdfs(self, pdf_folder: str) -> Dict: + """ + Process PDFs through the complete ODDPub workflow and store results in database. + Args: + pdf_folder (str): Path to folder containing PDF files + output_folder (str): Path to temporary output folder for converted text files + Returns: + OddpubMetrics: Results of open data analysis + """ + try: + # Create output directory if it doesn't exist + output_folder = "oddpub_output" + Path(output_folder).mkdir(parents=True, exist_ok=True) + + # Execute the workflow + self._convert_pdfs(pdf_folder, output_folder) + pdf_text_sentences = self._load_pdf_text(output_folder) + result = self._search_open_data(pdf_text_sentences) + + return result + except Exception as e: + logger.error(f"Error in PDF processing workflow: {str(e)}") + self.db.rollback() + finally: + # Attempt cleanup even if processing failed + self._cleanup_output_folder(output_folder) + if self.db: + self.db.close() + + def _convert_r_result(self, r_result) -> OddpubMetrics: + """Convert R results to OddpubMetrics instance.""" + try: + df = pd.DataFrame(r_result) + result_dict = df.to_dict("records")[0] if not df.empty else {} + + # Create new OddpubMetrics instance with relationships + oddpub_metrics = OddpubMetrics( + article=result_dict.get("article"), + is_open_data=result_dict.get("is_open_data", False), + open_data_category=result_dict.get("open_data_category"), + is_reuse=result_dict.get("is_reuse", False), + is_open_code=result_dict.get("is_open_code", False), + is_open_data_das=result_dict.get("is_open_data_das", False), + is_open_code_cas=result_dict.get("is_open_code_cas", False), + das=result_dict.get("das"), + open_data_statements=result_dict.get("open_data_statements"), + cas=result_dict.get("cas"), + open_code_statements=result_dict.get("open_code_statements"), + ) + return oddpub_metrics + except Exception as e: + logger.error(f"Error converting R result: {str(e)}") + raise + + +@app.post("/oddpub") +async def process_pdf(file: UploadFile = File(...)): + # Save the uploaded file + file_location = f"/tmp/{file.filename}" + with open(file_location, "wb") as buffer: + shutil.copyfileobj(file.file, buffer) + + oddpub_wrapper = OddpubWrapper() + + # Call the oddpub functions + result = oddpub_wrapper.process_pdfs(file_location) + + # Clean up the saved file + os.remove(file_location) + + return JSONResponse(content=result) diff --git a/services/oddpub/pyproject.toml b/services/oddpub/pyproject.toml new file mode 100644 index 0000000..77d8816 --- /dev/null +++ b/services/oddpub/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "oddpub-api" +version = "0.1.0" +description = "A FastAPI application for processing PDF files with oddpub functions." +authors = [] + +[tool.poetry.dependencies] +python = "^3.11" +fastapi = "^0.95.0" +uvicorn = "^0.22.0" +rpy2 = "^3.5.0" + + +[tool.poetry.dev-dependencies] +pytest = "^7.0.0" \ No newline at end of file From ac53e9290f2f9d4220291252892f40f60a03adb5 Mon Sep 17 00:00:00 2001 From: Quang Date: Wed, 11 Dec 2024 16:30:07 +0700 Subject: [PATCH 03/21] Enhance Dockerfile and pyproject.toml for R support and author attribution --- services/oddpub/dockerfile | 15 +++++++++++---- services/oddpub/pyproject.toml | 5 ++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/services/oddpub/dockerfile b/services/oddpub/dockerfile index 9d2b21f..e653775 100644 --- a/services/oddpub/dockerfile +++ b/services/oddpub/dockerfile @@ -10,13 +10,20 @@ COPY . /app # Install Poetry RUN pip install --no-cache-dir poetry +# Install R and its dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + r-base \ + r-base-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Set R_HOME environment variable +ENV R_HOME=/usr/lib/R + # Install dependencies using Poetry RUN poetry config virtualenvs.create false && poetry install --no-interaction --no-ansi -# Install rpy2 package dependencies -RUN apt-get update && \ - apt-get install -y r-base - # Make port 80 available to the world outside this container EXPOSE 80 diff --git a/services/oddpub/pyproject.toml b/services/oddpub/pyproject.toml index 77d8816..c6c8aa9 100644 --- a/services/oddpub/pyproject.toml +++ b/services/oddpub/pyproject.toml @@ -2,11 +2,11 @@ requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta" -[project] +[tool.poetry] name = "oddpub-api" version = "0.1.0" description = "A FastAPI application for processing PDF files with oddpub functions." -authors = [] +authors = ["Quang Nguyen"] [tool.poetry.dependencies] python = "^3.11" @@ -14,6 +14,5 @@ fastapi = "^0.95.0" uvicorn = "^0.22.0" rpy2 = "^3.5.0" - [tool.poetry.dev-dependencies] pytest = "^7.0.0" \ No newline at end of file From 157a217bf80dea7dd943856c6aed2726661d87be Mon Sep 17 00:00:00 2001 From: Quang Date: Wed, 11 Dec 2024 16:38:29 +0700 Subject: [PATCH 04/21] Update pyproject.toml to include pandas and numpy dependencies --- services/oddpub/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/oddpub/pyproject.toml b/services/oddpub/pyproject.toml index c6c8aa9..6463e5c 100644 --- a/services/oddpub/pyproject.toml +++ b/services/oddpub/pyproject.toml @@ -13,6 +13,8 @@ python = "^3.11" fastapi = "^0.95.0" uvicorn = "^0.22.0" rpy2 = "^3.5.0" +pandas = "^1.5.0" +numpy = "^1.23.0" [tool.poetry.dev-dependencies] pytest = "^7.0.0" \ No newline at end of file From 05180ed198f4693c5fb07df1d260711be51b42af Mon Sep 17 00:00:00 2001 From: Quang Date: Wed, 11 Dec 2024 17:26:13 +0700 Subject: [PATCH 05/21] Refactor Dockerfile and update pyproject.toml to enhance R package installation --- services/oddpub/dockerfile | 18 ++++++++---------- services/oddpub/pyproject.toml | 1 + 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/services/oddpub/dockerfile b/services/oddpub/dockerfile index e653775..b5e599f 100644 --- a/services/oddpub/dockerfile +++ b/services/oddpub/dockerfile @@ -1,31 +1,29 @@ -# Use the official Python image as a parent image FROM python:3.11-slim -# Set the working directory in the container WORKDIR /app -# Copy the current directory contents into the container at /app COPY . /app -# Install Poetry RUN pip install --no-cache-dir poetry -# Install R and its dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends \ r-base \ - r-base-dev && \ + r-base-dev \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Set R_HOME environment variable ENV R_HOME=/usr/lib/R -# Install dependencies using Poetry +RUN R -e "install.packages('devtools')" + +RUN R -e "devtools::install_github('quest-bih/oddpub')" + RUN poetry config virtualenvs.create false && poetry install --no-interaction --no-ansi -# Make port 80 available to the world outside this container EXPOSE 80 -# Run app.py when the container launches CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/services/oddpub/pyproject.toml b/services/oddpub/pyproject.toml index 6463e5c..70b397d 100644 --- a/services/oddpub/pyproject.toml +++ b/services/oddpub/pyproject.toml @@ -15,6 +15,7 @@ uvicorn = "^0.22.0" rpy2 = "^3.5.0" pandas = "^1.5.0" numpy = "^1.23.0" +python-multipart = "^0.0.19" [tool.poetry.dev-dependencies] pytest = "^7.0.0" \ No newline at end of file From f20241a82bab171c13dc4adb67423e76ca9daec8 Mon Sep 17 00:00:00 2001 From: Quang Date: Thu, 12 Dec 2024 17:02:59 +0700 Subject: [PATCH 06/21] Add FastAPI application and Docker setup for ODDPub processing --- services/oddpub/_entrypoint.sh | 4 +++ services/oddpub/{main.py => app.py} | 16 +++++----- services/oddpub/dockerfile | 39 ++++++++++++------------ services/oddpub/environment.yaml | 47 +++++++++++++++++++++++++++++ services/oddpub/pyproject.toml | 21 ------------- 5 files changed, 79 insertions(+), 48 deletions(-) create mode 100644 services/oddpub/_entrypoint.sh rename services/oddpub/{main.py => app.py} (95%) create mode 100644 services/oddpub/environment.yaml delete mode 100644 services/oddpub/pyproject.toml diff --git a/services/oddpub/_entrypoint.sh b/services/oddpub/_entrypoint.sh new file mode 100644 index 0000000..95fe53d --- /dev/null +++ b/services/oddpub/_entrypoint.sh @@ -0,0 +1,4 @@ +#!/bin/bash +source /opt/conda/etc/profile.d/conda.sh +conda activate osm +exec "$@" \ No newline at end of file diff --git a/services/oddpub/main.py b/services/oddpub/app.py similarity index 95% rename from services/oddpub/main.py rename to services/oddpub/app.py index cd2994c..3ea07cb 100644 --- a/services/oddpub/main.py +++ b/services/oddpub/app.py @@ -74,7 +74,7 @@ def _convert_pdfs(self, pdf_folder: str, output_folder: str) -> None: ) except Exception as e: logger.error(f"Error in PDF conversion: {str(e)}") - raise + raise e def _load_pdf_text(self, pdf_text_folder: str) -> robjects.vectors.ListVector: """Load converted PDF text using oddpub::pdf_load.""" @@ -131,12 +131,9 @@ def process_pdfs(self, pdf_folder: str) -> Dict: return result except Exception as e: logger.error(f"Error in PDF processing workflow: {str(e)}") - self.db.rollback() finally: # Attempt cleanup even if processing failed self._cleanup_output_folder(output_folder) - if self.db: - self.db.close() def _convert_r_result(self, r_result) -> OddpubMetrics: """Convert R results to OddpubMetrics instance.""" @@ -165,15 +162,20 @@ def _convert_r_result(self, r_result) -> OddpubMetrics: @app.post("/oddpub") -async def process_pdf(file: UploadFile = File(...)): +def process_pdf(file: UploadFile = File(...)): # Save the uploaded file - file_location = f"/tmp/{file.filename}" + pdf_folder = "/tmp/pdfs" + Path(pdf_folder).mkdir(parents=True, exist_ok=True) + + file_location = f"{pdf_folder}/{file.filename}" + logger.info(f"Saving file to {file_location}") + with open(file_location, "wb") as buffer: shutil.copyfileobj(file.file, buffer) oddpub_wrapper = OddpubWrapper() - # Call the oddpub functions + logger.info(f"Processing file: {file_location}") result = oddpub_wrapper.process_pdfs(file_location) # Clean up the saved file diff --git a/services/oddpub/dockerfile b/services/oddpub/dockerfile index b5e599f..bd0da06 100644 --- a/services/oddpub/dockerfile +++ b/services/oddpub/dockerfile @@ -1,29 +1,28 @@ -FROM python:3.11-slim - +FROM condaforge/mambaforge:24.3.0-0 +SHELL ["/bin/bash", "--login", "-c"] +# Set working directory WORKDIR /app -COPY . /app - -RUN pip install --no-cache-dir poetry +COPY environment.yaml /app -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - r-base \ - r-base-dev \ - libcurl4-openssl-dev \ - libssl-dev \ - libxml2-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* +# Create the environment +RUN conda env create -f environment.yaml -ENV R_HOME=/usr/lib/R +# Ensure the conda environment is activated +RUN echo "source /opt/conda/etc/profile.d/conda.sh && conda activate osm" | tee -a ~/.bashrc /etc/profile /etc/profile.d/conda.sh /etc/skel/.bashrc /etc/skel/.profile > /dev/null -RUN R -e "install.packages('devtools')" +RUN R -e '\ +devtools::install_github("quest-bih/oddpub",ref="c5b091c7e82ed6177192dc380a515b3dc6304863"); \ +devtools::install_github("serghiou/rtransparent", build_vignettes = F)' -RUN R -e "devtools::install_github('quest-bih/oddpub')" +# # Copy the project files and install the package +COPY app.py /app -RUN poetry config virtualenvs.create false && poetry install --no-interaction --no-ansi +# Make entrypoint etc. convenient for users +COPY _entrypoint.sh /usr/local/bin/_entrypoint.sh -EXPOSE 80 +# Make _entrypoint.sh executable +RUN chmod +x /usr/local/bin/_entrypoint.sh -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] +ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] +CMD ["fastapi", "dev", "--host", "0.0.0.0", "--port", "8071"] \ No newline at end of file diff --git a/services/oddpub/environment.yaml b/services/oddpub/environment.yaml new file mode 100644 index 0000000..87471fd --- /dev/null +++ b/services/oddpub/environment.yaml @@ -0,0 +1,47 @@ +name: osm +channels: + - conda-forge + - nodefaults +dependencies: + - fastapi + - lxml + - pandas + - pip + - psutil + - python + - requests + - rpy2 + - uvicorn + # Dependencies for rtransparent + - r-crul + - r-devtools + - r-dplyr + - r-furrr + - r-future + - r-globals + - r-hoardr + - r-httpcode + - r-lazyeval + - r-lubridate + - r-magrittr + - r-pbapply + #- r-pdftools + - r-plyr + - r-purrr + #- r-qpdf + - r-readr + # - r-rentrez + - r-rlang + - r-stringr + - r-tibble + - r-tidyr + - r-tidyselect + - r-timechange + - r-tokenizers + - r-triebeard + - r-urltools + - r-utf8 + # - r-XML + - r-xml2 + - pip: + - metapub \ No newline at end of file diff --git a/services/oddpub/pyproject.toml b/services/oddpub/pyproject.toml deleted file mode 100644 index 70b397d..0000000 --- a/services/oddpub/pyproject.toml +++ /dev/null @@ -1,21 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel"] -build-backend = "setuptools.build_meta" - -[tool.poetry] -name = "oddpub-api" -version = "0.1.0" -description = "A FastAPI application for processing PDF files with oddpub functions." -authors = ["Quang Nguyen"] - -[tool.poetry.dependencies] -python = "^3.11" -fastapi = "^0.95.0" -uvicorn = "^0.22.0" -rpy2 = "^3.5.0" -pandas = "^1.5.0" -numpy = "^1.23.0" -python-multipart = "^0.0.19" - -[tool.poetry.dev-dependencies] -pytest = "^7.0.0" \ No newline at end of file From d7e2115955c40a3547abbc963a0cc5036eb2c86a Mon Sep 17 00:00:00 2001 From: Quang Date: Fri, 13 Dec 2024 16:28:30 +0700 Subject: [PATCH 07/21] Enhance logging and update PDF processing in Oddpub application --- services/oddpub/app.py | 26 ++++++++++++++++---------- services/oddpub/dockerfile | 4 ++++ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/services/oddpub/app.py b/services/oddpub/app.py index 3ea07cb..1f94625 100644 --- a/services/oddpub/app.py +++ b/services/oddpub/app.py @@ -12,6 +12,13 @@ from rpy2.robjects.packages import importr from dataclasses import dataclass, asdict +# Configure logging +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[logging.StreamHandler()] +) + logger = logging.getLogger(__name__) app = FastAPI() @@ -68,7 +75,8 @@ def _convert_pdfs(self, pdf_folder: str, output_folder: str) -> None: r_pdf_folder = robjects.StrVector([str(Path(pdf_folder))]) r_output_folder = robjects.StrVector([str(Path(output_folder))]) - self.oddpub.pdf_convert(r_pdf_folder, r_output_folder) + logger.info(f"Converting PDFs from {r_pdf_folder} to text in {r_output_folder}") + self.oddpub.pdf_convert(pdf_folder, output_folder) logger.info( f"Successfully converted PDFs from {pdf_folder} to text in {output_folder}" ) @@ -79,8 +87,7 @@ def _convert_pdfs(self, pdf_folder: str, output_folder: str) -> None: def _load_pdf_text(self, pdf_text_folder: str) -> robjects.vectors.ListVector: """Load converted PDF text using oddpub::pdf_load.""" try: - r_text_folder = robjects.StrVector([str(Path(pdf_text_folder))]) - pdf_text_sentences = self.oddpub.pdf_load(r_text_folder) + pdf_text_sentences = self.oddpub.pdf_load(pdf_text_folder) logger.info(f"Successfully loaded PDF text from {pdf_text_folder}") return pdf_text_sentences except Exception as e: @@ -120,10 +127,12 @@ def process_pdfs(self, pdf_folder: str) -> Dict: """ try: # Create output directory if it doesn't exist - output_folder = "oddpub_output" + output_folder = "oddpub_output/" Path(output_folder).mkdir(parents=True, exist_ok=True) # Execute the workflow + logger.info(f"Converting PDFs from {pdf_folder} to text in {output_folder}") + self._convert_pdfs(pdf_folder, output_folder) pdf_text_sentences = self._load_pdf_text(output_folder) result = self._search_open_data(pdf_text_sentences) @@ -163,8 +172,7 @@ def _convert_r_result(self, r_result) -> OddpubMetrics: @app.post("/oddpub") def process_pdf(file: UploadFile = File(...)): - # Save the uploaded file - pdf_folder = "/tmp/pdfs" + pdf_folder = "/tmp/pdfs/" Path(pdf_folder).mkdir(parents=True, exist_ok=True) file_location = f"{pdf_folder}/{file.filename}" @@ -175,10 +183,8 @@ def process_pdf(file: UploadFile = File(...)): oddpub_wrapper = OddpubWrapper() - logger.info(f"Processing file: {file_location}") - result = oddpub_wrapper.process_pdfs(file_location) + result = oddpub_wrapper.process_pdfs(pdf_folder) - # Clean up the saved file os.remove(file_location) - return JSONResponse(content=result) + return JSONResponse(content=result.serialize()) diff --git a/services/oddpub/dockerfile b/services/oddpub/dockerfile index bd0da06..fb4957b 100644 --- a/services/oddpub/dockerfile +++ b/services/oddpub/dockerfile @@ -24,5 +24,9 @@ COPY _entrypoint.sh /usr/local/bin/_entrypoint.sh # Make _entrypoint.sh executable RUN chmod +x /usr/local/bin/_entrypoint.sh +# Install pdftotext +RUN apt-get update && apt-get install -y poppler-utils + ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] + CMD ["fastapi", "dev", "--host", "0.0.0.0", "--port", "8071"] \ No newline at end of file From 1bb19ece9ad91812967f6fe35d969df9a42993a3 Mon Sep 17 00:00:00 2001 From: Quang Date: Fri, 13 Dec 2024 16:43:33 +0700 Subject: [PATCH 08/21] Add OddpubWrapper class for PDF processing and API integration --- dsst_etl/oddpub_wrapper.py | 72 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 dsst_etl/oddpub_wrapper.py diff --git a/dsst_etl/oddpub_wrapper.py b/dsst_etl/oddpub_wrapper.py new file mode 100644 index 0000000..e0b8f30 --- /dev/null +++ b/dsst_etl/oddpub_wrapper.py @@ -0,0 +1,72 @@ +import logging +from pathlib import Path + +import requests +from sqlalchemy.orm import Session + +from .db import get_db +from .models import OddpubMetrics + +logger = logging.getLogger(__name__) + + +class OddpubWrapper: + """ + A wrapper class for calling ODDPub R functions from Python using rpy2. + """ + + def __init__( + self, + db: Session = None, + work_id: int = None, + document_id: int = None, + oddpub_host_api: str = None, + ): + """ + Initialize the OddpubWrapper. + + Args: + db (Session, optional): SQLAlchemy database session + work_id (int): ID of the work being processed + document_id (int): ID of the document being processed + """ + try: + self.oddpub_host_api = oddpub_host_api + self.db = db if db is not None else next(get_db()) + self.work_id = work_id + self.document_id = document_id + logger.info("Successfully initialized OddpubWrapper") + except Exception as e: + logger.error(f"Failed to initialize OddpubWrapper: {str(e)}") + raise + + def process_pdfs(self, pdf_folder: str) -> OddpubMetrics: + """ + Process PDFs through the complete ODDPub workflow and store results in database. + + Args: + pdf_folder (str): Path to folder containing PDF files + + Returns: + OddpubMetrics: Results of open data analysis + """ + try: + # Iterate over each PDF file in the folder + for pdf_file in Path(pdf_folder).glob("*.pdf"): + with open(pdf_file, "rb") as f: + # Use requests to call the API + response = requests.post( + f"{self.oddpub_host_api}/oddpub", files={"file": f} + ) + response.raise_for_status() + + r_result = response.json() + oddpub_metrics = OddpubMetrics(**r_result) + oddpub_metrics.work_id = self.work_id + oddpub_metrics.document_id = self.document_id + self.db.add(oddpub_metrics) + self.db.commit() + + except Exception as e: + logger.error(f"Error in PDF processing workflow: {str(e)}") + self.db.rollback() From ae524a7244beb1f668ed7142f65824a0c6d204c4 Mon Sep 17 00:00:00 2001 From: Quang Date: Fri, 13 Dec 2024 17:04:02 +0700 Subject: [PATCH 09/21] Update Oddpub metrics to allow nullable fields and enhance OddpubWrapper for PDF processing --- .mockenv | 4 +- ...9d1785e_add_field_in_oddpub_is_nullable.py | 64 +++++++++++++++++++ dsst_etl/models.py | 12 ++-- dsst_etl/oddpub_wrapper.py | 17 ++--- scripts/run_oddpub.py | 16 +++++ 5 files changed, 98 insertions(+), 15 deletions(-) create mode 100644 alembic/versions/600039d1785e_add_field_in_oddpub_is_nullable.py create mode 100644 scripts/run_oddpub.py diff --git a/.mockenv b/.mockenv index a85521b..97d8325 100644 --- a/.mockenv +++ b/.mockenv @@ -15,4 +15,6 @@ NCBI_API_KEY= S3_BUCKET_NAME=osm-pdf-uploads HOSTNAME=localhost -USERNAME=quang \ No newline at end of file +USERNAME=quang + +ODDPUB_HOST_API=http://localhost:80 \ No newline at end of file diff --git a/alembic/versions/600039d1785e_add_field_in_oddpub_is_nullable.py b/alembic/versions/600039d1785e_add_field_in_oddpub_is_nullable.py new file mode 100644 index 0000000..7a153f1 --- /dev/null +++ b/alembic/versions/600039d1785e_add_field_in_oddpub_is_nullable.py @@ -0,0 +1,64 @@ +"""add field in OddPub is nullable + +Revision ID: 600039d1785e +Revises: 832c238c1be7 +Create Date: 2024-12-13 17:00:30.689184 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '600039d1785e' +down_revision: Union[str, None] = '832c238c1be7' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('oddpub_metrics', 'article', + existing_type=sa.VARCHAR(), + nullable=True) + op.alter_column('oddpub_metrics', 'is_open_data', + existing_type=sa.BOOLEAN(), + nullable=True) + op.alter_column('oddpub_metrics', 'is_reuse', + existing_type=sa.BOOLEAN(), + nullable=True) + op.alter_column('oddpub_metrics', 'is_open_code', + existing_type=sa.BOOLEAN(), + nullable=True) + op.alter_column('oddpub_metrics', 'is_open_data_das', + existing_type=sa.BOOLEAN(), + nullable=True) + op.alter_column('oddpub_metrics', 'is_open_code_cas', + existing_type=sa.BOOLEAN(), + nullable=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('oddpub_metrics', 'is_open_code_cas', + existing_type=sa.BOOLEAN(), + nullable=False) + op.alter_column('oddpub_metrics', 'is_open_data_das', + existing_type=sa.BOOLEAN(), + nullable=False) + op.alter_column('oddpub_metrics', 'is_open_code', + existing_type=sa.BOOLEAN(), + nullable=False) + op.alter_column('oddpub_metrics', 'is_reuse', + existing_type=sa.BOOLEAN(), + nullable=False) + op.alter_column('oddpub_metrics', 'is_open_data', + existing_type=sa.BOOLEAN(), + nullable=False) + op.alter_column('oddpub_metrics', 'article', + existing_type=sa.VARCHAR(), + nullable=False) + # ### end Alembic commands ### diff --git a/dsst_etl/models.py b/dsst_etl/models.py index 20885d1..5fe38d6 100644 --- a/dsst_etl/models.py +++ b/dsst_etl/models.py @@ -256,13 +256,13 @@ class OddpubMetrics(Base): __tablename__ = "oddpub_metrics" id = Column(Integer, primary_key=True) - article = Column(String, unique=True, nullable=False, index=True) - is_open_data = Column(Boolean, nullable=False, default=False) + article = Column(String, unique=True, nullable=True, index=True) + is_open_data = Column(Boolean, nullable=True, default=False) open_data_category = Column(String) - is_reuse = Column(Boolean, nullable=False, default=False) - is_open_code = Column(Boolean, nullable=False, default=False) - is_open_data_das = Column(Boolean, nullable=False, default=False) - is_open_code_cas = Column(Boolean, nullable=False, default=False) + is_reuse = Column(Boolean, nullable=True, default=False) + is_open_code = Column(Boolean, nullable=True, default=False) + is_open_data_das = Column(Boolean, nullable=True, default=False) + is_open_code_cas = Column(Boolean, nullable=True, default=False) das = Column(String) open_data_statements = Column(String) cas = Column(String) diff --git a/dsst_etl/oddpub_wrapper.py b/dsst_etl/oddpub_wrapper.py index e0b8f30..1d874c5 100644 --- a/dsst_etl/oddpub_wrapper.py +++ b/dsst_etl/oddpub_wrapper.py @@ -4,8 +4,9 @@ import requests from sqlalchemy.orm import Session -from .db import get_db -from .models import OddpubMetrics +from dsst_etl.models import OddpubMetrics + +from .config import config logger = logging.getLogger(__name__) @@ -17,10 +18,10 @@ class OddpubWrapper: def __init__( self, - db: Session = None, + db_session: Session = None, work_id: int = None, document_id: int = None, - oddpub_host_api: str = None, + oddpub_host_api: str = config.ODDPUB_HOST_API, ): """ Initialize the OddpubWrapper. @@ -32,7 +33,7 @@ def __init__( """ try: self.oddpub_host_api = oddpub_host_api - self.db = db if db is not None else next(get_db()) + self.db_session = db_session self.work_id = work_id self.document_id = document_id logger.info("Successfully initialized OddpubWrapper") @@ -64,9 +65,9 @@ def process_pdfs(self, pdf_folder: str) -> OddpubMetrics: oddpub_metrics = OddpubMetrics(**r_result) oddpub_metrics.work_id = self.work_id oddpub_metrics.document_id = self.document_id - self.db.add(oddpub_metrics) - self.db.commit() + self.db_session.add(oddpub_metrics) + self.db_session.commit() except Exception as e: logger.error(f"Error in PDF processing workflow: {str(e)}") - self.db.rollback() + self.db_session.rollback() diff --git a/scripts/run_oddpub.py b/scripts/run_oddpub.py new file mode 100644 index 0000000..2e5f49a --- /dev/null +++ b/scripts/run_oddpub.py @@ -0,0 +1,16 @@ +import argparse +from dsst_etl import get_db_engine +from dsst_etl.db import get_db_session +from dsst_etl.oddpub_wrapper import OddpubWrapper + +def main(): + parser = argparse.ArgumentParser(description="Process PDFs with OddpubWrapper") + parser.add_argument('pdf_folder', type=str, help='Path to the folder containing PDF files') + args = parser.parse_args() + + oddpubWrapper = OddpubWrapper(get_db_session(get_db_engine())) + oddpubWrapper.process_pdfs(args.pdf_folder) + +if __name__ == "__main__": + main() + From 789cc2d08bc326b9b87e5c1fdd0fbbed5710504c Mon Sep 17 00:00:00 2001 From: Quang Date: Fri, 13 Dec 2024 17:14:34 +0700 Subject: [PATCH 10/21] Refactor R result conversion in OddpubWrapper to use pandas2ri for improved data handling --- services/oddpub/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/oddpub/app.py b/services/oddpub/app.py index 1f94625..b0090bd 100644 --- a/services/oddpub/app.py +++ b/services/oddpub/app.py @@ -147,7 +147,7 @@ def process_pdfs(self, pdf_folder: str) -> Dict: def _convert_r_result(self, r_result) -> OddpubMetrics: """Convert R results to OddpubMetrics instance.""" try: - df = pd.DataFrame(r_result) + df = pandas2ri.rpy2py(r_result) result_dict = df.to_dict("records")[0] if not df.empty else {} # Create new OddpubMetrics instance with relationships From 9943c5f13b7db298f4506565119af401dc18e076 Mon Sep 17 00:00:00 2001 From: Quang Date: Fri, 13 Dec 2024 17:31:57 +0700 Subject: [PATCH 11/21] Add unit tests for OddpubWrapper and refactor test setup in RTransparentDataUploader --- tests/test_oddpub.py | 73 ++++++++++++++++++++++++++ tests/test_upload_rtransparent_data.py | 5 +- 2 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 tests/test_oddpub.py diff --git a/tests/test_oddpub.py b/tests/test_oddpub.py new file mode 100644 index 0000000..630585a --- /dev/null +++ b/tests/test_oddpub.py @@ -0,0 +1,73 @@ +import logging +import unittest +from unittest.mock import patch, MagicMock +from sqlalchemy.orm import Session +from dsst_etl.oddpub_wrapper import OddpubWrapper +from dsst_etl.models import OddpubMetrics +from dsst_etl import get_db_engine +from dsst_etl.db import get_db_session, init_db +from sqlalchemy import inspect +logger = logging.getLogger(__name__) + +class TestOddpubWrapper(unittest.TestCase): + + def setUp(self): + # Mock the database session + self.engine = get_db_engine(is_test=True) + + init_db(self.engine) + # Create a new session for each test + self.session = get_db_session(self.engine) + + self.wrapper = OddpubWrapper( + db_session=self.session, + oddpub_host_api="http://mock-api" + ) + + def tearDown(self): + # # Rollback the transaction + # self.session.rollback() + + # Check if the Works table exists before attempting to update or delete + inspector = inspect(self.engine) + tables = inspector.get_table_names() + logger.info(f"Tables in the tearDown: {tables}") + if "oddpub_metrics" in tables: + self.session.query(OddpubMetrics).delete() + self.session.commit() + + @patch("dsst_etl.oddpub_wrapper.requests.post") + def test_process_pdfs_success(self, mock_post): + # Mock the response from the API + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = { + 'article': 'test1.txt', + 'is_open_data': False, + 'open_data_category': '', + 'is_reuse': False, + 'is_open_code': False, + 'is_open_data_das': False, + 'is_open_code_cas': False, + 'das': None, + 'open_data_statements': '', + 'cas': None, + 'open_code_statements': '' + } + + mock_post.return_value = mock_response + + # Mock the PDF files + pdf_folder = "tests/pdf-test" + pdf_paths = [ + pdf_folder + "/test1.pdf", + ] + + # Call the method + self.wrapper.process_pdfs(pdf_folder) + + # Assertions + self.assertEqual(self.session.query(OddpubMetrics).count(), len(pdf_paths)) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_upload_rtransparent_data.py b/tests/test_upload_rtransparent_data.py index 14e839f..7f824f7 100644 --- a/tests/test_upload_rtransparent_data.py +++ b/tests/test_upload_rtransparent_data.py @@ -30,13 +30,10 @@ def mock_data(self): def setUp(self): self.engine = get_db_engine(is_test=True) - init_db(self.engine ) + init_db(self.engine) # Create a new session for each test self.session = get_db_session(self.engine) - # Start a transaction that we can roll back after each test - # self.transaction = self.session.begin() - self.uploader = RTransparentDataUploader(self.session) def tearDown(self): From 7ce9b160041ce1881f85d8e1a79c8fa6136ceb1f Mon Sep 17 00:00:00 2001 From: Quang Date: Fri, 13 Dec 2024 17:33:20 +0700 Subject: [PATCH 12/21] Refactor OddpubWrapper class documentation to clarify its purpose as a wrapper for the ODDPub API. --- dsst_etl/oddpub_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsst_etl/oddpub_wrapper.py b/dsst_etl/oddpub_wrapper.py index 1d874c5..a810a8b 100644 --- a/dsst_etl/oddpub_wrapper.py +++ b/dsst_etl/oddpub_wrapper.py @@ -13,7 +13,7 @@ class OddpubWrapper: """ - A wrapper class for calling ODDPub R functions from Python using rpy2. + Wrapper class for the ODDPub API. """ def __init__( From f20dbdbbb4f1f72f9469fff01106826f012526de Mon Sep 17 00:00:00 2001 From: Quang Date: Wed, 18 Dec 2024 16:32:43 +0700 Subject: [PATCH 13/21] Implement a new test for OddpubWrapper to validate PDF processing without mock API, ensuring correct data retrieval from the database. --- .docker/postgres-compose.yaml | 10 +++++++++- tests/test_oddpub.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/.docker/postgres-compose.yaml b/.docker/postgres-compose.yaml index a06ee6a..0b13ed6 100644 --- a/.docker/postgres-compose.yaml +++ b/.docker/postgres-compose.yaml @@ -1,5 +1,5 @@ services: - postgres: + postgres-dsst: image: postgres:15 container_name: postgres_db env_file: "../.env" @@ -13,5 +13,13 @@ services: timeout: 5s # Added missing timeout value retries: 5 + oddpub-dsst: + build: + context: ../services/oddpub + dockerfile: Dockerfile + container_name: oddpub_service + ports: + - "8071:8071" + volumes: postgres_data: diff --git a/tests/test_oddpub.py b/tests/test_oddpub.py index 630585a..d39c530 100644 --- a/tests/test_oddpub.py +++ b/tests/test_oddpub.py @@ -36,6 +36,17 @@ def tearDown(self): self.session.query(OddpubMetrics).delete() self.session.commit() + def test_oddpub_wrapper_without_mock_api(self): + self.wrapper.oddpub_host_api = "http://localhost:8071" + self.wrapper.process_pdfs("tests/pdf-test") + data = self.session.query(OddpubMetrics).all() + self.assertEqual(len(data), 2) + articles = [row.article for row in data] + self.assertIn("test1.txt", articles) + self.assertIn("test2.txt", articles) + + + @patch("dsst_etl.oddpub_wrapper.requests.post") def test_process_pdfs_success(self, mock_post): # Mock the response from the API From 92dd494404d4fc33ae4220726ffb977111a106e4 Mon Sep 17 00:00:00 2001 From: leej3 Date: Wed, 18 Dec 2024 16:57:31 +0000 Subject: [PATCH 14/21] remove superfluous oddpub deps --- services/oddpub/environment.yaml | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/services/oddpub/environment.yaml b/services/oddpub/environment.yaml index 87471fd..c95512e 100644 --- a/services/oddpub/environment.yaml +++ b/services/oddpub/environment.yaml @@ -12,36 +12,17 @@ dependencies: - requests - rpy2 - uvicorn - # Dependencies for rtransparent - - r-crul - r-devtools - r-dplyr - r-furrr - - r-future - - r-globals - - r-hoardr - - r-httpcode - - r-lazyeval - r-lubridate - - r-magrittr - - r-pbapply - #- r-pdftools - - r-plyr + - r-pdftools - r-purrr - #- r-qpdf - r-readr - # - r-rentrez - r-rlang - r-stringr - r-tibble - r-tidyr - - r-tidyselect - - r-timechange - r-tokenizers - - r-triebeard - - r-urltools - - r-utf8 - # - r-XML - - r-xml2 - - pip: - - metapub \ No newline at end of file + - r-vctrs + - r-yaml From e694cbed502e769666c5c42f16e9ff308087f9da Mon Sep 17 00:00:00 2001 From: Quang Date: Thu, 19 Dec 2024 14:45:06 +0700 Subject: [PATCH 15/21] Revert name of service in docker compose --- .docker/postgres-compose.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.docker/postgres-compose.yaml b/.docker/postgres-compose.yaml index 0b13ed6..50d2a5e 100644 --- a/.docker/postgres-compose.yaml +++ b/.docker/postgres-compose.yaml @@ -1,5 +1,5 @@ services: - postgres-dsst: + postgres: image: postgres:15 container_name: postgres_db env_file: "../.env" @@ -13,7 +13,7 @@ services: timeout: 5s # Added missing timeout value retries: 5 - oddpub-dsst: + oddpub: build: context: ../services/oddpub dockerfile: Dockerfile From eca7643157b74c7361b0e5d562da553704cc1665 Mon Sep 17 00:00:00 2001 From: Quang Date: Thu, 19 Dec 2024 14:56:57 +0700 Subject: [PATCH 16/21] Update README.md to enhance Docker setup instructions and API usage examples for Oddpub service --- services/oddpub/README.md | 53 +++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/services/oddpub/README.md b/services/oddpub/README.md index c2f48c5..60d336a 100644 --- a/services/oddpub/README.md +++ b/services/oddpub/README.md @@ -13,7 +13,7 @@ services/ ## Requirements - Python 3.11 -- Docker (optional, for containerized deployment) +- Docker for containerized deployment ## Setup @@ -23,35 +23,44 @@ services/ cd services/oddpub ``` -2. **Install dependencies**: - If you are using Poetry: +2. **Build the Docker image**: ```bash - poetry install + docker build -t oddpub-api . ``` -3. **Run the application**: +3. **Start the Docker container**: ```bash - uvicorn main:app --reload + docker run -p 80:8071 -v $PWD:/app oddpub-api ``` ## Usage -- Access the API at `http://localhost:8000/oddpub` to upload a PDF file and receive JSON output. - -## Docker - -To build and run the application using Docker: - -1. **Build the Docker image**: - ```bash - docker build -t oddpub-api . - ``` - -2. **Run the Docker container**: - ```bash - docker run -p 80:80 oddpub-api - ``` +Access the API at `http://localhost:80/oddpub` to upload a PDF file and receive JSON output. +Example curl command: + +```bash +curl -X POST -F "file=@/path/to/your/file.pdf" http://localhost:80/oddpub +``` + +Response: + +```json +{ + "article": "test1.txt", + "is_open_data": false, + "open_data_category": "", + "is_reuse": false, + "is_open_code": false, + "is_open_data_das": false, + "is_open_code_cas": false, + "das": null, + "open_data_statements": "", + "cas": null, + "open_code_statements": "" +} +``` ## License -This project is licensed under the MIT License. \ No newline at end of file +This project is licensed under the MIT License. + From 3ccece31f3a419c400f71bbb9415148f8faf64e6 Mon Sep 17 00:00:00 2001 From: Quang Date: Thu, 19 Dec 2024 15:05:31 +0700 Subject: [PATCH 17/21] Refactor Dockerfile: removing unnecessary dependencies for oddpub and ensuring a cleaner setup process. --- services/oddpub/dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/services/oddpub/dockerfile b/services/oddpub/dockerfile index fb4957b..6c518cf 100644 --- a/services/oddpub/dockerfile +++ b/services/oddpub/dockerfile @@ -11,9 +11,7 @@ RUN conda env create -f environment.yaml # Ensure the conda environment is activated RUN echo "source /opt/conda/etc/profile.d/conda.sh && conda activate osm" | tee -a ~/.bashrc /etc/profile /etc/profile.d/conda.sh /etc/skel/.bashrc /etc/skel/.profile > /dev/null -RUN R -e '\ -devtools::install_github("quest-bih/oddpub",ref="c5b091c7e82ed6177192dc380a515b3dc6304863"); \ -devtools::install_github("serghiou/rtransparent", build_vignettes = F)' +RUN R -e 'devtools::install_github("quest-bih/oddpub",ref="c5b091c7e82ed6177192dc380a515b3dc6304863")' # # Copy the project files and install the package COPY app.py /app From dca4d6da67cfec545d40565fe6a8e339167cabb7 Mon Sep 17 00:00:00 2001 From: Quang Date: Thu, 19 Dec 2024 15:11:08 +0700 Subject: [PATCH 18/21] Update README.md to correct Docker run command and API access URL for Oddpub service --- services/oddpub/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/oddpub/README.md b/services/oddpub/README.md index 60d336a..652e973 100644 --- a/services/oddpub/README.md +++ b/services/oddpub/README.md @@ -30,12 +30,12 @@ services/ 3. **Start the Docker container**: ```bash - docker run -p 80:8071 -v $PWD:/app oddpub-api + docker run -p 8071:8071 -v $PWD:/app oddpub-api ``` ## Usage -Access the API at `http://localhost:80/oddpub` to upload a PDF file and receive JSON output. +Access the API at `http://localhost:8071/oddpub` to upload a PDF file and receive JSON output. Example curl command: ```bash From 4af8cd18a12660fab6a91fdcb3bd7dc98aae0935 Mon Sep 17 00:00:00 2001 From: Quang Date: Thu, 19 Dec 2024 15:11:19 +0700 Subject: [PATCH 19/21] Update Dockerfile to install pdftotext utility for PDF processing --- services/oddpub/dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/oddpub/dockerfile b/services/oddpub/dockerfile index 6c518cf..f134c53 100644 --- a/services/oddpub/dockerfile +++ b/services/oddpub/dockerfile @@ -3,6 +3,9 @@ SHELL ["/bin/bash", "--login", "-c"] # Set working directory WORKDIR /app +# Install pdftotext +RUN apt-get update && apt-get install -y poppler-utils + COPY environment.yaml /app # Create the environment @@ -22,9 +25,6 @@ COPY _entrypoint.sh /usr/local/bin/_entrypoint.sh # Make _entrypoint.sh executable RUN chmod +x /usr/local/bin/_entrypoint.sh -# Install pdftotext -RUN apt-get update && apt-get install -y poppler-utils - ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] CMD ["fastapi", "dev", "--host", "0.0.0.0", "--port", "8071"] \ No newline at end of file From 11f9ef04ffcf7b66539c87381d922ad45435931f Mon Sep 17 00:00:00 2001 From: Quang Date: Thu, 19 Dec 2024 15:19:50 +0700 Subject: [PATCH 20/21] update mock env files --- .mockenv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mockenv b/.mockenv index 97d8325..a1b43bf 100644 --- a/.mockenv +++ b/.mockenv @@ -17,4 +17,4 @@ S3_BUCKET_NAME=osm-pdf-uploads HOSTNAME=localhost USERNAME=quang -ODDPUB_HOST_API=http://localhost:80 \ No newline at end of file +ODDPUB_HOST_API=http://localhost:8071 \ No newline at end of file From 8f11c26f0b9a083fe0b36aa9b2992b5af8bc6491 Mon Sep 17 00:00:00 2001 From: Joshua Lawrimore Date: Fri, 20 Dec 2024 17:27:44 -0500 Subject: [PATCH 21/21] Added platform to postgres-compose.yaml --- .docker/postgres-compose.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.docker/postgres-compose.yaml b/.docker/postgres-compose.yaml index 50d2a5e..87d99b4 100644 --- a/.docker/postgres-compose.yaml +++ b/.docker/postgres-compose.yaml @@ -1,5 +1,6 @@ services: postgres: + platform: linux/amd64 image: postgres:15 container_name: postgres_db env_file: "../.env"