From 67d6e2acd3c32435017bd132addf9ac23d458957 Mon Sep 17 00:00:00 2001 From: leej3 Date: Tue, 20 Aug 2024 10:44:45 +0100 Subject: [PATCH] reorg with fixes --- .dockerignore | 1 + .github/workflows/deploy.yml | 2 +- .pre-commit-config.yaml | 4 +- README.md | 30 ++- ....yaml => compose.development.override.yaml | 21 +- external_components/rtransparent/Dockerfile | 19 +- external_components/rtransparent/app.py | 78 +++++-- osm/_utils.py | 11 +- osm/cli.py | 35 ++-- osm/pipeline/core.py | 90 ++++++--- osm/pipeline/extractors.py | 44 ++-- osm/pipeline/parsers.py | 25 ++- osm/pipeline/savers.py | 100 +++++++-- osm/schemas/__init__.py | 1 - osm/schemas/metrics_schemas.py | 191 +++++++++++------- osm/schemas/schemas.py | 15 +- pyproject.toml | 1 + {web_api => web}/.env.template | 0 {web_api => web/api}/Dockerfile | 2 +- {web_api => web/api}/main.py | 28 ++- {web_api => web}/dashboard/Dockerfile | 6 +- {web_api => web}/dashboard/dashboard.py | 0 {web_api => web/deploy}/README.md | 0 {web_api => web/deploy}/compose.override.yaml | 0 {web_api => web/deploy}/compose.yaml | 0 {web_api => web/deploy}/deploy.py | 4 +- .../deploy}/docker-compose.yaml.j2 | 0 {web_api => web/deploy}/environment.yaml | 0 .../modules/shared_resources/main.tf | 0 .../deploy}/terraform/staging/main.tf | 0 .../deploy}/terraform/staging/variables.tf | 0 .../deploy}/terraform/state_storage/README.md | 0 .../state_storage/dynamodb-policy.json | 0 .../terraform/state_storage/state-storage.tf | 0 web_api/__init__.py | 0 35 files changed, 480 insertions(+), 228 deletions(-) create mode 100644 .dockerignore rename compose.override.yaml => compose.development.override.yaml (54%) rename {web_api => web}/.env.template (100%) rename {web_api => web/api}/Dockerfile (92%) rename {web_api => web/api}/main.py (70%) rename {web_api => web}/dashboard/Dockerfile (79%) rename {web_api => web}/dashboard/dashboard.py (100%) rename {web_api => web/deploy}/README.md (100%) rename {web_api => web/deploy}/compose.override.yaml (100%) rename {web_api => web/deploy}/compose.yaml (100%) rename {web_api => web/deploy}/deploy.py (98%) rename {web_api => web/deploy}/docker-compose.yaml.j2 (100%) rename {web_api => web/deploy}/environment.yaml (100%) rename {web_api => web/deploy}/terraform/modules/shared_resources/main.tf (100%) rename {web_api => web/deploy}/terraform/staging/main.tf (100%) rename {web_api => web/deploy}/terraform/staging/variables.tf (100%) rename {web_api => web/deploy}/terraform/state_storage/README.md (100%) rename {web_api => web/deploy}/terraform/state_storage/dynamodb-policy.json (100%) rename {web_api => web/deploy}/terraform/state_storage/state-storage.tf (100%) delete mode 100644 web_api/__init__.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..3fa8c86b --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.terraform diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 2f6d2bcc..49901fc1 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -40,7 +40,7 @@ jobs: - name: Build and push Docker image run: | - DOCKER_BUILDKIT=1 docker build -t osm_web_api:${{ github.event.inputs.environment || 'production' }} -f ./docker_images/web_api/Dockerfile . + DOCKER_BUILDKIT=1 docker build -t osm_web_api:${{ github.event.inputs.environment || 'production' }} -f ./web/app/Dockerfile . docker tag osm_web_api:${{ github.event.inputs.environment || 'production' }}:latest ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/osm_web_api:${{ github.event.inputs.environment || 'production' }} docker push ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/osm_web_api:${{ github.event.inputs.environment || 'production' }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 48a29704..605efde9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,11 +19,11 @@ repos: hooks: - id: trailing-whitespace files: ".*\\.py" - exclude: "examples|docs/examples" + exclude: "examples|docs/examples|tests/data" - id: check-added-large-files - id: check-toml - id: end-of-file-fixer - exclude: "examples|docs/examples" + exclude: "examples|docs/examples|tests/data" # - repo: https://github.com/pre-commit/mirrors-prettier # rev: v4.0.0-alpha.8 diff --git a/README.md b/README.md index 36e5ea3f..6b837cf0 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,41 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to N.B. pdf parsing does not work on Apple silicon... -- With docker-compose and python >3.11 installed, runng the following from the project's root directory: +- With docker-compose and python >=3.11 installed, run the following from the project's root directory: ``` pip install . osm -f path/to/pdf-or-xml -u uuid ``` +If you have many files to upload you may with to start up the docker-compose dependencies in a separate terminal window: + +``` +docker compose up # docker-compose on some systems +``` + +And then tell the osm tool that this has been handled: + +``` +osm -f path/to/pdf-or-xml -u uuid --user-managed-compose +osm -f path/to/pdf-or-xml2 -u uuid2 --user-managed-compose +``` + +# Contributing + +If you wish to contribute to this project you can set up a development environment with the following: + +``` +pip install -e . +docker compose -f compose.yaml -f compose.development.override.yaml up --build +``` +And in another terminal: + +``` +osm -f path/to/pdf-or-xml -u uuid --user-managed-compose +``` + + ## Using pre-commit for commit checks Pre-commit will run all of its hooks on every commit you make. To install diff --git a/compose.override.yaml b/compose.development.override.yaml similarity index 54% rename from compose.override.yaml rename to compose.development.override.yaml index 8cd8cb13..015c12da 100644 --- a/compose.override.yaml +++ b/compose.development.override.yaml @@ -1,26 +1,37 @@ +name: local-osm services: rtransparent: build: context: . - dockerfile: docker_images/rtransparent/Dockerfile + dockerfile: ./external_components/rtransparent/Dockerfile volumes: - - ./docker_images/rtransparent:/app + - ./external_components/rtransparent:/app - osm_web_api: + web_api: environment: - MONGODB_URI=mongodb://db:27017/test build: context: . - dockerfile: ./docker_images/web_api/Dockerfile + dockerfile: ./web/api/Dockerfile ports: - 80:80 volumes: - - ./docker_images/web_api:/app/app + - ./web/api:/app/app working_dir: /app/app command: ["fastapi","dev","--host","0.0.0.0","--port","80"] depends_on: - db + dashboard: + build: + context: . + dockerfile: ./web/dashboard/Dockerfile + environment: + - MONGODB_URI=mongodb://db:27017/test + working_dir: /app + ports: + - "8501:8501" + db: # use old version of mongo to avoid Apple Instruction set error image: mongo:4.4.6 diff --git a/external_components/rtransparent/Dockerfile b/external_components/rtransparent/Dockerfile index f21cf031..0b1b1d16 100644 --- a/external_components/rtransparent/Dockerfile +++ b/external_components/rtransparent/Dockerfile @@ -3,15 +3,7 @@ SHELL ["/bin/bash", "--login", "-c"] # Set working directory WORKDIR /app -# Install debugging tools -RUN apt-get update && apt-get install -y \ - git \ - curl \ - iputils-ping \ - net-tools \ - && rm -rf /var/lib/apt/lists/* - -COPY docker_images/rtransparent/environment.yaml /app +COPY external_components/rtransparent/environment.yaml /app # Create the environment RUN conda env create -f environment.yaml @@ -20,16 +12,13 @@ RUN conda env create -f environment.yaml RUN echo "source /opt/conda/etc/profile.d/conda.sh && conda activate osm" | tee -a ~/.bashrc /etc/profile /etc/profile.d/conda.sh /etc/skel/.bashrc /etc/skel/.profile > /dev/null RUN R -e '\ -install.packages("roadoi", repos = "http://cran.us.r-project.org"); \ -devtools::install_github("quest-bih/oddpub"); \ -devtools::install_github("cran/crminer"); \ -devtools::install_github("serghiou/metareadr"); \ +devtools::install_github("quest-bih/oddpub",ref="c5b091c7e82ed6177192dc380a515b3dc6304863"); \ devtools::install_github("serghiou/rtransparent", build_vignettes = F)' # # Copy the project files and install the package -COPY docker_images/rtransparent/app.py /app +COPY external_components/rtransparent/app.py /app # Make entrypoint etc. convenient for users -COPY docker_images/_entrypoint.sh /usr/local/bin/_entrypoint.sh +COPY external_components/_entrypoint.sh /usr/local/bin/_entrypoint.sh ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] CMD ["fastapi", "dev", "--host", "0.0.0.0", "--port", "8071"] diff --git a/external_components/rtransparent/app.py b/external_components/rtransparent/app.py index 35077478..1f40b6a2 100644 --- a/external_components/rtransparent/app.py +++ b/external_components/rtransparent/app.py @@ -2,9 +2,10 @@ import tempfile from pathlib import Path +import pandas as pd import psutil import rpy2.robjects as ro -from fastapi import FastAPI, HTTPException, Request, status +from fastapi import FastAPI, HTTPException, Query, Request, status from fastapi.responses import JSONResponse from pydantic import BaseModel from rpy2.robjects import pandas2ri @@ -44,7 +45,7 @@ def get_health() -> HealthCheck: def rtransparent_metric_extraction( - xml_content: bytes, workers: int = psutil.cpu_count() + xml_content: bytes, parser: str, workers: int = psutil.cpu_count() ): rtransparent = importr("rtransparent") future = importr("future") @@ -54,27 +55,74 @@ def rtransparent_metric_extraction( with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as temp_xml_file: temp_xml_file.write(xml_content) temp_xml_file_path = temp_xml_file.name - - with (ro.default_converter + pandas2ri.converter).context(): - df = ro.conversion.get_conversion().rpy2py( - rtransparent.rt_all(temp_xml_file_path) - ) - + if parser == "pmc": + df = extract_from_pmc_xml(temp_xml_file_path, rtransparent) + else: + df = extract_from_xml(temp_xml_file_path, rtransparent) # Clean up the temporary file temp_xml_file.close() Path(temp_xml_file_path).unlink() - return df -# from osm.schemas import Invocation -@app.post("/extract-metrics") -async def extract_metrics(request: Request): +def extract_from_xml(temp_xml_file_path, rtransparent): + dfs = {} + with (ro.default_converter + pandas2ri.converter).context(): + dfs["data_code"] = ro.conversion.get_conversion().rpy2py( + rtransparent.rt_data_code(temp_xml_file_path) + ) + # "all" contains fund, register, and coi outputs + with (ro.default_converter + pandas2ri.converter).context(): + dfs["all"] = ro.conversion.get_conversion().rpy2py( + rtransparent.rt_all(temp_xml_file_path) + ) + return pd.concat([dfs["all"], dfs["data_code"].drop(columns=["article"])], axis=1) + + +def extract_from_pmc_xml(temp_xml_file_path, rtransparent): + raise NotImplementedError( + "Not all XML files provided at pubmedcentral include the datasharing statements." + ) + # dfs = {} + # with (ro.default_converter + pandas2ri.converter).context(): + # dfs["meta_pmc"] = ro.conversion.get_conversion().rpy2py( + # rtransparent.rt_meta_pmc(temp_xml_file_path) + # ) + # # data_code_pmc is a subset of all_pmc + # with (ro.default_converter + pandas2ri.converter).context(): + # dfs["all_pmc"] = ro.conversion.get_conversion().rpy2py( + # rtransparent.rt_all_pmc(temp_xml_file_path) + # ) + # return pd.concat( + # [ + # dfs["all_pmc"], + # dfs["meta_pmc"].drop( + # columns=["doi", "filename", "is_success", "pmcid_pmc", "pmid"] + # ), + # ], + # axis=1, + # ) + + +@app.post("/extract-metrics/") +async def extract_metrics(request: Request, parser: str = Query("other")): try: + # Attempt to read the XML content from the request body xml_content = await request.body() - metrics_df = rtransparent_metric_extraction(xml_content) + if not xml_content: + raise NotImplementedError( + """For now the XML content must be provided. Check the output of + the parsing stage.""" + ) + + metrics_df = rtransparent_metric_extraction(xml_content, parser) + + # Log the extracted metrics logger.info(metrics_df) - metrics_json = metrics_df.to_json(orient="records") - return JSONResponse(content=metrics_json, status_code=200) + + # Return the first row as a JSON response + return JSONResponse(content=metrics_df.iloc[0].to_dict(), status_code=200) + except Exception as e: + # Handle exceptions and return a 500 Internal Server Error raise HTTPException(status_code=500, detail=str(e)) diff --git a/osm/_utils.py b/osm/_utils.py index 3bdba06f..2c6e3d8c 100644 --- a/osm/_utils.py +++ b/osm/_utils.py @@ -40,9 +40,6 @@ def get_compute_context_id(): def _upload_data(args, file_in, xml, metrics, components): - """ - TODO: add in derivatives and components - """ osm_api = os.environ.get("OSM_API", "http://localhost:80") payload = { @@ -106,11 +103,19 @@ def _setup(args): if args.filepath.name.endswith(".pdf"): if xml_path.exists(): raise FileExistsError(xml_path) + elif args.filepath.name.endswith(".xml"): + logger.warning( + """The input file is an xml file. Skipping the pdf to text + conversion and so ignoring requested parsers.""" + ) + args.parser = ["no-op"] metrics_path = _get_metrics_dir() / f"{args.uid}.json" if metrics_path.exists(): raise FileExistsError(metrics_path) if not args.user_managed_compose: compose_up() + logger.info("Waiting for containers to be ready...") + print("Waiting for containers to be ready...") wait_for_containers() return xml_path, metrics_path diff --git a/osm/cli.py b/osm/cli.py index 527a6872..3d9a9883 100644 --- a/osm/cli.py +++ b/osm/cli.py @@ -1,13 +1,14 @@ import argparse from osm._utils import DEFAULT_OUTPUT_DIR, _existing_file, _setup, compose_down -from osm.pipeline.core import Pipeline +from osm.pipeline.core import Pipeline, Savers from osm.pipeline.extractors import RTransparentExtractor -from osm.pipeline.parsers import ScienceBeamParser -from osm.pipeline.savers import FileSaver, JSONSaver, OSMSaver, Savers +from osm.pipeline.parsers import NoopParser, ScienceBeamParser +from osm.pipeline.savers import FileSaver, JSONSaver, OSMSaver PARSERS = { "sciencebeam": ScienceBeamParser, + "no-op": NoopParser, } EXTRACTORS = { "rtransparent": RTransparentExtractor, @@ -72,14 +73,22 @@ def main(): args = parse_args() try: xml_path, metrics_path = _setup(args) + pipeline = Pipeline( filepath=args.filepath, xml_path=xml_path, metrics_path=metrics_path, - parsers=[PARSERS[p] for p in args.parser], - extractors=[EXTRACTORS[m] for m in args.metrics_type], + parsers=[PARSERS[p]() for p in args.parser], + extractors=[EXTRACTORS[m]() for m in args.metrics_type], savers=Savers( - file_saver=FileSaver(), json_saver=JSONSaver(), osm_saver=OSMSaver() + file_saver=FileSaver(), + json_saver=JSONSaver(), + osm_saver=OSMSaver( + comment=args.comment, + email=args.email, + user_defined_id=args.uid, + filename=args.filepath.name, + ), ), ) pipeline.run() @@ -90,17 +99,3 @@ def main(): if __name__ == "__main__": main() - -# def main(): -# args = parse_args() -# try: -# pipeline = _setup(args) -# pipeline.parse() -# pipeline.extract() -# pipeline.save() -# xml_path, metrics_path, parser, extractor = _setup(args) -# xml = parser.parse() -# xml_path.write_bytes(xml) -# metrics = _extract(xml) -# metrics_path.write_text(json.dumps(metrics)) -# _upload_data(args, file_in, xml, metrics,components) diff --git a/osm/pipeline/core.py b/osm/pipeline/core.py index 229e80a4..91c549c6 100644 --- a/osm/pipeline/core.py +++ b/osm/pipeline/core.py @@ -1,50 +1,81 @@ from abc import ABC, abstractmethod -from typing import Optional +from pathlib import Path +from typing import Any, Optional +from osm import schemas -# Parser Interface -class Parser(ABC): - @abstractmethod - def parse(self, data: bytes) -> str: - pass +class Component(ABC): + def __init__(self, version: str = "0.0.1"): + """As subclasses evolve they should keep track of their version.""" + self.version = version + self.docker_image = None + self.docker_image_id = None + self._name = None + self._orm_model = None -# Extractor Interface -class Extractor(ABC): @abstractmethod - def extract(self, data: str) -> dict: + def run(self, data: Any, **kwargs) -> Any: pass + def _get_model_fields(self) -> dict[str, Any]: + return { + "name": self.name, + "version": self.version, + } -# Saver Interface -class Saver(ABC): - @abstractmethod - def save(self, data: dict): - pass + @property + def name(self) -> str: + return self.__class__.__name__ + + @property + def orm_model(self) -> schemas.Component: + if self._orm_model is None: + self._orm_model = schemas.Component( + **self._get_model_fields(), + ) + return self._orm_model + + def model_dump(self) -> dict[str, Any]: + """Return a dict of the components model.""" + return self.orm_model.model_dump() class Savers: - def __init__(self, file_saver: Saver, json_saver: Saver, osm_saver: Saver): + def __init__( + self, file_saver: Component, json_saver: Component, osm_saver: Component + ): self.file_saver = file_saver self.json_saver = json_saver self.osm_saver = osm_saver - def save_file(self, data: str): - self.file_saver.save(data) + def __iter__(self): + yield self.file_saver + yield self.json_saver + yield self.osm_saver - def save_json(self, data: dict): - self.json_saver.save(data) + def save_file(self, data: str, path: Path): + self.file_saver.run(data, path) - def save_osm(self, data: dict): - self.osm_saver.save(data) + def save_json(self, data: dict, path: Path): + self.json_saver.run(data, path) + + def save_osm( + self, + file_in: bytes, + metrics: dict, + components: list, + ): + # Call the method to save or upload the data + self.osm_saver.run(file_in, metrics, components) class Pipeline: def __init__( self, *, - parsers: list[Parser], - extractors: list[Extractor], + parsers: list[Component], + extractors: list[Component], savers: Savers, filepath: str, xml_path: Optional[str] = None, @@ -60,17 +91,16 @@ def __init__( def run(self): for parser in self.parsers: - parsed_data = parser.parse(self.file_data) + parsed_data = parser.run(self.file_data) if isinstance(parsed_data, str): self.savers.save_file(parsed_data, self.xml_path) for extractor in self.extractors: - extracted_metrics = extractor.extract(parsed_data) + # extracted_metrics = extractor.run(parsed_data,parser=parser.name) + extracted_metrics = extractor.run(parsed_data) self.savers.save_osm( - { - "parser": parser.__class__.__name__, - "extractor": extractor.__class__.__name__, - "metrics": extracted_metrics, - } + file_in=self.file_data, + metrics=extracted_metrics, + components=[*self.parsers, *self.extractors, *self.savers], ) self.savers.save_json(extracted_metrics, self.metrics_path) diff --git a/osm/pipeline/extractors.py b/osm/pipeline/extractors.py index 30b7f6d5..b36f0717 100644 --- a/osm/pipeline/extractors.py +++ b/osm/pipeline/extractors.py @@ -1,22 +1,34 @@ -import json +import logging import requests - -def _extract(xml: bytes) -> str: - """Extracts metrics from an XML. - - Args: - xml: Raw bytes for an xml file. - """ - headers = {"Content-Type": "application/octet-stream"} - response = requests.post( - "http://localhost:8071/extract-metrics", data=xml, headers=headers - ) - if response.status_code == 200: - return json.loads(response.json())[0] - else: - response.raise_for_status() +from .core import Component + +logger = logging.getLogger(__name__) + + +class RTransparentExtractor(Component): + def run(self, data: str, parser: str = None) -> dict: + headers = {"Content-Type": "application/octet-stream"} + response = requests.post( + "http://localhost:8071/extract-metrics", + data=data, + headers=headers, + params={"parser": parser}, + ) + if response.status_code == 200: + metrics = response.json() + # pmid only exists when input filename is correct + metrics.pop("pmid") + # replace bizarre sentinel value + for k, v in metrics.items(): + if v == -2147483648: + metrics[k] = None + return metrics + else: + breakpoint() + logger.error(f"Error: {response.text}") + response.raise_for_status() # import psutil diff --git a/osm/pipeline/parsers.py b/osm/pipeline/parsers.py index 4c3b4f42..75136ec4 100644 --- a/osm/pipeline/parsers.py +++ b/osm/pipeline/parsers.py @@ -1,17 +1,20 @@ import requests +from .core import Component + SCIENCEBEAM_URL = "http://localhost:8070/api/convert" -def _convert(pdf: bytes) -> str: - """Converts a PDF to a an XML. +class NoopParser(Component): + def run(self, data: bytes) -> str: + return data.decode("utf-8") + - Args: - pdf: Path to the input PDF file. - """ - headers = {"Accept": "application/tei+xml", "Content-Type": "application/pdf"} - response = requests.post(SCIENCEBEAM_URL, data=pdf, headers=headers) - if response.status_code == 200: - return response.text - else: - response.raise_for_status() +class ScienceBeamParser(Component): + def run(self, data: bytes) -> str: + headers = {"Accept": "application/tei+xml", "Content-Type": "application/pdf"} + response = requests.post(SCIENCEBEAM_URL, data=data, headers=headers) + if response.status_code == 200: + return response.text + else: + response.raise_for_status() diff --git a/osm/pipeline/savers.py b/osm/pipeline/savers.py index 9237c7f5..ffc6e4ee 100644 --- a/osm/pipeline/savers.py +++ b/osm/pipeline/savers.py @@ -1,29 +1,89 @@ +import base64 +import hashlib import json +import logging +import os +from pathlib import Path -from .core import Saver +import requests +from pydantic import ValidationError +from osm._utils import get_compute_context_id +from osm._version import __version__ +from osm.schemas import Invocation -class FileSaver(Saver): - def save(self, data: str): - with open("output.xml", "w") as file: - file.write(data) +from .core import Component +logger = logging.getLogger(__name__) -class JSONSaver(Saver): - def save(self, data: dict): - with open("output.json", "w") as file: - json.dump(data, file) +class FileSaver(Component): + def run(self, data: str, path: Path): + path.write_text(data) -class OSMSaver(Saver): - def save(self, data: dict): - # Assuming there's a method to post data to an endpoint - response = self.post_to_osm(data) - if response.status_code != 200: - raise Exception("Failed to save metrics to OSM") - def post_to_osm(self, data: dict): - # Mock implementation: Replace with actual HTTP request - print(f"Posting to OSM: {data}") - # TODO - pass +class JSONSaver(Component): + def run(self, data: dict, path: Path): + path.write_text(json.dumps(data)) + + +class OSMSaver(Component): + def __init__(self, comment, email, user_defined_id, filename): + super().__init__() + self.compute_context_id = get_compute_context_id() + self.comment = comment + self.email = email + self.user_defined_id = user_defined_id + self.filename = filename + + def run(self, file_in: bytes, metrics: dict, components: list): + osm_api = os.environ.get("OSM_API", "http://localhost:80") + # Build the payload + payload = { + "osm_version": __version__, + "user_comment": self.comment, + "work": { + "user_defined_id": self.user_defined_id, + "filename": self.filename, + "file": base64.b64encode(file_in).decode("utf-8"), + "content_hash": hashlib.sha256(file_in).hexdigest(), + }, + "client": { + "compute_context_id": self.compute_context_id, + "email": self.email, + }, + "metrics": metrics, + "components": [comp.model_dump() for comp in components], + } + try: + # Validate the payload + validated_data = Invocation(**payload) + # If validation passes, send POST request to OSM API. ID is not + # serializable but can be excluded and created by the DB. All types + # should be serializable. If they're not then a they should be encoded + # as a string or something like that: base64.b64encode(bytes).decode("utf-8") + response = requests.put( + f"{osm_api}/upload", json=validated_data.model_dump(exclude=["id"]) + ) + if response.status_code == 200: + print("Invocation data uploaded successfully") + else: + raise ValueError( + f"Failed to upload invocation data: \n {response.text}" + ) + except (ValidationError, ValueError) as e: + breakpoint() + try: + payload["upload_error"] = str(e) + # Quarantine the failed payload + response = requests.put(f"{osm_api}/quarantine", json=payload) + response.raise_for_status() + except requests.exceptions.RequestException as qe: + requests.put( + f"{osm_api}/quarantine", + json={ + "upload_error": str(e), + "recovery_error": str(qe), + }, + ) + logger.warning(f"Validation failed: {e}") diff --git a/osm/schemas/__init__.py b/osm/schemas/__init__.py index 71a2c66b..5f4eae77 100644 --- a/osm/schemas/__init__.py +++ b/osm/schemas/__init__.py @@ -1,6 +1,5 @@ from .schemas import Client as Client from .schemas import Component as Component -from .schemas import Derivative as Derivative from .schemas import Invocation as Invocation from .schemas import RtransparentMetrics as RtransparentMetrics from .schemas import Work as Work diff --git a/osm/schemas/metrics_schemas.py b/osm/schemas/metrics_schemas.py index 28f31186..48985a84 100644 --- a/osm/schemas/metrics_schemas.py +++ b/osm/schemas/metrics_schemas.py @@ -2,21 +2,29 @@ from odmantic import EmbeddedModel +# The rtransparent tool can extract from parsed pdfs or from XML directly from pubmed central. The latter has many more fields. + +# all_indicators.csv from the rtransparent publication has both but has the following extra fields: +# code_text,com_code,com_data_availibility,com_file_formats,com_general_db,com_github_data,com_specific_db,com_suppl_code,com_supplemental_data,data_text,dataset,eigenfactor_score,field,is_art,is_code_pred,is_data_pred,is_relevant_code,is_relevant_data,jif,n_cite,score,year, class RtransparentMetrics(EmbeddedModel): + # Mandatory fields + is_open_code: Optional[bool] + is_open_data: Optional[bool] + + # Optional fields + year: Optional[float] = None + filename: Optional[str] = None pmcid_pmc: Optional[int] = None - pmid: Optional[int] = None + pmid: Optional[float] = None doi: Optional[str] = None - filename: Optional[str] = None - year: Optional[int] = None - year_epub: Optional[int] = None - year_ppub: Optional[int] = None + year_epub: Optional[float] = None + year_ppub: Optional[float] = None journal: Optional[str] = None publisher: Optional[str] = None affiliation_country: Optional[str] = None affiliation_institution: Optional[str] = None type: Optional[str] = None - is_data_pred: Optional[bool] = None data_text: Optional[str] = None is_relevant_data: Optional[bool] = None com_specific_db: Optional[str] = None @@ -26,7 +34,6 @@ class RtransparentMetrics(EmbeddedModel): com_file_formats: Optional[str] = None com_supplemental_data: Optional[str] = None com_data_availibility: Optional[str] = None - is_code_pred: Optional[bool] = None code_text: Optional[str] = None is_relevant_code: Optional[bool] = None com_code: Optional[str] = None @@ -35,74 +42,73 @@ class RtransparentMetrics(EmbeddedModel): coi_text: Optional[str] = None is_coi_pmc_fn: Optional[bool] = None is_coi_pmc_title: Optional[str] = None - is_relevant_coi: Optional[str] = None - is_relevant_coi_hi: Optional[str] = None - is_relevant_coi_lo: Optional[str] = None + is_relevant_coi: Optional[bool] = None + is_relevant_coi_hi: Optional[bool] = None + is_relevant_coi_lo: Optional[bool] = None is_explicit_coi: Optional[str] = None - coi_1: Optional[str] = None - coi_2: Optional[str] = None - coi_disclosure_1: Optional[str] = None - commercial_1: Optional[str] = None - benefit_1: Optional[str] = None - consultant_1: Optional[str] = None - grants_1: Optional[str] = None - brief_1: Optional[str] = None - fees_1: Optional[str] = None - consults_1: Optional[str] = None - connect_1: Optional[str] = None - connect_2: Optional[str] = None - commercial_ack_1: Optional[str] = None - rights_1: Optional[str] = None - founder_1: Optional[str] = None - advisor_1: Optional[str] = None - paid_1: Optional[str] = None - board_1: Optional[str] = None - no_coi_1: Optional[str] = None - no_funder_role_1: Optional[str] = None - is_fund_pred: Optional[bool] = None + coi_1: Optional[bool] = None + coi_2: Optional[bool] = None + coi_disclosure_1: Optional[bool] = None + commercial_1: Optional[bool] = None + benefit_1: Optional[bool] = None + consultant_1: Optional[bool] = None + grants_1: Optional[bool] = None + brief_1: Optional[bool] = None + fees_1: Optional[bool] = None + consults_1: Optional[bool] = None + connect_1: Optional[bool] = None + connect_2: Optional[bool] = None + commercial_ack_1: Optional[bool] = None + rights_1: Optional[bool] = None + founder_1: Optional[bool] = None + advisor_1: Optional[bool] = None + paid_1: Optional[bool] = None + board_1: Optional[bool] = None + no_coi_1: Optional[bool] = None + no_funder_role_1: Optional[bool] = None fund_text: Optional[str] = None fund_pmc_institute: Optional[str] = None fund_pmc_source: Optional[str] = None fund_pmc_anysource: Optional[str] = None is_fund_pmc_group: Optional[bool] = None - is_fund_pmc_title: Optional[str] = None - is_fund_pmc_anysource: Optional[str] = None - is_relevant_fund: Optional[str] = None - is_explicit_fund: Optional[str] = None - support_1: Optional[str] = None - support_3: Optional[str] = None - support_4: Optional[str] = None - support_5: Optional[str] = None - support_6: Optional[str] = None - support_7: Optional[str] = None - support_8: Optional[str] = None - support_9: Optional[str] = None - support_10: Optional[str] = None - developed_1: Optional[str] = None - received_1: Optional[str] = None - received_2: Optional[str] = None - recipient_1: Optional[str] = None - authors_1: Optional[str] = None - authors_2: Optional[str] = None - thank_1: Optional[str] = None - thank_2: Optional[str] = None - fund_1: Optional[str] = None - fund_2: Optional[str] = None - fund_3: Optional[str] = None - supported_1: Optional[str] = None - financial_1: Optional[str] = None - financial_2: Optional[str] = None - financial_3: Optional[str] = None - grant_1: Optional[str] = None - french_1: Optional[str] = None - common_1: Optional[str] = None - common_2: Optional[str] = None - common_3: Optional[str] = None - common_4: Optional[str] = None - common_5: Optional[str] = None - acknow_1: Optional[str] = None - disclosure_1: Optional[str] = None - disclosure_2: Optional[str] = None + is_fund_pmc_title: Optional[bool] = None + is_fund_pmc_anysource: Optional[bool] = None + is_relevant_fund: Optional[bool] = None + is_explicit_fund: Optional[bool] = None + support_1: Optional[bool] = None + support_3: Optional[bool] = None + support_4: Optional[bool] = None + support_5: Optional[bool] = None + support_6: Optional[bool] = None + support_7: Optional[bool] = None + support_8: Optional[bool] = None + support_9: Optional[bool] = None + support_10: Optional[bool] = None + developed_1: Optional[bool] = None + received_1: Optional[bool] = None + received_2: Optional[bool] = None + recipient_1: Optional[bool] = None + authors_1: Optional[bool] = None + authors_2: Optional[bool] = None + thank_1: Optional[bool] = None + thank_2: Optional[bool] = None + fund_1: Optional[bool] = None + fund_2: Optional[bool] = None + fund_3: Optional[bool] = None + supported_1: Optional[bool] = None + financial_1: Optional[bool] = None + financial_2: Optional[bool] = None + financial_3: Optional[bool] = None + grant_1: Optional[bool] = None + french_1: Optional[bool] = None + common_1: Optional[bool] = None + common_2: Optional[bool] = None + common_3: Optional[bool] = None + common_4: Optional[bool] = None + common_5: Optional[bool] = None + acknow_1: Optional[bool] = None + disclosure_1: Optional[bool] = None + disclosure_2: Optional[bool] = None fund_ack: Optional[str] = None project_ack: Optional[str] = None is_register_pred: Optional[bool] = None @@ -110,9 +116,9 @@ class RtransparentMetrics(EmbeddedModel): is_research: Optional[bool] = None is_review: Optional[bool] = None is_reg_pmc_title: Optional[bool] = None - is_relevant_reg: Optional[str] = None - is_method: Optional[str] = None - is_NCT: Optional[str] = None + is_relevant_reg: Optional[bool] = None + is_method: Optional[bool] = None + is_NCT: Optional[bool] = None is_explicit_reg: Optional[str] = None prospero_1: Optional[str] = None registered_1: Optional[str] = None @@ -135,12 +141,49 @@ class RtransparentMetrics(EmbeddedModel): ct_3: Optional[str] = None protocol_1: Optional[str] = None is_success: Optional[bool] = None - is_art: Optional[str] = None + is_art: Optional[bool] = None field: Optional[str] = None score: Optional[int] = None jif: Optional[float] = None eigenfactor_score: Optional[float] = None - n_cite: Optional[int] = None + n_cite: Optional[float] = None + # some extra fields + affiliation_aff_id: Optional[str] = None + affiliation_all: Optional[str] = None + article: Optional[str] = None + author: Optional[str] = None + author_aff_id: Optional[str] = None + correspondence: Optional[str] = None + date_epub: Optional[str] = None + date_ppub: Optional[str] = None + funding_text: Optional[str] = None + is_explicit: Optional[bool] = None + is_fund_pred: Optional[bool] = None + is_funded_pred: Optional[bool] = None + is_relevant: Optional[bool] = None + is_supplement: Optional[bool] = None + issn_epub: Optional[str] = None + issn_ppub: Optional[str] = None + journal_iso: Optional[str] = None + journal_nlm: Optional[str] = None + license: Optional[str] = None + n_affiliation: Optional[str] = None + n_auth: Optional[str] = None + n_fig_body: Optional[str] = None + n_fig_floats: Optional[str] = None + n_ref: Optional[str] = None + n_table_body: Optional[str] = None + n_table_floats: Optional[str] = None + open_code_statements: Optional[str] = None + open_data_category: Optional[str] = None + open_data_statements: Optional[str] = None + pii: Optional[str] = None + pmcid_uid: Optional[str] = None + publisher_id: Optional[str] = None + subject: Optional[str] = None + title: Optional[str] = None + is_data_pred: Optional[bool] = None + is_code_pred: Optional[bool] = None # Tried to define programmatically but both ways seemed to yield a model class without type annotated fields... diff --git a/osm/schemas/schemas.py b/osm/schemas/schemas.py index f066cd34..0516c3a2 100644 --- a/osm/schemas/schemas.py +++ b/osm/schemas/schemas.py @@ -9,8 +9,8 @@ class Component(EmbeddedModel): name: str version: str - docker_image: str - docker_image_id: str + docker_image: Optional[str] = None + docker_image_id: Optional[str] = None class Client(EmbeddedModel): @@ -33,7 +33,7 @@ class Work(EmbeddedModel): doi: Optional[str] = None openalex_id: Optional[str] = None scopus_id: Optional[str] = None - filename: str + filename: str = "" file: Optional[str] = None content_hash: Optional[str] = None @@ -44,13 +44,14 @@ class Invocation(Model): for the Odmantic document model used to interact with mongodb. """ - osm_version: str - user_comment: Optional[str] - client: Client work: Work - # Potentially link to other databases for extra metadata but for now will just use component outputs metrics: RtransparentMetrics + components: list[Component] + client: Client + user_comment: Optional[str] = "" + osm_version: str + # Potentially link to other databases for extra metadata but for now will just use component outputs # Rtransparent: Component.construct(name="rtransparent", version="0.13", docker_image="nimh-dsst/rtransparent:0.13", docker_image_id="dsjfkldsjflkdsjlf2jkl23j") diff --git a/pyproject.toml b/pyproject.toml index dabee885..d8298b86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ keywords = [ dynamic = ["version"] dependencies = [ "pydantic", + "pydantic[email]", "odmantic", "requests", ] diff --git a/web_api/.env.template b/web/.env.template similarity index 100% rename from web_api/.env.template rename to web/.env.template diff --git a/web_api/Dockerfile b/web/api/Dockerfile similarity index 92% rename from web_api/Dockerfile rename to web/api/Dockerfile index fdf7643e..7c7223d9 100644 --- a/web_api/Dockerfile +++ b/web/api/Dockerfile @@ -9,6 +9,6 @@ COPY osm /opt/osm/osm ARG PSEUDO_VERSION=0.0.1 # strongly recommended to update based on git describe RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_OSM=${PSEUDO_VERSION} pip install -e /opt/osm RUN --mount=source=.git,target=/opt/osm/.git,type=bind pip install -e /opt/osm -COPY ./web_api/main.py /app/app/main.py +COPY ./web/api/main.py /app/app/main.py CMD ["fastapi", "run", "--host", "0.0.0.0", "--port", "80", "--root-path", "/api"] diff --git a/web_api/main.py b/web/api/main.py similarity index 70% rename from web_api/main.py rename to web/api/main.py index 3025ff36..37729516 100644 --- a/web_api/main.py +++ b/web/api/main.py @@ -47,6 +47,27 @@ async def get_invocation_by_id(id: ObjectId): return invocation +LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "()": "logging.Formatter", + "fmt": "%(levelname)s %(name)s@%(lineno)d %(message)s", + }, + }, + "handlers": { + "default": { + "formatter": "default", + "class": "my_project.ColorStreamHandler", + "stream": "ext://sys.stderr", + }, + }, + "loggers": { + "": {"handlers": ["default"], "level": "TRACE"}, + }, +} + if __name__ == "__main__": import asyncio @@ -54,7 +75,12 @@ async def get_invocation_by_id(id: ObjectId): loop = asyncio.get_event_loop() config = uvicorn.Config( - app=app, host="0.0.0.0", port=80, root_path="/api", loop=loop + app=app, + host="0.0.0.0", + port=80, + root_path="/api", + loop=loop, + log_config=LOGGING_CONFIG, ) server = uvicorn.Server(config) loop.run_until_complete(server.serve()) diff --git a/web_api/dashboard/Dockerfile b/web/dashboard/Dockerfile similarity index 79% rename from web_api/dashboard/Dockerfile rename to web/dashboard/Dockerfile index c2497c07..2707a7d4 100644 --- a/web_api/dashboard/Dockerfile +++ b/web/dashboard/Dockerfile @@ -2,8 +2,8 @@ FROM tiangolo/uvicorn-gunicorn:python3.11 WORKDIR /app -ENV LOCAL_DATA_PATH=/opt/from_mongo.feather -COPY from_mongo.feather /opt/from_mongo.feather +ENV LOCAL_DATA_PATH=/opt/dashboard_data.feather +COPY ./dashboard_data.feather /opt/dashboard_data.feather # Create the environment RUN pip install holoviews panel pymongo odmantic pandas pyarrow pydantic[email] @@ -16,5 +16,5 @@ RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_OSM=${PSEUDO_VERSION} pip install -e /opt RUN --mount=source=.git,target=/opt/osm/.git,type=bind pip install -e /opt/osm # # Copy the project files and install the package -COPY web_api/dashboard/dashboard.py /app +COPY web/dashboard/dashboard.py /app CMD ["python", "dashboard.py"] diff --git a/web_api/dashboard/dashboard.py b/web/dashboard/dashboard.py similarity index 100% rename from web_api/dashboard/dashboard.py rename to web/dashboard/dashboard.py diff --git a/web_api/README.md b/web/deploy/README.md similarity index 100% rename from web_api/README.md rename to web/deploy/README.md diff --git a/web_api/compose.override.yaml b/web/deploy/compose.override.yaml similarity index 100% rename from web_api/compose.override.yaml rename to web/deploy/compose.override.yaml diff --git a/web_api/compose.yaml b/web/deploy/compose.yaml similarity index 100% rename from web_api/compose.yaml rename to web/deploy/compose.yaml diff --git a/web_api/deploy.py b/web/deploy/deploy.py similarity index 98% rename from web_api/deploy.py rename to web/deploy/deploy.py index 3c5a688a..9cc6d6ea 100644 --- a/web_api/deploy.py +++ b/web/deploy/deploy.py @@ -47,11 +47,11 @@ def build_and_push_docker_images(): print("Building and pushing Docker images...") run_command( - f"DOCKER_BUILDKIT=1 docker build -t {os.getenv('DOCKER_IMAGE_TAG')} -f ./web_api/Dockerfile ." + f"DOCKER_BUILDKIT=1 docker build -t {os.getenv('DOCKER_IMAGE_TAG')} -f ./web/Dockerfile ." ) run_command(f"docker push {os.getenv('DOCKER_IMAGE_TAG')}") run_command( - f"DOCKER_BUILDKIT=1 docker build -t {os.getenv('DASHBOARD_IMAGE_TAG')} -f ./web_api/dashboard/Dockerfile ." + f"DOCKER_BUILDKIT=1 docker build -t {os.getenv('DASHBOARD_IMAGE_TAG')} -f ./web/dashboard/Dockerfile ." ) run_command(f"docker push {os.getenv('DASHBOARD_IMAGE_TAG')}") diff --git a/web_api/docker-compose.yaml.j2 b/web/deploy/docker-compose.yaml.j2 similarity index 100% rename from web_api/docker-compose.yaml.j2 rename to web/deploy/docker-compose.yaml.j2 diff --git a/web_api/environment.yaml b/web/deploy/environment.yaml similarity index 100% rename from web_api/environment.yaml rename to web/deploy/environment.yaml diff --git a/web_api/terraform/modules/shared_resources/main.tf b/web/deploy/terraform/modules/shared_resources/main.tf similarity index 100% rename from web_api/terraform/modules/shared_resources/main.tf rename to web/deploy/terraform/modules/shared_resources/main.tf diff --git a/web_api/terraform/staging/main.tf b/web/deploy/terraform/staging/main.tf similarity index 100% rename from web_api/terraform/staging/main.tf rename to web/deploy/terraform/staging/main.tf diff --git a/web_api/terraform/staging/variables.tf b/web/deploy/terraform/staging/variables.tf similarity index 100% rename from web_api/terraform/staging/variables.tf rename to web/deploy/terraform/staging/variables.tf diff --git a/web_api/terraform/state_storage/README.md b/web/deploy/terraform/state_storage/README.md similarity index 100% rename from web_api/terraform/state_storage/README.md rename to web/deploy/terraform/state_storage/README.md diff --git a/web_api/terraform/state_storage/dynamodb-policy.json b/web/deploy/terraform/state_storage/dynamodb-policy.json similarity index 100% rename from web_api/terraform/state_storage/dynamodb-policy.json rename to web/deploy/terraform/state_storage/dynamodb-policy.json diff --git a/web_api/terraform/state_storage/state-storage.tf b/web/deploy/terraform/state_storage/state-storage.tf similarity index 100% rename from web_api/terraform/state_storage/state-storage.tf rename to web/deploy/terraform/state_storage/state-storage.tf diff --git a/web_api/__init__.py b/web_api/__init__.py deleted file mode 100644 index e69de29b..00000000