diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 00000000..2f6d2bcc --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,66 @@ +name: Deploy to AWS EC2 with Terraform + +on: + workflow_dispatch: + inputs: + environment: + description: 'Deployment environment' + required: true + default: staging + type: choice + options: + - staging + push: + tags: + - 'v*.*.*' # Matches version tags like v1.0.0 + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + install: true + + - name: Cache Docker layers + uses: actions/cache@v3 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Log in to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Build and push Docker image + run: | + DOCKER_BUILDKIT=1 docker build -t osm_web_api:${{ github.event.inputs.environment || 'production' }} -f ./docker_images/web_api/Dockerfile . + docker tag osm_web_api:${{ github.event.inputs.environment || 'production' }}:latest ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/osm_web_api:${{ github.event.inputs.environment || 'production' }} + docker push ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/osm_web_api:${{ github.event.inputs.environment || 'production' }} + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v2 + + - name: Terraform Init + run: terraform init -backend-config="path/to/${{ github.event.inputs.environment || 'production' }}/backend-config" + + - name: Terraform Plan + run: terraform plan -var-file="terraform/${{ github.event.inputs.environment || 'production' }}.tfvars" + + - name: Terraform Apply + if: success() + run: terraform apply -var-file="terraform/${{ github.event.inputs.environment || 'production' }}.tfvars" -auto-approve + + - name: Notify Success + if: success() + run: echo "Deployment to ${{ github.event.inputs.environment || 'production' }} environment was successful." + + - name: Notify Failure + if: failure() + run: echo "Deployment to ${{ github.event.inputs.environment || 'production' }} environment failed." diff --git a/.gitignore b/.gitignore index feb91d9c..e671b5ac 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ __pycache__/ *.pyd *.env *.egg-info/ +*.tfstate dist/ build/ .tox/ @@ -17,3 +18,6 @@ venv/ .vscode/settings.json .DS_Store osm_output +.terraform +.terraform.lock.hcl +.public_dns diff --git a/compose.override.yaml b/compose.override.yaml index 91a82271..8cd8cb13 100644 --- a/compose.override.yaml +++ b/compose.override.yaml @@ -9,7 +9,6 @@ services: osm_web_api: environment: - MONGODB_URI=mongodb://db:27017/test - # - MONGODB_URI=mongodb://mongoadmin:secret@db:27017/osm build: context: . dockerfile: ./docker_images/web_api/Dockerfile @@ -23,10 +22,9 @@ services: - db db: + # use old version of mongo to avoid Apple Instruction set error image: mongo:4.4.6 ports: - 27017:27017 environment: - MONGO_INITDB_DATABASE=test - # - MONGO_INITDB_ROOT_USERNAME=mongoadmin - # - MONGO_INITDB_ROOT_PASSWORD=secret diff --git a/docker_images/web_api/Dockerfile b/docker_images/web_api/Dockerfile deleted file mode 100644 index 4ec7a9de..00000000 --- a/docker_images/web_api/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM tiangolo/uvicorn-gunicorn:python3.11 - -WORKDIR /app - -COPY ./docker_images/web_api/requirements.txt /app/app/requirements.txt -RUN pip install --no-cache-dir --upgrade -r /app/app/requirements.txt - -# Consider installing from pypi -RUN mkdir -p /opt/osm -COPY pyproject.toml /opt/osm -COPY osm /opt/osm/osm -COPY .git /opt/osm/.git -RUN pip install /opt/osm - -COPY ./docker_images/web_api/main.py /app/app/main.py diff --git a/docker_images/web_api/compose.yaml b/docker_images/web_api/compose.yaml deleted file mode 100644 index e4dc7156..00000000 --- a/docker_images/web_api/compose.yaml +++ /dev/null @@ -1,10 +0,0 @@ -services: - osm_web_api: - image: osm_web_api - environment: - - MONGODB_URI="mongodb+srv://johnlee:@cluster0.6xo8ws7.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0" - build: - context: ../.. - dockerfile: ./docker_images/web_api/Dockerfile - ports: - - 80:80 diff --git a/docker_images/web_api/requirements.txt b/docker_images/web_api/requirements.txt deleted file mode 100644 index 58f38a89..00000000 --- a/docker_images/web_api/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -fastapi -odmantic diff --git a/docker_images/_entrypoint.sh b/external_components/_entrypoint.sh similarity index 100% rename from docker_images/_entrypoint.sh rename to external_components/_entrypoint.sh diff --git a/docker_images/rtransparent/Dockerfile b/external_components/rtransparent/Dockerfile similarity index 100% rename from docker_images/rtransparent/Dockerfile rename to external_components/rtransparent/Dockerfile diff --git a/docker_images/rtransparent/app.py b/external_components/rtransparent/app.py similarity index 100% rename from docker_images/rtransparent/app.py rename to external_components/rtransparent/app.py diff --git a/docker_images/rtransparent/environment.yaml b/external_components/rtransparent/environment.yaml similarity index 100% rename from docker_images/rtransparent/environment.yaml rename to external_components/rtransparent/environment.yaml diff --git a/osm/_utils.py b/osm/_utils.py index a42f8d66..3bdba06f 100644 --- a/osm/_utils.py +++ b/osm/_utils.py @@ -1,7 +1,11 @@ import argparse import base64 import hashlib +import logging import os +import shlex +import subprocess +import time from pathlib import Path import requests @@ -9,6 +13,7 @@ from osm._version import __version__ DEFAULT_OUTPUT_DIR = "./osm_output" +logger = logging.getLogger(__name__) def _get_metrics_dir(output_dir: Path = DEFAULT_OUTPUT_DIR) -> Path: @@ -34,7 +39,7 @@ def get_compute_context_id(): return hash(f"{os.environ.get('HOSTNAME')}_{os.environ.get('USERNAME')}") -def _upload_data(args, file_in, xml, extracted): +def _upload_data(args, file_in, xml, metrics, components): """ TODO: add in derivatives and components """ @@ -53,7 +58,8 @@ def _upload_data(args, file_in, xml, extracted): "compute_context_id": get_compute_context_id(), "email": args.email, }, - "metrics": extracted, + "metrics": metrics, + "components": components, } # Send POST request to OSM API response = requests.put(f"{osm_api}/upload", json=payload) @@ -63,3 +69,48 @@ def _upload_data(args, file_in, xml, extracted): print("Invocation data uploaded successfully") else: print(f"Failed to upload invocation data: \n {response.text}") + + +def wait_for_containers(): + while True: + try: + response = requests.get("http://localhost:8071/health") + if response.status_code == 200: + break + except requests.exceptions.RequestException: + pass + + time.sleep(1) + + +def compose_up(): + cmd = shlex.split("docker-compose up -d --build") + subprocess.run( + cmd, + check=True, + ) + + +def compose_down(): + cmd = shlex.split("docker-compose down") + subprocess.run( + cmd, + check=True, + ) + + +def _setup(args): + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + xml_path = _get_text_dir() / f"{args.uid}.xml" + if args.filepath.name.endswith(".pdf"): + if xml_path.exists(): + raise FileExistsError(xml_path) + metrics_path = _get_metrics_dir() / f"{args.uid}.json" + if metrics_path.exists(): + raise FileExistsError(metrics_path) + if not args.user_managed_compose: + compose_up() + logger.info("Waiting for containers to be ready...") + wait_for_containers() + return xml_path, metrics_path diff --git a/osm/cli.py b/osm/cli.py index c36cb559..527a6872 100644 --- a/osm/cli.py +++ b/osm/cli.py @@ -1,24 +1,17 @@ import argparse -import json -import logging -import shlex -import subprocess -import time -from pathlib import Path -import requests +from osm._utils import DEFAULT_OUTPUT_DIR, _existing_file, _setup, compose_down +from osm.pipeline.core import Pipeline +from osm.pipeline.extractors import RTransparentExtractor +from osm.pipeline.parsers import ScienceBeamParser +from osm.pipeline.savers import FileSaver, JSONSaver, OSMSaver, Savers -from osm._utils import ( - DEFAULT_OUTPUT_DIR, - _existing_file, - _get_metrics_dir, - _get_text_dir, - _upload_data, -) -from osm.components.rtransparent import _extract -from osm.components.sciencebeam import _convert - -logger = logging.getLogger(__name__) +PARSERS = { + "sciencebeam": ScienceBeamParser, +} +EXTRACTORS = { + "rtransparent": RTransparentExtractor, +} def parse_args(): @@ -26,7 +19,7 @@ def parse_args(): parser.add_argument( "-f", - "--file", + "--filepath", type=_existing_file, required=True, help="Specify the path to the pdf/xml for processing.", @@ -42,6 +35,20 @@ def parse_args(): default=DEFAULT_OUTPUT_DIR, help="Directory to store output.", ) + parser.add_argument( + "--parser", + choices=PARSERS.keys(), + default=["sciencebeam"], + nargs="+", + help="Select the tool for parsing the input document. Default is 'sciencebeam'.", + ) + parser.add_argument( + "--metrics-type", + choices=EXTRACTORS.keys(), + default=["rtransparent"], + nargs="+", + help="Select the tool for extracting the output metrics. Default is 'rtransparent'.", + ) parser.add_argument( "--comment", required=False, @@ -61,71 +68,39 @@ def parse_args(): return parser.parse_args() -def wait_for_containers(): - while True: - try: - response = requests.get("http://localhost:8071/health") - if response.status_code == 200: - break - except requests.exceptions.RequestException: - pass - - time.sleep(1) - - -def compose_up(): - cmd = shlex.split("docker-compose up -d --build") - subprocess.run( - cmd, - check=True, - ) - - -def compose_down(): - cmd = shlex.split("docker-compose down") - subprocess.run( - cmd, - check=True, - ) - - -def _setup(args): - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - xml_out = _get_text_dir() / f"{args.uid}.xml" - if args.file.name.endswith(".pdf"): - if xml_out.exists(): - raise FileExistsError(xml_out) - metrics_out = _get_metrics_dir() / f"{args.uid}.json" - if metrics_out.exists(): - raise FileExistsError(metrics_out) - if not args.user_managed_compose: - compose_up() - logger.info("Waiting for containers to be ready...") - wait_for_containers() - return xml_out, metrics_out - - def main(): args = parse_args() try: - xml_out, metrics_out = _setup(args) - file_in = args.file.read_bytes() - - if args.file.name.endswith(".pdf"): - xml = _convert(file_in) - xml_out.write_bytes(xml) - else: - xml = file_in - extracted = _extract(xml) - metrics_out.write_text(json.dumps(extracted)) - _upload_data(args, file_in, xml, extracted) - + xml_path, metrics_path = _setup(args) + pipeline = Pipeline( + filepath=args.filepath, + xml_path=xml_path, + metrics_path=metrics_path, + parsers=[PARSERS[p] for p in args.parser], + extractors=[EXTRACTORS[m] for m in args.metrics_type], + savers=Savers( + file_saver=FileSaver(), json_saver=JSONSaver(), osm_saver=OSMSaver() + ), + ) + pipeline.run() finally: if not args.user_managed_compose: compose_down() - pass if __name__ == "__main__": main() + +# def main(): +# args = parse_args() +# try: +# pipeline = _setup(args) +# pipeline.parse() +# pipeline.extract() +# pipeline.save() +# xml_path, metrics_path, parser, extractor = _setup(args) +# xml = parser.parse() +# xml_path.write_bytes(xml) +# metrics = _extract(xml) +# metrics_path.write_text(json.dumps(metrics)) +# _upload_data(args, file_in, xml, metrics,components) diff --git a/osm/components/oddpub.py b/osm/components/oddpub.py deleted file mode 100644 index 2934c6c2..00000000 --- a/osm/components/oddpub.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Oddpub is being actively developed where as rtransparent has stagnated. -Oddpub implements parallelism and their interface does not easily allow working -with objects in memory so we will use that to reduce IO overhead. - -The alternative would be to load the pdf file into memory (pdftools::pdf_data -and then pass that into oddpub private functions). This would make it easier to -manage the parallelism, troubleshoot, and define the interface but partially -reinvents the wheel. -""" - -import logging -from pathlib import Path - -import psutil -import rpy2.robjects as ro -from rpy2.robjects import pandas2ri -from rpy2.robjects.packages import importr - -from osm.config import osm_config - -logging.basicConfig(level=logging.DEBUG) - -# Adjust the logging level for rpy2 -rpy2_logger = logging.getLogger("rpy2") -rpy2_logger.setLevel(logging.DEBUG) - -oddpub = importr("oddpub") -future = importr("future") -ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")') - - -def oddpub_pdf_conversion( - pdf_dir: Path, text_dir: Path, workers: int = psutil.cpu_count() -): - future.plan(future.multisession, workers=workers) - oddpub.pdf_convert(str(pdf_dir), str(text_dir)) - - -def oddpub_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()): - future.plan(future.multisession, workers=workers) - pdf_sentences = oddpub.pdf_load(f"{text_dir}/") - open_data_results = oddpub.open_data_search(pdf_sentences) - with (ro.default_converter + pandas2ri.converter).context(): - metrics = ro.conversion.get_conversion().rpy2py(open_data_results) - - return metrics diff --git a/osm/components/rtransparent.py b/osm/components/rtransparent.py deleted file mode 100644 index 687f4d24..00000000 --- a/osm/components/rtransparent.py +++ /dev/null @@ -1,19 +0,0 @@ -import json - -import requests - - -def _extract(xml: bytes) -> str: - """Extracts metrics from an XML. - - Args: - xml: Raw bytes for an xml file. - """ - headers = {"Content-Type": "application/octet-stream"} - response = requests.post( - "http://localhost:8071/extract-metrics", data=xml, headers=headers - ) - if response.status_code == 200: - return json.loads(response.json())[0] - else: - response.raise_for_status() diff --git a/docker_images/web_api/__init__.py b/osm/pipeline/__init__.py similarity index 100% rename from docker_images/web_api/__init__.py rename to osm/pipeline/__init__.py diff --git a/osm/pipeline/core.py b/osm/pipeline/core.py new file mode 100644 index 00000000..229e80a4 --- /dev/null +++ b/osm/pipeline/core.py @@ -0,0 +1,86 @@ +from abc import ABC, abstractmethod +from typing import Optional + + +# Parser Interface +class Parser(ABC): + @abstractmethod + def parse(self, data: bytes) -> str: + pass + + +# Extractor Interface +class Extractor(ABC): + @abstractmethod + def extract(self, data: str) -> dict: + pass + + +# Saver Interface +class Saver(ABC): + @abstractmethod + def save(self, data: dict): + pass + + +class Savers: + def __init__(self, file_saver: Saver, json_saver: Saver, osm_saver: Saver): + self.file_saver = file_saver + self.json_saver = json_saver + self.osm_saver = osm_saver + + def save_file(self, data: str): + self.file_saver.save(data) + + def save_json(self, data: dict): + self.json_saver.save(data) + + def save_osm(self, data: dict): + self.osm_saver.save(data) + + +class Pipeline: + def __init__( + self, + *, + parsers: list[Parser], + extractors: list[Extractor], + savers: Savers, + filepath: str, + xml_path: Optional[str] = None, + metrics_path: Optional[str] = None, + ): + self.parsers = parsers + self.extractors = extractors + self.savers = savers + self.filepath = filepath + self._file_data = None + self.xml_path = xml_path + self.metrics_path = metrics_path + + def run(self): + for parser in self.parsers: + parsed_data = parser.parse(self.file_data) + if isinstance(parsed_data, str): + self.savers.save_file(parsed_data, self.xml_path) + for extractor in self.extractors: + extracted_metrics = extractor.extract(parsed_data) + self.savers.save_osm( + { + "parser": parser.__class__.__name__, + "extractor": extractor.__class__.__name__, + "metrics": extracted_metrics, + } + ) + self.savers.save_json(extracted_metrics, self.metrics_path) + + @staticmethod + def read_file(filepath: str) -> bytes: + with open(filepath, "rb") as file: + return file.read() + + @property + def file_data(self): + if not self._file_data: + self._file_data = self.read_file(self.filepath) + return self._file_data diff --git a/osm/pipeline/extractors.py b/osm/pipeline/extractors.py new file mode 100644 index 00000000..30b7f6d5 --- /dev/null +++ b/osm/pipeline/extractors.py @@ -0,0 +1,46 @@ +import json + +import requests + + +def _extract(xml: bytes) -> str: + """Extracts metrics from an XML. + + Args: + xml: Raw bytes for an xml file. + """ + headers = {"Content-Type": "application/octet-stream"} + response = requests.post( + "http://localhost:8071/extract-metrics", data=xml, headers=headers + ) + if response.status_code == 200: + return json.loads(response.json())[0] + else: + response.raise_for_status() + + +# import psutil +# # Adjust the logging level for rpy2 +# rpy2_logger = logging.getLogger("rpy2") +# rpy2_logger.setLevel(logging.DEBUG) + +# oddpub = importr("oddpub") +# future = importr("future") +# ro.r(f'Sys.setenv(VROOM_CONNECTION_SIZE = "{osm_config.vroom_connection_size}")') + + +# def oddpub_pdf_conversion( +# pdf_dir: Path, text_dir: Path, workers: int = psutil.cpu_count() +# ): +# future.plan(future.multisession, workers=workers) +# oddpub.pdf_convert(str(pdf_dir), str(text_dir)) + + +# def oddpub_metric_extraction(text_dir: Path, workers: int = psutil.cpu_count()): +# future.plan(future.multisession, workers=workers) +# pdf_sentences = oddpub.pdf_load(f"{text_dir}/") +# open_data_results = oddpub.open_data_search(pdf_sentences) +# with (ro.default_converter + pandas2ri.converter).context(): +# metrics = ro.conversion.get_conversion().rpy2py(open_data_results) + +# return metrics diff --git a/osm/components/sciencebeam.py b/osm/pipeline/parsers.py similarity index 100% rename from osm/components/sciencebeam.py rename to osm/pipeline/parsers.py diff --git a/osm/pipeline/savers.py b/osm/pipeline/savers.py new file mode 100644 index 00000000..9237c7f5 --- /dev/null +++ b/osm/pipeline/savers.py @@ -0,0 +1,29 @@ +import json + +from .core import Saver + + +class FileSaver(Saver): + def save(self, data: str): + with open("output.xml", "w") as file: + file.write(data) + + +class JSONSaver(Saver): + def save(self, data: dict): + with open("output.json", "w") as file: + json.dump(data, file) + + +class OSMSaver(Saver): + def save(self, data: dict): + # Assuming there's a method to post data to an endpoint + response = self.post_to_osm(data) + if response.status_code != 200: + raise Exception("Failed to save metrics to OSM") + + def post_to_osm(self, data: dict): + # Mock implementation: Replace with actual HTTP request + print(f"Posting to OSM: {data}") + # TODO + pass diff --git a/osm/schemas.py b/osm/schemas.py deleted file mode 100644 index 17627a32..00000000 --- a/osm/schemas.py +++ /dev/null @@ -1,88 +0,0 @@ -from typing import Optional - -from odmantic import EmbeddedModel, Model -from pydantic import EmailStr - -# mriqc applies a unique id to each person so that you can aggregate and detect -# duplication. One can have different views of same work -# (pubmedcentral and nature). Each would have different filenames, different -# provenance, different md5sum. Usually there is a final version of record, -# ideally we would be analysing that. Generally for high throughput we analyse -# the open access pubmed central. - - -class Derivative(EmbeddedModel): - """ - Gridfs can avoid issues with size limitations. Each derivative is an output of the - execution of a single container with the “preceding document” or “parent” - referenced (this could be a primary document or another derivative). A primary - document could have several different derivatives (scibeam and rtransparent outputs) - and/or several versions of the same derivative type (scibeam and rtransparent - across different releases or rtransparent or modifications of our docker - image). A text label would be useful here but a docker image id is likely the - sanest way to track derivatives (which would mean that all derivatives must be - computed in a docker container). - """ - - text_label: str - version: str - - -class Component(EmbeddedModel): - name: str - version: str - docker_image: str - docker_image_id: str - - -class Metrics(EmbeddedModel): - """Potentially link to other databases for extra metadata""" - - metrics: dict - - -class Client(EmbeddedModel): - compute_context_id: int - email: Optional[EmailStr] = None - - -class Work(EmbeddedModel): - """ - Unique reference for each publication/study/work. For each “work”, - pmid, doi (normalized), openalex ids are approaches to referencing such a - study uniquely but any one of them may be used by a user. Versioning of the - publications (as in pubmed vs Nature vs addendums) should all be handled - naturally as part of an array of referenced “user input documents” (let’s say - a pdf) provided as part of each "Invocation" or cli call. - """ - - user_defined_id: str - pmid: Optional[str] = None - doi: Optional[str] = None - openalex_id: Optional[str] = None - scopus_id: Optional[str] = None - filename: str - file: Optional[str] = None - content_hash: Optional[str] = None - - -class Invocation(Model): - """ - Approximate document model. This may evolve but provides a starting point - for the Odmantic document model used to interact with mongodb. - """ - - osm_version: str - user_comment: Optional[str] - client: Client - work: Work - metrics: dict - # derivatives: list[Derivative] - # components: list[Component] - - -# Rtransparent: Component.construct(name="rtransparent", version="0.13", docker_image="nimh-dsst/rtransparent:0.13", -# docker_image_id="dsjfkldsjflkdsjlf2jkl23j") Derivative.construct(name="rtransparent", version="0.13", -# docker_image="nimh-dsst/rtransparent:0.13", docker_image_id="dsjfkldsjflkdsjlf2jkl23j") ScibeamParser: -# Component.construct(name="scibeam-parser", version="0.5.1", docker_image="elife/scibeam-parser:0.5.1", -# docker_image_id="dsjfkldsjflkdsjlf2jkl23j") diff --git a/osm/schemas/__init__.py b/osm/schemas/__init__.py new file mode 100644 index 00000000..71a2c66b --- /dev/null +++ b/osm/schemas/__init__.py @@ -0,0 +1,6 @@ +from .schemas import Client as Client +from .schemas import Component as Component +from .schemas import Derivative as Derivative +from .schemas import Invocation as Invocation +from .schemas import RtransparentMetrics as RtransparentMetrics +from .schemas import Work as Work diff --git a/osm/schemas/metrics_schemas.py b/osm/schemas/metrics_schemas.py new file mode 100644 index 00000000..28f31186 --- /dev/null +++ b/osm/schemas/metrics_schemas.py @@ -0,0 +1,166 @@ +from typing import Optional + +from odmantic import EmbeddedModel + + +class RtransparentMetrics(EmbeddedModel): + pmcid_pmc: Optional[int] = None + pmid: Optional[int] = None + doi: Optional[str] = None + filename: Optional[str] = None + year: Optional[int] = None + year_epub: Optional[int] = None + year_ppub: Optional[int] = None + journal: Optional[str] = None + publisher: Optional[str] = None + affiliation_country: Optional[str] = None + affiliation_institution: Optional[str] = None + type: Optional[str] = None + is_data_pred: Optional[bool] = None + data_text: Optional[str] = None + is_relevant_data: Optional[bool] = None + com_specific_db: Optional[str] = None + com_general_db: Optional[str] = None + com_github_data: Optional[str] = None + dataset: Optional[str] = None + com_file_formats: Optional[str] = None + com_supplemental_data: Optional[str] = None + com_data_availibility: Optional[str] = None + is_code_pred: Optional[bool] = None + code_text: Optional[str] = None + is_relevant_code: Optional[bool] = None + com_code: Optional[str] = None + com_suppl_code: Optional[str] = None + is_coi_pred: Optional[bool] = None + coi_text: Optional[str] = None + is_coi_pmc_fn: Optional[bool] = None + is_coi_pmc_title: Optional[str] = None + is_relevant_coi: Optional[str] = None + is_relevant_coi_hi: Optional[str] = None + is_relevant_coi_lo: Optional[str] = None + is_explicit_coi: Optional[str] = None + coi_1: Optional[str] = None + coi_2: Optional[str] = None + coi_disclosure_1: Optional[str] = None + commercial_1: Optional[str] = None + benefit_1: Optional[str] = None + consultant_1: Optional[str] = None + grants_1: Optional[str] = None + brief_1: Optional[str] = None + fees_1: Optional[str] = None + consults_1: Optional[str] = None + connect_1: Optional[str] = None + connect_2: Optional[str] = None + commercial_ack_1: Optional[str] = None + rights_1: Optional[str] = None + founder_1: Optional[str] = None + advisor_1: Optional[str] = None + paid_1: Optional[str] = None + board_1: Optional[str] = None + no_coi_1: Optional[str] = None + no_funder_role_1: Optional[str] = None + is_fund_pred: Optional[bool] = None + fund_text: Optional[str] = None + fund_pmc_institute: Optional[str] = None + fund_pmc_source: Optional[str] = None + fund_pmc_anysource: Optional[str] = None + is_fund_pmc_group: Optional[bool] = None + is_fund_pmc_title: Optional[str] = None + is_fund_pmc_anysource: Optional[str] = None + is_relevant_fund: Optional[str] = None + is_explicit_fund: Optional[str] = None + support_1: Optional[str] = None + support_3: Optional[str] = None + support_4: Optional[str] = None + support_5: Optional[str] = None + support_6: Optional[str] = None + support_7: Optional[str] = None + support_8: Optional[str] = None + support_9: Optional[str] = None + support_10: Optional[str] = None + developed_1: Optional[str] = None + received_1: Optional[str] = None + received_2: Optional[str] = None + recipient_1: Optional[str] = None + authors_1: Optional[str] = None + authors_2: Optional[str] = None + thank_1: Optional[str] = None + thank_2: Optional[str] = None + fund_1: Optional[str] = None + fund_2: Optional[str] = None + fund_3: Optional[str] = None + supported_1: Optional[str] = None + financial_1: Optional[str] = None + financial_2: Optional[str] = None + financial_3: Optional[str] = None + grant_1: Optional[str] = None + french_1: Optional[str] = None + common_1: Optional[str] = None + common_2: Optional[str] = None + common_3: Optional[str] = None + common_4: Optional[str] = None + common_5: Optional[str] = None + acknow_1: Optional[str] = None + disclosure_1: Optional[str] = None + disclosure_2: Optional[str] = None + fund_ack: Optional[str] = None + project_ack: Optional[str] = None + is_register_pred: Optional[bool] = None + register_text: Optional[str] = None + is_research: Optional[bool] = None + is_review: Optional[bool] = None + is_reg_pmc_title: Optional[bool] = None + is_relevant_reg: Optional[str] = None + is_method: Optional[str] = None + is_NCT: Optional[str] = None + is_explicit_reg: Optional[str] = None + prospero_1: Optional[str] = None + registered_1: Optional[str] = None + registered_2: Optional[str] = None + registered_3: Optional[str] = None + registered_4: Optional[str] = None + registered_5: Optional[str] = None + not_registered_1: Optional[str] = None + registration_1: Optional[str] = None + registration_2: Optional[str] = None + registration_3: Optional[str] = None + registration_4: Optional[str] = None + registry_1: Optional[str] = None + reg_title_1: Optional[str] = None + reg_title_2: Optional[str] = None + reg_title_3: Optional[str] = None + reg_title_4: Optional[str] = None + funded_ct_1: Optional[str] = None + ct_2: Optional[str] = None + ct_3: Optional[str] = None + protocol_1: Optional[str] = None + is_success: Optional[bool] = None + is_art: Optional[str] = None + field: Optional[str] = None + score: Optional[int] = None + jif: Optional[float] = None + eigenfactor_score: Optional[float] = None + n_cite: Optional[int] = None + + +# Tried to define programmatically but both ways seemed to yield a model class without type annotated fields... + +# 1 +# RtransparentMetrics = type( +# "RtransparentMetrics", +# (Model,), +# {n: Optional[t] for n, t in rtransparent_metric_types.items()}, +# ) + +# 2 +# Use Field to explicitly define the fields in the model +# namespace = { +# n: (Optional[t], Field(default=None)) +# for n, t in rtransparent_metric_types.items() +# } +# Dynamically create the Pydantic/ODMantic model +# RtransparentMetrics: Type[Model] = type( +# "RtransparentMetrics", +# (Model,), +# namespace, +# ) diff --git a/osm/schemas/schemas.py b/osm/schemas/schemas.py new file mode 100644 index 00000000..f066cd34 --- /dev/null +++ b/osm/schemas/schemas.py @@ -0,0 +1,57 @@ +from typing import Optional + +from odmantic import EmbeddedModel, Model +from pydantic import EmailStr + +from .metrics_schemas import RtransparentMetrics + + +class Component(EmbeddedModel): + name: str + version: str + docker_image: str + docker_image_id: str + + +class Client(EmbeddedModel): + compute_context_id: int + email: Optional[EmailStr] = None + + +class Work(EmbeddedModel): + """ + Unique reference for each publication/study/work. For each “work”, + pmid, doi (normalized), openalex ids are approaches to referencing such a + study uniquely but any one of them may be used by a user. Versioning of the + publications (as in pubmed vs Nature vs addendums) should all be handled + naturally as part of an array of referenced “user input documents” (let’s say + a pdf) provided as part of each "Invocation" or cli call. + """ + + user_defined_id: str + pmid: Optional[str] = None + doi: Optional[str] = None + openalex_id: Optional[str] = None + scopus_id: Optional[str] = None + filename: str + file: Optional[str] = None + content_hash: Optional[str] = None + + +class Invocation(Model): + """ + Approximate document model. This may evolve but provides a starting point + for the Odmantic document model used to interact with mongodb. + """ + + osm_version: str + user_comment: Optional[str] + client: Client + work: Work + # Potentially link to other databases for extra metadata but for now will just use component outputs + metrics: RtransparentMetrics + components: list[Component] + + +# Rtransparent: Component.construct(name="rtransparent", version="0.13", docker_image="nimh-dsst/rtransparent:0.13", docker_image_id="dsjfkldsjflkdsjlf2jkl23j") +# ScibeamParser: Component.construct(name="scibeam-parser", version="0.5.1", docker_image="elife/scibeam-parser:0.5.1",docker_image_id="dsjfkldsjflkdsjlf2jkl23j") diff --git a/scripts/deployment_diagnostics.sh b/scripts/deployment_diagnostics.sh new file mode 100644 index 00000000..7b116247 --- /dev/null +++ b/scripts/deployment_diagnostics.sh @@ -0,0 +1,34 @@ +# 1. Describe the instance to get details including VPC and Subnet +aws ec2 describe-instances --instance-ids i-03b729b63c679cf2d --query "Reservations[*].Instances[*].{InstanceId:InstanceId, VpcId:VpcId, SubnetId:SubnetId, PublicIpAddress:PublicIpAddress, PublicDnsName:PublicDnsName, PrivateIpAddress:PrivateIpAddress, State:State.Name, SecurityGroups:SecurityGroups, NetworkInterfaces:NetworkInterfaces}" > instance_details.json + +# Extract VPC ID and Subnet ID from the instance details +VPC_ID=$(jq -r '.[0][0].VpcId' instance_details.json) +SUBNET_ID=$(jq -r '.[0][0].SubnetId' instance_details.json) + +# 2. Describe the VPC to get more details +aws ec2 describe-vpcs --vpc-ids $VPC_ID --query "Vpcs[*].{VpcId:VpcId, CidrBlock:CidrBlock, DhcpOptionsId:DhcpOptionsId, State:State}" > vpc_details.json + +# 3. Describe the subnet to get more details +aws ec2 describe-subnets --subnet-ids $SUBNET_ID --query "Subnets[*].{SubnetId:SubnetId, VpcId:VpcId, CidrBlock:CidrBlock, MapPublicIpOnLaunch:MapPublicIpOnLaunch, AvailabilityZone:AvailabilityZone, State:State, RouteTableAssociationId:RouteTableAssociationId}" > subnet_details.json + +# Extract Route Table Association ID from the subnet details +ROUTE_TABLE_ASSOC_ID=$(jq -r '.[0].RouteTableAssociationId' subnet_details.json) + +# 4. Describe the route table associated with the subnet +aws ec2 describe-route-tables --filters "Name=association.route-table-association-id,Values=$ROUTE_TABLE_ASSOC_ID" --query "RouteTables[*].{RouteTableId:RouteTableId, VpcId:VpcId, Routes:Routes, Associations:Associations, Tags:Tags}" > route_table_details.json + +# 5. Describe the Internet Gateway +aws ec2 describe-internet-gateways --filters "Name=attachment.vpc-id,Values=$VPC_ID" --query "InternetGateways[*].{InternetGatewayId:InternetGatewayId, Attachments:Attachments, Tags:Tags}" > internet_gateway_details.json + +# 6. Describe DHCP Options associated with the VPC +aws ec2 describe-dhcp-options --dhcp-options-ids $(jq -r '.[0].DhcpOptionsId' vpc_details.json) --query "DhcpOptions[*].{DhcpOptionsId:DhcpOptionsId, DhcpConfigurations:DhcpConfigurations}" > dhcp_options_details.json + +# 7. List all security groups associated with the instance +SECURITY_GROUP_IDS=$(jq -r '.[0][0].SecurityGroups[].GroupId' instance_details.json) +aws ec2 describe-security-groups --group-ids $SECURITY_GROUP_IDS --query "SecurityGroups[*].{GroupId:GroupId, GroupName:GroupName, VpcId:VpcId, Description:Description, IpPermissions:IpPermissions, IpPermissionsEgress:IpPermissionsEgress}" > security_groups_details.json + +# Bundle all JSON files into a single file for sharing etc. +tar -czvf aws_details.tar.gz instance_details.json vpc_details.json subnet_details.json route_table_details.json internet_gateway_details.json dhcp_options_details.json security_groups_details.json + +# Clean up individual files (optional) +rm instance_details.json vpc_details.json subnet_details.json route_table_details.json internet_gateway_details.json dhcp_options_details.json security_groups_details.json diff --git a/scripts/diagnose-instance-connectivity.sh b/scripts/diagnose-instance-connectivity.sh new file mode 100644 index 00000000..ed13d4fe --- /dev/null +++ b/scripts/diagnose-instance-connectivity.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +mkdir instance-report +# Check IP Configuration +echo "### IP Configuration ###" +ip addr show > instance-report/ip-configuration.txt + +# Check Default Gateway +echo "### Default Gateway ###" +ip route show > instance-report/default-gateway.txt + +# Check DNS Configuration +echo "### DNS Configuration ###" +cat /etc/resolv.conf > instance-report/dns-configuration.txt + +# Check Current Firewall Rules +echo "### Firewall Rules ###" +sudo iptables -L -v -n > instance-report/firewall-rules.txt + +# Check Network Interface Statistics +echo "### Network Interface Statistics ###" +netstat -i > instance-report/network-interface-statistics.txt + +# Check Active Network Connections +echo "### Active Network Connections ###" +netstat -tuln > instance-report/active-network-connections.txt + +# Check Network Manager Status +echo "### Network Manager Status ###" +systemctl status systemd-networkd > instance-report/network-manager-status.txt + +# Check Cloud-Init Status +echo "### Cloud-Init Status ###" +systemctl status cloud-init > instance-report/cloud-init-status.txt + +# Check System Logs for Network-Related Issues +echo "### System Logs for Network-Related Issues (systemd-networkd) ###" +sudo journalctl -u systemd-networkd > instance-report/systemd-networkd-logs.txt + +echo "### System Logs for Network-Related Issues (cloud-init) ###" +sudo journalctl -u cloud-init > instance-report/cloud-init-logs.txt + +# Check if IPv6 is Disabled +echo "### IPv6 Disabled Status ###" +cat /etc/sysctl.conf | grep net.ipv6.conf.all.disable_ipv6 > instance-report/ipv6-disabled-status.txt +cat /etc/sysctl.conf | grep net.ipv6.conf.default.disable_ipv6 > instance-report/ipv6-default-disabled-status.txt + +# Check HTTP Connectivity Directly +echo "### HTTP Connectivity Check ###" +curl -v http://us-east-1.ec2.archive.ubuntu.com/ubuntu/dists/focal-backports/InRelease > instance-report/apt-repository-curl.txt + +# Check DNS Resolution for the Repository +echo "### DNS Resolution for the Repository ###" +host us-east-1.ec2.archive.ubuntu.com > instance-report/apt-repository-dns.txt +dig us-east-1.ec2.archive.ubuntu.com > instance-report/apt-repository-dig.txt diff --git a/web_api/.env.template b/web_api/.env.template new file mode 100644 index 00000000..8539ff6f --- /dev/null +++ b/web_api/.env.template @@ -0,0 +1,11 @@ +# Used for local development of the web API +# AWS_ACCESS_KEY_ID= +# AWS_SECRET_ACCESS_KEY= +# CERT_EMAIL="" +# DOCKER_HUB_USERNAME="your-docker-hub-username" +# DOCKER_HUB_ACCESS_TOKEN="your-docker-hub-access-token" +# ENVIRONMENT= # staging or production +# DOCKER_IMAGE_TAG="/osm_web_api: https redirect + - "--entrypoints.web.http.redirections.entryPoint.to=websecure" + - "--entrypoints.web.http.redirections.entryPoint.scheme=https" + - "--entrypoints.web.http.redirections.entrypoint.permanent=true" + ports: + - "80:80" + - "443:443" + - "8080:8080" + volumes: + - "/var/run/docker.sock:/var/run/docker.sock:ro" + - myrttle_letsencrypt:/letsencrypt diff --git a/web_api/dashboard/Dockerfile b/web_api/dashboard/Dockerfile new file mode 100644 index 00000000..c2497c07 --- /dev/null +++ b/web_api/dashboard/Dockerfile @@ -0,0 +1,20 @@ +FROM tiangolo/uvicorn-gunicorn:python3.11 + +WORKDIR /app + +ENV LOCAL_DATA_PATH=/opt/from_mongo.feather +COPY from_mongo.feather /opt/from_mongo.feather + +# Create the environment +RUN pip install holoviews panel pymongo odmantic pandas pyarrow pydantic[email] + +RUN mkdir -p /opt/osm +COPY pyproject.toml /opt/osm +COPY osm /opt/osm/osm +ARG PSEUDO_VERSION=0.0.1 # strongly recommended to update based on git describe +RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_OSM=${PSEUDO_VERSION} pip install -e /opt/osm +RUN --mount=source=.git,target=/opt/osm/.git,type=bind pip install -e /opt/osm + +# # Copy the project files and install the package +COPY web_api/dashboard/dashboard.py /app +CMD ["python", "dashboard.py"] diff --git a/web_api/dashboard/dashboard.py b/web_api/dashboard/dashboard.py new file mode 100644 index 00000000..937cba66 --- /dev/null +++ b/web_api/dashboard/dashboard.py @@ -0,0 +1,136 @@ +import os + +import holoviews as hv +import pandas as pd +import panel as pn +from odmantic import SyncEngine +from pymongo import MongoClient + +from osm import schemas + + +def flatten_dict(d): + """ + Recursively flattens a nested dictionary without prepending parent keys. + + :param d: Dictionary to flatten. + :return: Flattened dictionary. + """ + items = [] + for k, v in d.items(): + if isinstance(v, dict): + # If the value is a dictionary, flatten it without the parent key + items.extend(flatten_dict(v).items()) + else: + items.append((k, v)) + return dict(items) + + +def load_data(): + if "LOCAL_DATA_PATH" in os.environ: + return pd.read_feather(os.environ["LOCAL_DATA_PATH"]) + client = MongoClient(os.environ["MONGODB_URI"]) + engine = SyncEngine(client=client, database="osm") + matches = ( + engine.get_collection(schemas.Invocation) + .aggregate( + [ + { + "$match": { + "osm_version": {"$eq": "0.0.1"}, + # "work.pmid": {"$regex":r"^2"}, + "metrics.year": {"$gt": 2000}, + # "metrics.is_data_pred": {"$eq": True}, + }, + }, + { + "$project": { + # "osm_version": True, + # "user_comment": True, + # "client.compute_context_id": True, + "work.user_defined_id": True, + "metrics.year": True, + "metrics.is_code_pred": True, + "metrics.is_data_pred": True, + "metrics.affiliation_country": True, + "metrics.score": True, + "metrics.eigenfactor_score": True, + "metrics.fund_pmc_anysource": True, + "metrics.fund_pmc_institute": True, + "metrics.fund_pmc_source": True, + "metrics.journal": True, + }, + }, + ] + ) + .__iter__() + ) + return pd.DataFrame(flatten_dict(match) for match in matches) + + +def get_dashboard(): + data_grouped = pn.state.cache["data_grouped"] + + # Create charts + fig_data = hv.Bars( + data_grouped, + kdims=["year"], + vdims=[ + "percent_is_data_pred", + ], + ) + fig_code = hv.Bars( + data_grouped, + kdims=["year"], + vdims=[ + "percent_is_code_pred", + ], + ) + # Layout the dashboard + dashboard = pn.Column( + "# Data and code transparency", + pn.Row(fig_data, fig_code, sizing_mode="stretch_width"), + ) + return dashboard + + +def on_load(): + """ + Add resource intensive things that you only want to run once. + """ + pn.config.browser_info = True + pn.config.notifications = True + pn.state.cache["data"] = load_data() + pn.state.cache["data_grouped"] = ( + pn.state.cache["data"][pn.state.cache["data"]["year"] != 999999] + .groupby("year") + .agg( + percent_is_data_pred=("is_data_pred", lambda x: x.mean() * 100), + percent_is_code_pred=("is_code_pred", lambda x: x.mean() * 100), + avg_score=("score", "mean"), + avg_eigenfactor_score=("eigenfactor_score", "mean"), + ) + .reset_index() + ) + + +if __name__ == "__main__": + # Runs all the things necessary before the server actually starts. + pn.state.onload(on_load) + print("starting dashboard!") + pn.serve( + get_dashboard(), + address="0.0.0.0", + port=8501, + start=True, + location=True, + show=False, + keep_alive=30 * 1000, # 30s + autoreload=True, + admin=True, + profiler="pyinstrument", + allow_websocket_origin=[ + "localhost:8501", + "osm.pythonaisolutions.com", + ], + ) diff --git a/web_api/deploy.py b/web_api/deploy.py new file mode 100644 index 00000000..3c5a688a --- /dev/null +++ b/web_api/deploy.py @@ -0,0 +1,162 @@ +import argparse +import contextlib +import os +import subprocess +import sys +import tempfile +from pathlib import Path +from time import sleep + +from dotenv import load_dotenv +from jinja2 import Template + +# Load environment variables from .env file +load_dotenv() + +# Required environment variables +required_env_vars = [ + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "CERT_EMAIL", + "DASHBOARD_IMAGE_TAG", + "DOCKER_HUB_ACCESS_TOKEN", + "DOCKER_HUB_USERNAME", + "DOCKER_IMAGE_TAG", + "ENVIRONMENT", + "MONGODB_URI", + "SSH_KEY_PATH", +] + +# Check if all required environment variables are set +for var in required_env_vars: + if not os.getenv(var): + raise EnvironmentError(f"Missing required environment variable: {var}") + + +def run_command(command): + result = subprocess.run(command, shell=True) + if result.returncode != 0: + sys.exit(f"Command failed: {command}") + + +def build_and_push_docker_images(): + print("Logging in to Docker Hub...") + run_command( + f"echo {os.getenv('DOCKER_HUB_ACCESS_TOKEN')} | docker login --username {os.getenv('DOCKER_HUB_USERNAME')} --password-stdin" + ) + + print("Building and pushing Docker images...") + run_command( + f"DOCKER_BUILDKIT=1 docker build -t {os.getenv('DOCKER_IMAGE_TAG')} -f ./web_api/Dockerfile ." + ) + run_command(f"docker push {os.getenv('DOCKER_IMAGE_TAG')}") + run_command( + f"DOCKER_BUILDKIT=1 docker build -t {os.getenv('DASHBOARD_IMAGE_TAG')} -f ./web_api/dashboard/Dockerfile ." + ) + run_command(f"docker push {os.getenv('DASHBOARD_IMAGE_TAG')}") + + +def deploy_terraform(environment): + print("Deploying using Opentofu...") + terraform_dir = f"web_api/terraform/{environment}" + + with contextlib.chdir(terraform_dir): + run_command("tofu init") + run_command("tofu plan") + run_command("tofu apply -auto-approve") + + public_dns = ( + subprocess.check_output("tofu output -raw public_dns", shell=True) + .decode() + .strip() + ) + + if not public_dns: + sys.exit(f"Public DNS not found for {environment}. Exiting...") + + # Write public_dns to a hidden file in the current directory + Path(".public_dns").write_text(public_dns) + + +def create_temp_files(): + compose_template_path = Path("web_api/docker-compose.yaml.j2") + compose_template = Template(compose_template_path.read_text()) + compose_content = compose_template.render( + docker_image_tag=os.getenv("DOCKER_IMAGE_TAG"), + dashboard_image_tag=os.getenv("DASHBOARD_IMAGE_TAG"), + mongodb_uri=os.getenv("MONGODB_URI"), + cert_email=os.getenv("CERT_EMAIL"), + ) + + temp_dir = Path(tempfile.mkdtemp()) + compose_path = temp_dir / "docker-compose.yaml" + + compose_path.write_text(compose_content) + + return compose_path + + +def transfer_and_deploy_files(public_dns, compose_path, attach_to_logs=False): + sleep(5) + print("Transferring Docker Compose files to the remote instance...") + ssh_key_path = os.getenv("SSH_KEY_PATH") + ssh_port = os.getenv("TF_VAR_ssh_port") + + run_command( + f"scp -o StrictHostKeyChecking=no -i {ssh_key_path} {compose_path} ubuntu@{public_dns}:~/docker-compose.yaml" + ) + print("Deploying Docker Compose on the instance...") + run_command( + f"ssh -o StrictHostKeyChecking=no -i {ssh_key_path} ubuntu@{public_dns} -p {ssh_port} 'sudo docker-compose up --remove-orphans -d'" + ) + if attach_to_logs: + run_command( + f"ssh -o StrictHostKeyChecking=no -i {ssh_key_path} ubuntu@{public_dns} -p {ssh_port} 'sudo docker-compose logs -f'" + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Deploy the application.") + parser.add_argument( + "--skip-terraform-deployment", + action="store_true", + help="Skip the Terraform deployment.", + ) + parser.add_argument( + "--skip-docker-rebuild", + action="store_true", + help="Skip rebuilding the Docker image.", + ) + parser.add_argument( + "--attach-to-logs", + action="store_true", + help="Attach to the logs of the deployed containers.", + ) + + return parser.parse_args() + + +def main(args=None): + if args is None: + args = parse_args() + if not args.skip_docker_rebuild: + build_and_push_docker_images() + + compose_path = create_temp_files() + if not args.skip_terraform_deployment: + deploy_terraform(os.getenv("ENVIRONMENT")) + + public_dns_file = Path(".public_dns") + if public_dns_file.exists(): + public_dns = public_dns_file.read_text().strip() + transfer_and_deploy_files(public_dns, compose_path, args.attach_to_logs) + else: + print("Public DNS file not found. Exiting...") + + # Clean up temporary files + compose_path.unlink() + print("Cleaned up temporary files.") + + +if __name__ == "__main__": + main() diff --git a/web_api/docker-compose.yaml.j2 b/web_api/docker-compose.yaml.j2 new file mode 100644 index 00000000..aa43b653 --- /dev/null +++ b/web_api/docker-compose.yaml.j2 @@ -0,0 +1,73 @@ +name: osm +services: + web_api: + image: {{ docker_image_tag }} + pull_policy: always + environment: + - MONGODB_URI={{ mongodb_uri }} + working_dir: /app/app + expose: + - "80" + labels: + - traefik.enable=true + - traefik.docker.network=osm_traefik-public + - traefik.http.routers.osm_web_api.rule=Host(`osm.pythonaisolutions.com`) && PathPrefix(`/api`) + - "traefik.http.routers.osm_web_api.entrypoints=web,websecure" + - traefik.http.services.osm_web_api.loadbalancer.server.port=80 + # use the "le" (Let's Encrypt) resolver to get Let's Encrypt certificates + - traefik.http.routers.osm_web_api.tls=true + - traefik.http.routers.osm_web_api.tls.certresolver=le + networks: + - traefik-public + + dashboard: + image: {{ dashboard_image_tag }} + pull_policy: always + environment: + - MONGODB_URI={{ mongodb_uri }} + working_dir: /app + labels: + - traefik.enable=true + - traefik.docker.network=osm_traefik-public + - traefik.http.routers.dashboard.rule=Host(`osm.pythonaisolutions.com`) + - traefik.http.routers.dashboard.entrypoints=web,websecure + # use the "le" (Let's Encrypt) resolver to get Let's Encrypt certificates + - traefik.http.routers.dashboard.tls=true + - traefik.http.routers.dashboard.tls.certresolver=le + - traefik.http.services.dashboard.loadbalancer.server.port=8501 + expose: + - "8501" + + networks: + - traefik-public + reverse_proxy: + image: traefik + restart: always + command: + - --providers.docker=true + - --providers.docker.exposedbydefault=false + - --entrypoints.web.address=:80 + - --entrypoints.websecure.address=:443 + - --entryPoints.web.http.redirections.entryPoint.to=websecure + - "--certificatesresolvers.le.acme.email={{ cert_email }}" + - --certificatesresolvers.le.acme.storage=/certificates/acme.json + {# - "--certificatesresolvers.le.acme.caServer=https://acme-staging-v02.api.letsencrypt.org/directory" #} + - --certificatesresolvers.le.acme.tlschallenge=true + - --log + - --accesslog + - --log.level=DEBUG + ports: + - 80:80 + - 8080:8080 + - 443:443 + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + - traefik-public-certificates:/certificates + networks: + - traefik-public + +volumes: + traefik-public-certificates: + +networks: + traefik-public: diff --git a/web_api/environment.yaml b/web_api/environment.yaml new file mode 100644 index 00000000..de5c70df --- /dev/null +++ b/web_api/environment.yaml @@ -0,0 +1,11 @@ +name: deploy +channels: + - conda-forge + - nodefaults +dependencies: + - awscli + - opentofu >= 1.8.1 + - python-dotenv + - jinja2 + - ipython + - pdbpp diff --git a/docker_images/web_api/main.py b/web_api/main.py similarity index 65% rename from docker_images/web_api/main.py rename to web_api/main.py index e613ef82..3025ff36 100644 --- a/docker_images/web_api/main.py +++ b/web_api/main.py @@ -22,19 +22,14 @@ import os import motor.motor_asyncio -from fastapi import FastAPI -from odmantic import AIOEngine +from fastapi import FastAPI, HTTPException +from odmantic import AIOEngine, ObjectId from osm.schemas import Invocation app = FastAPI() - -client = motor.motor_asyncio.AsyncIOMotorClient( - os.environ.get( - "MONGODB_URI", - "mongodb+srv://johnlee:@cluster0.6xo8ws7.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0", - ) -) +dburi = os.environ.get("MONGODB_URI", "mongodb://localhost:27017") +client = motor.motor_asyncio.AsyncIOMotorClient(dburi) engine = AIOEngine(client=client, database="test") @@ -44,12 +39,22 @@ async def upload_invocation(invocation: Invocation): return invocation +@app.get("/invocations/{id}", response_model=Invocation) +async def get_invocation_by_id(id: ObjectId): + invocation = await engine.find_one(Invocation, Invocation.id == id) + if invocation is None: + raise HTTPException(404) + return invocation + + if __name__ == "__main__": import asyncio import uvicorn loop = asyncio.get_event_loop() - config = uvicorn.Config(app=app, host="0.0.0.0", port=8000, loop=loop) + config = uvicorn.Config( + app=app, host="0.0.0.0", port=80, root_path="/api", loop=loop + ) server = uvicorn.Server(config) loop.run_until_complete(server.serve()) diff --git a/web_api/terraform/modules/shared_resources/main.tf b/web_api/terraform/modules/shared_resources/main.tf new file mode 100644 index 00000000..b9551f71 --- /dev/null +++ b/web_api/terraform/modules/shared_resources/main.tf @@ -0,0 +1,202 @@ +variable "aws_region" { + description = "AWS region" + default = "us-east-1" +} + + +variable "s3_bucket" { + description = "S3 bucket for Terraform state" + default = "osm-storage" +} + +variable "dynamodb_table" { + description = "DynamoDB table for Terraform state locking" + default = "terraform-locks" +} +variable ssh_port { + description = "Non-standard port for SSH" + default = 22 +} + + +# VPC +resource "aws_vpc" "main" { + cidr_block = "10.0.0.0/16" + enable_dns_hostnames = true + enable_dns_support = true + tags = { + Name = "osm-vpc" + } +} + +# Internet Gateway +resource "aws_internet_gateway" "main" { + vpc_id = aws_vpc.main.id + + tags = { + Name = "osm-internet-gateway" + } +} + +# Route Table +resource "aws_route_table" "main" { + vpc_id = aws_vpc.main.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.main.id + } + route { + ipv6_cidr_block = "::/0" + gateway_id = aws_internet_gateway.main.id + } + tags = { + Name = "osm-route-table" + } +} + +# Network ACL +resource "aws_network_acl" "allow_all" { + vpc_id = aws_vpc.main.id + + tags = { + Name = "allow_all_acl" + } +} + +resource "aws_network_acl_rule" "allow_all_inbound" { + network_acl_id = aws_network_acl.allow_all.id + rule_number = 100 + protocol = "-1" # -1 means all protocols + rule_action = "allow" + egress = false + cidr_block = "0.0.0.0/0" + from_port = 0 + to_port = 65535 +} + +resource "aws_network_acl_rule" "allow_all_outbound" { + network_acl_id = aws_network_acl.allow_all.id + rule_number = 200 + protocol = "-1" # -1 means all protocols + rule_action = "allow" + egress = true + cidr_block = "0.0.0.0/0" + from_port = 0 + to_port = 65535 +} +resource "aws_security_group" "allow_all" { + name = "allow_all_security_group" + description = "Security group that allows all inbound and outbound traffic" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 0 + to_port = 65535 + protocol = "6" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 65535 + protocol = "6" + cidr_blocks = ["0.0.0.0/0"] + } + ingress { + from_port = 0 + to_port = 65535 + protocol = "17" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 65535 + protocol = "17" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { + Name = "allow_all_security_group" + } +} + +resource "aws_vpc_dhcp_options" "main" { + domain_name = "compute-1.amazonaws.com" + domain_name_servers = ["AmazonProvidedDNS"] + + tags = { + Name = "osm-dhcp-options" + } +} + +resource "aws_vpc_dhcp_options_association" "main" { + vpc_id = aws_vpc.main.id + dhcp_options_id = aws_vpc_dhcp_options.main.id +} + + +# main Subnet +resource "aws_subnet" "main" { + vpc_id = aws_vpc.main.id + cidr_block = "10.0.1.0/24" + availability_zone = "us-east-1a" + map_public_ip_on_launch = true + + tags = { + Name = "main-subnet" + } +} + +# Route Table Association for main +resource "aws_route_table_association" "main" { + subnet_id = aws_subnet.main.id + route_table_id = aws_route_table.main.id +} + +# Associate the Network ACL with the Subnet +resource "aws_network_acl_association" "main" { + subnet_id = aws_subnet.main.id + network_acl_id = aws_network_acl.allow_all.id +} + + +# Security Group + + +# Data source to find the latest Ubuntu AMI +data "aws_ami" "ubuntu" { + most_recent = true + owners = ["099720109477"] + filter { + name = "name" + values = ["ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*"] + } +} + + + + +# Outputs +output "vpc_id" { + value = aws_vpc.main.id +} +output "subnet_id" { + value = aws_subnet.main.id +} +output "security_group_id" { + value = aws_security_group.allow_all.id +} +output "internet_gateway_id" { + value = aws_internet_gateway.main.id +} +output "route_table_id" { + value = aws_route_table.main.id +} +output "aws_network_acl_id" { + value = aws_network_acl.allow_all.id +} +output "ami_id" { + value = data.aws_ami.ubuntu.id +} diff --git a/web_api/terraform/staging/main.tf b/web_api/terraform/staging/main.tf new file mode 100644 index 00000000..72ac28c9 --- /dev/null +++ b/web_api/terraform/staging/main.tf @@ -0,0 +1,87 @@ +provider "aws" { + region = "us-east-1" +} + +terraform { + backend "s3" { + bucket = "osm-terraform-storage" + key = "terraform/staging-state/terraform.tfstate" + region = "us-east-1" + dynamodb_table = "terraform-locks" + } +} + + +module "shared_resources" { + source = "../modules/shared_resources" +} + +# EC2 Instance +resource "aws_instance" "staging" { + ami = module.shared_resources.ami_id + instance_type = var.instance_type + subnet_id = module.shared_resources.subnet_id + key_name = "dsst2023" + vpc_security_group_ids = [module.shared_resources.security_group_id] + associate_public_ip_address = true + + tags = { + Name = "staging-instance" + } + + user_data = <<-EOF + #!/bin/bash + apt-get update -y + apt install -y curl + apt-get install -y docker.io + curl -SL https://github.com/docker/compose/releases/download/v2.29.1/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose + chmod a+x /usr/local/bin/docker-compose + systemctl restart sshd + systemctl start docker + systemctl enable docker + EOF +} + +resource "aws_eip" "staging" { + domain = "vpc" + + tags = { + Name = "staging-elastic-ip" + } +} + +resource "aws_eip_association" "staging" { + instance_id = aws_instance.staging.id + allocation_id = aws_eip.staging.id +} + +output "vpc_id" { + value = module.shared_resources.vpc_id +} +output "internet_gateway_id" { + value = module.shared_resources.internet_gateway_id +} +output "route_table_id" { + value = module.shared_resources.route_table_id +} +output "network_acl_id" { + value = module.shared_resources.aws_network_acl_id +} +output "security_group_id" { + value = module.shared_resources.security_group_id +} +output "subnet_id" { + value = module.shared_resources.subnet_id +} + +output "instance_id" { + value = aws_instance.staging.id +} + +output "public_dns" { + value = aws_eip.staging.public_dns +} + +output "public_ip" { + value = aws_eip.staging.public_ip +} diff --git a/web_api/terraform/staging/variables.tf b/web_api/terraform/staging/variables.tf new file mode 100644 index 00000000..eeb2c264 --- /dev/null +++ b/web_api/terraform/staging/variables.tf @@ -0,0 +1,4 @@ +variable "instance_type" { + description = "EC2 instance type" + default = "t3.large" +} diff --git a/web_api/terraform/state_storage/README.md b/web_api/terraform/state_storage/README.md new file mode 100644 index 00000000..9e4ce8d0 --- /dev/null +++ b/web_api/terraform/state_storage/README.md @@ -0,0 +1,16 @@ +Created bucket and table manually: + +``` +aws s3api create-bucket --bucket osm-terraform-storage --region us-east-1 +aws s3api list-buckets +aws s3api list-buckets --region us-east-1 +aws s3api put-bucket-versioning --bucket osm-terraform-storage --versioning-configuration Status=Enabled +aws s3 cp state-storage.tf s3://osm-terraform-storage/test.tf +aws s3 rm s3://osm-terraform-storage --recursive +# Failed: aws dynamodb create-table --table-name terraform-locks --attribute-definitions AttributeName=LockID,AttributeType=S --key-schema AttributeName=LockID,KeyType=HASH --billing-mode PAY_PER_REQUEST --region us-east-1 +# Created dynamodb-policy.json +aws iam create-policy --policy-name DynamoDBFullAccess --policy-document file://dynamodb-policy.json +aws iam attach-user-policy --policy-arn arn:aws:iam::507624629289:policy/DynamoDBFullAccess --user-name osm +aws iam list-attached-user-policies --user-name osm +aws dynamodb create-table --table-name terraform-locks --attribute-definitions AttributeName=LockID,AttributeType=S --key-schema AttributeName=LockID,KeyType=HASH --billing-mode PAY_PER_REQUEST --region us-east-1 +``` diff --git a/web_api/terraform/state_storage/dynamodb-policy.json b/web_api/terraform/state_storage/dynamodb-policy.json new file mode 100644 index 00000000..714b91c4 --- /dev/null +++ b/web_api/terraform/state_storage/dynamodb-policy.json @@ -0,0 +1,29 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:DeleteTable", + "dynamodb:DescribeTable", + "dynamodb:ListTables", + "dynamodb:UpdateTable", + "dynamodb:PutItem", + "dynamodb:GetItem", + "dynamodb:DeleteItem", + "dynamodb:Query", + "dynamodb:Scan" + ], + "Resource": "arn:aws:dynamodb:us-east-1:507624629289:table/terraform-locks" + }, + { + "Effect": "Allow", + "Action": [ + "dynamodb:ListTables", + "dynamodb:ListTagsOfResource" + ], + "Resource": "*" + } + ] +} diff --git a/web_api/terraform/state_storage/state-storage.tf b/web_api/terraform/state_storage/state-storage.tf new file mode 100644 index 00000000..0d5877f7 --- /dev/null +++ b/web_api/terraform/state_storage/state-storage.tf @@ -0,0 +1,46 @@ +provider "aws" { + region = "us-east-1" +} + +resource "aws_s3_bucket" "tf_state" { + bucket = "osm-storage" + versioning { + enabled = true + } + server_side_encryption_configuration { + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } + } + lifecycle_rule { + id = "tf_state" + enabled = true + transition { + days = 30 + storage_class = "STANDARD_IA" + } + expiration { + days = 365 + } + } + tags = { + Name = "terraform-state-storage" + } +} + +resource "aws_dynamodb_table" "tf_locks" { + name = "terraform-locks" + billing_mode = "PAY_PER_REQUEST" + hash_key = "LockID" + + attribute { + name = "LockID" + type = "S" + } + + tags = { + Name = "terraform-state-locks" + } +}