From d56cf75c2f766deb99c390a4407b9c41613d87ef Mon Sep 17 00:00:00 2001 From: Nichollette Date: Wed, 2 Oct 2024 17:28:10 -0400 Subject: [PATCH 01/20] updated backup methods for more user features --- discovery/utils/backup.py | 43 +++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/discovery/utils/backup.py b/discovery/utils/backup.py index 6ad33a98..c4496d8d 100644 --- a/discovery/utils/backup.py +++ b/discovery/utils/backup.py @@ -100,32 +100,49 @@ def daily_backup_routine(): logger.error(str(exc)) -def backup_from_file(api): - """Restore index data from file""" +def backup_from_file(api, update_schema=True, update_schema_class=True, update_dataset=True): + """ + Restore index data from a file, with an option to update indices. + + + """ logger = logging.getLogger("backup_from_file") if not api: - logger.error("failure to restore from file, no json object passed.") - else: + logger.error("Failure to restore from file, no JSON object passed.") + + # Reset indices if any index is being updated + if update_schema or update_schema_class or update_dataset: indices.reset() - api_schema = api["discover_schema"] - api_schema_class = api["discover_schema_class"] - api_dataset = api["discover_dataset"] + # Update discover_schema if True + if update_schema: + api_schema = api["discover_schema"] for doc in api_schema["docs"]: file = Schema(**doc) file.meta.id = doc["_id"] file.save() + else: + logger.info("No discover_schema data found in the API backup") + # Update discover_schema_class if True + if update_schema_class: + api_schema_class = api["discover_schema_class"] for doc in api_schema_class["docs"]: file = SchemaClass(**doc) file.save() - + else: + logger.info("No discover_schema_class data found in the API backup") + + # Update discover_dataset if True + if discover_dataset: + api_dataset = api["discover_dataset"] for doc in api_dataset["docs"]: file = Dataset(**doc) file.save() + else: + logger.info("No discover_dataset data found in the API backup") - -def restore_from_s3(filename=None, bucket="dde"): +def restore_from_s3(filename=None, bucket="dde",update_schema=True, update_schema_class=True, update_dataset=True): s3 = boto3.client("s3") @@ -144,10 +161,10 @@ def restore_from_s3(filename=None, bucket="dde"): ) ddeapis = json.loads(obj['Body'].read()) - backup_from_file(ddeapis) + backup_from_file(ddeapis, update_schema=update_schema, update_schema_class=update_schema_class, update_dataset=update_dataset) -def restore_from_file(filename=None): +def restore_from_file(filename=None, update_schema=True, update_schema_class=True, update_dataset=True): with open(filename) as file: ddeapis = json.load(file) - backup_from_file(ddeapis) + backup_from_file(ddeapis, update_schema=update_schema, update_schema_class=update_schema_class, update_dataset=update_dataset) From c362e4986627d4523d48f7492190f5c9e3e56f34 Mon Sep 17 00:00:00 2001 From: Nichollette Date: Thu, 3 Oct 2024 12:09:47 -0400 Subject: [PATCH 02/20] added index reset flexibility --- discovery/utils/backup.py | 9 ++++---- discovery/utils/indices.py | 46 +++++++++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/discovery/utils/backup.py b/discovery/utils/backup.py index c4496d8d..54a98f21 100644 --- a/discovery/utils/backup.py +++ b/discovery/utils/backup.py @@ -110,9 +110,8 @@ def backup_from_file(api, update_schema=True, update_schema_class=True, update_d if not api: logger.error("Failure to restore from file, no JSON object passed.") - # Reset indices if any index is being updated - if update_schema or update_schema_class or update_dataset: - indices.reset() + # Reset target indices + indices.reset(update_schema=update_schema, update_schema_class=update_schema_class, update_dataset=update_dataset) # Update discover_schema if True if update_schema: @@ -134,7 +133,7 @@ def backup_from_file(api, update_schema=True, update_schema_class=True, update_d logger.info("No discover_schema_class data found in the API backup") # Update discover_dataset if True - if discover_dataset: + if update_dataset: api_dataset = api["discover_dataset"] for doc in api_dataset["docs"]: file = Dataset(**doc) @@ -158,7 +157,7 @@ def restore_from_s3(filename=None, bucket="dde",update_schema=True, update_schem obj = s3.get_object( Bucket=bucket, Key=filename - ) + ) ddeapis = json.loads(obj['Body'].read()) backup_from_file(ddeapis, update_schema=update_schema, update_schema_class=update_schema_class, update_dataset=update_dataset) diff --git a/discovery/utils/indices.py b/discovery/utils/indices.py index 6317d031..0823569e 100644 --- a/discovery/utils/indices.py +++ b/discovery/utils/indices.py @@ -26,24 +26,44 @@ def refresh(): Index(Dataset.Index.name).refresh() -def reset(): +def reset(update_schema=True, update_schema_class=True, update_dataset=True): - index_1 = Index(Schema.Index.name) - index_2 = Index(SchemaClass.Index.name) - index_3 = Index(Dataset.Index.name) + # Reset discover_schema if update_schema is True (default behavior resets all) + if update_schema: + index_1 = Index(Schema.Index.name) + if index_1.exists(): + index_1.delete() + Schema.init() + + # Reset discover_schema_class if update_schema_class is True + if update_schema_class: + index_2 = Index(SchemaClass.Index.name) + if index_2.exists(): + index_2.delete() + SchemaClass.init() + + # Reset discover_dataset if update_dataset is True + if update_dataset: + index_3 = Index(Dataset.Index.name) + if index_3.exists(): + index_3.delete() + Dataset.init() + # index_1 = Index(Schema.Index.name) + # index_2 = Index(SchemaClass.Index.name) + # index_3 = Index(Dataset.Index.name) - if index_1.exists(): - index_1.delete() + # if index_1.exists(): + # index_1.delete() - if index_2.exists(): - index_2.delete() + # if index_2.exists(): + # index_2.delete() - if index_3.exists(): - index_3.delete() + # if index_3.exists(): + # index_3.delete() - Schema.init() - SchemaClass.init() - Dataset.init() + # Schema.init() + # SchemaClass.init() + # Dataset.init() def save_schema_index_meta(meta): From 4be00266052bb789db8c9fb67ae7cd25296f5e47 Mon Sep 17 00:00:00 2001 From: Nichollette Date: Thu, 3 Oct 2024 12:16:56 -0400 Subject: [PATCH 03/20] code clean with black and flake8 --- discovery/utils/backup.py | 42 +++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/discovery/utils/backup.py b/discovery/utils/backup.py index 54a98f21..6bef4d98 100644 --- a/discovery/utils/backup.py +++ b/discovery/utils/backup.py @@ -103,15 +103,19 @@ def daily_backup_routine(): def backup_from_file(api, update_schema=True, update_schema_class=True, update_dataset=True): """ Restore index data from a file, with an option to update indices. - - + + """ logger = logging.getLogger("backup_from_file") if not api: logger.error("Failure to restore from file, no JSON object passed.") # Reset target indices - indices.reset(update_schema=update_schema, update_schema_class=update_schema_class, update_dataset=update_dataset) + indices.reset( + update_schema=update_schema, + update_schema_class=update_schema_class, + update_dataset=update_dataset, + ) # Update discover_schema if True if update_schema: @@ -131,7 +135,7 @@ def backup_from_file(api, update_schema=True, update_schema_class=True, update_d file.save() else: logger.info("No discover_schema_class data found in the API backup") - + # Update discover_dataset if True if update_dataset: api_dataset = api["discover_dataset"] @@ -141,7 +145,10 @@ def backup_from_file(api, update_schema=True, update_schema_class=True, update_d else: logger.info("No discover_dataset data found in the API backup") -def restore_from_s3(filename=None, bucket="dde",update_schema=True, update_schema_class=True, update_dataset=True): + +def restore_from_s3( + filename=None, bucket="dde", update_schema=True, update_schema_class=True, update_dataset=True +): s3 = boto3.client("s3") @@ -154,16 +161,25 @@ def restore_from_s3(filename=None, bucket="dde",update_schema=True, update_schem logging.info("GET s3://%s/%s", bucket, filename) - obj = s3.get_object( - Bucket=bucket, - Key=filename - ) + obj = s3.get_object(Bucket=bucket, Key=filename) - ddeapis = json.loads(obj['Body'].read()) - backup_from_file(ddeapis, update_schema=update_schema, update_schema_class=update_schema_class, update_dataset=update_dataset) + ddeapis = json.loads(obj["Body"].read()) + backup_from_file( + ddeapis, + update_schema=update_schema, + update_schema_class=update_schema_class, + update_dataset=update_dataset, + ) -def restore_from_file(filename=None, update_schema=True, update_schema_class=True, update_dataset=True): +def restore_from_file( + filename=None, update_schema=True, update_schema_class=True, update_dataset=True +): with open(filename) as file: ddeapis = json.load(file) - backup_from_file(ddeapis, update_schema=update_schema, update_schema_class=update_schema_class, update_dataset=update_dataset) + backup_from_file( + ddeapis, + update_schema=update_schema, + update_schema_class=update_schema_class, + update_dataset=update_dataset, + ) From 12142b9809bd140c94ce3096bb55808608a73ae6 Mon Sep 17 00:00:00 2001 From: Nichollette Date: Tue, 8 Oct 2024 16:26:35 -0400 Subject: [PATCH 04/20] flexibility improvement added to backup fucntion --- discovery/utils/backup.py | 131 +++++++++++++++++++++---------------- discovery/utils/indices.py | 89 ++++++++++++++----------- 2 files changed, 125 insertions(+), 95 deletions(-) diff --git a/discovery/utils/backup.py b/discovery/utils/backup.py index 6bef4d98..82e229e9 100644 --- a/discovery/utils/backup.py +++ b/discovery/utils/backup.py @@ -1,6 +1,7 @@ import json import logging from datetime import date, datetime +from typing import Union, List, Tuple import boto3 @@ -100,56 +101,84 @@ def daily_backup_routine(): logger.error(str(exc)) -def backup_from_file(api, update_schema=True, update_schema_class=True, update_dataset=True): +def backup_from_file(api: dict, indices: Union[str, List[str], Tupe[str, ...]] = "all") -> None: """ - Restore index data from a file, with an option to update indices. - + Restore index data from a file, with an option to update selected indices + Parameters: + - api: dict - JSON object containing the backup data. + - indices: Union[str, List[str], Tuple[str, ...]] - Specifies which indices to update. + Accepts 'all' or any combination of ['schema', 'schema_class', 'dataset']. """ logger = logging.getLogger("backup_from_file") if not api: logger.error("Failure to restore from file, no JSON object passed.") - - # Reset target indices - indices.reset( - update_schema=update_schema, - update_schema_class=update_schema_class, - update_dataset=update_dataset, - ) - - # Update discover_schema if True - if update_schema: - api_schema = api["discover_schema"] - for doc in api_schema["docs"]: - file = Schema(**doc) - file.meta.id = doc["_id"] - file.save() - else: - logger.info("No discover_schema data found in the API backup") - - # Update discover_schema_class if True - if update_schema_class: - api_schema_class = api["discover_schema_class"] - for doc in api_schema_class["docs"]: - file = SchemaClass(**doc) - file.save() + return + + # Validate the 'indices' parameter + valid_indices = {"dataset", "schema", "schema_class"} + if isinstance(indices, str): + if indices != "all": + logger.error(f"Invalid string value for 'indices': {indices}. Must be 'all'") + return + elif isinstance(indices, (list, tuple)): + if not all(index in valid_indices for index in indices): + # Ensure all elements in the list/tuple are valid ***** + # explicit information about the invalid elements would be helpful + logger.error(f"Invalid list/tuple value for 'indices': {indices}. Must be a subset of {valid_indices}") + return else: - logger.info("No discover_schema_class data found in the API backup") - - # Update discover_dataset if True - if update_dataset: - api_dataset = api["discover_dataset"] - for doc in api_dataset["docs"]: - file = Dataset(**doc) - file.save() - else: - logger.info("No discover_dataset data found in the API backup") - - -def restore_from_s3( - filename=None, bucket="dde", update_schema=True, update_schema_class=True, update_dataset=True -): + logger.error(f"Invalid type for 'indices': {type(indices)}. Must be a string, list, or tuple.") + return + # Selectively reset indices based on the indices parameter + if indices == "all": + indices_to_reset = ["schema", "schema_class", "dataset"] + else: + # Ensure indices is a list or tuple and contains valid entries + indices_to_reset = [index for index in valid_indices if index in indices] + + # Reset each relevant index + for index in indices_to_reset: + indices.reset(index=index) + + # Reset and update target indices based on the indices parameter + if indices == "all" or "schema" in indices: + # Update discover_schema + if "discover_schema" in api: + api_schema = api["discover_schema"] + for doc in api_schema["docs"]: + file = Schema(**doc) + file.meta.id = doc["_id"] + file.save() + logger.info("The discover_schema index data was updated successfully.") + else: + logger.info("No discover_schema data found in the API backup") + + if indices == "all" or "schema_class" in indices: + # Update discover_schema_class + if "discover_schema_class" in api: + api_schema_class = api["discover_schema_class"] + for doc in api_schema_class["docs"]: + file = SchemaClass(**doc) + file.save() + logger.info("The discover_schema_class index data was updated successfully.") + else: + logger.info("No discover_schema_class data found in the API backup") + + if indices == "all" or "dataset" in indices: + # Update discover_dataset + if "discover_dataset" in api: + api_dataset = api["discover_dataset"] + for doc in api_dataset["docs"]: + file = Dataset(**doc) + file.save() + logger.info("The discover_dataset index data was updated successfully.") + else: + logger.info("No discover_dataset data found in the API backup") + + +def restore_from_s3(filename: str = None, bucket: str = "dde", indices: Union[str, List[str], Tuple[str, ...]] = "all"): s3 = boto3.client("s3") if not filename: @@ -164,22 +193,10 @@ def restore_from_s3( obj = s3.get_object(Bucket=bucket, Key=filename) ddeapis = json.loads(obj["Body"].read()) - backup_from_file( - ddeapis, - update_schema=update_schema, - update_schema_class=update_schema_class, - update_dataset=update_dataset, - ) + backup_from_file(ddeapis, indices=indices) -def restore_from_file( - filename=None, update_schema=True, update_schema_class=True, update_dataset=True -): +def restore_from_file(filename: str = None, indices: Union[str, List[str], Tuple[str, ...]] = "all"): with open(filename) as file: ddeapis = json.load(file) - backup_from_file( - ddeapis, - update_schema=update_schema, - update_schema_class=update_schema_class, - update_dataset=update_dataset, - ) + backup_from_file(ddeapis, indices=indices) diff --git a/discovery/utils/indices.py b/discovery/utils/indices.py index 0823569e..b5f3d5a5 100644 --- a/discovery/utils/indices.py +++ b/discovery/utils/indices.py @@ -1,4 +1,5 @@ from elasticsearch_dsl import Index +from typing import Union, List, Tuple from discovery.model.dataset import Dataset from discovery.model.schema import Schema, SchemaClass @@ -26,44 +27,56 @@ def refresh(): Index(Dataset.Index.name).refresh() -def reset(update_schema=True, update_schema_class=True, update_dataset=True): - - # Reset discover_schema if update_schema is True (default behavior resets all) - if update_schema: - index_1 = Index(Schema.Index.name) - if index_1.exists(): - index_1.delete() - Schema.init() - - # Reset discover_schema_class if update_schema_class is True - if update_schema_class: - index_2 = Index(SchemaClass.Index.name) - if index_2.exists(): - index_2.delete() - SchemaClass.init() - - # Reset discover_dataset if update_dataset is True - if update_dataset: - index_3 = Index(Dataset.Index.name) - if index_3.exists(): - index_3.delete() - Dataset.init() - # index_1 = Index(Schema.Index.name) - # index_2 = Index(SchemaClass.Index.name) - # index_3 = Index(Dataset.Index.name) - - # if index_1.exists(): - # index_1.delete() - - # if index_2.exists(): - # index_2.delete() - - # if index_3.exists(): - # index_3.delete() - - # Schema.init() - # SchemaClass.init() - # Dataset.init() +def reset(indices: Union[str, List[str], Tuple[str, ...]] = "all") -> None: + """ + Reset selected indices. Default is to reset all indices. + Parameters: + - indices: Union[str, List[str], Tuple[str, ...]] - Specifies which indices to reset. + Accepts 'all' or any combination of ["schema", "schema_class", "dataset"]. + """ + + # Define index mapping + index_mapping = { + "schema": (Schema.Index.name, Schema), + "schema_class": (SchemaClass.Index.name, SchemaClass), + "dataset": (Dataset.Index.name, Dataset), + } + + # Determine which indices to reset + if indices == "all": + indices_to_reset = ["schema", "schema_class", "dataset"] + else: + valid_indices = {"schema", "schema_class", "dataset"} + indices_to_reset = [index for index in indices if index in valid_indices] + + # Loop through indices and reset them + for index_name in indices_to_reset: + index_value, model = index_mapping[index_name] + index = Index(index_value) + if index.exists(): + index.delete() + model.init() + + # # Reset discover_schema if update_schema is True (default behavior resets all) + # if update_schema: + # index_1 = Index(Schema.Index.name) + # if index_1.exists(): + # index_1.delete() + # Schema.init() + + # # Reset discover_schema_class if update_schema_class is True + # if update_schema_class: + # index_2 = Index(SchemaClass.Index.name) + # if index_2.exists(): + # index_2.delete() + # SchemaClass.init() + + # # Reset discover_dataset if update_dataset is True + # if update_dataset: + # index_3 = Index(Dataset.Index.name) + # if index_3.exists(): + # index_3.delete() + # Dataset.init() def save_schema_index_meta(meta): From 622ef7deae74010b892111f419fadf2f0dd5fbcb Mon Sep 17 00:00:00 2001 From: Everaldo Date: Thu, 3 Oct 2024 11:55:57 -0700 Subject: [PATCH 05/20] Add biothings_schema to requirements (#289) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b6d5604d..603d9936 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -# git+https://github.com/biothings/biothings_schema.py@25a3c02#egg=biothings_schema +git+https://github.com/biothings/biothings_schema.py@25a3c02#egg=biothings_schema # git+https://github.com/biothings/biothings.api.git@0.12.x#egg=biothings biothings==0.12.5 #PyLD>=0.7.1 From 1cc764f5e57bfd2d3fc62413bbbb854eda1f6a04 Mon Sep 17 00:00:00 2001 From: Everaldo Date: Thu, 3 Oct 2024 12:37:39 -0700 Subject: [PATCH 06/20] Implement zip backup. (#287) * Implement zip backup. * Remove no backup variable from config. * Create github action to auto check backup. * Fix backup file name. * Improve messages. * Remove unused code. * Check backup file size. * Fix backup error on json dumps. * Update slack channel. --- .github/scripts/check_backup.py | 96 ++++++++++++++++++++++++++++++ .github/workflows/check_backup.yml | 35 +++++++++++ discovery/utils/backup.py | 62 +++++++++++++++---- 3 files changed, 181 insertions(+), 12 deletions(-) create mode 100644 .github/scripts/check_backup.py create mode 100644 .github/workflows/check_backup.yml diff --git a/.github/scripts/check_backup.py b/.github/scripts/check_backup.py new file mode 100644 index 00000000..314326a2 --- /dev/null +++ b/.github/scripts/check_backup.py @@ -0,0 +1,96 @@ +""" +This script checks if a backup file for the current date exists in a specified S3 bucket. +If the backup file does not exist, a notification is sent to a Slack channel. + +Expected file format in the S3 bucket: +- The file should be in the folder 'db_backup/' with the following naming pattern: + 'dde_backup_YYYYMMDD.zip', where YYYYMMDD corresponds to the current date. + +Required Environment Variables: +- AWS_ACCESS_KEY_ID: The AWS access key ID to read the AWS s3 bucket. +- AWS_SECRET_ACCESS_KEY: The AWS secret access key to read the AWS s3 bucket. +- BACKUP_BUCKET_NAME: The name of the AWS S3 bucket where backups are stored. +- S3_FOLDER: The folder path within the S3 bucket where backups are stored (e.g., 'db_backup/'). +- AWS_REGION: The AWS region where the S3 bucket is located. +- SLACK_CHANNEL: The Slack channel where notifications should be sent (e.g., '#observability-test'). +- SLACK_WEBHOOK_URL: The Slack Webhook URL used to send the notification. + +Functionality: +1. The script uses the AWS SDK (boto3) to check for the existence of the backup file in the specified S3 bucket. +2. If the file is found, it logs that no action is needed. +3. If the file is not found, it sends a notification to the configured Slack channel. + +Dependencies: +- boto3: For interacting with AWS S3. +- requests: For sending HTTP POST requests to Slack. + +""" + +import boto3 +import botocore +import os +import requests + +from datetime import datetime + + +def send_slack_notification(message): + + print(f" └─ {message}") + + # Create the payload for Slack + slack_data = { + "channel": os.getenv("SLACK_CHANNEL"), + "username": "DDE", + "icon_emoji": ":thumbsdown:", + "text": message, + } + + try: + print(" └─ Sending Slack notification.") + response = requests.post(os.getenv("SLACK_WEBHOOK_URL"), json=slack_data, timeout=10) + if response.status_code == 200: + print(" └─ Slack notification sent successfully.") + else: + print(f" └─ Failed to send message to Slack: {response.status_code}, {response.text}") + except requests.exceptions.Timeout as e: + print(" └─ Request timed out to Slack WebHook URL.") + raise e + except requests.exceptions.RequestException as e: + print(f" └─ Failed to send Slack notification. Error: {str(e)}") + raise e + + +def check_backup_file(): + + # Create the expected file name + today_date = datetime.today().strftime("%Y%m%d") + expected_file = f"{os.getenv('S3_FOLDER')}dde_backup_{today_date}.zip" + + # Create the S3 client + s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION")) + + # Try to fetch the file metadata + try: + response = s3_client.head_object(Bucket=os.getenv("BACKUP_BUCKET_NAME"), Key=expected_file) + print(f" └─ Backup file {expected_file} exists!") + + # Get the file size in bytes + file_size = response['ContentLength'] + + # Check if the file is larger than 1MB + if file_size > 1048576: # 1MB in bytes + print(f" └─ Backup file is larger than 1MB! Size: {file_size} bytes.") + print(" └─ Nothing to do!") + else: + message = f":alert: The backup file {expected_file} is smaller than 1MB!" + send_slack_notification(message) + + except botocore.exceptions.ClientError as e: + print(e) + message = f":alert: The backup file {expected_file} was NOT created today!" + send_slack_notification(message) + + +if __name__ == "__main__": + check_backup_file() diff --git a/.github/workflows/check_backup.yml b/.github/workflows/check_backup.yml new file mode 100644 index 00000000..521435a6 --- /dev/null +++ b/.github/workflows/check_backup.yml @@ -0,0 +1,35 @@ +name: Check S3 Backup and Notify Slack + +on: + workflow_dispatch: # Allows manual trigger from GitHub Actions UI + schedule: + - cron: '0 13 * * *' # 5:00 AM PST (UTC-8) + + jobs: + check-backup: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install boto3 (AWS SDK for Python) + run: | + python -m pip install --upgrade pip + pip install boto3 requests + + - name: Check if backup exists in S3 + run: python .github/scripts/check_backup.py + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.AWS_REGION }} + BACKUP_BUCKET_NAME: "${{ secrets.BACKUP_BUCKET_NAME }}" + S3_FOLDER: "db_backup/" + SLACK_CHANNEL: "#cd2h" + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/discovery/utils/backup.py b/discovery/utils/backup.py index 82e229e9..5ea0246e 100644 --- a/discovery/utils/backup.py +++ b/discovery/utils/backup.py @@ -1,5 +1,7 @@ import json import logging +import zipfile +import io from datetime import date, datetime from typing import Union, List, Tuple @@ -19,15 +21,23 @@ def json_serial(obj): raise TypeError("Type %s not serializable" % type(obj)) -def _default_filename(): - return "dde_backup_" + datetime.today().strftime("%Y%m%d") + ".json" +def _default_filename(extension=".json"): + return "dde_backup_" + datetime.today().strftime("%Y%m%d") + extension -def save_to_s3(data, filename=None, bucket="dde"): - filename = filename or _default_filename() - s3 = boto3.client("s3") +def save_to_s3(data, filename=None, bucket="dde", format="zip"): + filename = filename or _default_filename(f".{format}") + s3 = boto3.resource("s3") obj_key = f"db_backup/{filename}" - s3.put_object(Bucket=bucket, Key=obj_key, Body=json.dumps(data, indent=2, default=json_serial)) + if format == "zip": + with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as zfile: + json_data = json.dumps(data, indent=2, default=json_serial) + zfile.writestr(filename.replace(".zip", ".json"), json_data) + logging.info(f"Uploading {filename} to AWS S3") + s3.Bucket(bucket).upload_file(Filename=filename, Key=obj_key) + else: + logging.info(f"Uploading {filename} to AWS S3") + s3.Bucket(bucket).put_object(Key=obj_key, Body=json.dumps(data, indent=2)) return obj_key @@ -77,7 +87,7 @@ def backup_schema_class(outfile=None): return backup_es(SchemaClass, outfile=outfile) -def daily_backup_routine(): +def daily_backup_routine(format="zip"): logger = logging.getLogger("daily_backup") data = {} try: @@ -95,10 +105,11 @@ def daily_backup_routine(): data.update(_d) logger.info("Saving to S3 bucket...") - s3_obj = save_to_s3(data) + s3_obj = save_to_s3(data, format=format) logger.info("Done. [%s]", s3_obj) except Exception as exc: logger.error(str(exc)) + logger.error("Stack trace:", exc_info=True) def backup_from_file(api: dict, indices: Union[str, List[str], Tupe[str, ...]] = "all") -> None: @@ -192,11 +203,38 @@ def restore_from_s3(filename: str = None, bucket: str = "dde", indices: Union[st obj = s3.get_object(Bucket=bucket, Key=filename) - ddeapis = json.loads(obj["Body"].read()) + filename = filename.replace("db_backup/", "") + + if filename.endswith(".zip"): + file_content = obj["Body"].read() + with zipfile.ZipFile(io.BytesIO(file_content)) as zfile: + # Search for a JSON file inside the ZIP + json_file = next((f for f in zfile.namelist() if f.endswith(".json")), None) + if not json_file: + raise ValueError("No JSON file found inside the ZIP archive.") + with zfile.open(json_file) as json_data: + ddeapis = json.load(json_data) + elif filename.endswith(".json"): + ddeapis = json.loads(obj["Body"].read()) + else: + raise Exception("Unsupported backup file type!") + backup_from_file(ddeapis, indices=indices) def restore_from_file(filename: str = None, indices: Union[str, List[str], Tuple[str, ...]] = "all"): - with open(filename) as file: - ddeapis = json.load(file) - backup_from_file(ddeapis, indices=indices) + if filename.endswith(".zip"): + with zipfile.ZipFile(filename, 'r') as zfile: + # Search for a JSON file inside the ZIP + json_file = next((f for f in zfile.namelist() if f.endswith(".json")), None) + if not json_file: + raise ValueError("No JSON file found inside the ZIP archive.") + with zfile.open(json_file) as json_data: + ddeapis = json.load(json_data) + elif filename.endswith(".json"): + with open(filename) as file: + ddeapis = json.load(file) + else: + raise Exception("Unsupported backup file type!") + + backup_from_file(ddeapis, indices=indices) From 610446ebe991b0bf370b095b370fc917212893ca Mon Sep 17 00:00:00 2001 From: Everaldo Date: Fri, 4 Oct 2024 10:02:22 -0700 Subject: [PATCH 07/20] Fix indentation in github action: backup (#290) --- .github/workflows/check_backup.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_backup.yml b/.github/workflows/check_backup.yml index 521435a6..2cd4f5bf 100644 --- a/.github/workflows/check_backup.yml +++ b/.github/workflows/check_backup.yml @@ -5,7 +5,7 @@ on: schedule: - cron: '0 13 * * *' # 5:00 AM PST (UTC-8) - jobs: +jobs: check-backup: runs-on: ubuntu-latest From 642aac633a0f3fa81a72c33a16d2bcae8aa65082 Mon Sep 17 00:00:00 2001 From: Chunlei Wu Date: Sat, 5 Oct 2024 21:14:20 -0700 Subject: [PATCH 08/20] build: :construction_worker: updated Docker setup --- Dockerfile | 10 +++--- docker-compose.yml | 83 +++++++++++++++++++++++++++++----------------- 2 files changed, 58 insertions(+), 35 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7425d9cc..2933e8ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,13 +2,15 @@ FROM ubuntu RUN apt update -o Acquire::Check-Date=false && \ apt upgrade -y && \ - apt install -y libcurl4-openssl-dev libssl-dev build-essential python3 python3-pip vim git && \ - git clone https://github.com/biothings/discovery-app.git && \ - pip3 install -r /discovery-app/requirements.txt + apt install -y libcurl4-openssl-dev libssl-dev build-essential python3 python3-pip vim git +RUN git clone https://github.com/biothings/discovery-app.git +RUN python3 -m venv /discovery-app/.venv && \ + /discovery-app/.venv/bin/pip install -r /discovery-app/requirements.txt + COPY ./config_key.py /discovery-app/config_key.py WORKDIR /discovery-app EXPOSE 8000 RUN git pull -ENTRYPOINT ["python3", "index.py", "--debug"] \ No newline at end of file +ENTRYPOINT ["/discovery-app/.venv/bin/python", "index.py"] diff --git a/docker-compose.yml b/docker-compose.yml index 6ffaaefb..710fee9c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,31 +1,52 @@ -version: "3.7" -services: - - web: - build: - context: . - dockerfile: Dockerfile - environment: - - ES_INDEX=es - depends_on: - - es - ports: - - "8000:8000" - restart: on-failure - networks: - - net0 - - es: - image: elasticsearch:7.3.1 - environment: - - discovery.type=single-node - - http.cors.enabled=true - - http.cors.allow-origin=/https?:\/\/localhost(:[0-9]+)?/ - ports: - - "9200:9200" - networks: - - net0 - -networks: - net0: - driver: bridge +services: + nginx: + image: nginx:1.27.2 + ports: + - "8000:8000" + volumes: + - ./nginx_conf.d:/etc/nginx/conf.d + depends_on: + - api + - webapp + networks: + - net0 + + api: + build: + context: . + dockerfile: Dockerfile + environment: + - ES_HOST=http://es:9200 + depends_on: + - es + ports: + - "8000" + restart: on-failure + networks: + - net0 + + es: + image: elasticsearch:8.15.2 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - xpack.security.http.ssl.enabled=false + - http.cors.enabled=true + - http.cors.allow-origin=/https?:\/\/localhost(:[0-9]+)?/ + ports: + - "9200" + networks: + - net0 + + webapp: + build: + dockerfile: nuxt-app/Dockerfile + ports: + - "3000" + restart: on-failure + networks: + - net0 + +networks: + net0: + driver: bridge From 2e060d3018d1bdabd5dc05bdde5cccb1ab018494 Mon Sep 17 00:00:00 2001 From: Chunlei Wu Date: Mon, 7 Oct 2024 21:05:21 -0700 Subject: [PATCH 09/20] build: :construction_worker: updated docker-compose setup --- docker/Dockerfile_api | 19 +++++++++ docker/Dockerfile_webapp | 17 +++++++++ docker/config_key_example.py | 3 ++ docker/docker-compose.yml | 53 ++++++++++++++++++++++++++ docker/nginx_conf.d/discovery-app.conf | 30 +++++++++++++++ 5 files changed, 122 insertions(+) create mode 100644 docker/Dockerfile_api create mode 100644 docker/Dockerfile_webapp create mode 100644 docker/config_key_example.py create mode 100644 docker/docker-compose.yml create mode 100644 docker/nginx_conf.d/discovery-app.conf diff --git a/docker/Dockerfile_api b/docker/Dockerfile_api new file mode 100644 index 00000000..2708be61 --- /dev/null +++ b/docker/Dockerfile_api @@ -0,0 +1,19 @@ +FROM python:3.10-alpine + +RUN apk update && \ + apk upgrade && \ + apk --no-cache add git gcc musl-dev libcurl curl-dev + +RUN git clone https://github.com/biothings/discovery-app.git +RUN pip install pip wheel -U && \ + pip install -r /discovery-app/requirements.txt && \ + pip cache purge && \ + apk del gcc musl-dev curl-dev && apk cache clean + + + +COPY ./config_key_example.py /discovery-app/config_key.py +WORKDIR /discovery-app +EXPOSE 8000 + +ENTRYPOINT ["python", "index.py"] diff --git a/docker/Dockerfile_webapp b/docker/Dockerfile_webapp new file mode 100644 index 00000000..35e7fafb --- /dev/null +++ b/docker/Dockerfile_webapp @@ -0,0 +1,17 @@ +FROM node:18-alpine + +RUN apk update && \ + apk upgrade && \ + apk --no-cache add git + +RUN git clone https://github.com/biothings/discovery-app.git +WORKDIR /discovery-app/nuxt-app +RUN npm install && \ + NODE_OPTIONS="--max-old-space-size=1512" npm run build && \ + npm cache clean --force && \ + rm -rf ./node_modules package-lock.json .cache + +ENV PORT=3000 +EXPOSE 3000 + +ENTRYPOINT ["node", ".output/server/index.mjs"] diff --git a/docker/config_key_example.py b/docker/config_key_example.py new file mode 100644 index 00000000..b87667bb --- /dev/null +++ b/docker/config_key_example.py @@ -0,0 +1,3 @@ +COOKIE_SECRET = '' +GITHUB_CLIENT_ID = '' +GITHUB_CLIENT_SECRET = '' diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 00000000..012063ca --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,53 @@ +name: discovery-app +services: + nginx: + image: nginx:stable-alpine + ports: + - "8000:8000" + volumes: + - ./nginx_conf.d:/etc/nginx/conf.d + depends_on: + - api + - webapp + networks: + - net0 + + api: + build: + context: . + dockerfile: Dockerfile_api + environment: + - ES_HOST=http://es:9200 + depends_on: + - es + ports: + - "8000" + restart: on-failure + networks: + - net0 + + es: + image: elasticsearch:8.15.2 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - xpack.security.http.ssl.enabled=false + - http.cors.enabled=true + - http.cors.allow-origin=/https?:\/\/localhost(:[0-9]+)?/ + ports: + - "9200" + networks: + - net0 + + webapp: + build: + dockerfile: Dockerfile_webapp + ports: + - "3000" + restart: on-failure + networks: + - net0 + +networks: + net0: + driver: bridge diff --git a/docker/nginx_conf.d/discovery-app.conf b/docker/nginx_conf.d/discovery-app.conf new file mode 100644 index 00000000..5944de4c --- /dev/null +++ b/docker/nginx_conf.d/discovery-app.conf @@ -0,0 +1,30 @@ +# include /etc/nginx/proxy_params; +proxy_set_header Host $http_host; +proxy_set_header X-Real-IP $remote_addr; +proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; +proxy_set_header X-Forwarded-Proto $scheme; + +upstream discovery_api { + server api:8000; +} + +upstream discovery_frontend { + server webapp:3000; +} + +server { + server_name localhost; + listen 8000; + + location ~ ^/(sitemap.xml) { + proxy_pass http://discovery_frontend; + } + + location ~ ^/(api|user|logout|sitemap|oauth|saml) { + proxy_pass http://discovery_api; + } + + location / { + proxy_pass http://discovery_frontend; + } +} From afa72b4c67acb6962ef479bbdcf819c6719bbd99 Mon Sep 17 00:00:00 2001 From: Chunlei Wu Date: Mon, 7 Oct 2024 21:11:37 -0700 Subject: [PATCH 10/20] chore: :memo: updated README.md for docker compose setup --- Dockerfile | 16 -------------- README.md | 15 ++++++------- docker-compose.yml | 52 ---------------------------------------------- 3 files changed, 8 insertions(+), 75 deletions(-) delete mode 100644 Dockerfile delete mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 2933e8ab..00000000 --- a/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM ubuntu - -RUN apt update -o Acquire::Check-Date=false && \ - apt upgrade -y && \ - apt install -y libcurl4-openssl-dev libssl-dev build-essential python3 python3-pip vim git -RUN git clone https://github.com/biothings/discovery-app.git -RUN python3 -m venv /discovery-app/.venv && \ - /discovery-app/.venv/bin/pip install -r /discovery-app/requirements.txt - - -COPY ./config_key.py /discovery-app/config_key.py -WORKDIR /discovery-app -EXPOSE 8000 - -RUN git pull -ENTRYPOINT ["/discovery-app/.venv/bin/python", "index.py"] diff --git a/README.md b/README.md index e0548f43..fe2a8c85 100644 --- a/README.md +++ b/README.md @@ -52,33 +52,34 @@ You should now be able to access the homepage at ## Run in Docker -First refer to step 5 above to setup the credentials required to run the program. +First refer to step 6 above to setup the credentials required to run the program. The following commands should be issued under the first level project folder. -Make sure you have port `8000` and `9200` not in use when starting containers. +Make sure you have port `8000` not in use when starting containers. ### Build ```bash -docker-compose up --detach +cd docker +docker compose up --detach ``` ### Stop and restart ```bash -docker-compose stop -docker-compose start +docker compose stop +docker compose start ``` ### Update codebase ```bash -docker-compose exec web git pull +docker compose exec web git pull ``` ### Remove containers ```bash -docker-compose down +docker compose down ``` # Related Projects diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 710fee9c..00000000 --- a/docker-compose.yml +++ /dev/null @@ -1,52 +0,0 @@ -services: - nginx: - image: nginx:1.27.2 - ports: - - "8000:8000" - volumes: - - ./nginx_conf.d:/etc/nginx/conf.d - depends_on: - - api - - webapp - networks: - - net0 - - api: - build: - context: . - dockerfile: Dockerfile - environment: - - ES_HOST=http://es:9200 - depends_on: - - es - ports: - - "8000" - restart: on-failure - networks: - - net0 - - es: - image: elasticsearch:8.15.2 - environment: - - discovery.type=single-node - - xpack.security.enabled=false - - xpack.security.http.ssl.enabled=false - - http.cors.enabled=true - - http.cors.allow-origin=/https?:\/\/localhost(:[0-9]+)?/ - ports: - - "9200" - networks: - - net0 - - webapp: - build: - dockerfile: nuxt-app/Dockerfile - ports: - - "3000" - restart: on-failure - networks: - - net0 - -networks: - net0: - driver: bridge From 8f6ce47c273e9c1c2393247c1bd09dc26bcde6dc Mon Sep 17 00:00:00 2001 From: Marco Cano Date: Mon, 7 Oct 2024 16:13:38 -0700 Subject: [PATCH 11/20] fix: :hammer: query phrase match priority, hierarchy view, prefix included in results --- discovery/pipeline.py | 9 ++++++--- nuxt-app/components/SchemaRegistryItem.vue | 3 ++- nuxt-app/pages/markup-generator/index.vue | 2 +- nuxt-app/pages/ns/[namespace]/[[query]].vue | 2 +- nuxt-app/pages/registry/index.vue | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/discovery/pipeline.py b/discovery/pipeline.py index 9ad96e30..a8fb3ccd 100644 --- a/discovery/pipeline.py +++ b/discovery/pipeline.py @@ -28,10 +28,13 @@ def default_string_query(self, q, options): "query": { "dis_max": { "queries": [ - {"term": {"_id": {"value": q, "boost": 15.0}}}, - {"term": {"label.raw": {"value": q, "boost": 10.0}}}, + # Prioritize phrase matches with a higher boost + {"match_phrase": {"label": {"query": q, "boost": 20.0, "case_insensitive": True}}}, + {"match_phrase": {"name": {"query": q, "boost": 15.0, "case_insensitive": True}}}, + # Fallback to term and match queries with lower priority + {"term": {"_id": {"value": q, "boost": 10.0}}}, + {"term": {"label.raw": {"value": q, "boost": 8.0}}}, {"term": {"_meta.username": {"value": q}}}, - {"term": {"name": {"value": q}}}, {"match": {"parent_classes": {"query": q}}}, {"prefix": {"label": {"value": q}}}, {"query_string": {"query": q}}, diff --git a/nuxt-app/components/SchemaRegistryItem.vue b/nuxt-app/components/SchemaRegistryItem.vue index 98f35c95..e572cdb7 100644 --- a/nuxt-app/components/SchemaRegistryItem.vue +++ b/nuxt-app/components/SchemaRegistryItem.vue @@ -23,7 +23,8 @@
- {{ item.label }} + {{ item.namespace }}:{{ item.label }}

diff --git a/nuxt-app/pages/markup-generator/index.vue b/nuxt-app/pages/markup-generator/index.vue index b6db96cb..04ebf869 100644 --- a/nuxt-app/pages/markup-generator/index.vue +++ b/nuxt-app/pages/markup-generator/index.vue @@ -141,7 +141,7 @@ onMounted(() => { });