From 85fde3e3742db5ada0cf19d3324ff36e5b94adfd Mon Sep 17 00:00:00 2001 From: Keyur Shah Date: Fri, 11 Oct 2024 15:29:00 -0700 Subject: [PATCH] Create colab notebook demonstrating loading data into a custom DC. (#287) --- ...Your_Data_Commons_Load_Data_Workflow.ipynb | 324 ++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb diff --git a/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb b/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb new file mode 100644 index 0000000..e79189e --- /dev/null +++ b/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb @@ -0,0 +1,324 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "mount_file_id": "1-mn6074ewnVaAekb0_WXtns6IdNK3HGu", + "authorship_tag": "ABX9TyOVQ2i3HxmNED6HbFpIulxM", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Copyright 2024 Google LLC.\n", + "SPDX-License-Identifier: Apache-2.0" + ], + "metadata": { + "id": "j0scfBZnWZTL" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y8Tm9pY0iYiZ" + }, + "outputs": [], + "source": [ + "# Install dependencies\n", + "\n", + "%%capture\n", + "!pip install google-cloud-storage\n", + "!pip install google-cloud-run" + ] + }, + { + "cell_type": "code", + "source": [ + "# User parameters\n", + "\n", + "# Replace the values in this section with yours.\n", + "# Running without replacement can lead to errors since you won't have permissions to those resources.\n", + "\n", + "# Path to your local folder with files to be uploaded to GCS.\n", + "# Note that colab works best with Google drive, so upload your data to a drive and use that path here.\n", + "# This is only needed when working within colab. With standalone scripts, you can obviously use local folder paths.\n", + "google_drive_folder_path = '/content/drive/MyDrive/demo-data'\n", + "\n", + "# Your GCP project ID.\n", + "# This notebook uses this project ID for all GCP resources - GCS, Cloud SQL, Cloud Run.\n", + "# If your resources are spread across different projects, you will need to specify different project IDs accordingly.\n", + "gcp_project_id = 'datcom-website-dev'\n", + "\n", + "# The GCS bucket where your data will be uploaded.\n", + "gcs_bucket_name = 'customdc-data'\n", + "# The folder under the GCP bucket where your data will be uploaded.\n", + "gcs_folder_path = 'load-data-workflow-demo'\n", + "\n", + "# The name of your Cloud SQL DB where your data will be imported.\n", + "# Note that DB names can only have alpha-numeric characters.\n", + "# Otherwise it fails with obscure messages.\n", + "cloud_sql_db_name = 'demodb'\n", + "\n", + "# The region where your cloud run jobs and services are deployed.\n", + "cloud_run_region = 'us-central1'\n", + "\n", + "# Cloud Run resources.\n", + "# The script does not create a new job or service but runs and deploys existing ones respectively.\n", + "# You can programmatically creates them as well but will need to use different APIs to do so.\n", + "\n", + "# Name of the cloud run job used for loading data.\n", + "load_data_cloud_run_job_name = 'demo-job'\n", + "# Name of the cloud run service for your datacommons instance.\n", + "datacommons_cloud_run_service_name = 'demo-service'\n", + "# The name of the docker image that will be deployed to the cloud run service.\n", + "datacommons_service_image = 'gcr.io/datcom-ci/datacommons-services:latest'\n", + "\n", + "full_gcs_path = f'gs://{gcs_bucket_name}/{gcs_folder_path}'\n", + "full_gcs_path" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "34H15wBAi5G3", + "outputId": "3a87c029-2351-459a-d60d-fbfc0d644e97" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'gs://customdc-data/load-data-workflow-demo'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Imports\n", + "\n", + "import os\n", + "import glob\n", + "from datetime import datetime\n", + "from google.colab import auth\n", + "from google.cloud import storage\n", + "from google.cloud import run_v2\n", + "from google.colab import drive" + ], + "metadata": { + "id": "Hcz3Nhg1lKcn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Authenticate user, mount google drive and instantiate GCP objects\n", + "auth.authenticate_user()\n", + "\n", + "drive.mount('/content/drive')\n", + "\n", + "gcs_client = storage.Client(project=gcp_project_id)\n", + "gcs_bucket = gcs_client.bucket(gcs_bucket_name)" + ], + "metadata": { + "id": "-FvqSgDdlOTo", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4b5d2b0c-d116-48f1-b809-966d08c6d5d8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Workflow functions\n", + "\n", + "def upload_local_directory_to_gcs():\n", + " assert os.path.isdir(google_drive_folder_path), f'Not a directory: {google_drive_folder_path}'\n", + " for local_file in glob.glob(google_drive_folder_path + '/**'):\n", + " if os.path.isfile(local_file):\n", + " remote_path = os.path.join(gcs_folder_path, local_file[1 + len(google_drive_folder_path):])\n", + " blob = gcs_bucket.blob(remote_path)\n", + " blob.upload_from_filename(local_file)\n", + " print(f'Uploaded {local_file} to gs://{gcs_bucket_name}/{remote_path}')\n", + "\n", + "def run_load_data_job():\n", + " client = run_v2.JobsClient()\n", + "\n", + " job_path = f'projects/{gcp_project_id}/locations/{cloud_run_region}/jobs/{load_data_cloud_run_job_name}'\n", + "\n", + " env_vars = [\n", + " run_v2.EnvVar(name='RUN_TIMESTAMP', value=str(datetime.now())),\n", + " run_v2.EnvVar(name='INPUT_DIR', value=full_gcs_path),\n", + " run_v2.EnvVar(name='OUTPUT_DIR', value=full_gcs_path),\n", + " run_v2.EnvVar(name='DB_NAME', value=cloud_sql_db_name)\n", + " ]\n", + "\n", + " request = run_v2.RunJobRequest(\n", + " name=job_path,\n", + " etag='*', # Use * for latest revision\n", + " validate_only=False,\n", + " overrides=run_v2.RunJobRequest.Overrides(\n", + " task_count=1,\n", + " container_overrides=[\n", + " run_v2.RunJobRequest.Overrides.ContainerOverride(env=env_vars)\n", + " ],\n", + " ),\n", + " )\n", + "\n", + " operation = client.run_job(request=request)\n", + "\n", + " logs_url = f'https://console.cloud.google.com/run/jobs/details/{cloud_run_region}/{load_data_cloud_run_job_name}/logs?project={gcp_project_id}'\n", + "\n", + " print('Waiting for load data job to complete. This will take a few minutes...')\n", + " print(f'You can monitor the logs at: {logs_url}')\n", + " response = operation.result()\n", + " print(f'Load data job completed: {response.name}')\n", + "\n", + "def deploy_datacommons_service():\n", + " client = run_v2.ServicesClient()\n", + "\n", + " service_path = f'projects/{gcp_project_id}/locations/{cloud_run_region}/services/{datacommons_cloud_run_service_name}'\n", + "\n", + " env = {\n", + " 'DEPLOY_TIMESTAMP': str(datetime.now()),\n", + " 'INPUT_DIR': full_gcs_path,\n", + " 'OUTPUT_DIR': full_gcs_path,\n", + " 'DB_NAME': cloud_sql_db_name\n", + " }\n", + "\n", + " service = client.get_service(name=service_path)\n", + " container = service.template.containers[0]\n", + " env_vars = []\n", + "\n", + " for var_name, var_value in env.items():\n", + " env_vars.append(run_v2.EnvVar(name=var_name, value=var_value))\n", + "\n", + " for env_var in container.env:\n", + " var_name = env_var.name\n", + " if var_name not in env:\n", + " env_vars.append(env_var)\n", + "\n", + " container.env = env_vars\n", + " service.template.containers = [container]\n", + "\n", + " request = run_v2.UpdateServiceRequest(\n", + " service=service,\n", + " validate_only=False,\n", + " allow_missing=False,\n", + " )\n", + "\n", + " operation = client.update_service(request=request)\n", + "\n", + " logs_url = f'https://console.cloud.google.com/run/detail/{cloud_run_region}/{datacommons_cloud_run_service_name}/logs?project={gcp_project_id}'\n", + "\n", + " print('Waiting for service to get deployed. This will take a few minutes...')\n", + " print(f'You can monitor the logs at: {logs_url}')\n", + " response = operation.result()\n", + " print(f'Service deployed: {response.name}')\n", + " print(f'Service URL: {response.urls[-1]}')" + ], + "metadata": { + "id": "GDUiNaWLmGEf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Execute workflow\n", + "\n", + "print(\"\\n==== STEP 1 of 3: Uploading data to GCS ====\\n\")\n", + "upload_local_directory_to_gcs()\n", + "\n", + "print(\"\\n==== STEP 2 of 3: Loading data in your datacommons store ====\\n\")\n", + "run_load_data_job()\n", + "\n", + "print(\"\\n==== STEP 3 of 3: Deploying your datacommons service ====\\n\")\n", + "deploy_datacommons_service()\n", + "\n", + "print(\"\\n==== DONE ====\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "agD4W8TfnHh5", + "outputId": "3ad21d6b-e489-4533-8fc6-fd50006bdaf1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "==== STEP 1 of 3: Uploading data to GCS ====\n", + "\n", + "Uploaded /content/drive/MyDrive/demo-data/gender_wage_gap.csv to gs://customdc-data/load-data-workflow-demo/gender_wage_gap.csv\n", + "Uploaded /content/drive/MyDrive/demo-data/config.json to gs://customdc-data/load-data-workflow-demo/config.json\n", + "Uploaded /content/drive/MyDrive/demo-data/average_annual_wage.csv to gs://customdc-data/load-data-workflow-demo/average_annual_wage.csv\n", + "\n", + "==== STEP 2 of 3: Loading data in your datacommons store ====\n", + "\n", + "Waiting for load data job to complete. This will take a few minutes...\n", + "You can monitor the logs at: https://console.cloud.google.com/run/jobs/details/us-central1/demo-job/logs?project=datcom-website-dev\n", + "Load data job completed: projects/datcom-website-dev/locations/us-central1/jobs/demo-job/executions/demo-job-2fhrs\n", + "\n", + "==== STEP 3 of 3: Deploying your datacommons service ====\n", + "\n", + "Waiting for service to get deployed. This will take a few minutes...\n", + "You can monitor the logs at: https://console.cloud.google.com/run/detail/us-central1/demo-service/logs?project=datcom-website-dev\n", + "Service deployed: projects/datcom-website-dev/locations/us-central1/services/demo-service\n", + "Service URL: https://demo-service-kqb7thiuka-uc.a.run.app\n", + "\n", + "==== DONE ====\n" + ] + } + ] + } + ] +} \ No newline at end of file