From 85fde3e3742db5ada0cf19d3324ff36e5b94adfd Mon Sep 17 00:00:00 2001
From: Keyur Shah <keyurva@gmail.com>
Date: Fri, 11 Oct 2024 15:29:00 -0700
Subject: [PATCH] Create colab notebook demonstrating loading data into a
 custom DC. (#287)

---
 ...Your_Data_Commons_Load_Data_Workflow.ipynb | 324 ++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb
diff --git a/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb b/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb
new file mode 100644
index 0000000..e79189e
--- /dev/null
+++ b/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb
@@ -0,0 +1,324 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "mount_file_id": "1-mn6074ewnVaAekb0_WXtns6IdNK3HGu",
+      "authorship_tag": "ABX9TyOVQ2i3HxmNED6HbFpIulxM",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/datacommonsorg/tools/blob/master/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Copyright 2024 Google LLC.\n",
+        "SPDX-License-Identifier: Apache-2.0"
+      ],
+      "metadata": {
+        "id": "j0scfBZnWZTL"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Y8Tm9pY0iYiZ"
+      },
+      "outputs": [],
+      "source": [
+        "# Install dependencies\n",
+        "\n",
+        "%%capture\n",
+        "!pip install google-cloud-storage\n",
+        "!pip install google-cloud-run"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# User parameters\n",
+        "\n",
+        "# Replace the values in this section with yours.\n",
+        "# Running without replacement can lead to errors since you won't have permissions to those resources.\n",
+        "\n",
+        "# Path to your local folder with files to be uploaded to GCS.\n",
+        "# Note that colab works best with Google drive, so upload your data to a drive and use that path here.\n",
+        "# This is only needed when working within colab. With standalone scripts, you can obviously use local folder paths.\n",
+        "google_drive_folder_path = '/content/drive/MyDrive/demo-data'\n",
+        "\n",
+        "# Your GCP project ID.\n",
+        "# This notebook uses this project ID for all GCP resources - GCS, Cloud SQL, Cloud Run.\n",
+        "# If your resources are spread across different projects, you will need to specify different project IDs accordingly.\n",
+        "gcp_project_id = 'datcom-website-dev'\n",
+        "\n",
+        "# The GCS bucket where your data will be uploaded.\n",
+        "gcs_bucket_name = 'customdc-data'\n",
+        "# The folder under the GCP bucket where your data will be uploaded.\n",
+        "gcs_folder_path = 'load-data-workflow-demo'\n",
+        "\n",
+        "# The name of your Cloud SQL DB where your data will be imported.\n",
+        "# Note that DB names can only have alpha-numeric characters.\n",
+        "# Otherwise it fails with obscure messages.\n",
+        "cloud_sql_db_name = 'demodb'\n",
+        "\n",
+        "# The region where your cloud run jobs and services are deployed.\n",
+        "cloud_run_region = 'us-central1'\n",
+        "\n",
+        "# Cloud Run resources.\n",
+        "# The script does not create a new job or service but runs and deploys existing ones respectively.\n",
+        "# You can programmatically creates them as well but will need to use different APIs to do so.\n",
+        "\n",
+        "# Name of the cloud run job used for loading data.\n",
+        "load_data_cloud_run_job_name = 'demo-job'\n",
+        "# Name of the cloud run service for your datacommons instance.\n",
+        "datacommons_cloud_run_service_name = 'demo-service'\n",
+        "# The name of the docker image that will be deployed to the cloud run service.\n",
+        "datacommons_service_image = 'gcr.io/datcom-ci/datacommons-services:latest'\n",
+        "\n",
+        "full_gcs_path = f'gs://{gcs_bucket_name}/{gcs_folder_path}'\n",
+        "full_gcs_path"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "id": "34H15wBAi5G3",
+        "outputId": "3a87c029-2351-459a-d60d-fbfc0d644e97"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "'gs://customdc-data/load-data-workflow-demo'"
+            ],
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "string"
+            }
+          },
+          "metadata": {},
+          "execution_count": 21
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Imports\n",
+        "\n",
+        "import os\n",
+        "import glob\n",
+        "from datetime import datetime\n",
+        "from google.colab import auth\n",
+        "from google.cloud import storage\n",
+        "from google.cloud import run_v2\n",
+        "from google.colab import drive"
+      ],
+      "metadata": {
+        "id": "Hcz3Nhg1lKcn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Authenticate user, mount google drive and instantiate GCP objects\n",
+        "auth.authenticate_user()\n",
+        "\n",
+        "drive.mount('/content/drive')\n",
+        "\n",
+        "gcs_client = storage.Client(project=gcp_project_id)\n",
+        "gcs_bucket = gcs_client.bucket(gcs_bucket_name)"
+      ],
+      "metadata": {
+        "id": "-FvqSgDdlOTo",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "4b5d2b0c-d116-48f1-b809-966d08c6d5d8"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Workflow functions\n",
+        "\n",
+        "def upload_local_directory_to_gcs():\n",
+        "  assert os.path.isdir(google_drive_folder_path), f'Not a directory: {google_drive_folder_path}'\n",
+        "  for local_file in glob.glob(google_drive_folder_path + '/**'):\n",
+        "    if os.path.isfile(local_file):\n",
+        "      remote_path = os.path.join(gcs_folder_path, local_file[1 + len(google_drive_folder_path):])\n",
+        "      blob = gcs_bucket.blob(remote_path)\n",
+        "      blob.upload_from_filename(local_file)\n",
+        "      print(f'Uploaded {local_file} to gs://{gcs_bucket_name}/{remote_path}')\n",
+        "\n",
+        "def run_load_data_job():\n",
+        "  client = run_v2.JobsClient()\n",
+        "\n",
+        "  job_path = f'projects/{gcp_project_id}/locations/{cloud_run_region}/jobs/{load_data_cloud_run_job_name}'\n",
+        "\n",
+        "  env_vars = [\n",
+        "      run_v2.EnvVar(name='RUN_TIMESTAMP', value=str(datetime.now())),\n",
+        "      run_v2.EnvVar(name='INPUT_DIR', value=full_gcs_path),\n",
+        "      run_v2.EnvVar(name='OUTPUT_DIR', value=full_gcs_path),\n",
+        "      run_v2.EnvVar(name='DB_NAME', value=cloud_sql_db_name)\n",
+        "  ]\n",
+        "\n",
+        "  request = run_v2.RunJobRequest(\n",
+        "      name=job_path,\n",
+        "      etag='*',  # Use * for latest revision\n",
+        "      validate_only=False,\n",
+        "      overrides=run_v2.RunJobRequest.Overrides(\n",
+        "          task_count=1,\n",
+        "          container_overrides=[\n",
+        "              run_v2.RunJobRequest.Overrides.ContainerOverride(env=env_vars)\n",
+        "          ],\n",
+        "      ),\n",
+        "  )\n",
+        "\n",
+        "  operation = client.run_job(request=request)\n",
+        "\n",
+        "  logs_url = f'https://console.cloud.google.com/run/jobs/details/{cloud_run_region}/{load_data_cloud_run_job_name}/logs?project={gcp_project_id}'\n",
+        "\n",
+        "  print('Waiting for load data job to complete. This will take a few minutes...')\n",
+        "  print(f'You can monitor the logs at: {logs_url}')\n",
+        "  response = operation.result()\n",
+        "  print(f'Load data job completed: {response.name}')\n",
+        "\n",
+        "def deploy_datacommons_service():\n",
+        "  client = run_v2.ServicesClient()\n",
+        "\n",
+        "  service_path = f'projects/{gcp_project_id}/locations/{cloud_run_region}/services/{datacommons_cloud_run_service_name}'\n",
+        "\n",
+        "  env = {\n",
+        "      'DEPLOY_TIMESTAMP': str(datetime.now()),\n",
+        "      'INPUT_DIR': full_gcs_path,\n",
+        "      'OUTPUT_DIR': full_gcs_path,\n",
+        "      'DB_NAME': cloud_sql_db_name\n",
+        "  }\n",
+        "\n",
+        "  service = client.get_service(name=service_path)\n",
+        "  container = service.template.containers[0]\n",
+        "  env_vars = []\n",
+        "\n",
+        "  for var_name, var_value in env.items():\n",
+        "    env_vars.append(run_v2.EnvVar(name=var_name, value=var_value))\n",
+        "\n",
+        "  for env_var in container.env:\n",
+        "    var_name = env_var.name\n",
+        "    if var_name not in env:\n",
+        "      env_vars.append(env_var)\n",
+        "\n",
+        "  container.env = env_vars\n",
+        "  service.template.containers = [container]\n",
+        "\n",
+        "  request = run_v2.UpdateServiceRequest(\n",
+        "      service=service,\n",
+        "      validate_only=False,\n",
+        "      allow_missing=False,\n",
+        "  )\n",
+        "\n",
+        "  operation = client.update_service(request=request)\n",
+        "\n",
+        "  logs_url = f'https://console.cloud.google.com/run/detail/{cloud_run_region}/{datacommons_cloud_run_service_name}/logs?project={gcp_project_id}'\n",
+        "\n",
+        "  print('Waiting for service to get deployed. This will take a few minutes...')\n",
+        "  print(f'You can monitor the logs at: {logs_url}')\n",
+        "  response = operation.result()\n",
+        "  print(f'Service deployed: {response.name}')\n",
+        "  print(f'Service URL: {response.urls[-1]}')"
+      ],
+      "metadata": {
+        "id": "GDUiNaWLmGEf"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Execute workflow\n",
+        "\n",
+        "print(\"\\n==== STEP 1 of 3: Uploading data to GCS ====\\n\")\n",
+        "upload_local_directory_to_gcs()\n",
+        "\n",
+        "print(\"\\n==== STEP 2 of 3: Loading data in your datacommons store ====\\n\")\n",
+        "run_load_data_job()\n",
+        "\n",
+        "print(\"\\n==== STEP 3 of 3: Deploying your datacommons service ====\\n\")\n",
+        "deploy_datacommons_service()\n",
+        "\n",
+        "print(\"\\n==== DONE ====\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "agD4W8TfnHh5",
+        "outputId": "3ad21d6b-e489-4533-8fc6-fd50006bdaf1"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\n",
+            "==== STEP 1 of 3: Uploading data to GCS ====\n",
+            "\n",
+            "Uploaded /content/drive/MyDrive/demo-data/gender_wage_gap.csv to gs://customdc-data/load-data-workflow-demo/gender_wage_gap.csv\n",
+            "Uploaded /content/drive/MyDrive/demo-data/config.json to gs://customdc-data/load-data-workflow-demo/config.json\n",
+            "Uploaded /content/drive/MyDrive/demo-data/average_annual_wage.csv to gs://customdc-data/load-data-workflow-demo/average_annual_wage.csv\n",
+            "\n",
+            "==== STEP 2 of 3: Loading data in your datacommons store ====\n",
+            "\n",
+            "Waiting for load data job to complete. This will take a few minutes...\n",
+            "You can monitor the logs at: https://console.cloud.google.com/run/jobs/details/us-central1/demo-job/logs?project=datcom-website-dev\n",
+            "Load data job completed: projects/datcom-website-dev/locations/us-central1/jobs/demo-job/executions/demo-job-2fhrs\n",
+            "\n",
+            "==== STEP 3 of 3: Deploying your datacommons service ====\n",
+            "\n",
+            "Waiting for service to get deployed. This will take a few minutes...\n",
+            "You can monitor the logs at: https://console.cloud.google.com/run/detail/us-central1/demo-service/logs?project=datcom-website-dev\n",
+            "Service deployed: projects/datcom-website-dev/locations/us-central1/services/demo-service\n",
+            "Service URL: https://demo-service-kqb7thiuka-uc.a.run.app\n",
+            "\n",
+            "==== DONE ====\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file