Skip to content

Commit

Permalink
Create colab notebook demonstrating loading data into a custom DC. (#287
Browse files Browse the repository at this point in the history
)
  • Loading branch information
keyurva authored Oct 11, 2024
1 parent cd7d72d commit 85fde3e
Showing 1 changed file with 324 additions and 0 deletions.
324 changes: 324 additions & 0 deletions notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"mount_file_id": "1-mn6074ewnVaAekb0_WXtns6IdNK3HGu",
"authorship_tag": "ABX9TyOVQ2i3HxmNED6HbFpIulxM",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/datacommonsorg/tools/blob/master/notebooks/Your_Data_Commons_Load_Data_Workflow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"Copyright 2024 Google LLC.\n",
"SPDX-License-Identifier: Apache-2.0"
],
"metadata": {
"id": "j0scfBZnWZTL"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Y8Tm9pY0iYiZ"
},
"outputs": [],
"source": [
"# Install dependencies\n",
"\n",
"%%capture\n",
"!pip install google-cloud-storage\n",
"!pip install google-cloud-run"
]
},
{
"cell_type": "code",
"source": [
"# User parameters\n",
"\n",
"# Replace the values in this section with yours.\n",
"# Running without replacement can lead to errors since you won't have permissions to those resources.\n",
"\n",
"# Path to your local folder with files to be uploaded to GCS.\n",
"# Note that colab works best with Google drive, so upload your data to a drive and use that path here.\n",
"# This is only needed when working within colab. With standalone scripts, you can obviously use local folder paths.\n",
"google_drive_folder_path = '/content/drive/MyDrive/demo-data'\n",
"\n",
"# Your GCP project ID.\n",
"# This notebook uses this project ID for all GCP resources - GCS, Cloud SQL, Cloud Run.\n",
"# If your resources are spread across different projects, you will need to specify different project IDs accordingly.\n",
"gcp_project_id = 'datcom-website-dev'\n",
"\n",
"# The GCS bucket where your data will be uploaded.\n",
"gcs_bucket_name = 'customdc-data'\n",
"# The folder under the GCP bucket where your data will be uploaded.\n",
"gcs_folder_path = 'load-data-workflow-demo'\n",
"\n",
"# The name of your Cloud SQL DB where your data will be imported.\n",
"# Note that DB names can only have alpha-numeric characters.\n",
"# Otherwise it fails with obscure messages.\n",
"cloud_sql_db_name = 'demodb'\n",
"\n",
"# The region where your cloud run jobs and services are deployed.\n",
"cloud_run_region = 'us-central1'\n",
"\n",
"# Cloud Run resources.\n",
"# The script does not create a new job or service but runs and deploys existing ones respectively.\n",
"# You can programmatically creates them as well but will need to use different APIs to do so.\n",
"\n",
"# Name of the cloud run job used for loading data.\n",
"load_data_cloud_run_job_name = 'demo-job'\n",
"# Name of the cloud run service for your datacommons instance.\n",
"datacommons_cloud_run_service_name = 'demo-service'\n",
"# The name of the docker image that will be deployed to the cloud run service.\n",
"datacommons_service_image = 'gcr.io/datcom-ci/datacommons-services:latest'\n",
"\n",
"full_gcs_path = f'gs://{gcs_bucket_name}/{gcs_folder_path}'\n",
"full_gcs_path"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "34H15wBAi5G3",
"outputId": "3a87c029-2351-459a-d60d-fbfc0d644e97"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'gs://customdc-data/load-data-workflow-demo'"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 21
}
]
},
{
"cell_type": "code",
"source": [
"# Imports\n",
"\n",
"import os\n",
"import glob\n",
"from datetime import datetime\n",
"from google.colab import auth\n",
"from google.cloud import storage\n",
"from google.cloud import run_v2\n",
"from google.colab import drive"
],
"metadata": {
"id": "Hcz3Nhg1lKcn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Authenticate user, mount google drive and instantiate GCP objects\n",
"auth.authenticate_user()\n",
"\n",
"drive.mount('/content/drive')\n",
"\n",
"gcs_client = storage.Client(project=gcp_project_id)\n",
"gcs_bucket = gcs_client.bucket(gcs_bucket_name)"
],
"metadata": {
"id": "-FvqSgDdlOTo",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "4b5d2b0c-d116-48f1-b809-966d08c6d5d8"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Workflow functions\n",
"\n",
"def upload_local_directory_to_gcs():\n",
" assert os.path.isdir(google_drive_folder_path), f'Not a directory: {google_drive_folder_path}'\n",
" for local_file in glob.glob(google_drive_folder_path + '/**'):\n",
" if os.path.isfile(local_file):\n",
" remote_path = os.path.join(gcs_folder_path, local_file[1 + len(google_drive_folder_path):])\n",
" blob = gcs_bucket.blob(remote_path)\n",
" blob.upload_from_filename(local_file)\n",
" print(f'Uploaded {local_file} to gs://{gcs_bucket_name}/{remote_path}')\n",
"\n",
"def run_load_data_job():\n",
" client = run_v2.JobsClient()\n",
"\n",
" job_path = f'projects/{gcp_project_id}/locations/{cloud_run_region}/jobs/{load_data_cloud_run_job_name}'\n",
"\n",
" env_vars = [\n",
" run_v2.EnvVar(name='RUN_TIMESTAMP', value=str(datetime.now())),\n",
" run_v2.EnvVar(name='INPUT_DIR', value=full_gcs_path),\n",
" run_v2.EnvVar(name='OUTPUT_DIR', value=full_gcs_path),\n",
" run_v2.EnvVar(name='DB_NAME', value=cloud_sql_db_name)\n",
" ]\n",
"\n",
" request = run_v2.RunJobRequest(\n",
" name=job_path,\n",
" etag='*', # Use * for latest revision\n",
" validate_only=False,\n",
" overrides=run_v2.RunJobRequest.Overrides(\n",
" task_count=1,\n",
" container_overrides=[\n",
" run_v2.RunJobRequest.Overrides.ContainerOverride(env=env_vars)\n",
" ],\n",
" ),\n",
" )\n",
"\n",
" operation = client.run_job(request=request)\n",
"\n",
" logs_url = f'https://console.cloud.google.com/run/jobs/details/{cloud_run_region}/{load_data_cloud_run_job_name}/logs?project={gcp_project_id}'\n",
"\n",
" print('Waiting for load data job to complete. This will take a few minutes...')\n",
" print(f'You can monitor the logs at: {logs_url}')\n",
" response = operation.result()\n",
" print(f'Load data job completed: {response.name}')\n",
"\n",
"def deploy_datacommons_service():\n",
" client = run_v2.ServicesClient()\n",
"\n",
" service_path = f'projects/{gcp_project_id}/locations/{cloud_run_region}/services/{datacommons_cloud_run_service_name}'\n",
"\n",
" env = {\n",
" 'DEPLOY_TIMESTAMP': str(datetime.now()),\n",
" 'INPUT_DIR': full_gcs_path,\n",
" 'OUTPUT_DIR': full_gcs_path,\n",
" 'DB_NAME': cloud_sql_db_name\n",
" }\n",
"\n",
" service = client.get_service(name=service_path)\n",
" container = service.template.containers[0]\n",
" env_vars = []\n",
"\n",
" for var_name, var_value in env.items():\n",
" env_vars.append(run_v2.EnvVar(name=var_name, value=var_value))\n",
"\n",
" for env_var in container.env:\n",
" var_name = env_var.name\n",
" if var_name not in env:\n",
" env_vars.append(env_var)\n",
"\n",
" container.env = env_vars\n",
" service.template.containers = [container]\n",
"\n",
" request = run_v2.UpdateServiceRequest(\n",
" service=service,\n",
" validate_only=False,\n",
" allow_missing=False,\n",
" )\n",
"\n",
" operation = client.update_service(request=request)\n",
"\n",
" logs_url = f'https://console.cloud.google.com/run/detail/{cloud_run_region}/{datacommons_cloud_run_service_name}/logs?project={gcp_project_id}'\n",
"\n",
" print('Waiting for service to get deployed. This will take a few minutes...')\n",
" print(f'You can monitor the logs at: {logs_url}')\n",
" response = operation.result()\n",
" print(f'Service deployed: {response.name}')\n",
" print(f'Service URL: {response.urls[-1]}')"
],
"metadata": {
"id": "GDUiNaWLmGEf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Execute workflow\n",
"\n",
"print(\"\\n==== STEP 1 of 3: Uploading data to GCS ====\\n\")\n",
"upload_local_directory_to_gcs()\n",
"\n",
"print(\"\\n==== STEP 2 of 3: Loading data in your datacommons store ====\\n\")\n",
"run_load_data_job()\n",
"\n",
"print(\"\\n==== STEP 3 of 3: Deploying your datacommons service ====\\n\")\n",
"deploy_datacommons_service()\n",
"\n",
"print(\"\\n==== DONE ====\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "agD4W8TfnHh5",
"outputId": "3ad21d6b-e489-4533-8fc6-fd50006bdaf1"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"==== STEP 1 of 3: Uploading data to GCS ====\n",
"\n",
"Uploaded /content/drive/MyDrive/demo-data/gender_wage_gap.csv to gs://customdc-data/load-data-workflow-demo/gender_wage_gap.csv\n",
"Uploaded /content/drive/MyDrive/demo-data/config.json to gs://customdc-data/load-data-workflow-demo/config.json\n",
"Uploaded /content/drive/MyDrive/demo-data/average_annual_wage.csv to gs://customdc-data/load-data-workflow-demo/average_annual_wage.csv\n",
"\n",
"==== STEP 2 of 3: Loading data in your datacommons store ====\n",
"\n",
"Waiting for load data job to complete. This will take a few minutes...\n",
"You can monitor the logs at: https://console.cloud.google.com/run/jobs/details/us-central1/demo-job/logs?project=datcom-website-dev\n",
"Load data job completed: projects/datcom-website-dev/locations/us-central1/jobs/demo-job/executions/demo-job-2fhrs\n",
"\n",
"==== STEP 3 of 3: Deploying your datacommons service ====\n",
"\n",
"Waiting for service to get deployed. This will take a few minutes...\n",
"You can monitor the logs at: https://console.cloud.google.com/run/detail/us-central1/demo-service/logs?project=datcom-website-dev\n",
"Service deployed: projects/datcom-website-dev/locations/us-central1/services/demo-service\n",
"Service URL: https://demo-service-kqb7thiuka-uc.a.run.app\n",
"\n",
"==== DONE ====\n"
]
}
]
}
]
}

0 comments on commit 85fde3e

Please sign in to comment.