From d3167433377f976751f78a1ce548e83e5fbaba7a Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Fri, 20 Oct 2023 14:33:46 -0400 Subject: [PATCH 01/10] Introduce GCP_OUTPUT env variable. --- docker/gcp_pudl_etl.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index 2829576c9e..8820fa8832 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -2,6 +2,8 @@ # This script runs the entire ETL and validation tests in a docker container on a Google Compute Engine instance. # This script won't work locally because it needs adequate GCP permissions. +: "${GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF}" + set -x function send_slack_msg() { @@ -41,7 +43,6 @@ function run_pudl_etl() { function shutdown_vm() { # Copy the outputs to the GCS bucket - gsutil -m cp -r $PUDL_OUTPUT "gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF" upload_file_to_slack $LOGFILE "pudl_etl logs for $ACTION_SHA-$GITHUB_REF:" @@ -54,6 +55,11 @@ function shutdown_vm() { curl -X POST -H "Content-Length: 0" -H "Authorization: Bearer ${ACCESS_TOKEN}" https://compute.googleapis.com/compute/v1/projects/catalyst-cooperative-pudl/zones/$GCE_INSTANCE_ZONE/instances/$GCE_INSTANCE/stop } +function copy_outputs_to_gcs() { + echo "Copying outputs to GCP bucket $GCS_OUTPUT" + gsutil -m cp -r $PUDL_OUTPUT $GCS_OUTPUT +} + function copy_outputs_to_distribution_bucket() { echo "Copying outputs to GCP distribution bucket" gsutil -m -u $GCP_BILLING_PROJECT cp -r "$PUDL_OUTPUT/*" "gs://pudl.catalyst.coop/$GITHUB_REF" @@ -89,8 +95,12 @@ run_pudl_etl 2>&1 | tee $LOGFILE # Notify slack if the etl succeeded. if [[ ${PIPESTATUS[0]} == 0 ]]; then notify_slack "success" + copy_outputs_to_gcs # Dump outputs to s3 bucket if branch is dev or build was triggered by a tag + # TODO: this behavior should be controlled by on/off switch here and this logic + # should be moved to the triggering github action. Having it here feels + # fragmented. if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then copy_outputs_to_distribution_bucket fi From c6ad30654606646453944c1333ba0a4420457598 Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Fri, 20 Oct 2023 15:24:59 -0400 Subject: [PATCH 02/10] Rename variable to PUDL_GCS_OUTPUT --- docker/gcp_pudl_etl.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index 8820fa8832..a6eeefb482 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -2,7 +2,7 @@ # This script runs the entire ETL and validation tests in a docker container on a Google Compute Engine instance. # This script won't work locally because it needs adequate GCP permissions. -: "${GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF}" +: "${PUDL_GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF}" set -x @@ -56,8 +56,8 @@ function shutdown_vm() { } function copy_outputs_to_gcs() { - echo "Copying outputs to GCP bucket $GCS_OUTPUT" - gsutil -m cp -r $PUDL_OUTPUT $GCS_OUTPUT + echo "Copying outputs to GCP bucket $PUDL_GCS_OUTPUT" + gsutil -m cp -r $PUDL_OUTPUT ${PUDL_GCS_OUTPUT} } function copy_outputs_to_distribution_bucket() { From ada7664afe235bcdc149b3e6e378c18370821291 Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Fri, 20 Oct 2023 20:27:32 -0400 Subject: [PATCH 03/10] Publish nightly builds as YYYY-MM-DD-hhmm-${sha}-$ --- .github/workflows/build-deploy-pudl.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml index 6bdf750377..3c39223ae0 100644 --- a/.github/workflows/build-deploy-pudl.yml +++ b/.github/workflows/build-deploy-pudl.yml @@ -83,6 +83,11 @@ jobs: - name: Set up Cloud SDK uses: google-github-actions/setup-gcloud@v1 + - name: Determine commit information + run: |- + echo "COMMIT_BRANCH=$(gitrev-parse --abbrev-ref HEAD)" >> $GITHUB_ENV + echo "COMMIT_TIME=$(git log -1 --format=%cd --date=format:%Y-%m-%d-%H%M)" >> $GITHUB_ENV + # Deploy PUDL image to GCE - name: Deploy run: |- @@ -110,6 +115,7 @@ jobs: --container-env AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \ --container-env AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \ --container-env AWS_DEFAULT_REGION=${{ secrets.AWS_DEFAULT_REGION }} \ + --container-env PUDL_GCS_OUTPUT=gs://nightly-build-outputs.catalyst.coop/${{ env.COMMIT_TIME }}-${{ env.ACTION_SHA}-${{ env.COMMIT_BRANCH }} # Start the VM - name: Start the deploy-pudl-vm From f066781c7950e173402b4b19cf67cc1d4bced86d Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 6 Nov 2023 17:11:46 -0700 Subject: [PATCH 04/10] Use short shas for output directories. --- .github/workflows/build-deploy-pudl.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml index 7ce636d8e7..483a2dda7c 100644 --- a/.github/workflows/build-deploy-pudl.yml +++ b/.github/workflows/build-deploy-pudl.yml @@ -35,6 +35,7 @@ jobs: - name: Get HEAD of the branch (main or dev) run: | echo "ACTION_SHA=$(git rev-parse HEAD)" >> $GITHUB_ENV + echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV - name: Print action vars run: | @@ -123,7 +124,7 @@ jobs: --container-env DAGSTER_PG_HOST="104.154.182.24" \ --container-env DAGSTER_PG_DB="dagster-storage" \ --container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \ - --container-env PUDL_GCS_OUTPUT=${{ env.GCS_OUTPUT_BUCKET }}/${{ env.COMMIT_TIME }}-${{ env.ACTION_SHA }}-${{ env.COMMIT_BRANCH }} + --container-env PUDL_GCS_OUTPUT=${{ env.GCS_OUTPUT_BUCKET }}/${{ env.COMMIT_TIME }}-${{ env.SHORT_SHA }}-${{ env.COMMIT_BRANCH }} # Start the VM - name: Start the deploy-pudl-vm @@ -134,6 +135,6 @@ jobs: uses: slackapi/slack-github-action@v1.24.0 with: channel-id: "C03FHB9N0PQ" - slack-message: "build-deploy-pudl status: ${{ job.status }}\n${{ env.COMMIT_TIME}}-${{ env.ACTION_SHA }}-${{ env.COMMIT_BRANCH }}" + slack-message: "build-deploy-pudl status: ${{ job.status }}\n${{ env.COMMIT_TIME}}-${{ env.SHORT_SHA }}-${{ env.COMMIT_BRANCH }}" env: SLACK_BOT_TOKEN: ${{ secrets.PUDL_DEPLOY_SLACK_TOKEN }} From 6a47bd3bdfdeddec8ebc3c91c2d4da4ae3dda9db Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 27 Nov 2023 23:35:47 +0000 Subject: [PATCH 05/10] Move gcs output after the notifications --- docker/gcp_pudl_etl.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index 6b18b7e805..b8fe6cd05f 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -91,8 +91,6 @@ function notify_slack() { # 2>&1 redirects stderr to stdout. run_pudl_etl 2>&1 | tee $LOGFILE -copy_outputs_to_gcs - # Notify slack if the etl succeeded. if [[ ${PIPESTATUS[0]} == 0 ]]; then notify_slack "success" @@ -114,4 +112,5 @@ else notify_slack "failure" fi +copy_outputs_to_gcs shutdown_vm From cc0bbccd530d30ce1450fa58a470255f50f2823d Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 27 Nov 2023 23:39:05 +0000 Subject: [PATCH 06/10] Create success file in output when ETL is okay --- docker/gcp_pudl_etl.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index b8fe6cd05f..4257dc7529 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -38,6 +38,7 @@ function run_pudl_etl() { --gcs-cache-path=gs://internal-zenodo-cache.catalyst.coop \ --etl-settings=$PUDL_SETTINGS_YML \ --live-dbs test + && touch ${PUDL_OUTPUT}/success } function shutdown_vm() { From 3758dbca26a4a9cf40b1142ed80d4f1f64883dd9 Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 27 Nov 2023 23:43:22 +0000 Subject: [PATCH 07/10] Fix path to etl config. --- .github/workflows/build-deploy-pudl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml index cdb3207686..b1b35d0e56 100644 --- a/.github/workflows/build-deploy-pudl.yml +++ b/.github/workflows/build-deploy-pudl.yml @@ -125,7 +125,7 @@ jobs: --container-env DAGSTER_PG_HOST="104.154.182.24" \ --container-env DAGSTER_PG_DB="dagster-storage" \ --container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \ - --container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \ + --container-env PUDL_SETTINGS_YML="/home/mambauser/src/pudl/package_data/settings/etl_full.yml" \ --container-env PUDL_GCS_OUTPUT=${{ env.GCS_OUTPUT_BUCKET }}/${{ env.COMMIT_TIME }}-${{ env.SHORT_SHA }}-${{ env.COMMIT_BRANCH }} # Start the VM From cdecdde9d0d1cb1843787c0a5a46b9ba25a9b0f1 Mon Sep 17 00:00:00 2001 From: rousik Date: Fri, 1 Dec 2023 18:41:41 +0000 Subject: [PATCH 08/10] Update conda-lock.yml and rendered conda environment files. --- environments/conda-linux-64.lock.yml | 2 +- environments/conda-lock.yml | 24 ++++++++++++------------ environments/conda-osx-64.lock.yml | 2 +- environments/conda-osx-arm64.lock.yml | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/environments/conda-linux-64.lock.yml b/environments/conda-linux-64.lock.yml index 2e9ca2e95f..d3798403b8 100644 --- a/environments/conda-linux-64.lock.yml +++ b/environments/conda-linux-64.lock.yml @@ -441,7 +441,7 @@ dependencies: - rich=13.7.0=pyhd8ed1ab_0 - sqlalchemy=2.0.23=py311h459d7ec_0 - stack_data=0.6.2=pyhd8ed1ab_0 - - starlette=0.32.0.post1=pyhd8ed1ab_0 + - starlette=0.33.0=pyhd8ed1ab_0 - tiledb=2.16.3=h8c794c1_3 - ukkonen=1.0.1=py311h9547e67_4 - uvicorn=0.24.0.post1=py311h38be061_0 diff --git a/environments/conda-lock.yml b/environments/conda-lock.yml index 036acbad90..6bb24c94d9 100644 --- a/environments/conda-lock.yml +++ b/environments/conda-lock.yml @@ -21167,45 +21167,45 @@ package: category: main optional: false - name: starlette - version: 0.32.0.post1 + version: 0.33.0 manager: conda platform: linux-64 dependencies: anyio: <5,>=3.4.0 python: ">=3.8" typing_extensions: ">=3.10.0" - url: https://conda.anaconda.org/conda-forge/noarch/starlette-0.32.0.post1-pyhd8ed1ab_0.conda + url: https://conda.anaconda.org/conda-forge/noarch/starlette-0.33.0-pyhd8ed1ab_0.conda hash: - md5: 9aa6d56db739eee2ff473becbe178fd1 - sha256: 9692b83467670b473dc71137376f735249ef2ee6eeefce9068b0dec94810c24c + md5: 55027cf7f50803f0f5ece8b661eff47b + sha256: 3923f4c3e31d8c3a9c574779585137ff834a6108558a8956ef93022d4fcb37a8 category: dev optional: true - name: starlette - version: 0.32.0.post1 + version: 0.33.0 manager: conda platform: osx-64 dependencies: python: ">=3.8" typing_extensions: ">=3.10.0" anyio: <5,>=3.4.0 - url: https://conda.anaconda.org/conda-forge/noarch/starlette-0.32.0.post1-pyhd8ed1ab_0.conda + url: https://conda.anaconda.org/conda-forge/noarch/starlette-0.33.0-pyhd8ed1ab_0.conda hash: - md5: 9aa6d56db739eee2ff473becbe178fd1 - sha256: 9692b83467670b473dc71137376f735249ef2ee6eeefce9068b0dec94810c24c + md5: 55027cf7f50803f0f5ece8b661eff47b + sha256: 3923f4c3e31d8c3a9c574779585137ff834a6108558a8956ef93022d4fcb37a8 category: dev optional: true - name: starlette - version: 0.32.0.post1 + version: 0.33.0 manager: conda platform: osx-arm64 dependencies: python: ">=3.8" typing_extensions: ">=3.10.0" anyio: <5,>=3.4.0 - url: https://conda.anaconda.org/conda-forge/noarch/starlette-0.32.0.post1-pyhd8ed1ab_0.conda + url: https://conda.anaconda.org/conda-forge/noarch/starlette-0.33.0-pyhd8ed1ab_0.conda hash: - md5: 9aa6d56db739eee2ff473becbe178fd1 - sha256: 9692b83467670b473dc71137376f735249ef2ee6eeefce9068b0dec94810c24c + md5: 55027cf7f50803f0f5ece8b661eff47b + sha256: 3923f4c3e31d8c3a9c574779585137ff834a6108558a8956ef93022d4fcb37a8 category: dev optional: true - name: stevedore diff --git a/environments/conda-osx-64.lock.yml b/environments/conda-osx-64.lock.yml index c31e0a024d..832b19005b 100644 --- a/environments/conda-osx-64.lock.yml +++ b/environments/conda-osx-64.lock.yml @@ -421,7 +421,7 @@ dependencies: - rich=13.7.0=pyhd8ed1ab_0 - sqlalchemy=2.0.23=py311he705e18_0 - stack_data=0.6.2=pyhd8ed1ab_0 - - starlette=0.32.0.post1=pyhd8ed1ab_0 + - starlette=0.33.0=pyhd8ed1ab_0 - tiledb=2.16.3=hd3a41d5_3 - ukkonen=1.0.1=py311h5fe6e05_4 - uvicorn=0.24.0.post1=py311h6eed73b_0 diff --git a/environments/conda-osx-arm64.lock.yml b/environments/conda-osx-arm64.lock.yml index 36b5d249ca..e28348c9e8 100644 --- a/environments/conda-osx-arm64.lock.yml +++ b/environments/conda-osx-arm64.lock.yml @@ -421,7 +421,7 @@ dependencies: - rich=13.7.0=pyhd8ed1ab_0 - sqlalchemy=2.0.23=py311h05b510d_0 - stack_data=0.6.2=pyhd8ed1ab_0 - - starlette=0.32.0.post1=pyhd8ed1ab_0 + - starlette=0.33.0=pyhd8ed1ab_0 - tiledb=2.16.3=he15c4da_3 - ukkonen=1.0.1=py311he4fd1f5_4 - uvicorn=0.24.0.post1=py311h267d04e_0 From df8347f9d9d69a562d2ec8bb2a910e0aadd1d95e Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Fri, 1 Dec 2023 13:40:20 -0700 Subject: [PATCH 09/10] Move copy before publishing and wipe success file. --- docker/gcp_pudl_etl.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index b4cd7a108e..6a194be935 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -64,6 +64,7 @@ function shutdown_vm() { function copy_outputs_to_gcs() { echo "Copying outputs to GCP bucket $PUDL_GCS_OUTPUT" gsutil -m cp -r $PUDL_OUTPUT ${PUDL_GCS_OUTPUT} + rm ${PUDL_OUTPUT}/success } function copy_outputs_to_distribution_bucket() { @@ -100,6 +101,8 @@ run_pudl_etl 2>&1 | tee $LOGFILE ETL_SUCCESS=${PIPESTATUS[0]} +copy_outputs_to_gcs + # if pipeline is successful, distribute + publish datasette if [[ $ETL_SUCCESS == 0 ]]; then # Deploy the updated data to datasette @@ -134,5 +137,4 @@ else notify_slack "failure" fi -copy_outputs_to_gcs shutdown_vm From a519e97d9eae30b7212cf073194731815adfd52f Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 1 Dec 2023 15:00:59 -0600 Subject: [PATCH 10/10] Explain some arcane bash magic with a comment. --- docker/gcp_pudl_etl.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index 6a194be935..8364a47a5a 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -2,6 +2,7 @@ # This script runs the entire ETL and validation tests in a docker container on a Google Compute Engine instance. # This script won't work locally because it needs adequate GCP permissions. +# Set PUDL_GCS_OUTPUT *only* if it is currently unset : "${PUDL_GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF}" set -x