Skip to content

Commit

Permalink
Improve flexibility for publishing options (#2964)
Browse files Browse the repository at this point in the history
Improve flexibility for publishing options

Merging this in today so we can see how it goes with tonight's build, and so I can integrate the changes with #3086.
  • Loading branch information
rousik authored Dec 3, 2023
1 parent 6f284f3 commit 85c2ea4
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 8 deletions.
10 changes: 9 additions & 1 deletion .github/workflows/build-deploy-pudl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ env:
GITHUB_REF: ${{ github.ref_name }} # This is changed to dev if running on a schedule
GCE_INSTANCE: pudl-deployment-tag # This is changed to pudl-deployment-dev if running on a schedule
GCE_INSTANCE_ZONE: ${{ secrets.GCE_INSTANCE_ZONE }}
GCS_OUTPUT_BUCKET: gs://nightly-build-outputs.catalyst.coop

jobs:
build_and_deploy_pudl:
Expand All @@ -34,6 +35,7 @@ jobs:
- name: Get HEAD of the branch (main or dev)
run: |
echo "ACTION_SHA=$(git rev-parse HEAD)" >> $GITHUB_ENV
echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
- name: Print action vars
run: |
Expand Down Expand Up @@ -83,6 +85,11 @@ jobs:
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v1

- name: Determine commit information
run: |-
echo "COMMIT_BRANCH=$(gitrev-parse --abbrev-ref HEAD)" >> $GITHUB_ENV
echo "COMMIT_TIME=$(git log -1 --format=%cd --date=format:%Y-%m-%d-%H%M)" >> $GITHUB_ENV
# Deploy PUDL image to GCE
- name: Deploy
env:
Expand Down Expand Up @@ -119,6 +126,7 @@ jobs:
--container-env DAGSTER_PG_DB="dagster-storage" \
--container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \
--container-env PUDL_SETTINGS_YML="/home/mambauser/src/pudl/package_data/settings/etl_full.yml" \
--container-env PUDL_GCS_OUTPUT=${{ env.GCS_OUTPUT_BUCKET }}/${{ env.COMMIT_TIME }}-${{ env.SHORT_SHA }}-${{ env.COMMIT_BRANCH }}
# Start the VM
- name: Start the deploy-pudl-vm
Expand All @@ -129,6 +137,6 @@ jobs:
uses: slackapi/[email protected]
with:
channel-id: "C03FHB9N0PQ"
slack-message: "build-deploy-pudl status: ${{ job.status }}\n${{ env.ACTION_SHA }}-${{ env.GITHUB_REF }}"
slack-message: "build-deploy-pudl status: ${{ job.status }}\n${{ env.COMMIT_TIME}}-${{ env.SHORT_SHA }}-${{ env.COMMIT_BRANCH }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.PUDL_DEPLOY_SLACK_TOKEN }}
28 changes: 21 additions & 7 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# This script runs the entire ETL and validation tests in a docker container on a Google Compute Engine instance.
# This script won't work locally because it needs adequate GCP permissions.

# Set PUDL_GCS_OUTPUT *only* if it is currently unset
: "${PUDL_GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF}"

set -x

function send_slack_msg() {
Expand All @@ -27,26 +30,26 @@ function run_pudl_etl() {
--loglevel DEBUG \
--gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \
--workers 8 \
$PUDL_SETTINGS_YML && \
pudl_etl \
$PUDL_SETTINGS_YML \
&& pudl_etl \
--loglevel DEBUG \
--gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \
$PUDL_SETTINGS_YML && \
pytest \
$PUDL_SETTINGS_YML \
&& pytest \
-n auto \
--gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \
--etl-settings $PUDL_SETTINGS_YML \
--live-dbs test/integration test/unit && \
pytest \
--live-dbs test/integration test/unit \
&& pytest \
-n auto \
--gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \
--etl-settings $PUDL_SETTINGS_YML \
--live-dbs test/validate
&& touch ${PUDL_OUTPUT}/success
}

function shutdown_vm() {
# Copy the outputs to the GCS bucket
gsutil -m cp -r $PUDL_OUTPUT "gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF"

upload_file_to_slack $LOGFILE "pudl_etl logs for $ACTION_SHA-$GITHUB_REF:"

Expand All @@ -59,6 +62,12 @@ function shutdown_vm() {
curl -X POST -H "Content-Length: 0" -H "Authorization: Bearer ${ACCESS_TOKEN}" https://compute.googleapis.com/compute/v1/projects/catalyst-cooperative-pudl/zones/$GCE_INSTANCE_ZONE/instances/$GCE_INSTANCE/stop
}

function copy_outputs_to_gcs() {
echo "Copying outputs to GCP bucket $PUDL_GCS_OUTPUT"
gsutil -m cp -r $PUDL_OUTPUT ${PUDL_GCS_OUTPUT}
rm ${PUDL_OUTPUT}/success
}

function copy_outputs_to_distribution_bucket() {
echo "Copying outputs to GCP distribution bucket"
gsutil -m -u $GCP_BILLING_PROJECT cp -r "$PUDL_OUTPUT/*" "gs://pudl.catalyst.coop/$GITHUB_REF"
Expand Down Expand Up @@ -93,6 +102,8 @@ run_pudl_etl 2>&1 | tee $LOGFILE

ETL_SUCCESS=${PIPESTATUS[0]}

copy_outputs_to_gcs

# if pipeline is successful, distribute + publish datasette
if [[ $ETL_SUCCESS == 0 ]]; then
# Deploy the updated data to datasette
Expand All @@ -109,6 +120,9 @@ if [[ $ETL_SUCCESS == 0 ]]; then
ETL_SUCCESS=${PIPESTATUS[0]}

# Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
# TODO: this behavior should be controlled by on/off switch here and this logic
# should be moved to the triggering github action. Having it here feels
# fragmented.
if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
copy_outputs_to_distribution_bucket
ETL_SUCCESS=${PIPESTATUS[0]}
Expand Down

0 comments on commit 85c2ea4

Please sign in to comment.