diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml index e6a6ab38c2..b1b35d0e56 100644 --- a/.github/workflows/build-deploy-pudl.yml +++ b/.github/workflows/build-deploy-pudl.yml @@ -12,6 +12,7 @@ env: GITHUB_REF: ${{ github.ref_name }} # This is changed to dev if running on a schedule GCE_INSTANCE: pudl-deployment-tag # This is changed to pudl-deployment-dev if running on a schedule GCE_INSTANCE_ZONE: ${{ secrets.GCE_INSTANCE_ZONE }} + GCS_OUTPUT_BUCKET: gs://nightly-build-outputs.catalyst.coop jobs: build_and_deploy_pudl: @@ -34,6 +35,7 @@ jobs: - name: Get HEAD of the branch (main or dev) run: | echo "ACTION_SHA=$(git rev-parse HEAD)" >> $GITHUB_ENV + echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV - name: Print action vars run: | @@ -83,6 +85,11 @@ jobs: - name: Set up Cloud SDK uses: google-github-actions/setup-gcloud@v1 + - name: Determine commit information + run: |- + echo "COMMIT_BRANCH=$(gitrev-parse --abbrev-ref HEAD)" >> $GITHUB_ENV + echo "COMMIT_TIME=$(git log -1 --format=%cd --date=format:%Y-%m-%d-%H%M)" >> $GITHUB_ENV + # Deploy PUDL image to GCE - name: Deploy env: @@ -119,6 +126,7 @@ jobs: --container-env DAGSTER_PG_DB="dagster-storage" \ --container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \ --container-env PUDL_SETTINGS_YML="/home/mambauser/src/pudl/package_data/settings/etl_full.yml" \ + --container-env PUDL_GCS_OUTPUT=${{ env.GCS_OUTPUT_BUCKET }}/${{ env.COMMIT_TIME }}-${{ env.SHORT_SHA }}-${{ env.COMMIT_BRANCH }} # Start the VM - name: Start the deploy-pudl-vm @@ -129,6 +137,6 @@ jobs: uses: slackapi/slack-github-action@v1.24.0 with: channel-id: "C03FHB9N0PQ" - slack-message: "build-deploy-pudl status: ${{ job.status }}\n${{ env.ACTION_SHA }}-${{ env.GITHUB_REF }}" + slack-message: "build-deploy-pudl status: ${{ job.status }}\n${{ env.COMMIT_TIME}}-${{ env.SHORT_SHA }}-${{ env.COMMIT_BRANCH }}" env: SLACK_BOT_TOKEN: ${{ secrets.PUDL_DEPLOY_SLACK_TOKEN }} diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index 84e46b4b24..8364a47a5a 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -2,6 +2,9 @@ # This script runs the entire ETL and validation tests in a docker container on a Google Compute Engine instance. # This script won't work locally because it needs adequate GCP permissions. +# Set PUDL_GCS_OUTPUT *only* if it is currently unset +: "${PUDL_GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF}" + set -x function send_slack_msg() { @@ -27,26 +30,26 @@ function run_pudl_etl() { --loglevel DEBUG \ --gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \ --workers 8 \ - $PUDL_SETTINGS_YML && \ - pudl_etl \ + $PUDL_SETTINGS_YML \ + && pudl_etl \ --loglevel DEBUG \ --gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \ - $PUDL_SETTINGS_YML && \ - pytest \ + $PUDL_SETTINGS_YML \ + && pytest \ -n auto \ --gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \ --etl-settings $PUDL_SETTINGS_YML \ - --live-dbs test/integration test/unit && \ - pytest \ + --live-dbs test/integration test/unit \ + && pytest \ -n auto \ --gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \ --etl-settings $PUDL_SETTINGS_YML \ --live-dbs test/validate + && touch ${PUDL_OUTPUT}/success } function shutdown_vm() { # Copy the outputs to the GCS bucket - gsutil -m cp -r $PUDL_OUTPUT "gs://nightly-build-outputs.catalyst.coop/$ACTION_SHA-$GITHUB_REF" upload_file_to_slack $LOGFILE "pudl_etl logs for $ACTION_SHA-$GITHUB_REF:" @@ -59,6 +62,12 @@ function shutdown_vm() { curl -X POST -H "Content-Length: 0" -H "Authorization: Bearer ${ACCESS_TOKEN}" https://compute.googleapis.com/compute/v1/projects/catalyst-cooperative-pudl/zones/$GCE_INSTANCE_ZONE/instances/$GCE_INSTANCE/stop } +function copy_outputs_to_gcs() { + echo "Copying outputs to GCP bucket $PUDL_GCS_OUTPUT" + gsutil -m cp -r $PUDL_OUTPUT ${PUDL_GCS_OUTPUT} + rm ${PUDL_OUTPUT}/success +} + function copy_outputs_to_distribution_bucket() { echo "Copying outputs to GCP distribution bucket" gsutil -m -u $GCP_BILLING_PROJECT cp -r "$PUDL_OUTPUT/*" "gs://pudl.catalyst.coop/$GITHUB_REF" @@ -93,6 +102,8 @@ run_pudl_etl 2>&1 | tee $LOGFILE ETL_SUCCESS=${PIPESTATUS[0]} +copy_outputs_to_gcs + # if pipeline is successful, distribute + publish datasette if [[ $ETL_SUCCESS == 0 ]]; then # Deploy the updated data to datasette @@ -109,6 +120,9 @@ if [[ $ETL_SUCCESS == 0 ]]; then ETL_SUCCESS=${PIPESTATUS[0]} # Dump outputs to s3 bucket if branch is dev or build was triggered by a tag + # TODO: this behavior should be controlled by on/off switch here and this logic + # should be moved to the triggering github action. Having it here feels + # fragmented. if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then copy_outputs_to_distribution_bucket ETL_SUCCESS=${PIPESTATUS[0]}