Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add automation for public build #13

Merged
merged 13 commits into from
Sep 28, 2024
Merged
17 changes: 17 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Dependabot configuration file
# <https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file>
#
# Each ecosystem is checked on a scheduled interval defined below. To trigger
# a check manually, go to
#
# https://github.com/nextstrain/wnv/network/updates
#
# and look for a "Check for updates" button. You may need to click around a
# bit first.
---
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
16 changes: 16 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: CI

on:
push:
branches:
- main
pull_request:
workflow_dispatch:
# Routinely check that we continue to work in the face of external changes.
schedule:
# Every day at 18:37 UTC / 10:37 Seattle (winter) / 11:37 Seattle (summer)
- cron: "37 18 * * *"

jobs:
ci:
uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
102 changes: 102 additions & 0 deletions .github/workflows/ingest-to-phylogenetic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
name: Ingest to phylogenetic

defaults:
run:
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
#
# Completely spelling it out here so that GitHub can't change it out from under us
# and we don't have to refer to the docs to know the expected behavior.
shell: bash --noprofile --norc -eo pipefail {0}

on:
schedule:
# Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings.
#
# Note the actual runs might be late.
# Numerous people were confused, about that, including me:
# - https://github.community/t/scheduled-action-running-consistently-late/138025/11
# - https://github.com/github/docs/issues/3059
#
# Note, '*' is a special character in YAML, so you have to quote this string.
#
# Docs:
# - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule
#
# Tool that deciphers this particular format of crontab string:
# - https://crontab.guru/
#
# Runs at 5pm UTC (1pm EDT/10am PDT) since curation by NCBI happens on the East Coast.
# We were running into invalid zip archive errors at 9am PDT, so hoping an hour
# delay will lower the error frequency
- cron: '0 17 * * *'

workflow_dispatch:
inputs:
ingest_image:
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
required: false
phylogenetic_image:
description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
required: false

jobs:
ingest:
permissions:
id-token: write
uses: ./.github/workflows/ingest.yaml
secrets: inherit
with:
image: ${{ inputs.ingest_image }}

# Check if ingest results include new data by checking for the cache
# of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3)
# GitHub will remove any cache entries that have not been accessed in over 7 days,
# so if the workflow has not been run over 7 days then it will trigger phylogenetic.
check-new-data:
needs: [ingest]
runs-on: ubuntu-latest
outputs:
cache-hit: ${{ steps.check-cache.outputs.cache-hit }}
steps:
- name: Get sha256sum
id: get-sha256sum
env:
AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }}
run: |
s3_urls=(
"s3://nextstrain-data/files/workflows/wnv/all/metadata.tsv.zst"
"s3://nextstrain-data/files/workflows/wnv/all/sequences.fasta.zst"
)

# Code below is modified from ingest/upload-to-s3
# https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29

no_hash=0000000000000000000000000000000000000000000000000000000000000000

for s3_url in "${s3_urls[@]}"; do
s3path="${s3_url#s3://}"
bucket="${s3path%%/*}"
key="${s3path#*/}"

s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
echo "${s3_hash}" | tee -a ingest-output-sha256sum
done

- name: Check cache
id: check-cache
uses: actions/cache@v4
with:
path: ingest-output-sha256sum
key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }}
lookup-only: true

phylogenetic:
needs: [check-new-data]
if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }}
permissions:
id-token: write
uses: ./.github/workflows/phylogenetic.yaml
secrets: inherit
with:
image: ${{ inputs.phylogenetic_image }}
80 changes: 80 additions & 0 deletions .github/workflows/ingest.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: Ingest

defaults:
run:
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
#
# Completely spelling it out here so that GitHub can't change it out from under us
# and we don't have to refer to the docs to know the expected behavior.
shell: bash --noprofile --norc -eo pipefail {0}

on:
workflow_call:
inputs:
image:
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
required: false
type: string

workflow_dispatch:
inputs:
image:
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
required: false
type: string
trial_name:
description: |
Trial name for outputs.
If not set, outputs will overwrite files at s3://nextstrain-data/files/workflows/wnv/
If set, outputs will be uploaded to s3://nextstrain-data/files/workflows/wnv/trials/<trial_name>/
required: false
type: string

jobs:
set_config_overrides:
runs-on: ubuntu-latest
steps:
- id: config
name: Set config overrides
env:
TRIAL_NAME: ${{ inputs.trial_name }}
run: |
config=""
if [[ "$TRIAL_NAME" ]]; then
config+="--config"
config+=" s3_dst='s3://nextstrain-data/files/workflows/wnv/trials/"$TRIAL_NAME"'"
fi

echo "config=$config" >> "$GITHUB_OUTPUT"
outputs:
config_overrides: ${{ steps.config.outputs.config }}

ingest:
needs: [set_config_overrides]
permissions:
id-token: write
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
secrets: inherit
with:
# Starting with the default docker runtime
# We can migrate to AWS Batch when/if we need to for more resources or if
# the job runs longer than the GH Action limit of 6 hours.
runtime: docker
env: |
NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }}
run: |
nextstrain build \
ingest \
upload_all \
--configfile build-configs/nextstrain-automation/config.yaml \
$CONFIG_OVERRIDES
# Specifying artifact name to differentiate ingest build outputs from
# the phylogenetic build outputs
artifact-name: ingest-build-output
artifact-paths: |
ingest/results/
ingest/benchmarks/
ingest/logs/
ingest/.snakemake/log/
107 changes: 107 additions & 0 deletions .github/workflows/phylogenetic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
name: Phylogenetic

defaults:
run:
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
#
# Completely spelling it out here so that GitHub can't change it out from under us
# and we don't have to refer to the docs to know the expected behavior.
shell: bash --noprofile --norc -eo pipefail {0}

on:
workflow_call:
inputs:
image:
description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
required: false
type: string

workflow_dispatch:
inputs:
image:
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
required: false
type: string
trial_name:
description: |
Trial name for deploying builds.
If not set, builds will overwrite existing builds at s3://nextstrain-data/wnv*
If set, builds will be deployed to s3://nextstrain-staging/wnv_trials_<trial_name>_*
required: false
type: string
sequences_url:
description: |
URL for the sequences.fasta.zst file
If not provided, will use default sequences_url from phylogenetic/defaults/config.yaml
required: false
type: string
metadata_url:
description: |
URL for the metadata.tsv.zst file
If not provided, will use default metadata_url from phylogenetic/defaults/config.yaml
required: false
type: string

jobs:
set_config_overrides:
runs-on: ubuntu-latest
steps:
- id: config
name: Set config overrides
env:
TRIAL_NAME: ${{ inputs.trial_name }}
SEQUENCES_URL: ${{ inputs.sequences_url }}
METADATA_URL: ${{ inputs.metadata_url }}
run: |
config=""

if [[ "$TRIAL_NAME" ]]; then
config+=" deploy_url='s3://nextstrain-staging/wnv_trials_"$TRIAL_NAME"_'"
fi

if [[ "$SEQUENCES_URL" ]]; then
config+=" sequences_url='"$SEQUENCES_URL"'"
fi

if [[ "$METADATA_URL" ]]; then
config+=" metadata_url='"$METADATA_URL"'"
fi

if [[ $config ]]; then
config="--config $config"
fi

echo "config=$config" >> "$GITHUB_OUTPUT"
outputs:
config_overrides: ${{ steps.config.outputs.config }}

phylogenetic:
needs: [set_config_overrides]
permissions:
id-token: write
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
secrets: inherit
with:
# Starting with the default docker runtime
# We can migrate to AWS Batch when/if we need to for more resources or if
# the job runs longer than the GH Action limit of 6 hours.
runtime: docker
env: |
NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }}
run: |
nextstrain build \
phylogenetic \
deploy_all \
--configfile build-configs/nextstrain-automation/config.yaml \
$CONFIG_OVERRIDES
# Specifying artifact name to differentiate ingest build outputs from
# the phylogenetic build outputs
artifact-name: phylogenetic-build-output
artifact-paths: |
phylogenetic/auspice/
phylogenetic/results/
phylogenetic/benchmarks/
phylogenetic/logs/
phylogenetic/.snakemake/log/
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ data/
results/
logs/
benchmarks/
*RAxML*

# Sensitive environment variables
environment*
Expand Down
8 changes: 2 additions & 6 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,12 @@ workdir: workflow.current_basedir
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

serotypes = ["all"]
wildcard_constraints:
serotype = "|".join(serotypes)

# This is the default rule that Snakemake will run when there are no specified targets.
# The default output of the ingest workflow is usually the curated metadata and sequences.
rule all:
input:
sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes),
metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes),
sequences="results/sequences.fasta",
metadata="results/metadata.tsv",

# Include smk files that contain the core steps necessary for building the curated metadata and sequence files.
# If there are build-specific customizations, they should be added with the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,9 @@ cloudfront_domain: "data.nextstrain.org"

# Nextstrain AWS S3 Bucket with pathogen prefix
# Replace <pathogen> with the pathogen repo name.
s3_dst: "s3://nextstrain-data/files/workflows/<pathogen>"
s3_dst: "s3://nextstrain-data/files/workflows/wnv"

# Mapping of files to upload
files_to_upload:
ncbi.ndjson.zst: data/ncbi.ndjson
metadata.tsv.zst: results/metadata.tsv
sequences.fasta.zst: results/sequences.fasta
alignments.fasta.zst: results/alignment.fasta
translations.zip: results/translations.zip
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ rule upload_to_s3:
cloudfront_domain=config["cloudfront_domain"],
shell:
"""
./vendored/upload-to-s3 \
./scripts/upload-to-s3 \
{params.quiet} \
{input.file_to_upload:q} \
{params.s3_dst:q}/{wildcards.remote_file:q} \
Expand Down
Loading