nextstrain · j23414 · Sep 28, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,17 @@
+# Dependabot configuration file
+# <https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file>
+#
+# Each ecosystem is checked on a scheduled interval defined below.  To trigger
+# a check manually, go to
+#
+#   https://github.com/nextstrain/wnv/network/updates
+#
+# and look for a "Check for updates" button.  You may need to click around a
+# bit first.
+---
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,16 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+  # Routinely check that we continue to work in the face of external changes.
+  schedule:
+    # Every day at 18:37 UTC / 10:37 Seattle (winter) / 11:37 Seattle (summer)
+    - cron: "37 18 * * *"
+
+jobs:
+  ci:
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -0,0 +1,102 @@
+name: Ingest to phylogenetic
+
+defaults:
+  run:
+    # This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
+    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
+    #
+    # Completely spelling it out here so that GitHub can't change it out from under us
+    # and we don't have to refer to the docs to know the expected behavior.
+    shell: bash --noprofile --norc -eo pipefail {0}
+
+on:
+  schedule:
+    # Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings.
+    #
+    # Note the actual runs might be late.
+    # Numerous people were confused, about that, including me:
+    #  - https://github.community/t/scheduled-action-running-consistently-late/138025/11
+    #  - https://github.com/github/docs/issues/3059
+    #
+    # Note, '*' is a special character in YAML, so you have to quote this string.
+    #
+    # Docs:
+    #  - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule
+    #
+    # Tool that deciphers this particular format of crontab string:
+    #  - https://crontab.guru/
+    #
+    # Runs at 5pm UTC (1pm EDT/10am PDT) since curation by NCBI happens on the East Coast.
+    # We were running into invalid zip archive errors at 9am PDT, so hoping an hour
+    # delay will lower the error frequency
+    - cron: '0 17 * * *'
+
+  workflow_dispatch:
+    inputs:
+      ingest_image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+      phylogenetic_image:
+        description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
+        required: false
+
+jobs:
+  ingest:
+    permissions:
+      id-token: write
+    uses: ./.github/workflows/ingest.yaml
+    secrets: inherit
+    with:
+      image: ${{ inputs.ingest_image }}
+
+  # Check if ingest results include new data by checking for the cache
+  # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3)
+  # GitHub will remove any cache entries that have not been accessed in over 7 days,
+  # so if the workflow has not been run over 7 days then it will trigger phylogenetic.
+  check-new-data:
+    needs: [ingest]
+    runs-on: ubuntu-latest
+    outputs:
+      cache-hit: ${{ steps.check-cache.outputs.cache-hit }}
+    steps:
+      - name: Get sha256sum
+        id: get-sha256sum
+        env:
+          AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }}
+        run: |
+          s3_urls=(
+            "s3://nextstrain-data/files/workflows/wnv/all/metadata.tsv.zst"
+            "s3://nextstrain-data/files/workflows/wnv/all/sequences.fasta.zst"
+          )
+
+          # Code below is modified from ingest/upload-to-s3
+          # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29
+
+          no_hash=0000000000000000000000000000000000000000000000000000000000000000
+
+          for s3_url in "${s3_urls[@]}"; do
+            s3path="${s3_url#s3://}"
+            bucket="${s3path%%/*}"
+            key="${s3path#*/}"
+
+            s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
+            echo "${s3_hash}" | tee -a ingest-output-sha256sum
+          done
+
+      - name: Check cache
+        id: check-cache
+        uses: actions/cache@v4
+        with:
+          path: ingest-output-sha256sum
+          key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }}
+          lookup-only: true
+
+  phylogenetic:
+    needs: [check-new-data]
+    if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }}
+    permissions:
+      id-token: write
+    uses: ./.github/workflows/phylogenetic.yaml
+    secrets: inherit
+    with:
+      image: ${{ inputs.phylogenetic_image }}
diff --git a/.github/workflows/ingest.yaml b/.github/workflows/ingest.yaml
@@ -0,0 +1,80 @@
+name: Ingest
+
+defaults:
+  run:
+    # This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
+    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
+    #
+    # Completely spelling it out here so that GitHub can't change it out from under us
+    # and we don't have to refer to the docs to know the expected behavior.
+    shell: bash --noprofile --norc -eo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+        type: string
+
+  workflow_dispatch:
+    inputs:
+      image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+        type: string
+      trial_name:
+        description: |
+          Trial name for outputs.
+          If not set, outputs will overwrite files at s3://nextstrain-data/files/workflows/wnv/
+          If set, outputs will be uploaded to s3://nextstrain-data/files/workflows/wnv/trials/<trial_name>/
+        required: false
+        type: string
+
+jobs:
+  set_config_overrides:
+    runs-on: ubuntu-latest
+    steps:
+      - id: config
+        name: Set config overrides
+        env:
+          TRIAL_NAME: ${{ inputs.trial_name }}
+        run: |
+          config=""
+          if [[ "$TRIAL_NAME" ]]; then
+            config+="--config"
+            config+=" s3_dst='s3://nextstrain-data/files/workflows/wnv/trials/"$TRIAL_NAME"'"
+          fi
+
+          echo "config=$config" >> "$GITHUB_OUTPUT"
+    outputs:
+      config_overrides: ${{ steps.config.outputs.config }}
+
+  ingest:
+    needs: [set_config_overrides]
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      # Starting with the default docker runtime
+      # We can migrate to AWS Batch when/if we need to for more resources or if
+      # the job runs longer than the GH Action limit of 6 hours.
+      runtime: docker
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
+        CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }}
+      run: |
+        nextstrain build \
+          ingest \
+            upload_all \
+            --configfile build-configs/nextstrain-automation/config.yaml \
+            $CONFIG_OVERRIDES
+      # Specifying artifact name to differentiate ingest build outputs from
+      # the phylogenetic build outputs
+      artifact-name: ingest-build-output
+      artifact-paths: |
+        ingest/results/
+        ingest/benchmarks/
+        ingest/logs/
+        ingest/.snakemake/log/
diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml
@@ -0,0 +1,107 @@
+name: Phylogenetic
+
+defaults:
+  run:
+    # This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
+    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
+    #
+    # Completely spelling it out here so that GitHub can't change it out from under us
+    # and we don't have to refer to the docs to know the expected behavior.
+    shell: bash --noprofile --norc -eo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      image:
+        description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
+        required: false
+        type: string
+
+  workflow_dispatch:
+    inputs:
+      image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+        type: string
+      trial_name:
+        description: |
+          Trial name for deploying builds.
+          If not set, builds will overwrite existing builds at s3://nextstrain-data/wnv*
+          If set, builds will be deployed to s3://nextstrain-staging/wnv_trials_<trial_name>_*
+        required: false
+        type: string
+      sequences_url:
+        description: |
+          URL for the sequences.fasta.zst file
+          If not provided, will use default sequences_url from phylogenetic/defaults/config.yaml
+        required: false
+        type: string
+      metadata_url:
+        description: |
+          URL for the metadata.tsv.zst file
+          If not provided, will use default metadata_url from phylogenetic/defaults/config.yaml
+        required: false
+        type: string
+
+jobs:
+  set_config_overrides:
+    runs-on: ubuntu-latest
+    steps:
+      - id: config
+        name: Set config overrides
+        env:
+          TRIAL_NAME: ${{ inputs.trial_name }}
+          SEQUENCES_URL: ${{ inputs.sequences_url }}
+          METADATA_URL: ${{ inputs.metadata_url }}
+        run: |
+          config=""
+
+          if [[ "$TRIAL_NAME" ]]; then
+            config+=" deploy_url='s3://nextstrain-staging/wnv_trials_"$TRIAL_NAME"_'"
+          fi
+
+          if [[ "$SEQUENCES_URL" ]]; then
+            config+=" sequences_url='"$SEQUENCES_URL"'"
+          fi
+
+          if [[ "$METADATA_URL" ]]; then
+            config+=" metadata_url='"$METADATA_URL"'"
+          fi
+
+          if [[ $config ]]; then
+            config="--config $config"
+          fi
+
+          echo "config=$config" >> "$GITHUB_OUTPUT"
+    outputs:
+      config_overrides: ${{ steps.config.outputs.config }}
+
+  phylogenetic:
+    needs: [set_config_overrides]
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      # Starting with the default docker runtime
+      # We can migrate to AWS Batch when/if we need to for more resources or if
+      # the job runs longer than the GH Action limit of 6 hours.
+      runtime: docker
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
+        CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }}
+      run: |
+        nextstrain build \
+          phylogenetic \
+            deploy_all \
+            --configfile build-configs/nextstrain-automation/config.yaml \
+            $CONFIG_OVERRIDES
+      # Specifying artifact name to differentiate ingest build outputs from
+      # the phylogenetic build outputs
+      artifact-name: phylogenetic-build-output
+      artifact-paths: |
+        phylogenetic/auspice/
+        phylogenetic/results/
+        phylogenetic/benchmarks/
+        phylogenetic/logs/
+        phylogenetic/.snakemake/log/
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ data/
 results/
 logs/
 benchmarks/
+*RAxML*
 
 # Sensitive environment variables
 environment*

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -4,16 +4,12 @@ workdir: workflow.current_basedir
 # Use default configuration values. Override with Snakemake's --configfile/--config options.
 configfile: "defaults/config.yaml"
 
-serotypes = ["all"]
-wildcard_constraints:
-    serotype = "|".join(serotypes)
-
 # This is the default rule that Snakemake will run when there are no specified targets.
 # The default output of the ingest workflow is usually the curated metadata and sequences.
 rule all:
     input:
-        sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes),
-        metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes),
+        sequences="results/sequences.fasta",
+        metadata="results/metadata.tsv",
 
 # Include smk files that contain the core steps necessary for building the curated metadata and sequence files.
 # If there are build-specific customizations, they should be added with the

diff --git a/ingest/build-configs/README.md → ...d-configs/nextstrain-automation/README.md b/ingest/build-configs/README.md → ...d-configs/nextstrain-automation/README.md
diff --git a/ingest/build-configs/config.yaml → ...configs/nextstrain-automation/config.yaml b/ingest/build-configs/config.yaml → ...configs/nextstrain-automation/config.yaml
@@ -12,12 +12,9 @@ cloudfront_domain: "data.nextstrain.org"
 
 # Nextstrain AWS S3 Bucket with pathogen prefix
 # Replace <pathogen> with the pathogen repo name.
-s3_dst: "s3://nextstrain-data/files/workflows/<pathogen>"
+s3_dst: "s3://nextstrain-data/files/workflows/wnv"
 
 # Mapping of files to upload
 files_to_upload:
-  ncbi.ndjson.zst: data/ncbi.ndjson
   metadata.tsv.zst: results/metadata.tsv
   sequences.fasta.zst: results/sequences.fasta
-  alignments.fasta.zst: results/alignment.fasta
-  translations.zip: results/translations.zip
diff --git a/ingest/build-configs/upload.smk → ...-configs/nextstrain-automation/upload.smk b/ingest/build-configs/upload.smk → ...-configs/nextstrain-automation/upload.smk
@@ -29,7 +29,7 @@ rule upload_to_s3:
         cloudfront_domain=config["cloudfront_domain"],
     shell:
         """
-        ./vendored/upload-to-s3 \
+        ./scripts/upload-to-s3 \
             {params.quiet} \
             {input.file_to_upload:q} \
             {params.s3_dst:q}/{wildcards.remote_file:q} \