Merge pull request #234 from nextstrain/update-ingest-to-template

Update ingest to pathogen-repo-template
nextstrain · Feb 6, 2024 · d4eb28b · d4eb28b
2 parents 82a480a + edc904b
commit d4eb28b
Show file tree

Hide file tree

Showing 18 changed files with 136 additions and 98 deletions.
diff --git a/.github/workflows/fetch-and-ingest-branch.yaml b/.github/workflows/fetch-and-ingest-branch.yaml
@@ -50,5 +50,6 @@ jobs:
           --env SLACK_TOKEN \
           --env SLACK_CHANNELS \
           ingest \
-            --configfiles config/config.yaml config/optional.yaml \
-            --config trigger_rebuild=False send_slack_notifications=True upload="$UPLOAD_CONFIG"
+            nextstrain_automation \
+              --configfiles build-configs/nextstrain-automation/config.yaml \
+              --config trigger_rebuild=False send_slack_notifications=True upload="$UPLOAD_CONFIG"
diff --git a/.github/workflows/fetch-and-ingest.yaml b/.github/workflows/fetch-and-ingest.yaml
@@ -56,4 +56,5 @@ jobs:
           --env SLACK_CHANNELS \
           --env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
           ingest \
-            --configfiles config/config.yaml config/optional.yaml
+            nextstrain_automation \
+              --configfiles build-configs/nextstrain-automation/config.yaml \
diff --git a/ingest/README.md b/ingest/README.md
@@ -31,7 +31,7 @@ This will produce two files (within the `ingest` directory):
 Run the complete ingest pipeline and upload results to AWS S3 with
 
 ```sh
-nextstrain build . --configfiles config/config.yaml config/optional.yaml
+nextstrain build . --configfiles build-configs/nextstrain-automation/config.yaml
 ```
 
 ### Adding new sequences not from GenBank
@@ -57,12 +57,12 @@ Do the following to include sequences from static FASTA files.
     !ingest/data/{file-name}.ndjson
     ```
 
-3. Add the `file-name` (without the `.ndjson` extension) as a source to `ingest/config/config.yaml`. This will tell the ingest pipeline to concatenate the records to the GenBank sequences and run them through the same transform pipeline.
+3. Add the `file-name` (without the `.ndjson` extension) as a source to `defaults/config.yaml`. This will tell the ingest pipeline to concatenate the records to the GenBank sequences and run them through the same transform pipeline.
 
 ## Configuration
 
-Configuration takes place in `config/config.yaml` by default.
-Optional configs for uploading files and Slack notifications are in `config/optional.yaml`.
+Configuration takes place in `defaults/config.yaml` by default.
+Optional configs for uploading files and Slack notifications are in `build-configs/nextstrain-automation/config.yaml`.
 
 ### Environment Variables
 

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -4,72 +4,24 @@ min_version(
     "7.7.0"
 )  # Snakemake 7.7.0 introduced `retries` directive used in fetch-sequences
 
-if not config:
 
-    configfile: "config/config.yaml"
-
-
-send_slack_notifications = config.get("send_slack_notifications", False)
-
-
-def _get_all_targets(wildcards):
-    # Default targets are the metadata TSV and sequences FASTA files
-    all_targets = ["results/sequences.fasta", "results/metadata.tsv"]
-
-    # Add additional targets based on upload config
-    upload_config = config.get("upload", {})
-
-    for target, params in upload_config.items():
-        files_to_upload = params.get("files_to_upload", {})
-
-        if not params.get("dst"):
-            print(
-                f"Skipping file upload for {target!r} because the destination was not defined."
-            )
-        else:
-            all_targets.extend(
-                expand(
-                    [f"data/upload/{target}/{{remote_file_name}}.done"],
-                    zip,
-                    remote_file_name=files_to_upload.keys(),
-                )
-            )
-
-    # Add additional targets for Nextstrain's internal Slack notifications
-    if send_slack_notifications:
-        all_targets.extend(
-            [
-                "data/notify/genbank-record-change.done",
-                "data/notify/metadata-diff.done",
-            ]
-        )
-
-    if config.get("trigger_rebuild", False):
-        all_targets.append("data/trigger/rebuild.done")
-
-    return all_targets
+# Use default configuration values. Override with Snakemake's --configfile/--config options.
+configfile: "defaults/config.yaml"
 
 
+# This is the default rule that Snakemake will run when there are no specified targets.
 rule all:
     input:
-        _get_all_targets,
-
-
-include: "workflow/snakemake_rules/fetch_sequences.smk"
-include: "workflow/snakemake_rules/transform.smk"
-include: "workflow/snakemake_rules/nextclade.smk"
-
-
-if config.get("upload", False):
-
-    include: "workflow/snakemake_rules/upload.smk"
-
+        "results/sequences.fasta",
+        "results/metadata.tsv",
 
-if send_slack_notifications:
 
-    include: "workflow/snakemake_rules/slack_notifications.smk"
+include: "rules/fetch_from_ncbi.smk"
+include: "rules/curate.smk"
+include: "rules/nextclade.smk"
 
 
-if config.get("trigger_rebuild", False):
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
 
-    include: "workflow/snakemake_rules/trigger_rebuild.smk"
+        include: rule_file
diff --git a/ingest/config/optional.yaml → ...configs/nextstrain-automation/config.yaml b/ingest/config/optional.yaml → ...configs/nextstrain-automation/config.yaml
@@ -1,4 +1,10 @@
-# Optional configs used by Nextstrain team
+# Default configs used by Nextstrain automation
+
+# Custom rules to run as part of the Nextstrain automated workflow
+# The paths should be relative to the ingest directory.
+custom_rules:
+  - build-configs/nextstrain-automation/nextstrain_automation.smk
+
 # Params for uploads
 upload:
   # Upload params for AWS S3

diff --git a/ingest/build-configs/nextstrain-automation/nextstrain_automation.smk b/ingest/build-configs/nextstrain-automation/nextstrain_automation.smk
@@ -0,0 +1,67 @@
+"""
+These custom rules handle the automation for Nextstrain builds that
+include
+
+    - Slack notifications
+    - Uploads to AWS S3
+    - Triggering downstream workflows
+"""
+
+send_slack_notifications = config.get("send_slack_notifications", False)
+
+def _get_all_targets(wildcards):
+    # Default targets are the metadata TSV and sequences FASTA files
+    all_targets = ["results/sequences.fasta", "results/metadata.tsv"]
+
+    # Add additional targets based on upload config
+    upload_config = config.get("upload", {})
+
+    for target, params in upload_config.items():
+        files_to_upload = params.get("files_to_upload", {})
+
+        if not params.get("dst"):
+            print(
+                f"Skipping file upload for {target!r} because the destination was not defined."
+            )
+        else:
+            all_targets.extend(
+                expand(
+                    [f"data/upload/{target}/{{remote_file_name}}.done"],
+                    zip,
+                    remote_file_name=files_to_upload.keys(),
+                )
+            )
+
+    # Add additional targets for Nextstrain's internal Slack notifications
+    if send_slack_notifications:
+        all_targets.extend(
+            [
+                "data/notify/genbank-record-change.done",
+                "data/notify/metadata-diff.done",
+            ]
+        )
+
+    if config.get("trigger_rebuild", False):
+        all_targets.append("data/trigger/rebuild.done")
+
+    return all_targets
+
+
+rule nextstrain_automation:
+    input:
+        _get_all_targets,
+
+
+if config.get("upload", False):
+
+    include: "upload.smk"
+
+
+if send_slack_notifications:
+
+    include: "slack_notifications.smk"
+
+
+if config.get("trigger_rebuild", False):
+
+    include: "trigger_rebuild.smk"
diff --git a/...w/snakemake_rules/slack_notifications.smk → ...strain-automation/slack_notifications.smk b/...w/snakemake_rules/slack_notifications.smk → ...strain-automation/slack_notifications.smk
diff --git a/...kflow/snakemake_rules/trigger_rebuild.smk → ...nextstrain-automation/trigger_rebuild.smk b/...kflow/snakemake_rules/trigger_rebuild.smk → ...nextstrain-automation/trigger_rebuild.smk
diff --git a/ingest/workflow/snakemake_rules/upload.smk → ...-configs/nextstrain-automation/upload.smk b/ingest/workflow/snakemake_rules/upload.smk → ...-configs/nextstrain-automation/upload.smk
diff --git a/ingest/source-data/annotations.tsv → ingest/defaults/annotations.tsv b/ingest/source-data/annotations.tsv → ingest/defaults/annotations.tsv
diff --git a/ingest/config/config.yaml → ingest/defaults/config.yaml b/ingest/config/config.yaml → ingest/defaults/config.yaml
@@ -3,14 +3,18 @@ sources: ['genbank']
 # Pathogen NCBI Taxonomy ID
 ncbi_taxon_id: '10244'
 # Renames the NCBI dataset headers
-ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv'
+ncbi_field_map: 'defaults/ncbi-dataset-field-map.tsv'
 
-# Params for the transform rule
-transform:
+# Params for the curate rule
+curate:
   # Fields to rename.
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
-  field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution']
+  field_map:
+    collected: date
+    submitted: date_submitted
+    genbank_accession: accession
+    submitting_organization: institution
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names
   strain_regex: '^.+$'
@@ -43,9 +47,9 @@ transform:
   geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
   # Local geolocation rules that are only applicable to mpox data
   # Local rules can overwrite the general geolocation rules provided above
-  local_geolocation_rules: 'source-data/geolocation-rules.tsv'
+  local_geolocation_rules: 'defaults/geolocation-rules.tsv'
   # User annotations file
-  annotations: 'source-data/annotations.tsv'
+  annotations: 'defaults/annotations.tsv'
   # ID field used to merge annotations
   annotations_id: 'accession'
   # Field to use as the sequence ID in the FASTA file
@@ -76,4 +80,4 @@ nextclade:
   # Field to use as the sequence ID in the Nextclade file
   id_field: 'seqName'
   # Fields from a Nextclade file to be renamed (if desired) and appended to a metadata file
-  field_map: 'source-data/nextclade-field-map.tsv'
+  field_map: 'defaults/nextclade-field-map.tsv'
diff --git a/ingest/source-data/geolocation-rules.tsv → ingest/defaults/geolocation-rules.tsv b/ingest/source-data/geolocation-rules.tsv → ingest/defaults/geolocation-rules.tsv
diff --git a/...st/source-data/ncbi-dataset-field-map.tsv → ingest/defaults/ncbi-dataset-field-map.tsv b/...st/source-data/ncbi-dataset-field-map.tsv → ingest/defaults/ncbi-dataset-field-map.tsv
diff --git a/ingest/source-data/nextclade-field-map.tsv → ingest/defaults/nextclade-field-map.tsv b/ingest/source-data/nextclade-field-map.tsv → ingest/defaults/nextclade-field-map.tsv
diff --git a/...st/workflow/snakemake_rules/transform.smk → ingest/rules/curate.smk b/...st/workflow/snakemake_rules/transform.smk → ingest/rules/curate.smk
@@ -1,5 +1,5 @@
 """
-This part of the workflow handles transforming the data into standardized
+This part of the workflow handles curating the data into standardized
 formats and expects input file
 
     sequences_ndjson = "data/sequences.ndjson"
@@ -9,15 +9,15 @@ This will produce output files as
     metadata = "data/metadata_raw.tsv"
     sequences = "results/sequences.fasta"
 
-Parameters are expected to be defined in `config.transform`.
+Parameters are expected to be defined in `config.curate`.
 """
 
 
 rule fetch_general_geolocation_rules:
     output:
         general_geolocation_rules="data/general-geolocation-rules.tsv",
     params:
-        geolocation_rules_url=config["transform"]["geolocation_rules_url"],
+        geolocation_rules_url=config["curate"]["geolocation_rules_url"],
     shell:
         """
         curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
@@ -27,7 +27,7 @@ rule fetch_general_geolocation_rules:
 rule concat_geolocation_rules:
     input:
         general_geolocation_rules="data/general-geolocation-rules.tsv",
-        local_geolocation_rules=config["transform"]["local_geolocation_rules"],
+        local_geolocation_rules=config["curate"]["local_geolocation_rules"],
     output:
         all_geolocation_rules="data/all-geolocation-rules.tsv",
     shell:
@@ -36,32 +36,39 @@ rule concat_geolocation_rules:
         """
 
 
-rule transform:
+def format_field_map(field_map: dict[str, str]) -> str:
+    """
+    Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands.
+    """
+    return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()])
+
+
+rule curate:
     input:
         sequences_ndjson="data/sequences.ndjson",
         all_geolocation_rules="data/all-geolocation-rules.tsv",
-        annotations=config["transform"]["annotations"],
+        annotations=config["curate"]["annotations"],
     output:
         metadata="data/metadata_raw.tsv",
         sequences="results/sequences.fasta",
     log:
-        "logs/transform.txt",
+        "logs/curate.txt",
     params:
-        field_map=config["transform"]["field_map"],
-        strain_regex=config["transform"]["strain_regex"],
-        strain_backup_fields=config["transform"]["strain_backup_fields"],
-        date_fields=config["transform"]["date_fields"],
-        expected_date_formats=config["transform"]["expected_date_formats"],
-        articles=config["transform"]["titlecase"]["articles"],
-        abbreviations=config["transform"]["titlecase"]["abbreviations"],
-        titlecase_fields=config["transform"]["titlecase"]["fields"],
-        authors_field=config["transform"]["authors_field"],
-        authors_default_value=config["transform"]["authors_default_value"],
-        abbr_authors_field=config["transform"]["abbr_authors_field"],
-        annotations_id=config["transform"]["annotations_id"],
-        metadata_columns=config["transform"]["metadata_columns"],
-        id_field=config["transform"]["id_field"],
-        sequence_field=config["transform"]["sequence_field"],
+        field_map=format_field_map(config["curate"]["field_map"]),
+        strain_regex=config["curate"]["strain_regex"],
+        strain_backup_fields=config["curate"]["strain_backup_fields"],
+        date_fields=config["curate"]["date_fields"],
+        expected_date_formats=config["curate"]["expected_date_formats"],
+        articles=config["curate"]["titlecase"]["articles"],
+        abbreviations=config["curate"]["titlecase"]["abbreviations"],
+        titlecase_fields=config["curate"]["titlecase"]["fields"],
+        authors_field=config["curate"]["authors_field"],
+        authors_default_value=config["curate"]["authors_default_value"],
+        abbr_authors_field=config["curate"]["abbr_authors_field"],
+        annotations_id=config["curate"]["annotations_id"],
+        metadata_columns=config["curate"]["metadata_columns"],
+        id_field=config["curate"]["id_field"],
+        sequence_field=config["curate"]["sequence_field"],
     shell:
         """
         (cat {input.sequences_ndjson} \

diff --git a/...kflow/snakemake_rules/fetch_sequences.smk → ingest/rules/fetch_from_ncbi.smk b/...kflow/snakemake_rules/fetch_sequences.smk → ingest/rules/fetch_from_ncbi.smk
diff --git a/...st/workflow/snakemake_rules/nextclade.smk → ingest/rules/nextclade.smk b/...st/workflow/snakemake_rules/nextclade.smk → ingest/rules/nextclade.smk
@@ -60,7 +60,7 @@ rule join_metadata_clades:
     output:
         metadata="results/metadata.tsv",
     params:
-        id_field=config["transform"]["id_field"],
+        id_field=config["curate"]["id_field"],
         nextclade_id_field=config["nextclade"]["id_field"],
     shell:
         """

diff --git a/phylogenetic/config/description.md b/phylogenetic/config/description.md
@@ -18,7 +18,7 @@ Our bioinformatic processing workflow can be found at [github.com/nextstrain/mpo
 
 #### Underlying data
 We curate sequence data and metadata from the [NCBI Datasets command line tools](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/),
-using an NCBI Taxonomy ID defined in [ingest/config/config.yaml](https://github.com/nextstrain/mpox/blob/master/ingest/config/config.yaml), as starting point for these analyses.
+using an NCBI Taxonomy ID defined in [ingest/defaults/config.yaml](https://github.com/nextstrain/mpox/blob/master/ingest/defaults/config.yaml), as starting point for these analyses.
 Curated sequences and metadata are available as flat files at:
 - [data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz](https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz)
 - [data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz](https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz)