Skip to content

Commit

Permalink
Merge pull request #234 from nextstrain/update-ingest-to-template
Browse files Browse the repository at this point in the history
Update ingest to pathogen-repo-template
  • Loading branch information
joverlee521 authored Feb 6, 2024
2 parents 82a480a + edc904b commit d4eb28b
Show file tree
Hide file tree
Showing 18 changed files with 136 additions and 98 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/fetch-and-ingest-branch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,6 @@ jobs:
--env SLACK_TOKEN \
--env SLACK_CHANNELS \
ingest \
--configfiles config/config.yaml config/optional.yaml \
--config trigger_rebuild=False send_slack_notifications=True upload="$UPLOAD_CONFIG"
nextstrain_automation \
--configfiles build-configs/nextstrain-automation/config.yaml \
--config trigger_rebuild=False send_slack_notifications=True upload="$UPLOAD_CONFIG"
3 changes: 2 additions & 1 deletion .github/workflows/fetch-and-ingest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,5 @@ jobs:
--env SLACK_CHANNELS \
--env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
ingest \
--configfiles config/config.yaml config/optional.yaml
nextstrain_automation \
--configfiles build-configs/nextstrain-automation/config.yaml \
8 changes: 4 additions & 4 deletions ingest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ This will produce two files (within the `ingest` directory):
Run the complete ingest pipeline and upload results to AWS S3 with

```sh
nextstrain build . --configfiles config/config.yaml config/optional.yaml
nextstrain build . --configfiles build-configs/nextstrain-automation/config.yaml
```

### Adding new sequences not from GenBank
Expand All @@ -57,12 +57,12 @@ Do the following to include sequences from static FASTA files.
!ingest/data/{file-name}.ndjson
```

3. Add the `file-name` (without the `.ndjson` extension) as a source to `ingest/config/config.yaml`. This will tell the ingest pipeline to concatenate the records to the GenBank sequences and run them through the same transform pipeline.
3. Add the `file-name` (without the `.ndjson` extension) as a source to `defaults/config.yaml`. This will tell the ingest pipeline to concatenate the records to the GenBank sequences and run them through the same transform pipeline.

## Configuration

Configuration takes place in `config/config.yaml` by default.
Optional configs for uploading files and Slack notifications are in `config/optional.yaml`.
Configuration takes place in `defaults/config.yaml` by default.
Optional configs for uploading files and Slack notifications are in `build-configs/nextstrain-automation/config.yaml`.

### Environment Variables

Expand Down
70 changes: 11 additions & 59 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,72 +4,24 @@ min_version(
"7.7.0"
) # Snakemake 7.7.0 introduced `retries` directive used in fetch-sequences

if not config:

configfile: "config/config.yaml"


send_slack_notifications = config.get("send_slack_notifications", False)


def _get_all_targets(wildcards):
# Default targets are the metadata TSV and sequences FASTA files
all_targets = ["results/sequences.fasta", "results/metadata.tsv"]

# Add additional targets based on upload config
upload_config = config.get("upload", {})

for target, params in upload_config.items():
files_to_upload = params.get("files_to_upload", {})

if not params.get("dst"):
print(
f"Skipping file upload for {target!r} because the destination was not defined."
)
else:
all_targets.extend(
expand(
[f"data/upload/{target}/{{remote_file_name}}.done"],
zip,
remote_file_name=files_to_upload.keys(),
)
)

# Add additional targets for Nextstrain's internal Slack notifications
if send_slack_notifications:
all_targets.extend(
[
"data/notify/genbank-record-change.done",
"data/notify/metadata-diff.done",
]
)

if config.get("trigger_rebuild", False):
all_targets.append("data/trigger/rebuild.done")

return all_targets
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"


# This is the default rule that Snakemake will run when there are no specified targets.
rule all:
input:
_get_all_targets,


include: "workflow/snakemake_rules/fetch_sequences.smk"
include: "workflow/snakemake_rules/transform.smk"
include: "workflow/snakemake_rules/nextclade.smk"


if config.get("upload", False):

include: "workflow/snakemake_rules/upload.smk"

"results/sequences.fasta",
"results/metadata.tsv",

if send_slack_notifications:

include: "workflow/snakemake_rules/slack_notifications.smk"
include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"
include: "rules/nextclade.smk"


if config.get("trigger_rebuild", False):
if "custom_rules" in config:
for rule_file in config["custom_rules"]:

include: "workflow/snakemake_rules/trigger_rebuild.smk"
include: rule_file
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# Optional configs used by Nextstrain team
# Default configs used by Nextstrain automation

# Custom rules to run as part of the Nextstrain automated workflow
# The paths should be relative to the ingest directory.
custom_rules:
- build-configs/nextstrain-automation/nextstrain_automation.smk

# Params for uploads
upload:
# Upload params for AWS S3
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
These custom rules handle the automation for Nextstrain builds that
include
- Slack notifications
- Uploads to AWS S3
- Triggering downstream workflows
"""

send_slack_notifications = config.get("send_slack_notifications", False)

def _get_all_targets(wildcards):
# Default targets are the metadata TSV and sequences FASTA files
all_targets = ["results/sequences.fasta", "results/metadata.tsv"]

# Add additional targets based on upload config
upload_config = config.get("upload", {})

for target, params in upload_config.items():
files_to_upload = params.get("files_to_upload", {})

if not params.get("dst"):
print(
f"Skipping file upload for {target!r} because the destination was not defined."
)
else:
all_targets.extend(
expand(
[f"data/upload/{target}/{{remote_file_name}}.done"],
zip,
remote_file_name=files_to_upload.keys(),
)
)

# Add additional targets for Nextstrain's internal Slack notifications
if send_slack_notifications:
all_targets.extend(
[
"data/notify/genbank-record-change.done",
"data/notify/metadata-diff.done",
]
)

if config.get("trigger_rebuild", False):
all_targets.append("data/trigger/rebuild.done")

return all_targets


rule nextstrain_automation:
input:
_get_all_targets,


if config.get("upload", False):

include: "upload.smk"


if send_slack_notifications:

include: "slack_notifications.smk"


if config.get("trigger_rebuild", False):

include: "trigger_rebuild.smk"
File renamed without changes.
File renamed without changes.
18 changes: 11 additions & 7 deletions ingest/config/config.yaml → ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@ sources: ['genbank']
# Pathogen NCBI Taxonomy ID
ncbi_taxon_id: '10244'
# Renames the NCBI dataset headers
ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv'
ncbi_field_map: 'defaults/ncbi-dataset-field-map.tsv'

# Params for the transform rule
transform:
# Params for the curate rule
curate:
# Fields to rename.
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution']
field_map:
collected: date
submitted: date_submitted
genbank_accession: accession
submitting_organization: institution
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names
strain_regex: '^.+$'
Expand Down Expand Up @@ -43,9 +47,9 @@ transform:
geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
# Local geolocation rules that are only applicable to mpox data
# Local rules can overwrite the general geolocation rules provided above
local_geolocation_rules: 'source-data/geolocation-rules.tsv'
local_geolocation_rules: 'defaults/geolocation-rules.tsv'
# User annotations file
annotations: 'source-data/annotations.tsv'
annotations: 'defaults/annotations.tsv'
# ID field used to merge annotations
annotations_id: 'accession'
# Field to use as the sequence ID in the FASTA file
Expand Down Expand Up @@ -76,4 +80,4 @@ nextclade:
# Field to use as the sequence ID in the Nextclade file
id_field: 'seqName'
# Fields from a Nextclade file to be renamed (if desired) and appended to a metadata file
field_map: 'source-data/nextclade-field-map.tsv'
field_map: 'defaults/nextclade-field-map.tsv'
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This part of the workflow handles transforming the data into standardized
This part of the workflow handles curating the data into standardized
formats and expects input file
sequences_ndjson = "data/sequences.ndjson"
Expand All @@ -9,15 +9,15 @@ This will produce output files as
metadata = "data/metadata_raw.tsv"
sequences = "results/sequences.fasta"
Parameters are expected to be defined in `config.transform`.
Parameters are expected to be defined in `config.curate`.
"""


rule fetch_general_geolocation_rules:
output:
general_geolocation_rules="data/general-geolocation-rules.tsv",
params:
geolocation_rules_url=config["transform"]["geolocation_rules_url"],
geolocation_rules_url=config["curate"]["geolocation_rules_url"],
shell:
"""
curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
Expand All @@ -27,7 +27,7 @@ rule fetch_general_geolocation_rules:
rule concat_geolocation_rules:
input:
general_geolocation_rules="data/general-geolocation-rules.tsv",
local_geolocation_rules=config["transform"]["local_geolocation_rules"],
local_geolocation_rules=config["curate"]["local_geolocation_rules"],
output:
all_geolocation_rules="data/all-geolocation-rules.tsv",
shell:
Expand All @@ -36,32 +36,39 @@ rule concat_geolocation_rules:
"""


rule transform:
def format_field_map(field_map: dict[str, str]) -> str:
"""
Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands.
"""
return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()])


rule curate:
input:
sequences_ndjson="data/sequences.ndjson",
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["transform"]["annotations"],
annotations=config["curate"]["annotations"],
output:
metadata="data/metadata_raw.tsv",
sequences="results/sequences.fasta",
log:
"logs/transform.txt",
"logs/curate.txt",
params:
field_map=config["transform"]["field_map"],
strain_regex=config["transform"]["strain_regex"],
strain_backup_fields=config["transform"]["strain_backup_fields"],
date_fields=config["transform"]["date_fields"],
expected_date_formats=config["transform"]["expected_date_formats"],
articles=config["transform"]["titlecase"]["articles"],
abbreviations=config["transform"]["titlecase"]["abbreviations"],
titlecase_fields=config["transform"]["titlecase"]["fields"],
authors_field=config["transform"]["authors_field"],
authors_default_value=config["transform"]["authors_default_value"],
abbr_authors_field=config["transform"]["abbr_authors_field"],
annotations_id=config["transform"]["annotations_id"],
metadata_columns=config["transform"]["metadata_columns"],
id_field=config["transform"]["id_field"],
sequence_field=config["transform"]["sequence_field"],
field_map=format_field_map(config["curate"]["field_map"]),
strain_regex=config["curate"]["strain_regex"],
strain_backup_fields=config["curate"]["strain_backup_fields"],
date_fields=config["curate"]["date_fields"],
expected_date_formats=config["curate"]["expected_date_formats"],
articles=config["curate"]["titlecase"]["articles"],
abbreviations=config["curate"]["titlecase"]["abbreviations"],
titlecase_fields=config["curate"]["titlecase"]["fields"],
authors_field=config["curate"]["authors_field"],
authors_default_value=config["curate"]["authors_default_value"],
abbr_authors_field=config["curate"]["abbr_authors_field"],
annotations_id=config["curate"]["annotations_id"],
metadata_columns=config["curate"]["metadata_columns"],
id_field=config["curate"]["id_field"],
sequence_field=config["curate"]["sequence_field"],
shell:
"""
(cat {input.sequences_ndjson} \
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ rule join_metadata_clades:
output:
metadata="results/metadata.tsv",
params:
id_field=config["transform"]["id_field"],
id_field=config["curate"]["id_field"],
nextclade_id_field=config["nextclade"]["id_field"],
shell:
"""
Expand Down
2 changes: 1 addition & 1 deletion phylogenetic/config/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Our bioinformatic processing workflow can be found at [github.com/nextstrain/mpo

#### Underlying data
We curate sequence data and metadata from the [NCBI Datasets command line tools](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/),
using an NCBI Taxonomy ID defined in [ingest/config/config.yaml](https://github.com/nextstrain/mpox/blob/master/ingest/config/config.yaml), as starting point for these analyses.
using an NCBI Taxonomy ID defined in [ingest/defaults/config.yaml](https://github.com/nextstrain/mpox/blob/master/ingest/defaults/config.yaml), as starting point for these analyses.
Curated sequences and metadata are available as flat files at:
- [data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz](https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz)
- [data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz](https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz)
Expand Down

0 comments on commit d4eb28b

Please sign in to comment.