Skip to content

Commit

Permalink
Merge pull request #8 from nextstrain/add-host-categories
Browse files Browse the repository at this point in the history
Add host taxonomic categories
  • Loading branch information
kimandrews authored Aug 20, 2024
2 parents 363b120 + 6e2efb9 commit 67b7941
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# CHANGELOG
* 20 August 2024: Assign host taxa to taxonomic groupings that are relevant to rabies for coloring in auspice
* 12 August 2024: Create a full genome phylogeny for rabies [PR#3](https://github.com/nextstrain/rabies/pull/3)
* 25 July 2024: Add CI GH Action workflow to test the ingest workflow [PR#6](https://github.com/nextstrain/rabies/pull/6)
* 15 July 2024: Make rabies-specific modifications to the ingest directory (which originated from the pathogen-repo-guide) [PR#2](https://github.com/nextstrain/rabies/pull/2)
14 changes: 13 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ ncbi_datasets_fields:
- update-date
- length
- host-name
- host-tax-id
- isolate-lineage-source
- biosample-acc
- submitter-names
Expand Down Expand Up @@ -53,12 +54,18 @@ curate:
release-date: date_released
update-date: date_updated
length: length
host-name: host
host-name: host_latin_name
host-tax-id: host_tax_id
isolate-lineage-source: sample_type
biosample-acc: biosample_accessions
submitter-names: authors
submitter-affiliation: institution
submitter-country: submitter_country
Group name: host_group
Curator common name: host_common_name
Family name: host_family
Genus name: host_genus

# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: "^.+$"
Expand Down Expand Up @@ -109,6 +116,11 @@ curate:
"location",
"length",
"host",
"host_latin_name",
"host_family",
"host_genus",
"host_group",
"host_common_name",
"date_released",
"date_updated",
"sra_accessions",
Expand Down
3 changes: 3 additions & 0 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ rule curate:
--abbr-authors-field {params.abbr_authors_field} \
| augur curate apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| scripts/add-host-categories.py \
--latin-field host_latin_name --family-field host_family \
--genus-field host_genus --group-field host_group \
| augur curate apply-record-annotations \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down
63 changes: 62 additions & 1 deletion ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,67 @@ rule format_ncbi_dataset_report:
> {output.ncbi_dataset_tsv}
"""

rule extract_ncbi_dataset_hosttaxid:
input:
ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
output:
ncbi_dataset_hosttaxid="data/ncbi_dataset_hosttaxid.tsv",
log:
"logs/extract_ncbi_dataset_hosttaxid.txt",
benchmark:
"benchmarks/extract_ncbi_dataset_hosttaxid.txt"
shell:
"""
tsv-select {input.ncbi_dataset_tsv} -H -f 'host\-tax\-id' | \
tsv-filter --is-numeric 1 | \
tsv-uniq \
2> {log} > {output.ncbi_dataset_hosttaxid}
"""

rule get_ncbi_hosttax_info:
input:
ncbi_dataset_hosttaxid="data/ncbi_dataset_hosttaxid.tsv",
output:
ncbi_hosttax_info="data/hosttax_info.zip",
# Allow retries in case of network errors
retries: 5
log:
"logs/get_ncbi_hosttax_info.txt",
benchmark:
"benchmarks/get_ncbi_hosttax_info.txt"
shell:
"""
datasets download taxonomy taxon \
--inputfile {input.ncbi_dataset_hosttaxid} \
--filename {output.ncbi_hosttax_info} \
2>&1 | tee {log}
"""

rule join_metadata_and_hostinfo:
input:
ncbi_hosttax_info="data/hosttax_info.zip",
ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
output:
metadata = "data/metadata_with_taxinfo.tsv",
log:
"logs/join_metadata_and_hostinfo.txt",
benchmark:
"benchmarks/join_metadata_and_hostinfo.txt"
params:
ncbi_hosttax_columns = "Query,'Group\ name','Curator\ common\ name','Family\ name','Genus\ name'"
shell:
"""
unzip -p {input.ncbi_hosttax_info} ncbi_dataset/data/taxonomy_summary.tsv \
| tsv-select -H -f {params.ncbi_hosttax_columns} \
| tsv-join -H \
--filter-file - \
--key-fields Query \
--data-fields 'host\-tax\-id' \
--append-fields '*' \
--write-all ? \
{input.ncbi_dataset_tsv} \
2> {log} > {output.metadata}
"""

# Technically you can bypass this step and directly provide FASTA and TSV files
# as input files for the curate pipeline.
Expand All @@ -103,7 +164,7 @@ rule format_ncbi_dataset_report:
rule format_ncbi_datasets_ndjson:
input:
ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta",
ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
ncbi_dataset_tsv="data/metadata_with_taxinfo.tsv",
output:
ndjson="data/ncbi.ndjson",
log:
Expand Down
70 changes: 70 additions & 0 deletions ingest/scripts/add-host-categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#! /usr/bin/env python3
"""
From stdin, generates host names using info from the NCBI taxonomy output of the NDJSON record, with output to 'host'
Outputs the modified record to stdout.
"""

import argparse
import json
from sys import stdin, stdout

def parse_args():
parser = argparse.ArgumentParser(
description="Generate host names and output to 'host'.")
parser.add_argument("--latin-field", default='host_latin_name',
help="Field from the records to use as the host latin name.")
parser.add_argument("--family-field", default='host_family',
help="Field from the records to use as the host Family name.")
parser.add_argument("--genus-field", default='host_genus',
help="Field from the records to use as the host genus name.")
parser.add_argument("--group-field", default='host_group',
help="Field from the records to use as the host group.")
return parser.parse_args()

def _set_host_name_transformed(record, args):
latin_replacements = {
"Canis lupus familiaris": "Domestic Dog",
"Homo sapiens": "Human",
"Bos taurus": "Cattle",
"Didelphis albiventris": "Other Mammal",
"Elephas maximus": "Other Mammal",
"Dasypus novemcinctus": "Other Mammal"}
family_replacements = {"Mephitidae": "Skunk"}
group_replacements = {
"odd-toed ungulates": "Other Ungulate",
"even-toed ungulates & whales": "Other Ungulate",
"carnivores": "Other Carnivore",
"bats": "Bat",
"birds": "Bird",
"primates": "Other Mammal",
"rodents": "Other Mammal",
"mammals": "Other Mammal"
}
latin_field = record[args.latin_field]
family_field = record[args.family_field]
group_field = record[args.group_field]

if record[args.family_field] == "Canidae" and record[args.genus_field] == "Vulpes":
return "Fox (Vulpes sp.)"
elif record[args.family_field] == "Procyonidae" and record[args.genus_field] == "Procyon":
return "Raccoon"
elif latin_field in latin_replacements:
return latin_replacements[latin_field]
elif family_field in family_replacements:
return family_replacements[family_field]
elif group_field in group_replacements:
return group_replacements[group_field]
else:
return group_field

def main():
args = parse_args()

for index, record in enumerate(stdin):
record = json.loads(record)
record['host'] = _set_host_name_transformed(record, args)
stdout.write(json.dumps(record) + "\n")

if __name__ == "__main__":
main()
12 changes: 11 additions & 1 deletion phylogenetic/defaults/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@
"key": "host",
"title": "Host",
"type": "categorical"
},
{
"key": "host_latin_name",
"title": "Host latin name",
"type": "categorical"
},
{
"key": "host_common_name",
"title": "Host common name",
"type": "categorical"
}
],
"geo_resolutions": [
Expand All @@ -39,7 +49,7 @@
],
"display_defaults": {
"map_triplicate": true,
"color_by": "region"
"color_by": "host"
},
"filters": [
"region",
Expand Down
13 changes: 13 additions & 0 deletions phylogenetic/defaults/colors.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,16 @@ region Africa #8ABB6A
region Europe #BEBB48
region South America #E29E39
region North America #E2562B
#
# Host taxa
host Bat #3F47C9
host Domestic Dog #4274CE
host Fox (Vulpes sp.) #4F97BB
host Raccoon #64AC99
host Skunk #7EB976
host Other Carnivore #9EBE5A
host Cattle #BEBB48
host Other Ungulate #D9AE3E
host Human #E69036
host Other Mammal #E35F2D
host Bird #DB2823

0 comments on commit 67b7941

Please sign in to comment.