diff --git a/README.md b/README.md index a392c3a..a5dd3d1 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,9 @@ NCBI interaction scripts that are useful for fetching public metadata and sequen - [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file. Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/) or [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs. +- [fetch-from-ncbi-virus](fetch-from-ncbi-virus) - Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) and output NDJSON records to stdout. +- [ncbi-virus-url](ncbi-virus-url) - Generates the URL to download metadata and sequences from NCBI Virus as a single CSV file. +- [csv-to-ndjson](csv-to-ndjson) - Converts CSV file to NDJSON file with a hard-coded 200MiB field size limit to accommodate sequences in the NCBI Virus download. Potential Nextstrain CLI scripts diff --git a/csv-to-ndjson b/csv-to-ndjson new file mode 100755 index 0000000..84befe0 --- /dev/null +++ b/csv-to-ndjson @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +""" +Convert CSV on stdin to NDJSON on stdout. +usage: `cat dummy.csv | ./csv-to-ndjson > dummy.ndjson` +""" +import csv +import json +from sys import stdin, stdout + +# 200 MiB; default is 128 KiB +csv.field_size_limit(200 * 1024 * 1024) + +for row in csv.DictReader(stdin): + json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:') + print() diff --git a/docs/ncbi-virus-all-fields-example.json b/docs/ncbi-virus-all-fields-example.json new file mode 100644 index 0000000..bbf9a7f --- /dev/null +++ b/docs/ncbi-virus-all-fields-example.json @@ -0,0 +1,292 @@ +{ + "ExportDate_dt": "2023-08-08T21:02:01.475Z", + "QualNum_i": 0, + "QualPct_d": 0.0, + "IncompleteCdsCnt_i": 0, + "gi_l": 1798174254, + "Host_s": "Homo sapiens", + "HostSpecies_s": "Homo sapiens (human), taxid:9606|", + "HostLineage_ss": [ + "cellular organisms, taxid:131567| biota", + "Eukaryota (eucaryotes), taxid:2759| eukaryotes Eucarya Eucaryotae Eukarya Eukaryotae", + "Opisthokonta, taxid:33154| Fungi/Metazoa group opisthokonts", + "Metazoa (metazoans), taxid:33208| multicellular animals Animalia animals", + "Eumetazoa, taxid:6072|", + "Bilateria, taxid:33213|", + "Deuterostomia (deuterostomes), taxid:33511|", + "Chordata (chordates), taxid:7711|", + "Craniata, taxid:89593|", + "Vertebrata (vertebrates), taxid:7742|", + "Gnathostomata (jawed vertebrates), taxid:7776|", + "Teleostomi, taxid:117570|", + "Euteleostomi (bony vertebrates), taxid:117571|", + "Sarcopterygii, taxid:8287|", + "Dipnotetrapodomorpha, taxid:1338369|", + "Tetrapoda (tetrapods), taxid:32523|", + "Amniota (amniotes), taxid:32524|", + "Mammalia (mammals), taxid:40674|", + "Theria, taxid:32525|", + "Eutheria (placentals), taxid:9347| eutherian mammals placental mammals Placentalia", + "Boreoeutheria, taxid:1437010| Boreotheria", + "Euarchontoglires, taxid:314146|", + "Primates, taxid:9443| Primata primates", + "Haplorrhini, taxid:376913|", + "Simiiformes, taxid:314293| Anthropoidea", + "Catarrhini, taxid:9526|", + "Hominoidea (apes), taxid:314295| ape", + "Hominidae (great apes), taxid:9604| Pongidae", + "Homininae, taxid:207598| Homo/Pan/Gorilla group", + "Homo (humans), taxid:9605|", + "Homo sapiens (human), taxid:9606|" + ], + "HostLineageId_ss": [ + "131567", + "2759", + "33154", + "33208", + "6072", + "33213", + "33511", + "7711", + "89593", + "7742", + "7776", + "117570", + "117571", + "8287", + "1338369", + "32523", + "32524", + "40674", + "32525", + "9347", + "1437010", + "314146", + "9443", + "376913", + "314293", + "9526", + "314295", + "9604", + "207598", + "9605", + "9606" + ], + "Locus_s": "NC_045512", + "OrgId_i": 2697049, + "VirusFamily_s": "Coronaviridae", + "VirusGenus_s": "Betacoronavirus", + "VirusSpecies_s": "Severe acute respiratory syndrome-related coronavirus", + "VirusSpeciesId_i": 694009, + "VirusLineage_ss": [ + "Viruses, taxid:10239| Vira Viridae viruses", + "Riboviria (RNA viruses), taxid:2559587| RNA viruses and viroids", + "Orthornavirae, taxid:2732396|", + "Pisuviricota, taxid:2732408|", + "Pisoniviricetes, taxid:2732506|", + "Nidovirales, taxid:76804|", + "Cornidovirineae, taxid:2499399|", + "Coronaviridae, taxid:11118|", + "Orthocoronavirinae, taxid:2501931|", + "Betacoronavirus, taxid:694002| Coronavirus", + "Sarbecovirus, taxid:2509511|", + "Severe acute respiratory syndrome-related coronavirus, taxid:694009| HCoV-SARS SARS SARSr-CoV SARSrCoV", + "Severe acute respiratory syndrome coronavirus 2, taxid:2697049| SARS-CoV-2", + "RNA viruses" + ], + "VirusLineageId_ss": [ + "10239", + "2559587", + "2732396", + "2732408", + "2732506", + "76804", + "2499399", + "11118", + "2501931", + "694002", + "2509511", + "694009", + "2697049" + ], + "VirusL0_s": "RNA viruses", + "VirusL1_s": "Orthornavirae, taxid:2732396", + "VirusL2_s": "Pisuviricota, taxid:2732408", + "VirusL3_s": "Pisoniviricetes, taxid:2732506", + "VirusL4_s": "Nidovirales, taxid:76804", + "VirusL5_s": "Cornidovirineae, taxid:2499399", + "VirusL6_s": "Coronaviridae, taxid:11118", + "VirusL7_s": "Orthocoronavirinae, taxid:2501931", + "VirusL8_s": "Betacoronavirus, taxid:694002", + "VirusL9_s": "Sarbecovirus, taxid:2509511", + "VirusL10_s": "Severe acute respiratory syndrome-related coronavirus, taxid:694009", + "ViralHost_ss": [ + "human", + "vertebrates" + ], + "GenomicMoltype_s": "ssRNA(+)", + "SLen_i": 29903, + "Flags_ss": [ + "refseq", + "complete" + ], + "Flags_csv": "refseq, complete", + "FlagsCount_i": 2, + "SetAcc_s": "GCF_009858895.2", + "Authors_ss": [ + "Wu,F.", + "Zhao,S.", + "Yu,B.", + "Chen,Y.M.", + "Wang,W.", + "Song,Z.G.", + "Hu,Y.", + "Tao,Z.W.", + "Tian,J.H.", + "Pei,Y.Y.", + "Yuan,M.L.", + "Zhang,Y.L.", + "Dai,F.H.", + "Liu,Y.", + "Wang,Q.M.", + "Zheng,J.J.", + "Xu,L.", + "Holmes,E.C.", + "Zhang,Y.Z.", + "Baranov,P.V.", + "Henderson,C.M.", + "Anderson,C.B.", + "Gesteland,R.F.", + "Atkins,J.F.", + "Howard,M.T.", + "Robertson,M.P.", + "Igel,H.", + "Baertsch,R.", + "Haussler,D.", + "Ares,M. Jr.", + "Scott,W.G.", + "Williams,G.D.", + "Chang,R.Y.", + "Brian,D.A.", + "Chen,Y.-M.", + "Song,Z.-G.", + "Tao,Z.-W.", + "Tian,J.-H.", + "Pei,Y.-Y.", + "Zhang,Y.-L.", + "Dai,F.-H.", + "Wang,Q.-M.", + "Zheng,J.-J.", + "Zhang,Y.-Z." + ], + "Authors_csv": "Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y., Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H., Liu,Y., Wang,Q.M., Zheng,J.J., Xu,L., Holmes,E.C., Zhang,Y.Z., Baranov,P.V., Henderson,C.M., Anderson,C.B., Gesteland,R.F., Atkins,J.F., Howard,M.T., Robertson,M.P., Igel,H., Baertsch,R., Haussler,D., Ares,M. Jr., Scott,W.G., Williams,G.D., Chang,R.Y., Brian,D.A., Chen,Y.-M., Song,Z.-G., Tao,Z.-W., Tian,J.-H., Pei,Y.-Y., Zhang,Y.-L., Dai,F.-H., Wang,Q.-M., Zheng,J.-J., Zhang,Y.-Z.", + "AuthorsCount_i": 44, + "Country_s": "China", + "Isolate_s": "Wuhan-Hu-1", + "Lineage_s": "B", + "Division_s": "VRL", + "Keywords_ss": [ + "RefSeq" + ], + "KeywordsCount_i": 1, + "TaxName_s": "Severe acute respiratory syndrome coronavirus 2", + "Region_s": "Asia", + "ParentAcc_s": "set:NC_045512", + "SetPosition_i": 0, + "SourceDB_s": "RefSeq", + "Definition_s": "Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome", + "HostId_i": 9606, + "CreateDate_dt": "2020-01-13T00:00:00Z", + "CreateYear_i": 2020, + "Genome_js": "[{\"id\": \"NC_045512.2\", \"segment\": null, \"proteins\": [{\"id\": \"YP_009724389.1\", \"name\": \"ORF1ab polyprotein\", \"location\": \"join(266..13468,13468..21555)\"}, {\"id\": \"YP_009725295.1\", \"name\": \"ORF1a polyprotein\", \"location\": \"266..13483\"}, {\"id\": \"YP_009724390.1\", \"name\": \"surface glycoprotein\", \"location\": \"21563..25384\"}, {\"id\": \"YP_009724391.1\", \"name\": \"ORF3a protein\", \"location\": \"25393..26220\"}, {\"id\": \"YP_009724392.1\", \"name\": \"envelope protein\", \"location\": \"26245..26472\"}, {\"id\": \"YP_009724393.1\", \"name\": \"membrane glycoprotein\", \"location\": \"26523..27191\"}, {\"id\": \"YP_009724394.1\", \"name\": \"ORF6 protein\", \"location\": \"27202..27387\"}, {\"id\": \"YP_009724395.1\", \"name\": \"ORF7a protein\", \"location\": \"27394..27759\"}, {\"id\": \"YP_009725318.1\", \"name\": \"ORF7b\", \"location\": \"27756..27887\"}, {\"id\": \"YP_009724396.1\", \"name\": \"ORF8 protein\", \"location\": \"27894..28259\"}, {\"id\": \"YP_009724397.2\", \"name\": \"nucleocapsid phosphoprotein\", \"location\": \"28274..29533\"}, {\"id\": \"YP_009725255.1\", \"name\": \"ORF10 protein\", \"location\": \"29558..29674\"}]}]", + "MolType_s": "RNA", + "ProtAcc_ss": [ + "YP_009724389", + "YP_009725295", + "YP_009724390", + "YP_009724391", + "YP_009724392", + "YP_009724393", + "YP_009724394", + "YP_009724395", + "YP_009725318", + "YP_009724396", + "YP_009724397", + "YP_009725255" + ], + "ProtAccCount_i": 12, + "UpdateDate_dt": "2020-07-18T00:00:00Z", + "UpdateYear_i": 2020, + "PubMed_ss": [ + "32015508", + "15680415", + "15630477", + "10482585" + ], + "PubMed_csv": "32015508, 15680415, 15630477, 10482585", + "PubMedCount_i": 4, + "Completeness_s": "complete", + "CountryFull_s": "China", + "ProtNames_ss": [ + "ORF1ab polyprotein", + "ORF1a polyprotein", + "surface glycoprotein", + "ORF3a protein", + "envelope protein", + "membrane glycoprotein", + "ORF6 protein", + "ORF7a protein", + "ORF7b protein", + "ORF8 protein", + "nucleocapsid phosphoprotein", + "ORF10 protein" + ], + "ProtNamesCount_i": 12, + "IsolateParsed_s": "Wuhan-Hu-1", + "NuclAcc_ss": [ + "NC_045512" + ], + "NuclAccCount_i": 1, + "CollectionDate_dr": "2019-12", + "CollectionYear_i": 2019, + "SubmitterAffil_s": "National Center for Biotechnology Information, NIH", + "BioProject_ss": [ + "PRJNA485481" + ], + "BioProject_csv": "PRJNA485481", + "BioProjectCount_i": 1, + "AccVer_s": "NC_045512.2", + "CollectionDate_s": "2019-12", + "SubmitterCountry_s": "USA", + "CollectionDate_dt": "2019-12-01T00:00:00Z", + "GenomeCompleteness_s": "complete", + "SubmitterAffilFull_s": "National Center for Biotechnology Information, NIH", + "BioProject_s": "PRJNA485481", + "AccNV_s": "NC_045512", + "id": "NC_045512", + "SeqType_s": "Nucleotide", + "FastaMD5_s": "4928f859a1822d291e0225206a0068c8", + "live_i": 1, + "ids_ss": [ + "GCF_009858895", + "GCF_009858895.2", + "NC_045512", + "NC_045512.2", + "PRJNA485481", + "YP_009724389", + "YP_009724390", + "YP_009724391", + "YP_009724392", + "YP_009724393", + "YP_009724394", + "YP_009724395", + "YP_009724396", + "YP_009724397", + "YP_009725255", + "YP_009725295", + "YP_009725318", + "set:NC_045512" + ], + "gi_i": 1798174254, + "_version_": 1773711315042304000 +} diff --git a/fetch-from-ncbi-virus b/fetch-from-ncbi-virus new file mode 100755 index 0000000..0c5f3e5 --- /dev/null +++ b/fetch-from-ncbi-virus @@ -0,0 +1,55 @@ +#!/bin/bash +# usage: fetch-from-ncbi-virus [options] +# +# Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) +# and output NDJSON records to stdout. +# +# options: +# +# --filter= Filter criteria to add as `fq` param values for the NCBI Virus URL +# May be specified multiple times. +# +# --field=: Metadata fields to add as `fl` param values for the NCBI Virus URL +# May be specified multiple times. +# +# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest: +# https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank +# +set -euo pipefail + +bin="$(dirname "$0")" + + +main() { + declare -a filters + declare -a fields + + for arg; do + case "$arg" in + --filter=*) + filters+=("${arg#*=}") + shift;; + --field=*) + fields+=("${arg#*=}") + shift;; + *) + break;; + esac + done + + local ncbi_taxon_id="${1:?NCBI taxon id is required.}" + local github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" + + local ncbi_virus_url + ncbi_virus_url="$("$bin"/ncbi-virus-url --ncbi-taxon-id "$ncbi_taxon_id" --filters "${filters[@]}" --fields "${fields[@]}")" + + fetch "$ncbi_virus_url" "$github_repo" | "$bin"/csv-to-ndjson +} + +fetch() { + curl "$1" \ + --fail --silent --show-error --http1.1 \ + --header "User-Agent: https://github.com/$2 (hello@nextstrain.org)" +} + +main "$@" diff --git a/ncbi-virus-url b/ncbi-virus-url new file mode 100755 index 0000000..0dd116b --- /dev/null +++ b/ncbi-virus-url @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +Generate URL to download all virus sequences and their curated metadata for a +specified NCBI Taxon ID from GenBank via NCBI Virus. + +The URL this program builds is based on the URL for SARS-CoV-2 constructed with + + https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/genbank-url + +and observing the network activity at + + https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide +""" +from urllib.parse import urlencode +from typing import List, Optional +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--ncbi-taxon-id", required=True, + help="NCBI Taxon ID. Visit NCBI virus at " + + "https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/find-data/virus " + + "to search for supported taxon IDs.") + parser.add_argument("--filters", required=False, nargs="*", + help="Filter criteria to add as `fq` param values. " + + "Apply filters via the NCBI Virus UI and observe the network " + + "activity to find the desired filter string.") + parser.add_argument("--fields", required=False, nargs="*", + help="Metadata fields to add as `fl` param values. " + + "Expected to be formatted as :. " + + "See docs/ncbi-virus-all-fields-example.json for the available NCBI Virus fields.") + return parser.parse_args() + +def build_query_url(ncbi_taxon_id: str, + filters: Optional[List[str]]=None, + fields: Optional[List[str]]=None): + """ + Generate URL to download all viral sequences and their curated metadata + from GenBank via NCBI Virus. + """ + endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/" + params = { + # Search criteria + 'fq': [ + '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein) + f'VirusLineageId_ss:({ncbi_taxon_id})', + *(filters or []), + ], + + # Unclear, but seems necessary. + 'q': '*:*', + + # Response format + 'cmd': 'download', + 'dlfmt': 'csv', + 'fl': ','.join( + [':'.join(names) for names in [ + # Pairs of (output column name, source data field). + ('genbank_accession', 'id'), + ('genbank_accession_rev', 'AccVer_s'), + ('database', 'SourceDB_s'), + ('strain', 'Isolate_s'), + ('region', 'Region_s'), + ('location', 'CountryFull_s'), + ('collected', 'CollectionDate_s'), + ('submitted', 'CreateDate_dt'), + ('updated', 'UpdateDate_dt'), + ('length', 'SLen_i'), + ('host', 'Host_s'), + ('isolation_source', 'Isolation_csv'), + ('bioproject_accession', 'BioProject_s'), + ('biosample_accession', 'BioSample_s'), + ('sra_accession', 'SRALink_csv'), + ('title', 'Definition_s'), + ('authors', 'Authors_csv'), + ('submitting_organization', 'SubmitterAffilFull_s'), + ('publications', 'PubMed_csv'), + ('sequence', 'Nucleotide_seq'), + ]] + (fields or []) + ), + + # Stable sort with GenBank accessions. + # Columns are source data fields, not our output columns. + 'sort': 'id asc', + + # This isn't Entrez, but include the same email parameter it requires just + # to be nice. + 'email': 'hello@nextstrain.org', + } + query = urlencode(params, doseq = True, encoding = "utf-8") + + print(f"{endpoint}?{query}") + +def main(): + args = parse_args() + build_query_url( + ncbi_taxon_id=args.ncbi_taxon_id, + filters=args.filters, + fields=args.fields + ) + +if __name__ == '__main__': + main()