diff --git a/bin/fetch-accession-links b/bin/fetch-accession-links new file mode 100755 index 00000000..42aa6e6b --- /dev/null +++ b/bin/fetch-accession-links @@ -0,0 +1,8 @@ +#!/bin/bash +set -euo pipefail + +curl "https://hgwdev.gi.ucsc.edu/~angie/epiToPublicAndDate.latest" \ + --fail --silent --show-error \ + --header 'User-Agent: https://github.com/nextstrain/ncov-ingest (hello@nextstrain.org)' \ + | csvtk -t add-header --names gisaid_epi_isl,genbank_accession,strain,date \ + | csvtk -t cut --fields genbank_accession,gisaid_epi_isl diff --git a/bin/transform-genbank b/bin/transform-genbank index 95c00f2d..5af323e4 100755 --- a/bin/transform-genbank +++ b/bin/transform-genbank @@ -207,8 +207,8 @@ if __name__ == '__main__': | ParseGeographicColumnsGenbank( base / 'source-data/us-state-codes.tsv' ) | AbbreviateAuthors() | ApplyUserGeoLocationSubstitutionRules(geoRules) - | MergeUserAnnotatedMetadata(annotations, idKey = 'genbank_accession' ) | MergeUserAnnotatedMetadata(accessions, idKey = 'genbank_accession_rev' ) + | MergeUserAnnotatedMetadata(annotations, idKey = 'genbank_accession' ) | FillDefaultLocationData() | patchUKData(args.cog_uk_accessions, args.cog_uk_metadata) | GenbankProblematicFilter( args.problem_data, @@ -301,4 +301,3 @@ if __name__ == '__main__': strain_name = updated_strain_names_by_line_no[entry[LINE_NUMBER_KEY]] print( '>' , strain_name , sep='' , file= fasta_OUT) print( entry['sequence'] , file= fasta_OUT) - diff --git a/bin/transform-gisaid b/bin/transform-gisaid index eb1bea0d..e8f95a9e 100755 --- a/bin/transform-gisaid +++ b/bin/transform-gisaid @@ -182,8 +182,8 @@ if __name__ == '__main__': pipeline = (pipeline | ApplyUserGeoLocationSubstitutionRules(geoRules) - | MergeUserAnnotatedMetadata(annotations) | MergeUserAnnotatedMetadata(accessions) + | MergeUserAnnotatedMetadata(annotations) | FillDefaultLocationData() ) diff --git a/workflow/snakemake_rules/curate.smk b/workflow/snakemake_rules/curate.smk index 75b65f48..f8c00765 100644 --- a/workflow/snakemake_rules/curate.smk +++ b/workflow/snakemake_rules/curate.smk @@ -23,6 +23,34 @@ Produces different output files for GISAID vs GenBank: """ +rule fetch_accession_links: + """ + Fetch the accession links between GISAID and GenBank + """ + output: + accessions=temp("data/accessions.tsv"), + retries: 5 + shell: + """ + ./bin/fetch-accession-links > {output.accessions:q} + """ + + +rule concat_accession_links: + input: + source_data="source-data/accessions.tsv.gz", + accessions="data/accessions.tsv", + output: + all_accessions="data/all_accessions.tsv.gz" + shell: + r""" + gunzip -kcfq {input.source_data:q} \ + | csvtk concat -t - {input.accessions:q} \ + | csvtk uniq -t -f genbank_accession,gisaid_epi_isl \ + | gzip -c > {output.all_accessions:q} + """ + + rule transform_rki_data: input: ndjson="data/rki.ndjson", @@ -60,7 +88,8 @@ rule transform_genbank_data: biosample = "data/genbank/biosample.tsv", ndjson = "data/genbank.ndjson", cog_uk_accessions = "data/cog_uk_accessions.tsv", - cog_uk_metadata = "data/cog_uk_metadata.csv.gz" + cog_uk_metadata = "data/cog_uk_metadata.csv.gz", + accessions = "data/all_accessions.tsv.gz", output: fasta = "data/genbank_sequences.fasta", metadata = "data/genbank_metadata_transformed.tsv", @@ -75,6 +104,7 @@ rule transform_genbank_data: --duplicate-biosample {output.duplicate_biosample} \ --cog-uk-accessions {input.cog_uk_accessions} \ --cog-uk-metadata {input.cog_uk_metadata} \ + --accessions {input.accessions} \ --output-metadata {output.metadata} \ --output-fasta {output.fasta} > {output.flagged_annotations} """ @@ -105,7 +135,8 @@ rule merge_open_data: rule transform_gisaid_data: input: - ndjson = "data/gisaid.ndjson" + ndjson = "data/gisaid.ndjson", + accessions = "data/all_accessions.tsv.gz", output: fasta = "data/gisaid/sequences.fasta", metadata = "data/gisaid/metadata_transformed.tsv", @@ -116,6 +147,7 @@ rule transform_gisaid_data: shell: """ ./bin/transform-gisaid {input.ndjson} \ + --accessions {input.accessions} \ --output-metadata {output.metadata} \ --output-fasta {output.fasta} \ --output-additional-info {output.additional_info} \