From a4089e71ea0e704c1c20700e20132b3592817f34 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 16 Dec 2024 16:33:44 -0800 Subject: [PATCH] Switch accession links to UCSC file Adds a rule to fetch GISAID and GenBank accession links from UCSC and uses the output in the `transform_genbank` and `transform_gisaid` rules. The UCSC file is modified to keep the headers and format exactly the same as the current `source-data/accessions.tsv.gz` file so that it can be directly replaced. --- bin/fetch-accession-links | 9 +++++++++ workflow/snakemake_rules/curate.smk | 21 +++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) create mode 100755 bin/fetch-accession-links diff --git a/bin/fetch-accession-links b/bin/fetch-accession-links new file mode 100755 index 00000000..7378ed4d --- /dev/null +++ b/bin/fetch-accession-links @@ -0,0 +1,9 @@ +#!/bin/bash +set -euo pipefail + +curl "https://hgwdev.gi.ucsc.edu/~angie/epiToPublicAndDate.latest" \ + --fail --silent --show-error \ + --header 'User-Agent: https://github.com/nextstrain/ncov-ingest (hello@nextstrain.org)' \ + | csvtk -t add-header --names gisaid_epi_isl,genbank_accession,strain,date \ + | csvtk -t cut --fields genbank_accession,gisaid_epi_isl \ + | gzip -c diff --git a/workflow/snakemake_rules/curate.smk b/workflow/snakemake_rules/curate.smk index 75b65f48..6fee03d2 100644 --- a/workflow/snakemake_rules/curate.smk +++ b/workflow/snakemake_rules/curate.smk @@ -23,6 +23,19 @@ Produces different output files for GISAID vs GenBank: """ +rule fetch_accession_links: + """ + Fetch the accession links between GISAID and GenBank + """ + output: + accessions="data/accessions.tsv.gz", + retries: 5 + shell: + """ + ./bin/fetch-accession-links > {output.accessions:q} + """ + + rule transform_rki_data: input: ndjson="data/rki.ndjson", @@ -60,7 +73,8 @@ rule transform_genbank_data: biosample = "data/genbank/biosample.tsv", ndjson = "data/genbank.ndjson", cog_uk_accessions = "data/cog_uk_accessions.tsv", - cog_uk_metadata = "data/cog_uk_metadata.csv.gz" + cog_uk_metadata = "data/cog_uk_metadata.csv.gz", + accessions = "data/accessions.tsv.gz", output: fasta = "data/genbank_sequences.fasta", metadata = "data/genbank_metadata_transformed.tsv", @@ -75,6 +89,7 @@ rule transform_genbank_data: --duplicate-biosample {output.duplicate_biosample} \ --cog-uk-accessions {input.cog_uk_accessions} \ --cog-uk-metadata {input.cog_uk_metadata} \ + --accessions {input.accessions} \ --output-metadata {output.metadata} \ --output-fasta {output.fasta} > {output.flagged_annotations} """ @@ -105,7 +120,8 @@ rule merge_open_data: rule transform_gisaid_data: input: - ndjson = "data/gisaid.ndjson" + ndjson = "data/gisaid.ndjson", + accessions = "data/accessions.tsv.gz", output: fasta = "data/gisaid/sequences.fasta", metadata = "data/gisaid/metadata_transformed.tsv", @@ -116,6 +132,7 @@ rule transform_gisaid_data: shell: """ ./bin/transform-gisaid {input.ndjson} \ + --accessions {input.accessions} \ --output-metadata {output.metadata} \ --output-fasta {output.fasta} \ --output-additional-info {output.additional_info} \