Skip to content

Commit

Permalink
Switch accession links to UCSC file
Browse files Browse the repository at this point in the history
Adds a rule to fetch GISAID and GenBank accession links from UCSC
and uses the output in the `transform_genbank` and `transform_gisaid`
rules.

The UCSC file is modified to keep the headers and format exactly the
same as the current `source-data/accessions.tsv.gz` file so that it can
be directly replaced.
  • Loading branch information
joverlee521 committed Dec 17, 2024
1 parent 4d06220 commit a4089e7
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
9 changes: 9 additions & 0 deletions bin/fetch-accession-links
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
set -euo pipefail

curl "https://hgwdev.gi.ucsc.edu/~angie/epiToPublicAndDate.latest" \
--fail --silent --show-error \
--header 'User-Agent: https://github.com/nextstrain/ncov-ingest ([email protected])' \
| csvtk -t add-header --names gisaid_epi_isl,genbank_accession,strain,date \
| csvtk -t cut --fields genbank_accession,gisaid_epi_isl \
| gzip -c
21 changes: 19 additions & 2 deletions workflow/snakemake_rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ Produces different output files for GISAID vs GenBank:
"""


rule fetch_accession_links:
"""
Fetch the accession links between GISAID and GenBank
"""
output:
accessions="data/accessions.tsv.gz",
retries: 5
shell:
"""
./bin/fetch-accession-links > {output.accessions:q}
"""


rule transform_rki_data:
input:
ndjson="data/rki.ndjson",
Expand Down Expand Up @@ -60,7 +73,8 @@ rule transform_genbank_data:
biosample = "data/genbank/biosample.tsv",
ndjson = "data/genbank.ndjson",
cog_uk_accessions = "data/cog_uk_accessions.tsv",
cog_uk_metadata = "data/cog_uk_metadata.csv.gz"
cog_uk_metadata = "data/cog_uk_metadata.csv.gz",
accessions = "data/accessions.tsv.gz",
output:
fasta = "data/genbank_sequences.fasta",
metadata = "data/genbank_metadata_transformed.tsv",
Expand All @@ -75,6 +89,7 @@ rule transform_genbank_data:
--duplicate-biosample {output.duplicate_biosample} \
--cog-uk-accessions {input.cog_uk_accessions} \
--cog-uk-metadata {input.cog_uk_metadata} \
--accessions {input.accessions} \
--output-metadata {output.metadata} \
--output-fasta {output.fasta} > {output.flagged_annotations}
"""
Expand Down Expand Up @@ -105,7 +120,8 @@ rule merge_open_data:

rule transform_gisaid_data:
input:
ndjson = "data/gisaid.ndjson"
ndjson = "data/gisaid.ndjson",
accessions = "data/accessions.tsv.gz",
output:
fasta = "data/gisaid/sequences.fasta",
metadata = "data/gisaid/metadata_transformed.tsv",
Expand All @@ -116,6 +132,7 @@ rule transform_gisaid_data:
shell:
"""
./bin/transform-gisaid {input.ndjson} \
--accessions {input.accessions} \
--output-metadata {output.metadata} \
--output-fasta {output.fasta} \
--output-additional-info {output.additional_info} \
Expand Down

0 comments on commit a4089e7

Please sign in to comment.