Skip to content

Commit

Permalink
Concatenate accession link files
Browse files Browse the repository at this point in the history
Concatenates the downloaded accession links file to the existing
`source-data/accessions.tsv.gz` since the downloaded file seems to be
missing some of the existing links.¹

The transform scripts apply the accession links in order, so the
last matching accession link is used in the final metadata. This allows
us to default to the downloaded file which is the latest data.

¹ <#485 (comment)>
  • Loading branch information
joverlee521 committed Dec 17, 2024
1 parent a4089e7 commit fd01604
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 5 deletions.
3 changes: 1 addition & 2 deletions bin/fetch-accession-links
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@ curl "https://hgwdev.gi.ucsc.edu/~angie/epiToPublicAndDate.latest" \
--fail --silent --show-error \
--header 'User-Agent: https://github.com/nextstrain/ncov-ingest ([email protected])' \
| csvtk -t add-header --names gisaid_epi_isl,genbank_accession,strain,date \
| csvtk -t cut --fields genbank_accession,gisaid_epi_isl \
| gzip -c
| csvtk -t cut --fields genbank_accession,gisaid_epi_isl
21 changes: 18 additions & 3 deletions workflow/snakemake_rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,29 @@ rule fetch_accession_links:
Fetch the accession links between GISAID and GenBank
"""
output:
accessions="data/accessions.tsv.gz",
accessions=temp("data/accessions.tsv"),
retries: 5
shell:
"""
./bin/fetch-accession-links > {output.accessions:q}
"""


rule concat_accession_links:
input:
source_data="source-data/accessions.tsv.gz",
accessions="data/accessions.tsv",
output:
all_accessions="data/all_accessions.tsv.gz"
shell:
"""
gunzip -kcfq {input.source_data} \
| csvtk concat -t - {input.accessions} \
| csvtk uniq -t -f genbank_accession,gisaid_epi_isl \
| gzip -c > {output.all_accessions}
"""


rule transform_rki_data:
input:
ndjson="data/rki.ndjson",
Expand Down Expand Up @@ -74,7 +89,7 @@ rule transform_genbank_data:
ndjson = "data/genbank.ndjson",
cog_uk_accessions = "data/cog_uk_accessions.tsv",
cog_uk_metadata = "data/cog_uk_metadata.csv.gz",
accessions = "data/accessions.tsv.gz",
accessions = "data/all_accessions.tsv.gz",
output:
fasta = "data/genbank_sequences.fasta",
metadata = "data/genbank_metadata_transformed.tsv",
Expand Down Expand Up @@ -121,7 +136,7 @@ rule merge_open_data:
rule transform_gisaid_data:
input:
ndjson = "data/gisaid.ndjson",
accessions = "data/accessions.tsv.gz",
accessions = "data/all_accessions.tsv.gz",
output:
fasta = "data/gisaid/sequences.fasta",
metadata = "data/gisaid/metadata_transformed.tsv",
Expand Down

0 comments on commit fd01604

Please sign in to comment.