From fd01604bf85a2eddf98b889c23742f23a2376185 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 17 Dec 2024 15:37:44 -0800 Subject: [PATCH] Concatenate accession link files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concatenates the downloaded accession links file to the existing `source-data/accessions.tsv.gz` since the downloaded file seems to be missing some of the existing links.¹ The transform scripts apply the accession links in order, so the last matching accession link is used in the final metadata. This allows us to default to the downloaded file which is the latest data. ¹ --- bin/fetch-accession-links | 3 +-- workflow/snakemake_rules/curate.smk | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/bin/fetch-accession-links b/bin/fetch-accession-links index 7378ed4d..42aa6e6b 100755 --- a/bin/fetch-accession-links +++ b/bin/fetch-accession-links @@ -5,5 +5,4 @@ curl "https://hgwdev.gi.ucsc.edu/~angie/epiToPublicAndDate.latest" \ --fail --silent --show-error \ --header 'User-Agent: https://github.com/nextstrain/ncov-ingest (hello@nextstrain.org)' \ | csvtk -t add-header --names gisaid_epi_isl,genbank_accession,strain,date \ - | csvtk -t cut --fields genbank_accession,gisaid_epi_isl \ - | gzip -c + | csvtk -t cut --fields genbank_accession,gisaid_epi_isl diff --git a/workflow/snakemake_rules/curate.smk b/workflow/snakemake_rules/curate.smk index 6fee03d2..b6052de2 100644 --- a/workflow/snakemake_rules/curate.smk +++ b/workflow/snakemake_rules/curate.smk @@ -28,7 +28,7 @@ rule fetch_accession_links: Fetch the accession links between GISAID and GenBank """ output: - accessions="data/accessions.tsv.gz", + accessions=temp("data/accessions.tsv"), retries: 5 shell: """ @@ -36,6 +36,21 @@ rule fetch_accession_links: """ +rule concat_accession_links: + input: + source_data="source-data/accessions.tsv.gz", + accessions="data/accessions.tsv", + output: + all_accessions="data/all_accessions.tsv.gz" + shell: + """ + gunzip -kcfq {input.source_data} \ + | csvtk concat -t - {input.accessions} \ + | csvtk uniq -t -f genbank_accession,gisaid_epi_isl \ + | gzip -c > {output.all_accessions} + """ + + rule transform_rki_data: input: ndjson="data/rki.ndjson", @@ -74,7 +89,7 @@ rule transform_genbank_data: ndjson = "data/genbank.ndjson", cog_uk_accessions = "data/cog_uk_accessions.tsv", cog_uk_metadata = "data/cog_uk_metadata.csv.gz", - accessions = "data/accessions.tsv.gz", + accessions = "data/all_accessions.tsv.gz", output: fasta = "data/genbank_sequences.fasta", metadata = "data/genbank_metadata_transformed.tsv", @@ -121,7 +136,7 @@ rule merge_open_data: rule transform_gisaid_data: input: ndjson = "data/gisaid.ndjson", - accessions = "data/accessions.tsv.gz", + accessions = "data/all_accessions.tsv.gz", output: fasta = "data/gisaid/sequences.fasta", metadata = "data/gisaid/metadata_transformed.tsv",