From ccc9fa73102f0a70d05a4cdf52955ede753d0979 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 26 Jul 2024 16:27:10 -0700 Subject: [PATCH] Create version JSONs and upload to S3 Creates one version JSON for each Nextclade TSV and one version JSON for the metadata TSV. Since the metadata just uses the Nextclade TSV columns directly, just add the `metadata_tsv_sha256sum` to the SARS-CoV-2 dataset version JSON. If we ever want to track data provenance by column, we will update the schema to include the 21L dataset version. The two Nextclade version JSONs will be used to check whether the workflow should use the existing cache. The metadata version JSON will be used to surface the version info to downstream users of the data. --- bin/generate-nextclade-version-json | 30 ++++++++++++++++++ workflow/snakemake_rules/nextclade.smk | 44 ++++++++++++++++++++++++++ workflow/snakemake_rules/upload.smk | 3 ++ 3 files changed, 77 insertions(+) create mode 100755 bin/generate-nextclade-version-json diff --git a/bin/generate-nextclade-version-json b/bin/generate-nextclade-version-json new file mode 100755 index 00000000..899aee83 --- /dev/null +++ b/bin/generate-nextclade-version-json @@ -0,0 +1,30 @@ +#!/bin/bash + +set -euo pipefail + +vendored="$(dirname "$0")"/../vendored + + +nextclade="${1:?A path to the Nextclade executable is required as the first argument}" +nextclade_dataset="${2:?A path to the Nextclade dataset is required as the second argument}" +nextclade_tsv="${3:?A path to the Nextclade TSV is required as the third argument}" + + +nextclade_version="$("$nextclade" --version)" +dataset_pathogen_json="$(unzip -p "$nextclade_dataset" pathogen.json)" +dataset_name="$(echo "$dataset_pathogen_json" | jq -r '.attributes.name')" +dataset_version="$(echo "$dataset_pathogen_json" | jq -r '.version.tag')" +nextclade_tsv_sha256sum="$("$vendored/sha256sum" < "$nextclade_tsv")" + +jq -c --null-input \ + --arg NEXTCLADE_VERSION "$nextclade_version" \ + --arg DATASET_NAME "$dataset_name" \ + --arg DATASET_VERSION "$dataset_version" \ + --arg NEXTCLADE_TSV_SHA256SUM "$nextclade_tsv_sha256sum" \ + '{ + "schema_version": "v1", + "nextclade_version": $NEXTCLADE_VERSION, + "nextclade_dataset_name": $DATASET_NAME, + "nextclade_dataset_version": $DATASET_VERSION, + "nextclade_tsv_sha256sum": $NEXTCLADE_TSV_SHA256SUM + }' diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index ed3d5c46..1fe637d8 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -316,6 +316,26 @@ rule nextclade_info: """ +rule nextclade_version_json: + """ + Generates a version JSON for the Nextclade TSV. + """ + input: + nextclade_path="data/nextclade", + nextclade_dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip", + nextclade_tsv=f"data/{database}/nextclade{{reference}}.tsv", + output: + nextclade_version_json=f"data/{database}/nextclade{{reference}}_version.json", + shell: + """ + ./bin/generate-nextclade-version-json \ + {input.nextclade_path} \ + {input.nextclade_dataset} \ + {input.nextclade_tsv} \ + > {output.nextclade_version_json} + """ + + rule combine_alignments: """ Generating full alignment by combining newly aligned sequences with previous (cached) alignment @@ -365,3 +385,27 @@ rule generate_metadata: --clade-legacy-mapping {input.clade_legacy_mapping} \ -o {output.metadata} """ + + +rule metadata_version_json: + """ + Generates the metadata version JSON by adding the metadata TSV sha256sum + to the Nextclade version JSON. + + TODO: Merge the 21L Nextclade version JSON to track data provenence for + specific columns + """ + input: + metadata=f"data/{database}/metadata.tsv", + nextclade_version_json=f"data/{database}/nextclade_version.json", + output: + metadata_version_json=f"data/{database}/metadata_version.json", + shell: + """ + metadata_tsv_sha256sum="$(./vendored/sha256sum < {input.metadata})" + + cat {input.nextclade_version_json} \ + | jq -c --arg METADATA_TSV_SHA256SUM "$metadata_tsv_sha256sum" \ + '.metadata_tsv_sha256sum = $METADATA_TSV_SHA256SUM' \ + > {output.metadata_version_json} + """ diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 2b8a02bb..b6a08985 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -33,6 +33,9 @@ def compute_files_to_upload(): "aligned.fasta.zst": f"data/{database}/aligned.fasta", "nextclade_21L.tsv.zst": f"data/{database}/nextclade_21L.tsv", + "nextclade_version.json": f"data/{database}/nextclade_version.json", + "nextclade_21L_version.json": f"data/{database}/nextclade_21L_version.json", + "metadata_version.json": f"data/{database}/metadata_version.json", } files_to_upload = files_to_upload | { f"translation_{gene}.fasta.zst" : f"data/{database}/translation_{gene}.fasta"