From 37970e33ffbabf1e54e9d97092b1a9851023af14 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 31 Jan 2024 10:22:29 -0800 Subject: [PATCH 1/2] Remove RDB-level related rules/files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prompted by https://github.com/nextstrain/ncov/pull/1094, which led to internal conversation¹ that determined we should just drop the RBD-levels. ¹ https://bedfordlab.slack.com/archives/CSKMU6YUC/p1706643360558109 --- defaults/parameters.yaml | 1 - defaults/rbd_levels.yaml | 19 ---- scripts/assign_rbd_levels.py | 95 ------------------- .../snakemake_rules/export_for_nextstrain.smk | 6 -- workflow/snakemake_rules/main_workflow.smk | 27 ------ 5 files changed, 148 deletions(-) delete mode 100644 defaults/rbd_levels.yaml delete mode 100644 scripts/assign_rbd_levels.py diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 6a6df44a5..9359b43a6 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -70,7 +70,6 @@ files: emerging_lineages: "defaults/emerging_lineages.tsv" mutational_fitness_distance_map: "defaults/mutational_fitness_distance_map.json" sites_to_mask: "defaults/sites_ignored_for_tree_topology.txt" - rbd_level_definitions: "defaults/rbd_levels.yaml" # Define genes to translate during alignment by nextalign. genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] diff --git a/defaults/rbd_levels.yaml b/defaults/rbd_levels.yaml deleted file mode 100644 index 35840b4e6..000000000 --- a/defaults/rbd_levels.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -## Definitions of the RBD mutations (in Spike) which define a genomes "RBD Level" -## See https://github.com/neherlab/SARS-CoV-2_variant-reports/blob/main/reports/variant_report_latest_draft.md - -rbd_mutations: - # each entry in this list defines a RBD position. Each entry is a list of length 3 where - # [0] is the basal codon, [1] is the 1-based spike aa position, [2] is a list of codons which result in a level elevation - - ["R", 346, ["T", "S"]] - - ["K", 356, ["T"]] - - ["K", 444, ["T", "R", "N", "M"]] - - ["V", 445, ["A", "P"]] - - ["G", 446, ["S", "D"]] - - ["N", 450, ["D"]] - - ["L", 452, ["R", "Q", "M"]] - - ["N", 460, ["K"]] - - ["F", 486, ["S", "V", "L", "P"]] - - ["F", 490, ["S", "V", "L", "P"]] - - ["R", 493, ["Q"]] # note wuhan was Q, so a reversion here elevates the level - - ["S", 494, ["P"]] diff --git a/scripts/assign_rbd_levels.py b/scripts/assign_rbd_levels.py deleted file mode 100644 index ec14ec156..000000000 --- a/scripts/assign_rbd_levels.py +++ /dev/null @@ -1,95 +0,0 @@ -from Bio import SeqIO, Phylo -import json -import yaml -import sys -import argparse - -def find_matching_nodes(clades_fname, basal_clade_label, tree_fname): - basal_node_name = None - with open(clades_fname) as fh: - for name, node_data in json.load(fh)['branches'].items(): - if node_data.get('labels', {}).get('clade', '') == basal_clade_label: - basal_node_name = name - break - if not basal_node_name: - print(f"WARNING: no branch found with a clade of {basal_clade_label}. This script will proceed, but no levels will be exported.") - return set() - print(f"Node representing {basal_clade_label}: {basal_node_name}") - T = Phylo.read(tree_fname, 'newick') - basal_node = T.find_any({"name": basal_node_name}) - if not basal_node: - print(f"ERROR: {basal_node_name} not found in provided tree") # this should be fatal as it indicates a mismatch of provided inputs - sys.exit(2) - names = set([basal_node_name]) # include parent (the basal_clade_label defining branch) - for n in basal_node.find_clades(): - names.add(n.name) - - return names - -def classify_into_levels(spike_seq, rbd_mutations): - level_num = 0 - codons = [] - calls = [] - for idx in range(0, len(rbd_mutations)): - (basal_codon, position, alts) = rbd_mutations[idx] - aa = spike_seq[position-1] # humans use 1-based positions, python uses 0-based - codons.append(aa) - if aa==basal_codon: - calls.append('basal') - elif aa in alts: - calls.append('alts') - level_num +=1 - elif aa=='X' or aa=='-': - calls.append('X-') - else: - calls.append('other') - return (level_num, codons, calls) - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Assign (omicron) levels to strains", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument('--config', type=str, metavar="YAML", required=True, help="config defining the RBD mutations") - parser.add_argument('--spike-alignment', type=str, metavar="FASTA", required=True, help="input spike alignment (usually from nextclade)") - parser.add_argument('--output-node-data', type=str, metavar="JSON", required=True, help="output node-data JSON") - parser.add_argument('--clades-node-data', type=str, metavar="JSON", required=False, help="Clade definitions JSON. Only descendants of the basal-clade-label will be labelled.") - parser.add_argument('--basal-clade-label', type=str, metavar="STRING", required=False, help="Used in conjunction with --clades-node-data and --tree") - parser.add_argument('--tree', type=str, metavar="NEWICK", required=False, help="Used in conjunction with --clades-node-data and --basal-clade-label") - args = parser.parse_args() - - if args.clades_node_data and args.basal_clade_label and args.tree: - selected_nodes = find_matching_nodes(args.clades_node_data, args.basal_clade_label, args.tree) - use_node = lambda name: name in selected_nodes - elif args.clades_node_data or args.tree or args.basal_clade_label: - print("ERROR: if you provide --clades-node-data you must provide --tree, and vice-versa") - sys.exit(2) - else: - use_node = lambda name: True - - with open(args.config, "r") as stream: - config = yaml.safe_load(stream) - try: - rbd_mutations=config['rbd_mutations'] - except KeyError: - print("ERROR: the provided yaml config did not define a `rbd_mutations` list") - sys.exit(2) - - spike_aln = {} - with open(args.spike_alignment) as handle: - for record in SeqIO.parse(handle, "fasta"): - if use_node(record.id): - spike_aln[record.id] = record.seq - - node_data = { - "nodes": {}, # encode the levels for augur export - "rbd_level_details": {} # for more info, as needed - } - - for name, seq in spike_aln.items(): - (level_num, codons, calls) = classify_into_levels(seq, rbd_mutations) - node_data['nodes'][name] = {'rbd_level': level_num} - node_data['rbd_level_details'][name] = ", ".join([f"S:{x[0][1]}{x[1]} ({x[2]})" for x in zip(rbd_mutations, codons, calls)]) - - with open(args.output_node_data, 'w') as fh: - json.dump(node_data, fh, indent=2) diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk index a90f0a31d..56046d7af 100644 --- a/workflow/snakemake_rules/export_for_nextstrain.smk +++ b/workflow/snakemake_rules/export_for_nextstrain.smk @@ -202,11 +202,6 @@ rule auspice_config: "title": "S1 Mutations", "type": "continuous" }, - { - "key": "rbd_level", - "title": "RBD Level", - "type": "ordinal" - }, { "key": "immune_escape", "title": "Immune Escape vs BA.2", @@ -311,7 +306,6 @@ rule auspice_config: originating_lab_filter, submitting_lab_filter, "recency", - "rbd_level", ], "panels": [ "tree", diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 6ce6282cc..82922157e 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -1297,32 +1297,6 @@ rule find_clusters: --output {output.clusters} """ -rule assign_rbd_levels: - input: - spike_alignment="results/{build_name}/translations/aligned.gene.S_withInternalNodes.fasta", - clades="results/{build_name}/clades.json", - tree = "results/{build_name}/tree.nwk", - params: - config=config["files"]["rbd_level_definitions"], - basal_clade_label="21L (BA.2)" - output: - node_data="results/{build_name}/rbd_levels.json", - log: - "logs/assign_rbd_levels_{build_name}.txt" - benchmark: - "benchmarks/assign_rbd_levels_{build_name}.txt", - conda: - config["conda_environment"], - shell: - """ - python3 scripts/assign_rbd_levels.py \ - --spike-alignment {input.spike_alignment} \ - --config {params.config} \ - --clades-node-data {input.clades} \ - --basal-clade-label {params.basal_clade_label:q} \ - --tree {input.tree} \ - --output-node-data {output.node_data} 2>&1 | tee {log} - """ def export_title(wildcards): # TODO: maybe we could replace this with a config entry for full/human-readable build name? @@ -1360,7 +1334,6 @@ def _get_node_data_by_wildcards(wildcards): rules.mutational_fitness.output.node_data, rules.distances.output.node_data, rules.calculate_epiweeks.output.node_data, - rules.assign_rbd_levels.output.node_data, ] if "run_pangolin" in config and config["run_pangolin"]: From d3597840d7646d198e9cd19f3d3b274507921b55 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 31 Jan 2024 10:34:42 -0800 Subject: [PATCH 2/2] Update changelog --- docs/src/reference/change_log.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index 0fba7c670..08c030d3c 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -5,6 +5,8 @@ We also use this change log to document new features that maintain backward comp ## New features since last version update +- 31 January 2024: Remove RBD-level related rules and files since this feature has been broken since May 2023 and is no longer relevant. [PR 1097](https://github.com/nextstrain/ncov/pull/1097) + - 30 January 2024: Fix RBD-level coloring by updating clade label and clade parsing. [PR 1094](https://github.com/nextstrain/ncov/pull/1094) - 14 Dec 2023: Use `nextclade2` binary that makes the version explicit [PR 1089](https://github.com/nextstrain/ncov/pull/1089)