Skip to content

Commit

Permalink
Keep reference strains in tree even if they are outliers
Browse files Browse the repository at this point in the history
Adds an argument to the script for flagging outliers that accepts a file
with a list of strains to keep in the output tree even if they are
outliers and updates the core workflow logic to use this argument
whenever builds define a list of strains to force-include. This change
fixes a bug when older titer reference strains (e.g.,
A/Wisconsin/588/2019 for H1N1pdm) get flagged as outliers in the tree
and dropping them causes downstream titer analyses to fail. Our choice
to force-include these strains in the build should override the
automated outlier detection.
  • Loading branch information
huddlej committed Aug 24, 2023
1 parent ce06291 commit 3386bf6
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
8 changes: 7 additions & 1 deletion scripts/flag_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def prepare_tree(T):
parser.add_argument('--reroot', action="store_true", help="reroot the tree")
parser.add_argument('--optimize', action="store_true", help="optimize sigma and mu")
parser.add_argument('--dates', type=str, help='csv/tsv file with dates for each sequence')
parser.add_argument('--keep-strains', type=str, help='a list of strains to keep in the output tree regardless of outlier status (i.e., reference strains that need to be retained in the build)')
parser.add_argument('--output-outliers', type=str, help='file for outliers')
parser.add_argument('--output-tree', type=str, help='file for pruned tree')

Expand Down Expand Up @@ -153,9 +154,14 @@ def prepare_tree(T):
df.to_csv(args.output_outliers, index=False, sep='\t')

if args.output_tree:
keep_strains = set()
if args.keep_strains:
with open(args.keep_strains, "r", encoding="utf-8") as fh:
keep_strains = {line.strip() for line in fh}

from Bio import Phylo
T = tt.tree
for r, row in df.iterrows():
if row['diagnosis']!='bad_date':
if row['diagnosis']!='bad_date' and row["sequence"] not in keep_strains:
T.prune(row['sequence'])
Phylo.write(T, args.output_tree, 'newick')
3 changes: 3 additions & 0 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,16 @@ rule prune_outliers:
output:
tree = build_dir + "/{build_name}/{segment}/tree_without_outgroup_clean.nwk",
outliers = build_dir + "/{build_name}/{segment}/outliers.tsv"
params:
keep_strains_argument=lambda wildcards: "--keep-strains " + config["builds"][wildcards.build_name]["include"] if "include" in config["builds"][wildcards.build_name] else "",
shell:
"""
python3 scripts/flag_outliers.py \
--tree {input.tree:q} \
--aln {input.aln} \
--dates {input.metadata} \
--cutoff 4.0 \
{params.keep_strains_argument} \
--output-tree {output.tree:q} --output-outliers {output.outliers} 2>&1 | tee {log}
"""

Expand Down

0 comments on commit 3386bf6

Please sign in to comment.