diff --git a/scripts/translations_aamuts.py b/scripts/translations_aamuts.py deleted file mode 100644 index 01991789..00000000 --- a/scripts/translations_aamuts.py +++ /dev/null @@ -1,94 +0,0 @@ -import argparse -import json -from Bio import Phylo, SeqIO, Seq -from Bio.Align import MultipleSeqAlignment -from treetime import TreeAnc - - -def read_gff(fname): - try: - from BCBio import GFF #Package name is confusing - tell user exactly what they need! - except ImportError: - print("ERROR: Package BCBio.GFF not found! Please install using \'pip install bcbio-gff\' before re-running.") - return None - - features = {} - with open(fname, encoding='utf-8') as in_handle: - for rec in GFF.parse(in_handle): - for feat in rec.features: - if "gene_name" in feat.qualifiers: - fname = feat.qualifiers["gene_name"][0] - if fname: - features[fname] = feat - - return features - -def annotation_json(features, reference): - annotations = {} - for fname, feat in features.items(): - annotations[fname] = {'seqid':reference.id, - 'type':feat.type, - 'start':int(feat.location.start)+1, - 'end':int(feat.location.end), - 'strand': '+' if feat.location.strand else '-'} - annotations['nuc'] = {'seqid':reference.id, - 'type':'source', - 'start': 1, - 'end': len(reference), - 'strand': '+'} - return annotations - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Add translations", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--tree', type=str, required=True, help="input tree") - parser.add_argument('--annotation', type=str, required=True, help="gff annotation file") - parser.add_argument('--reference', type=str, required=True, help="reference fasta sequence") - parser.add_argument('--translations', type=str, nargs='+', required=True, help="amino acid alignment") - parser.add_argument('--genes', type=str, nargs='+', required=True, help="amino acid alignment") - parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") - args = parser.parse_args() - - genes = args.genes if type(args.genes)==list else [args.genes] - translations = args.translations if type(args.translations)==list else [args.translations] - features = read_gff(args.annotation) - ref = SeqIO.read(args.reference, format='fasta') - - if not set(features.keys())==set(args.genes): - print("ERROR: supplied genes don't match the annotation") - exit(1) - - T = Phylo.read(args.tree, 'newick') - leafs = {n.name for n in T.get_terminals()} - - node_data = {} - for gene, translation in zip(genes, translations): - seqs = [] - for s in SeqIO.parse(translation, 'fasta'): - if s.id in leafs: - if str(s.seq).strip('-'): - seqs.append(s) - else: - print("WARNING: sequence {} is all gaps".format(s.id)) - s.seq = Seq.Seq(len(s) * 'N') - seqs.append(s) - - - tt = TreeAnc(tree=T, aln=MultipleSeqAlignment(seqs), alphabet='aa') - - tt.infer_ancestral_sequences(reconstruct_tip_states=True) - - with open(translation.replace('.fasta', '_withInternalNodes.fasta'), 'w') as fh: - for n in tt.tree.find_clades(): - if n.name not in node_data: - node_data[n.name] = {"aa_muts":{}} - if len(n.mutations): - node_data[n.name]["aa_muts"][gene] = [f"{a}{p+1}{d}" for a,p,d in n.mutations] - fh.write(f">{n.name}\n{tt.sequence(n, as_string=True, reconstructed=True)}\n") - - annotations = annotation_json(features, ref) - with open(args.output, 'w') as fh: - json.dump({"nodes":node_data, "annotations":annotations}, fh) \ No newline at end of file diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index e4d58dd9..a6ab3f33 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -266,12 +266,19 @@ rule refine: rule ancestral: message: "Reconstructing ancestral sequences and mutations" input: - tree = rules.refine.output.tree, - alignment = rules.align.output.alignment, + tree = build_dir + "/{build_name}/{segment}/tree.nwk", + alignment = build_dir + "/{build_name}/{segment}/aligned.fasta", + translations = aggregate_translations, + reference = lambda w: f"{config['builds'][w.build_name]['reference']}", + annotation = lambda w: f"{config['builds'][w.build_name]['annotation']}", output: - node_data = build_dir + "/{build_name}/{segment}/nt-muts.json" + node_data = build_dir + "/{build_name}/{segment}/muts.json", + translations_done = build_dir + "/{build_name}/{segment}/translations.done", params: - inference = "joint" + inference = "joint", + genes = lambda w: GENES[w.segment], + input_translations = lambda w: build_dir + f"/{w.build_name}/{w.segment}/nextalign/masked.gene.%GENE.fasta", + output_translations = lambda w: build_dir + f"/{w.build_name}/{w.segment}/nextalign/masked.gene.%GENE_withInternalNodes.fasta", conda: "../envs/nextstrain.yaml" benchmark: "benchmarks/ancestral_{build_name}_{segment}.txt" @@ -284,36 +291,13 @@ rule ancestral: augur ancestral \ --tree {input.tree} \ --alignment {input.alignment} \ - --output-node-data {output.node_data} \ - --inference {params.inference} 2>&1 | tee {log} - """ - -rule translate: - message: "Translating amino acid sequences" - input: - translations = aggregate_translations, - tree = rules.refine.output.tree, - reference = lambda w: f"{config['builds'][w.build_name]['reference']}", - annotation = lambda w: f"{config['builds'][w.build_name]['annotation']}", - output: - node_data = build_dir + "/{build_name}/{segment}/aa_muts.json", - translations_done = build_dir + "/{build_name}/{segment}/translations.done" - params: - genes = lambda w: GENES[w.segment] - conda: "../envs/nextstrain.yaml" - benchmark: - "benchmarks/translate_{build_name}_{segment}.txt" - log: - "logs/translate_{build_name}_{segment}.txt" - shell: - """ - python3 scripts/translations_aamuts.py \ - --tree {input.tree} \ + --root-sequence {input.reference} \ --annotation {input.annotation} \ - --reference {input.reference} \ - --translations {input.translations:q} \ --genes {params.genes} \ - --output {output.node_data} 2>&1 | tee {log} && touch {output.translations_done} + --translations "{params.input_translations}" \ + --output-node-data {output.node_data} \ + --output-translations "{params.output_translations}" \ + --inference {params.inference} 2>&1 | tee {log} && touch {output.translations_done} """ rule traits: @@ -347,8 +331,7 @@ rule traits: rule clades: input: tree = build_dir + "/{build_name}/ha/tree.nwk", - nt_muts = build_dir + "/{build_name}/ha/nt-muts.json", - aa_muts = build_dir + "/{build_name}/ha/aa_muts.json", + muts = build_dir + "/{build_name}/ha/muts.json", clades = lambda wildcards: config["builds"][wildcards.build_name]["clades"], output: node_data = build_dir + "/{build_name}/ha/clades.json", @@ -361,7 +344,7 @@ rule clades: """ augur clades \ --tree {input.tree} \ - --mutations {input.nt_muts} {input.aa_muts} \ + --mutations {input.muts} \ --clades {input.clades} \ --output {output.node_data} 2>&1 | tee {log} """ @@ -370,8 +353,7 @@ rule clades: rule subclades: input: tree = build_dir + "/{build_name}/{segment}/tree.nwk", - nt_muts = build_dir + "/{build_name}/{segment}/nt-muts.json", - aa_muts = build_dir + "/{build_name}/{segment}/aa_muts.json", + muts = build_dir + "/{build_name}/{segment}/muts.json", clades = lambda wildcards: config["builds"][wildcards.build_name]["subclades"], output: node_data = build_dir + "/{build_name}/{segment}/subclades.json", @@ -387,7 +369,7 @@ rule subclades: """ augur clades \ --tree {input.tree} \ - --mutations {input.nt_muts} {input.aa_muts} \ + --mutations {input.muts} \ --clades {input.clades} \ --membership-name {params.membership_name} \ --label-name {params.label_name} \ @@ -398,8 +380,7 @@ rule subclades: rule import_clades: input: tree = build_dir + "/{build_name}/ha/tree.nwk", - nt_muts = build_dir + "/{build_name}/{segment}/nt-muts.json", - aa_muts = build_dir + "/{build_name}/{segment}/aa_muts.json", + muts = build_dir + "/{build_name}/{segment}/muts.json", clades = build_dir + "/{build_name}/ha/clades.json", output: node_data = build_dir + "/{build_name}/{segment}/clades.json", diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index 2115287a..3898f067 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -8,7 +8,6 @@ def _get_node_data_by_wildcards(wildcards): inputs = [ rules.refine.output.node_data, rules.ancestral.output.node_data, - rules.translate.output.node_data, rules.clades.output.node_data, rules.traits.output.node_data, rules.annotate_epiweeks.output.node_data,