Skip to content

Commit

Permalink
Merge pull request #350 from nickjcroucher/time_tree
Browse files Browse the repository at this point in the history
Merge in capacity to generate time-calibrated trees
  • Loading branch information
nickjcroucher authored Oct 6, 2022
2 parents 4581de1 + 079d890 commit b83a607
Show file tree
Hide file tree
Showing 17 changed files with 732 additions and 184 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Change Log

## [v3.3]

- Enable time calibration of final tree
- Use separate models for tree construction and SNP reconstruction
- Enable use of iqtree in fast mode
- Allow model testing to identify the best-fitting model for tree construction
- Fix processing of custom phylogenetic models

## [v3.2.1](https://github.com/sanger-pathogens/gubbins/tree/v3.2.1) (2022-5-24)
[Full Changelog](https://github.com/sanger-pathogens/gubbins/compare/v3.1.6...v3.2.1)

Expand Down
105 changes: 70 additions & 35 deletions docs/gubbins_manual.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dependencies:
- numba
# phylogenetics
- raxml=8.2.12
- iqtree=2.0.3
- iqtree>=2.2
- rapidnj
- raxml-ng=1.0.1
- fasttree=2.1.10
Expand Down
9 changes: 9 additions & 0 deletions python/gubbins/PreProcessFasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,12 @@ def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_
AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")

return taxa_to_remove

def get_sequence_names(self):
sequence_names = []
with open(self.input_filename) as input_handle:
alignments = AlignIO.parse(input_handle, "fasta")
for alignment in alignments:
for record in alignment:
sequence_names.append(record.id)
return sequence_names
393 changes: 280 additions & 113 deletions python/gubbins/common.py

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions python/gubbins/pyjar.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@
# Python-native functions #
###########################

class Pyjar:
def __init__(self, model: str):
"""Initialises the object"""
self.citation = "https://doi.org/10.1093/oxfordjournals.molbev.a026369"
self.process = "Sequence reconstructor"
self.version = "1.0"
self.algorithm = "pyjar"
self.executable = "pyjar"
self.model = model

# Split a list into chunks for multiprocessing
# from https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length/37414115#37414115
def chunks(l, k):
Expand Down
51 changes: 30 additions & 21 deletions python/gubbins/run_gubbins.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ def parse_input_args():
ioGroup.add_argument('alignment_filename', help='Multifasta alignment file')
ioGroup.add_argument('--prefix', '-p', help='Add a prefix to the final output filenames')
ioGroup.add_argument('--starting-tree', '-s', help='Starting tree')
ioGroup.add_argument('--date', '-D', help='Two-column text file in which the second column is the'
' date of isolation in YYYY,YYYY-MM or YYYY-MM-DD format',
default = None)
ioGroup.add_argument('--use-time-stamp', '-u', help='Use a time stamp in file names', action='store_true')
ioGroup.add_argument('--version', action='version',
version = version())
Expand All @@ -55,17 +58,18 @@ def parse_input_args():
treeGroup = parser.add_argument_group('Tree building options')
treeGroup.add_argument('--tree-builder', '-t', help='Application to use for tree building',
default='raxml',
choices=['raxml', 'raxmlng', 'iqtree', 'fasttree', 'hybrid', 'rapidnj'])
choices=['raxml', 'raxmlng', 'iqtree', 'iqtree-fast', 'fasttree', 'hybrid', 'rapidnj'])
treeGroup.add_argument('--tree-args', help='Quoted string of further arguments passed to tree building algorithm'
' (start string with a space if there is a risk of being interpreted as a flag)',
default = None)
treeGroup.add_argument('--first-tree-builder', help='Application to use for building the first tree',
default=None,
choices=['raxml', 'raxmlng', 'iqtree', 'fasttree', 'rapidnj', 'star'])
choices=['raxml', 'raxmlng', 'iqtree', 'iqtree-fast', 'fasttree', 'rapidnj', 'star'])
treeGroup.add_argument('--first-tree-args', help='Further arguments passed to first tree building algorithm',
default = None)
treeGroup.add_argument('--outgroup', '-o', help='Outgroup name for rerooting. A list of comma separated '
'names can be used if they form a clade')
'names can be used if they form a clade',
default = None)
treeGroup.add_argument('--bootstrap', '-#', help='Number of bootstrap replicates to perform with final alignment',
type = int, default = 0)
treeGroup.add_argument('--transfer-bootstrap', help='Calculate bootstrap supporting transfer bootstrap expectation',
Expand All @@ -74,38 +78,43 @@ def parse_input_args():
action = 'store_true')

modelGroup = parser.add_argument_group('Nucleotide substitution model options')
modelGroup.add_argument('--model-fitter', '-F', help='Application to use for model fitting [if unspecified: same as'
' tree builder if possible, else raxml]',
default = None,
choices=['raxml', 'raxmlng', 'iqtree', 'fasttree', None])
modelGroup.add_argument('--model', '-M', help='Nucleotide substitution model (not all available for all '
'tree building algorithms)',
default='GTRGAMMA',
default=None,
choices=['JC','K2P','HKY','GTR','GTRGAMMA','GTRCAT'])
modelGroup.add_argument('--model-args', help='Quoted string of further arguments passed to model fitting algorithm'
' (start string with a space if there is a risk of being interpreted as a flag)',
default=None)
modelGroup.add_argument('--custom-model', help='String corresponding to a substitution model for the selected tree'
' building algorithm', default = None)
modelGroup.add_argument('--first-model-fitter', help='Application to use for model fitting in first iteration'
' [if unspecified: same as tree builder if possible, else raxml]',
default = None,
choices=['raxml', 'raxmlng', 'iqtree', 'fasttree', None])
modelGroup.add_argument('--first-model', help='Nucleotide substitution model used for first tree',
default=None,
choices=['JC','K2P','HKY','GTR','GTRGAMMA','GTRCAT'])
modelGroup.add_argument('--first-model-args', help='Further arguments passed to model fitting algorithm used in first'
'iteration (if unspecified: same as --first-tree-builder-args)',
default=None)
modelGroup.add_argument('--best-model', help='Automatically select best substitution model using iqtree in later iterations',
default = False, action = 'store_true')
modelGroup.add_argument('--custom-model', help='String corresponding to a substitution model for the selected tree'
' building algorithm', default = None)
modelGroup.add_argument('--custom-first-model', help='String corresponding to a substitution model for the selected tree'
' building algorithm for the first iteration',
default = None)

reconGroup = parser.add_argument_group('Ancestral sequence reconstruction options')
reconGroup.add_argument('--model-fitter', '-F', help='Application to use for model fitting for joint ancestral state'
' reconstruction [if unspecified: same as tree builder if possible'
', else iqtree]',
default = None,
choices=['raxml', 'raxmlng', 'iqtree', 'fasttree', None])
reconGroup.add_argument('--recon-model', '-R', help='Nucleotide substitution model used for ancestral state reconstruction'
' (not all available for all tree building algorithms)',
default='GTRGAMMA',
choices=['JC','K2P','HKY','GTR','GTRGAMMA','GTRCAT'])
reconGroup.add_argument('--custom-recon-model', help='String corresponding to a substitution model for the selected '
' model fitting algorithm',
default=None)
reconGroup.add_argument('--recon-with-dates', help='Use isolate date information in ancestral joint sequence'
' reconstruction',
default=False, action='store_true')
reconGroup.add_argument('--model-fitter-args', help='Further arguments passed to model fitting algorithm',
default=None)
reconGroup.add_argument('--mar', help='Use marginal, rather than joint, ancestral reconstruction',
action='store_true')
reconGroup.add_argument('--seq-recon', help='Algorithm to use for marginal reconstruction [if unspecified: '
'same as tree builder if possible, else raxml; requires --mar flag]',
'same as tree builder if possible, else iqtree; requires --mar flag]',
default=None,
choices=['raxml', 'raxmlng', 'iqtree', None])
reconGroup.add_argument('--seq-recon-args', help='Further arguments passed to sequence reconstruction algorithm'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
>sequence_1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA----------
------------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AA
>sequence_5
AAAAAAAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACACCCCCCCC
CCCCCCCACCCACCCCCCCCCCACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AA
>sequence_6
AAAAAAAAAAAAAAAGAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAA----------
------------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AA
>sequence_7
AAAAAAAAAAAACAAGAAAAAAAGAAAC---------------------A----------
------------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AA
>sequence_8
AAAAAAAAAAAAAAAGAAAAAAAGAAAC---------------------A----------
------------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AA
>sequence_9
---------------GAAAAAAAGAAAA---------------------A----------
------------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AA
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##gff-version 3
##sequence-region SEQUENCE 1 242
SEQUENCE GUBBINS CDS 29 49 0.000 . 0 node="Node_4->Node_3";neg_log_likelihood="4.955311";taxa=" sequence_7 sequence_9 sequence_8";snp_count="21";
SEQUENCE GUBBINS CDS 51 84 0.000 . 0 node="Node_5->Node_4";neg_log_likelihood="10.195830";taxa=" sequence_6 sequence_7 sequence_9 sequence_8";snp_count="30";
SEQUENCE GUBBINS CDS 51 84 0.000 . 0 node="Node_6->sequence_1";neg_log_likelihood="8.046578";taxa="sequence_1";snp_count="30";
SEQUENCE GUBBINS CDS 51 84 0.000 . 0 node="Node_9->Node_8";neg_log_likelihood="10.195830";taxa=" sequence_3 sequence_4 sequence_2 sequence_5 sequence_6 sequence_7 sequence_9 sequence_8 sequence_1";snp_count="30";
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(('sequence_5':0.000213,('sequence_6':0.000212,(('sequence_7':1.622202,'sequence_9':1.647725)'Node_2':0.000302,'sequence_8':0.000264)'Node_3':1.622508)'Node_4':1.22782)'Node_5':0.911834,'sequence_1':0.000214)'Node_6':28097902592.912083;
10 changes: 10 additions & 0 deletions python/gubbins/tests/data/taxon.times
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
sequence_1 2020-01-15
sequence_2 2020-02-15
sequence_3 2020-03-15
sequence_4 2020-04-15
sequence_5 2020-05-15
sequence_6 2020-06-15
sequence_7 2020-07-15
sequence_8 2020-08-15
sequence_9 2020-09-15
sequence_10 2020
Loading

0 comments on commit b83a607

Please sign in to comment.