-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patherror_measures.py
162 lines (130 loc) · 5.24 KB
/
error_measures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
__author__ = "Lars Berling"
from Bio import Phylo
import re
import json
from rpy2.robjects.packages import importr, isinstalled
from rpy2.robjects.vectors import StrVector
"""
Defining Error Measures for the comparison of two trees
See the 'Looking for trees in the forest paper'
"""
def r_install_decorator(func):
packnames = ("ape", "phangorn", "phytools", "beautier")
names_to_install = [x for x in packnames if not isinstalled(name=x)]
if any(names_to_install):
utils = importr("utils")
utils.install_packages(StrVector(names_to_install))
return func
def rank_error(tree_reference, tree):
# Calculates the sum of absolute differences between all clade ranks from tree to the tree_reference
# less is better
sum = 0
for i, clade in enumerate(tree_reference):
# i=height-1
if not clade == tree[i]:
# if clades at same age are different
if clade in tree:
# if the clade of tree_reference is also in tree
sum += abs((i+1) - tree.index(clade))
else:
# clade is not in tree_reference, need to find MRCA
for x in tree:
if clade.issubset(x):
mrca = x
break
mrca_age = tree.index(mrca) + 1
sum += abs((i+1) - mrca_age)
return sum
def ca_error(T_reference, T):
# CAE
# Calculates the heights of all clades and then compares heights in tree1 to heights in tree2 as reference
# Not really heights but rather the distance to the root!
tree_ref = Phylo.read(T_reference, 'nexus')
tree = Phylo.read(T, 'nexus')
# Compare the clade heights
rec_nameing(tree_ref.root, tree_ref)
rec_nameing(tree.root, tree)
t_ref_list = tree_ref.root.name.split(",{")
t_ref_dict = {}
for i in t_ref_list:
k, v = i.split(":")
# Delete all non digits characters from k and sort the integers in the cluster
k = sorted(re.sub("[{}]", "", k).split(','))
t_ref_dict[json.dumps(k)] = float(v)
t_list = tree.root.name.split(",{")
t_dict = {}
for i in t_list:
k, v = i.split(":")
k = sorted(re.sub("[{}]", "", k).split(','))
t_dict[json.dumps(k)] = float(v)
error = 0
for k, v in t_ref_dict.items():
if k in t_dict.keys():
error += abs(v - t_dict[k])
else:
t_clades = [set(json.loads(k)) for k in t_dict.keys()]
cur_t_ref_clade = set(json.loads(k))
mrca_in_t = t_dict[
json.dumps(sorted(list(min([i for i in t_clades if cur_t_ref_clade.issubset(i)], key=len))))]
error += abs(v - mrca_in_t)
return error
def clades_missed_error(tree, tree_reference):
# number of clades that are present in tree2 but not in tree1
return len([x for x in tree_reference if x not in tree])
def clades_called_error(tree, tree_reference):
# paper definition does not make sense
# i will implement the text i.e.
# -1 for clades that are called incorrectly, meaning present in tree1 that are not in tree2
# +1 for clades that are called correctly, meaning present in both tree1 and tree2
incorrect_clades = clades_missed_error(tree_reference, tree)
correct_clades = len([x for x in tree if x in tree_reference])
return correct_clades - incorrect_clades
def clade_credibility_score(tree, treeset):
# product of clade frequencies, should be max for the MCC tree
score = 1
for clade in tree:
clade_sum = 0
for t in treeset:
if clade in t:
clade_sum += 1
score *= (clade_sum / len(treeset))
return score
@r_install_decorator
def log_likelihood(tree_file, alignment_file):
"""
Computes the log likelihood of a given tree and alignment pair via the R package phangorn
:param tree_file: Path to tree file in nexus format
:param alignment_file: Path to alignment in fasta format
:return: Log likelihood of the tree given the alignment, float
"""
ape = importr('ape')
phangorn = importr('phangorn')
alignment = phangorn.read_phyDat(alignment_file, format="fasta")
tree = ape.read_nexus(tree_file)
likelihood = phangorn.pml(tree, alignment, rate=0.005)
return likelihood.rx2("logLik")[0]
@r_install_decorator
def tree_distances(tree1, tree2):
"""
Computes different distances between tree1 and tree2, returns a dict with keys name of distances and value distances
:param tree1: file for tree1 NEXUS format
:param tree2: file for tree2 NEXUS format
:return: dictionary with distances
"""
out = {}
ape = importr('ape')
phangorn = importr('phangorn')
t1 = ape.read_nexus(tree1)
t2 = ape.read_nexus(tree2)
distances = phangorn.treedist(t1, t2)
out['RF'] = distances[0] # symmetric difference -- RF distance
out['KF'] = distances[1] # branch score difference -- Kuhner Felsenstein
out['PD'] = distances[2] # path difference
out['wPD'] = distances[3] # quadratic path difference
wRF = phangorn.wRF_dist(t1, t2)
out['wRF'] = wRF[0]
nRF = phangorn.RF_dist(t1, t2, normalize=True)
out['nRF'] = nRF[0]
nwRF = phangorn.wRF_dist(t1, t2, normalize=True)
out['nwRF'] = nwRF[0]
return out