-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnwk_parser.py
292 lines (248 loc) · 11.4 KB
/
nwk_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
__author__ = 'Lars Berling'
from Bio import Phylo
import io
import re as re
from apply_new_mapping import get_mapping_dict
# Function to transform a given nwk-tree string to a ranked-tree in cluster notation
dellen = re.compile('\:[0-9]+\.[0-9]+(e-[0-9]+)?')
def sort_tree(tree):
# Returns unique string representation of tree by sorting taxa and converting sets to trees.
output = []
for i in tree:
j = sorted(list(i))
output.append(set(j))
return output
def add_leaves(tree):
# Add sets containing just one leaf to make ultrametric trees become non-ultrametric RNNI trees
# needed to use RNNI moves as they are implemented for RNNI and not for uRNNI
tree_with_leaves = []
n = len(tree) + 1
for j in range(1,n+1):
# tree_with_leaves.append(set([str(j)]))
tree_with_leaves.append(set([j]))
for k in tree:
tree_with_leaves.append(set(k))
return tree_with_leaves
def expand(tree):
# Takes a tree in list of lists format, adds leave clusters and returns it in list of sets format
if (len(tree[0]) == 1):
return (sort_tree(tree))
tree = add_leaves(tree)
tree = sort_tree(tree)
return tree
def name_clade(clade, tree):
# Function that names the clade in a given tree, part of a recursion
"""
clade is a given node in the tree
clade.caldes[0] and clade.clades[1] are the two children of the clade
tree.distance() returns the distance of two given nodes, if only on is given it returns distance to the root
Description of the if statements
l.27 & l.58 : which child of the clade is further away from the root --> will get the smaller rank
l.30 & l.63 : case that both children are leaves
l.35 & l.67 : the child node clade[0] is a leaf
l.42 & l.75 : the child node clade[1] is a leaf
l.50 & l.83 : both child nodes are subtrees with an already set clade notation
"""
# if (clade.clades[0].branch_length > clade.clades[1].branch_length):
if tree.distance(clade.clades[0]) > tree.distance(clade.clades[1]):
# if len(clade.clades[0].name) == 1 and len(clade.clades[1].name) == 1:
if not '{' in clade.clades[0].name and not '{' in clade.clades[1].name:
# clade.branch_length += clade.clades[0].branch_length
clade.name = '{' + clade.clades[0].name + ',' + clade.clades[1].name + '}:' + str(tree.distance(clade))
elif not '{' in clade.clades[0].name:
big1 = clade.clades[1].name.split(sep=',{')
big1 = big1[len(big1) - 1].replace('}', '').replace('{', '')
big1 = re.sub(dellen, '', big1)
# clade.branch_length += clade.clades[0].branch_length
clade.name = clade.clades[1].name + ',{' + clade.clades[0].name + ',' + big1 + '}:' + str(
tree.distance(clade))
elif not '{' in clade.clades[1].name:
big0 = clade.clades[0].name.split(sep=',{')
big0 = big0[len(big0) - 1].replace('}', '').replace('{', '')
big0 = re.sub(dellen, '', big0)
# clade.branch_length += clade.clades[0].branch_length
clade.name = clade.clades[0].name + ',{' + big0 + ',' + clade.clades[1].name + '}:' + str(
tree.distance(clade))
else:
big0 = clade.clades[0].name.split(sep=',{')
big0 = big0[len(big0) - 1].replace('}', '').replace('{', '')
big0 = re.sub(dellen, '', big0)
big1 = clade.clades[1].name.split(sep=',{')
big1 = big1[len(big1) - 1].replace('}', '').replace('{', '')
big1 = re.sub(dellen, '', big1)
# clade.branch_length += clade.clades[0].branch_length
clade.name = clade.clades[0].name + ',' + clade.clades[1].name + ',{' + big0 + ',' + big1 + '}:' + str(
tree.distance(clade))
# print(big0, '--2--', big1)
else:
if not '{' in clade.clades[0].name and not '{' in clade.clades[1].name:
# clade.branch_length += clade.clades[1].branch_length
clade.name = '{' + clade.clades[0].name + ',' + clade.clades[1].name + '}:' + str(tree.distance(clade))
elif not '{' in clade.clades[0].name:
# redundant ?
big1 = clade.clades[1].name.split(sep=',{')
big1 = big1[len(big1) - 1].replace('}', '').replace('{', '')
big1 = re.sub(dellen, '', big1)
# clade.branch_length += clade.clades[1].branch_length
clade.name = clade.clades[1].name + ',{' + clade.clades[0].name + ',' + big1 + '}:' + str(
tree.distance(clade))
elif not '{' in clade.clades[1].name:
# redundant ?
big0 = clade.clades[0].name.split(sep=',{')
big0 = big0[len(big0) - 1].replace('}', '').replace('{', '')
big0 = re.sub(dellen, '', big0)
# clade.branch_length += clade.clades[1].branch_length
clade.name = clade.clades[0].name + ',{' + big0 + ',' + clade.clades[1].name + '}:' + str(
tree.distance(clade))
else:
big0 = clade.clades[0].name.split(sep=',{')
big0 = big0[len(big0) - 1].replace('}', '').replace('{', '').replace(':.*$', '')
big0 = re.sub(dellen, '', big0)
big1 = clade.clades[1].name.split(sep=',{')
big1 = big1[len(big1) - 1].replace('}', '').replace('{', '')
big1 = re.sub(dellen, '', big1)
# clade.branch_length += clade.clades[1].branch_length
clade.name = clade.clades[1].name + ',' + clade.clades[0].name + ',{' + big0 + ',' + big1 + '}:' + str(
tree.distance(clade))
# print(big0,'--1--', big1)
def rec_nameing(clade, tree):
# Function that recursively names the inner nodes of a tree with the cluster notation, starting top down from the given clade
if not clade.clades[0].name:
if not clade.clades[1].name:
rec_nameing(clade.clades[0], tree)
rec_nameing(clade.clades[1], tree)
name_clade(clade, tree)
else:
rec_nameing(clade.clades[0], tree)
name_clade(clade, tree)
else:
if not clade.clades[1].name:
rec_nameing(clade.clades[1], tree)
name_clade(clade, tree)
else:
# both clades have a name
name_clade(clade, tree)
def to_cluster(raw_cluster):
# Function that converts the output from the rec_naming function and sorts the clusters accordingly
out = {}
next = raw_cluster.split(sep=',{')
for i in next:
key, value = i.split(sep=':')
if not key[0] == '{':
key = '{' + key
out[key] = float(value)
sort = sorted(out.items(), key=lambda x: x[1], reverse=True)
clust = ''
for key in sort:
clust += key[0] + ','
# Need to remove the last ,
clust = clust[:-1]
clust = '[' + clust + ']'
return (clust)
def nwk_to_cluster(nwk):
# Function that converts a tree nwk string to its cluster representation
tree_str = io.StringIO(nwk)
tree = Phylo.read(tree_str, 'newick')
tree.root.branch_length = float(0)
rec_nameing(tree.root, tree)
clus = to_cluster(tree.root.name)
return (clus)
# Function only for external input, like BEAST output
def nwk_parser(in_file, out_file, re_tree=re.compile('.')):
# Function that converts in_file of nwk trees to out_file cluster trees
# re_tree is an optional tree identification string, i.e. tree lines from BEAST output start with 'tree ...'
# re_tree Default takes each line as a tree!
# ntaxa = re.compile('\tDimensions ntax=[0-9]+\;$')
trees = 0
out_f = open(out_file, 'w+')
# out_f.write('taxa\ntrees\n') # First two lines for later usage of number of trees and taxa
with open(in_file) as in_f:
for cnt, line in enumerate(in_f):
if re_tree.match(line):
out_f.write(nwk_to_cluster(line) + '\n')
trees += 1
# elif ntaxa.match(line):
# n = re.compile('[0-9]+').search(line).group()
# else:
# print(cnt)
# lines = out_f.readline()
# lines[0] = n
# lines[1] = str(trees)
out_f.close()
class Node:
def __init__(self, clade, rank):
self.left = None
self.right = None
self.rank = rank
self.clade = clade
class Leaf:
def __init__(self, taxa):
self.taxa = taxa
def cluster_to_tree(tree):
working_treelist = []
rank = 1
for cluster in tree:
if len(cluster) == 2:
# Internal Node that will be formed from two leafs
new_cherry = Node(cluster.copy(), rank)
new_cherry.left = Leaf(cluster.pop())
new_cherry.right = Leaf(cluster.pop())
working_treelist.append(new_cherry)
else:
# Internal Node that will be formed from two subtrees, one may be a leaf
to_merge = []
for index, node in enumerate(working_treelist):
if not cluster.isdisjoint(node.clade):
to_merge.append(index)
if len(cluster) - len(node.clade) == 1:
break
if len(to_merge) == 1:
# Add leaf
new_tree = Node(cluster.copy(), rank)
new_tree.left = working_treelist.pop(to_merge[0])
new_tree.right = Leaf(cluster.difference(new_tree.left.clade).pop())
working_treelist.append(new_tree)
elif len(to_merge) == 2:
# Merge two trees
new_tree = Node(cluster.copy(), rank)
new_tree.left = working_treelist.pop(to_merge[0])
new_tree.right = working_treelist.pop(to_merge[1] - 1)
working_treelist.append(new_tree)
else:
# ERROR
print('ERROR!')
rank += 1
return working_treelist[0]
# Ways to check if either of the children are leafs
# isinstance(node.left, set)
# isinstance(node.right, set)
def print_tree_from_root(root):
"""Recursive Function to get the nwk representation of a node based tree"""
if isinstance(root.left, Leaf) and isinstance(root.right, Leaf):
# Merge two leafs
return f'({root.left.taxa}:{root.rank},{root.right.taxa}:{root.rank})'
elif isinstance(root.left, Leaf) and not isinstance(root.right, Leaf):
# Merge left leaf and right subtree
return f'({root.left.taxa}:{root.rank},{print_tree_from_root(root.right)}:{root.rank - root.right.rank})'
elif not isinstance(root.left, Leaf) and isinstance(root.right, Leaf):
# Merge left subtree and right leaf
return f'({print_tree_from_root(root.left)}:{root.rank - root.left.rank},{root.right.taxa}:{root.rank})'
else:
# Merge two subtrees
return f'({print_tree_from_root(root.left)}:{root.rank - root.left.rank},{print_tree_from_root(root.right)}:{root.rank - root.right.rank})'
def write_tree_to_nwk(tree, out_file, map_file):
mapping = get_mapping_dict(map_file)
cur_t = cluster_to_tree(tree)
with open(out_file, 'w+') as f:
# Nexus Header
f.write("#NEXUS\n\nBEGIN TREES;\n\tTRANSLATE\n")
for i in sorted(mapping.keys()):
f.write(f"\t\t{i} {mapping[i]}")
if i is not len(mapping.keys()):
f.write(',')
f.write('\n')
f.write(";\n")
f.write('TREE tree1 = ')
f.write(print_tree_from_root(cur_t))
f.write(';\nEND;\n')
return 0