-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathvsearchcluster.py
299 lines (267 loc) · 13.7 KB
/
vsearchcluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# -*- coding: utf-8 -*-
import itertools
import logging
from Bio.SeqUtils.lcc import lcc_simp
from Bio.SeqUtils import GC
def makelog(stri, do_print=True):
if do_print:
print(stri)
logging.debug(stri)
def loadcluster(cluster_file):
cluster_file, cluster_dic = open(cluster_file), {}
# parse through the cluster file and store the cluster name + sequences in the dictionary
cluster_groups = (x[1] for x in itertools.groupby(cluster_file, key=lambda line: line[0] == '>'))
for cluster in cluster_groups:
name = cluster.next().strip()
seqs = [seq.split('>')[1].split('...')[0] for seq in cluster_groups.next()]
cluster_dic[name[1:]] = seqs
return cluster_dic
def filtercluster(cluster_dic, minimum, candidates):
filtered_dic = {}
for cluster in set(cluster_dic.keys()):
#let's get first only valid groups to avoid unnecesary processing
if len(cluster_dic[cluster]) >= minimum:
#group into overlapped (overlapped groups count as one individual)
#ie should have more than minimum elements non overlapped
cluster_positions = {}
for k in cluster_dic[cluster]:
if isinstance(candidates[k]['start'], int) and isinstance(candidates[k]['end'], int):
if not candidates[k]['record'] in cluster_positions:
cluster_positions[candidates[k]['record']] = []
cluster_positions[candidates[k]['record']].append( (candidates[k]['start'], candidates[k]['end']) )
#cluster_candidates = [v for k,v in candidates.items() if k in cluster_dic[cluster]]
#for candidate in cluster_candidates:
# cluster_positions.append( (candidate['start'], candidate['end']) )
total_len = 0
for k,v in cluster_positions.items():
overlapped_cluster = merge_overlap(v)
total_len += len(overlapped_cluster)
if total_len >= minimum:
filtered_dic[cluster] = cluster_dic[cluster]
return filtered_dic
def merge_overlap(intervals):
sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
merged = []
for higher in sorted_by_lower_bound:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
# test for intersection between lower and higher:
# we know via sorting that lower[0] <= higher[0]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
merged[-1] = (lower[0], upper_bound) # replace by merged interval
else:
merged.append(higher)
return merged
def cluster2seq(cluster_dic, candidates, outfile):
# open the filter_output file
filter_file = open(outfile, 'w')
# loop through the sequence list
last_cluster = None
#family_number = 1
for cluster_id, cluster_seqs in cluster_dic.items():
for seq_id in cluster_seqs:
cand_id, description, sequence = candidates[seq_id]['id'], candidates[seq_id]['description'],candidates[seq_id]['seq']
#header = ">%s %s FAMILY:%s" % (cand_id, description, str(family_number))
header = ">%s %s" % (cand_id, description)
if not cluster_id == last_cluster:
filter_file.write('{0}\n'.format('-'*10))
last_cluster = cluster_id
filter_file.write('{0}\n{1}\n'.format(header, '\n'.join([sequence[i:i+60] for i in range(0, len(sequence), 60)])))
#family_number += 1
# close the filtered results file
filter_file.close()
def complex_enough(seq):
complexity = lcc_simp(seq.upper())
if complexity < 1.25:
return False
gc = GC(seq.upper())
if gc < 15 or gc > 95:
return False
return True
def cluster(file_names, candidates, min_copy_number, FSL, workers):
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import pairwise2
from subprocess import Popen, PIPE
from collections import OrderedDict
import os, shutil
import math
makelog("Clustering")
cmd_list = [
'./vsearch-2.7.1/bin/vsearch',
'--cluster_fast',file_names['file_candidates_fasta'],
#'--consout',file_names['file_representative'],
'--threads',str(workers),
'--strand','both',
'--clusters',file_names['file_temp_cluster'],
'--iddef','1',
'--id', '0.8']
makelog(' '.join(cmd_list))
p = Popen(cmd_list, stdout=PIPE, stderr=PIPE)
out,err = p.communicate()
#for stdout_line in iter(popen.stdout.readline, ""):
# yield stdout_line
#popen.stdout.close()
#return_code = popen.wait()
##if return_code:
# raise subprocess.CalledProcessError(return_code, cmd)
#for c in iter(lambda: p.stdout.read(), ''):
#makelog(c)
makelog("Clustering done")
makelog("Filtering clusters")
#count for minimum file length
clusters_dic = {}
list_dir = os.listdir(file_names['file_temp_cluster_dir'])
makelog("Initial clusters: %i" % (len(list_dir),))
for fn in list_dir:
if os.path.isfile(file_names['file_temp_cluster_dir'] + fn):
fh = open(file_names['file_temp_cluster_dir'] + fn)
n = 0
for line in fh:
if line.startswith(">"):
n += 1
id_seq = line[1:line.find(" ")]
if fn in clusters_dic:
clusters_dic[fn].append(id_seq)
else:
clusters_dic[fn] = [id_seq]
fh.close()
#shutil.rmtree(file_names['file_temp_cluster_dir'])
# os.unlink(file_names['file_temp_cluster_dir'] + fn)
# if n < args.min_copy_number:
# df.loc[df['candidate_id'] == 'id_seq', 'status'] = 'low_cn'
# os.remove(fn)
# continue
#clusters_dic = loadcluster(cluster_candidates_file + ".clstr")
filtered_clusters = filtercluster(clusters_dic, min_copy_number, candidates)
unique_clusters = set(filtered_clusters.keys())
num_clusters = len(unique_clusters)
#loop through clusters
for current_cluster in unique_clusters:
#search candidates for that cluster
#all possible 2-combinations of candidates
candidates_in_cluster = filtered_clusters[current_cluster]
#porc_of_clusters = int(math.ceil(len(candidates_in_cluster) * 0.4))
#new_min_copy_number = max(min_copy_number,porc_of_clusters)
new_min_copy_number = min_copy_number
sum_diff_fs_cluster = 0
for x in candidates_in_cluster:
totally_different_fs = True
cand_x = candidates[x]
fs_right_1 = cand_x['fs_right']
fs_left_1 = cand_x['fs_left']
if fs_left_1 == '' or fs_right_1 == '' or not isinstance(fs_left_1,str) or not isinstance(fs_right_1,str):
totally_different_fs = False
continue
if not complex_enough(fs_right_1) or not complex_enough(fs_left_1):
totally_different_fs = False
continue
fs_right_1 = fs_right_1.upper()
fs_left_1 = fs_left_1.upper()
fs_right_1_plus_mite = cand_x['seq'][-FSL:].upper() + fs_right_1
fs_left_1_plus_mite = fs_left_1 + cand_x['seq'][0:FSL].upper()
at_least_one = False
for y in candidates_in_cluster:
cand_y = candidates[y]
if cand_x['candidate_id'] == cand_y['candidate_id']:
continue
# R1 x R2
# L1 x L2
# L1RC x R2
# R1RC x L2
#some MITE could be at the end or begining of the sequence and this not having flanking seqs
fs_right_2 = cand_y['fs_right']
fs_left_2 = cand_y['fs_left']
if fs_right_2 == '' or fs_left_2 == '':
continue
#empty strings in some versions of pandas are returned as nan, so we make sure the flanking seqs are strings
if fs_left_2 == '' or fs_right_2 == '' or not isinstance(fs_right_2,str) or not isinstance(fs_left_2,str):
continue
if not complex_enough(fs_right_2) or not complex_enough(fs_left_2):
continue
fs_right_2 = fs_right_2.upper()
fs_left_2 = fs_left_2.upper()
fs_right_2_plus_mite = cand_y['seq'][-FSL:].upper() + fs_right_2
fs_left_2_plus_mite = fs_left_2 + cand_y['seq'][0:FSL].upper()
fs_left_1_rc = Seq(fs_left_1).reverse_complement()
fs_right_1_rc = Seq(fs_right_1).reverse_complement()
fs_left_1_plus_mite_rc = Seq(fs_left_1_plus_mite).reverse_complement()
fs_right_1_plus_mite_rc = Seq(fs_right_1_plus_mite).reverse_complement()
#calculate scores
score_r1_r2 = pairwise2.align.localms(fs_right_1, fs_right_2, 1, -1, -1, -1,score_only=True)
score_l1_l2 = pairwise2.align.localms(fs_left_1, fs_left_2, 1, -1, -1, -1,score_only=True)
score_l1rc_r2 = pairwise2.align.localms(fs_left_1_rc, fs_right_2, 1, -1, -1, -1,score_only=True)
score_r1rc_l2 = pairwise2.align.localms(fs_right_1_rc, fs_left_2, 1, -1, -1, -1,score_only=True)
#since a MITE might be longer, we also look a few nt inside
score_r1_r2_plus_mite = pairwise2.align.localms(fs_right_1, fs_right_2_plus_mite, 1, -1, -1, -1,score_only=True)
score_l1_l2_plus_mite = pairwise2.align.localms(fs_left_1, fs_left_2_plus_mite, 1, -1, -1, -1,score_only=True)
score_l1rc_r2_plus_mite = pairwise2.align.localms(fs_left_1_rc, fs_right_2_plus_mite, 1, -1, -1, -1,score_only=True)
score_r1rc_l2_plus_mite = pairwise2.align.localms(fs_right_1_rc, fs_left_2_plus_mite, 1, -1, -1, -1,score_only=True)
#TODO remove
#since a MITEs might be longer, we also look for the FS inside
#score_r1_m2 = pairwise2.align.localms(fs_right_1, seq_2, 1, -1, -1, -1,score_only=True)
#score_l1_m2 = pairwise2.align.localms(fs_left_1, seq_2, 1, -1, -1, -1,score_only=True)
#score_r1rc_m2 = pairwise2.align.localms(fs_right_1_rc, seq_2, 1, -1, -1, -1,score_only=True)
#score_l1rc_m2 = pairwise2.align.localms(fs_left_1_rc, seq_2, 1, -1, -1, -1,score_only=True)
#max_score = max(score_r1_r2,score_l1_l2,score_l1rc_r2,score_r1rc_l2,score_r1_m2,score_r1rc_m2,score_l1rc_m2)
#get max score
max_score = max(score_r1_r2,score_l1_l2,score_l1rc_r2,score_r1rc_l2,score_r1_r2_plus_mite,score_l1_l2_plus_mite,score_l1rc_r2_plus_mite,score_r1rc_l2_plus_mite)
if max_score == []:
max_score = 0
max_score /= FSL
at_least_one = True
if max_score > 0.5:
totally_different_fs = False
break
if totally_different_fs and at_least_one:
sum_diff_fs_cluster += 1
if sum_diff_fs_cluster >= new_min_copy_number:
break
if sum_diff_fs_cluster < new_min_copy_number:
#makelog(' '.join(filtered_clusters[current_cluster]) + " filtered by flanking sequence")
del filtered_clusters[current_cluster]
#else:
# makelog(' '.join(filtered_clusters[current_cluster]) + " not filtered by flanking sequence")
#again to remove < MIN_COPY_NUMBER elements
#filtered_clusters = filtercluster(filtered_clusters, args.min_copy_number, positions, df, 'low_copy_number_2')
ordered_cluster = OrderedDict(sorted(filtered_clusters.items(), key=lambda t: t[0]))
makelog("Clusters: " + str(len(filtered_clusters)))
buffer_rec = []
#import ipdb; ipdb.set_trace()
#for candidate in candidates.values():
count = 1
family_number = 1#ordered_cluster.keys().index(clus)
buffer_nr = []
output_gff = open(file_names['file_gff'],"w")
output_gff.write("##gff-version 3\n")
for clus, seqs in ordered_cluster.items():
one_per_family = False
tsd_family = []
for seq in seqs:
candidate = candidates[seq]
candidate['id'] = "MITE_T_%s|%s|%s|%s|%s|%s|F%s" % (str(count),candidate['record'],candidate['start'],candidate['end'],candidate['tsd'],candidate['tir_len'],family_number)
candidate['description'] = "%s CANDIDATE_ID:%s" % (candidate['description'], candidate['candidate_id'].split('|')[0])
record = SeqRecord(Seq(candidate['seq']), id=candidate['id'], description=candidate['description'])
buffer_rec.append(record)
write_row = '\t'.join([candidate['record'], 'MITE_Tracker','MITE',str(candidate['start']), str(candidate['end']),'.','+','.','ID='+candidate['id'] ])
output_gff.write(write_row + '\n')
tsd_family.append(candidate['tsd'])
if not one_per_family:
one_per_family = True
record_family = record
buffer_nr.append(record_family)
count += 1
from statistics import mode,StatisticsError
try:
tsd_consensus = mode(tsd_family)
record_family.description = '%s COMMON_TSD:%s' % (record_family.description, tsd_consensus)
except StatisticsError:
pass
family_number += 1
SeqIO.write(buffer_nr, file_names['file_representative'] , "fasta")
SeqIO.write(buffer_rec, file_names['all_file'] , "fasta")
cluster2seq(ordered_cluster, candidates, file_names['families_file'])