forked from fjruizruano/ngs-protocols
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cd_hit_count_clusters.py
executable file
·74 lines (58 loc) · 1.6 KB
/
cd_hit_count_clusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/python
import sys
print "cd_hit_count_clusters.py CdhitClstrFile MinClusterSize MaxClusterSize"
try:
file = sys.argv[1]
except:
file = raw_input("Introduce CD-HIT's *.Clstr File: ")
try:
min = int(sys.argv[2])
except:
min = raw_input("Introduce Minimum Cluster Size:")
min = int(min)
try:
max = int(sys.argv[3])
except:
max = raw_input("Introduce Maximum Cluster Size:")
max = int(min)
data = open(file).readlines()
indexes = []
sequences = []
sizes = []
for n in range(0,len(data)):
if data[n].startswith(">Cluster"):
indexes.append(n)
elif data[n].endswith("*\n"):
info = data[n]
info = info.split()
info = info[2][1:-3]
sequences.append(info)
for n in range(0,len(indexes)):
if n != len(indexes)-1:
info = data[indexes[n+1]-1]
else:
info = data[-1]
info = info.split()
info = int(info[0])+1
sizes.append(info)
w = open(file+".sel.%s.%s" % (str(min), str(max)), "w")
ww = open(file+".clu.%s.%s" % (str(min), str(max)), "w")
for n in range(0, len(sizes)):
if sizes[n] >= min and sizes[n] <= max:
w.write("%s\n" % sequences[n])
cluster = data[indexes[n]:indexes[n+1]]
uniq_cluster = []
for el in cluster[1:]:
i = el.split("\t")
uniq_cluster.append(i[1])
uniq_cluster = set(uniq_cluster[1:])
uniq_cluster = list(uniq_cluster)
ucl = len(uniq_cluster)
if ucl >= min and ucl <= max:
ww.write(cluster[0]+"".join(uniq_cluster))
print el
print i
print i[1:]
print uniq_cluster
w.close()
ww.close()