-
Notifications
You must be signed in to change notification settings - Fork 0
/
ExtractCoreGenes.py
83 lines (76 loc) · 2.6 KB
/
ExtractCoreGenes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
This script extracts core gene sets.
Created by Mingzhi Lin ([email protected]).
"""
import sys
from tqdm import tqdm
def read_accession_list(accession_file):
"""read a list of accessions from a file"""
accession_list = []
with open(accession_file) as reader:
for line in reader:
term = line.rstrip()
accession_list.append(term)
return accession_list
def read_alignments(alignment_file):
"""read clusters of alignment"""
clusters = {}
with open(alignment_file) as infile:
for line in infile:
terms = line.rstrip().split(',')
cluster, seqid, seqtype, seq = terms[0], terms[1], terms[2], terms[3]
if seqtype == 'nucl':
if cluster not in clusters:
clusters[cluster] = []
clusters[cluster].append((seqid, seq))
return clusters
def split_cluster(cluster, sequences):
"""Split cluster"""
profile_dict = {}
for (seqid, seq) in sequences:
profile = ""
for nuclacid in seq:
if nuclacid == '-':
profile = profile + "1"
else:
profile = profile + "0"
if profile not in profile_dict:
profile_dict[profile] = []
profile_dict[profile].append((seqid, seq))
clusters = []
index = 0
for seqs in profile_dict.values():
for (i, (seqid, seq)) in enumerate(seqs):
seq = seq.replace('-', '')
clst = "%s_%d" % (cluster, index)
seqid = clst + "|" + seqid
seqs[i] = (seqid, seq)
index = index + 1
clusters.append(seqs)
return clusters
def write(writer, clusters):
"""write clusters"""
for sequences in clusters:
for (seqid, seq) in sequences:
writer.write('>' + seqid + '\n')
writer.write(seq + '\n')
writer.write('=\n')
def main():
"""extract core and flex gene sets"""
accession_list_file = sys.argv[1]
alignment_file = sys.argv[2]
output_file = sys.argv[3]
num_genomes = len(read_accession_list(accession_list_file))
alignment_clusters = read_alignments(alignment_file)
print("Extract core genes...")
writer = open(output_file, 'w')
count = 0
for cluster, sequences in tqdm(alignment_clusters.items()):
if len(sequences) == num_genomes:
clusters = split_cluster(cluster, sequences)
write(writer, clusters)
count = count + 1
writer.close()
print("Total %d core gene alignments extracted, which were saved to %s" % (count, output_file))
if __name__ == "__main__":
main()