-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_info.py
233 lines (168 loc) · 7.52 KB
/
get_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 9 11:46:12 2016
@author: ewj
"""
from Bio import SeqIO
import myUniprotIO
from UniprotUtils import get_feature_frame
from bioservices.uniprot import UniProt
u = UniProt(verbose = True)
import pandas as pd
from tqdm import tqdm
import os, itertools
def evidence(feature,letter,out_file):
#read query file
x = myUniprotIO.UniprotIterator(open('query.xml','r'), return_raw_comments=True)
#parse for wanted data (i.e. gene name, sequence id, position, etc.)
L = []
for rec,seqrec in tqdm(enumerate(x)):
gene_name = seqrec.annotations.get('gene_name_primary','_none_')
t= (
get_feature_frame(seqrec,stype=feature,filter_val=letter)
.assign(rec=rec, id=seqrec.id, gene_name=gene_name)
.rename(columns={'start':'position'})
)
L.append(t)
msk = pd.concat(L)
#filter for selected types of experimental evidence
df = msk.loc[msk['evidence_type'].str.contains('269|305|244|213', na=False)]
df.to_csv(out_file)
return df
def build_gene_query(gene_names):
#compile string for uniprot search
L = ['gene_exact:%s' %x for x in gene_names]
s = ' OR '.join(L)
q = '(%s) taxonomy:mammalia fragment:no' %s
return q
def chunks(l,n):
for i in range(0, len(l), n):
yield l[i:i+n]
def get_file_by_gene_names(gene_names,output_prefix,frmt=None,limit=20,verbose=True):
#split list of gene names into chuncks, then search by chuncks
for i,names in enumerate(chunks(gene_names,limit)):
#specify output file format
if frmt is not None:
assert frmt in ['tab','xml','fasta']
ofile = '{0}_{1}_.{2}'.format(output_prefix,i,frmt)
else:
ofile = None
query = build_gene_query(names)
if verbose:
print(ofile)
print(query)
#output search results in selected format
if ofile is not None:
with open(ofile,'w') as f:
f.write(u.search(query,frmt=frmt))
def indiv_files(gene_names,df,gene_path,fasta_path,fasta_file):
#create output folders for gene tables and fasta files
if not os.path.exists(gene_path):
os.makedirs(gene_path)
if not os.path.exists(fasta_path):
os.makedirs(fasta_path)
#iterate through list of gene name results in dataframe
for name in gene_names:
options = df['Gene names'].str.upper()
good = []
#keep entry if first word in gene name result is same as searched gene name
for n in options:
l = n.split()
if l[0] == name:
good.append(n)
#g = df.loc[df['Gene names'].str.upper() == name] #exact name match
g = df.loc[df['Gene names'].isin(good)]
#output gene table of wanted results
g.to_csv(gene_path + '\%s.csv' %name)
#parse large all_fasta file and pick out ones selected in gene table
input_iterator = SeqIO.parse(open(fasta_file,'r'), 'fasta')
Entries = g.Entry.values
filter_iterator = (x for x in input_iterator if x.id.split('|')[1] in Entries)
with open(fasta_path + '\%s.fasta' %name,'w') as f:
SeqIO.write(filter_iterator, f, 'fasta')
def rm_fasta_dup(in_file,out_file):
ids = []
desc = []
sequences = []
#compile single large fasta file of unique fasta sequences
for seq in SeqIO.parse(in_file,'fasta'):
#only add sequence to large file if it isn't already there
if seq.id not in ids:
ids.append(seq.id)
desc.append(seq.description)
sequences.append(str(seq.seq))
#output single fasta file with all sequences
with open(out_file,'w') as f:
for x in range(len(ids)):
f.write('>' + desc[x] + '\n' + sequences[x] + '\n')
def switch_letters(letters,pos,num):
for x in range(num):
letters[pos+x] = letters[pos+x].lower()
def lower(unique_ids,pos_list,seq_list,width=60):
old_seq = []
new_seq = []
for acc,pos in itertools.zip_longest(unique_ids,pos_list):
for seq in seq_list:
if acc in seq:
#isolate wanted sequence from fasta file
full_seq = '>' + seq
ind = full_seq.find('\n')
header, body = full_seq[:ind], full_seq[ind+1:]
old_seq.append(full_seq)
letters = list(body)
#switch exp sites to lowercase (depending on location in file)
for p in pos:
if p <= width - 3:
switch_letters(letters,p,3)
elif (width-3) < p < width:
switch_letters(letters,p,4)
elif p >= width:
loc = p + p//width
if (p % width) > (width-3):
switch_letters(letters,loc,4)
else:
switch_letters(letters,loc,3)
#construct new string with exp data lowercase
string = header + '\n' + ''.join(letters)
new_seq.append(string)
return old_seq, new_seq
def mark_exp(unique_genes,evidence_file,out_path,fasta_path):
if not os.path.exists(out_path):
os.makedirs(out_path)
df = pd.read_csv(evidence_file)
for gene in unique_genes:
#isolate entries that have experimental data associated with it
wanted = df[df['gene_name'].str.upper() == gene]
unique_ids = wanted.id.unique()
l = []
#compile list of positions that are experimentally proven
for x in unique_ids:
pos_list = wanted[wanted['id'] == x]
positions = pos_list['position'].tolist()
index = [int(pos) for pos in positions]
l.append(index)
#retrieve necessary fasta file from fasta path
ref_file = fasta_path + '\%s.fasta' %gene
with open(ref_file, 'r') as f:
s = f.read()
z = s.split('>')
#assign original sequence and new, marked lowercase sequence
old_seq, new_seq = lower(unique_ids,l,z,width=60)
#replace old sequene with new, then write out new file
for old,new in itertools.zip_longest(old_seq,new_seq):
s = s.replace(old,'temp').replace(new,old).replace('temp',new)
with open(out_path + '\lower_%s.fasta' %gene,'w') as n:
n.write(s)
def sort_bad(path):
bad_files = os.path.join(path,'Bad')
if not os.path.exists(bad_files):
os.makedirs(bad_files)
#remove files with less than 5 sequences in them, move to Bad folder
files = [file for file in os.listdir(path) if 'Bad' not in file]
for file in files:
p = os.path.join(path,file)
with open(p,'r') as f:
s = f.read()
count = s.count('>')
if count < 5:
os.rename(os.path.join(path,file),os.path.join(bad_files,file))