-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgsites_catalog_modern.py
287 lines (224 loc) · 12.9 KB
/
gsites_catalog_modern.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import sys
from Bio import Seq
from Bio import SeqIO
from Bio import SeqRecord
import pandas as pd
import numpy as np
import ms_module as ms
import re
#
# # for these pair: there are peptides present in spec, but missing from pep isn't it odd?
# # suggesting, that different cutoffs were used for Scaffold program to output pep and spec ...
# pep_fname = "../raw_data/New_files_to_analyze/original_input_before_dec25/peptides.xls"
# spec_fname = "../raw_data/New_files_to_analyze/original_input_before_dec25/specs.xls"
#
# pept from spec are all in pep and all Peptide sequences are in
pep_fname = "../raw_data/New_files_to_analyze/011616 glycocapture 90-80/peptides.xls"
spec_fname = "../raw_data/New_files_to_analyze/011616 glycocapture 90-80/specs.xls"
#
pep_info = pd.read_csv(pep_fname,sep='\t')
spec_info = pd.read_csv(spec_fname,sep='\t')
#
# fasta = SeqIO.to_dict(SeqIO.parse(fasta_fname,"fasta"),key_function=lambda _: _.id.split('|')[1])
# 1-BASED NOTATION FOR PROTEINS INDEXING ENFORCED ...
# pep_df = pd.read_csv(uniq_pept_fname)
# connection between peptide info and spectrum info to be established ...
################################
#
# looking for Deamidated sites only ...
deamid = re.compile('[D,d]eamidat')
spec_info['pept'] = spec_info['Peptide sequence'].str.upper()
#
parse_pname = lambda pn: pd.Series(ms.parse_prot_name(pn))
spec_info = spec_info.merge(spec_info['Protein name'].apply(parse_pname), left_index=True, right_index=True)
pep_info = pep_info.merge(pep_info['Protein name'].apply(parse_pname), left_index=True, right_index=True)
#
def extract_deamids(mod_str):
mod_list = [ mod.strip() for mod in mod_str.split(',') ]
return pd.Series( ms.parse_spectrum_modifications(mod) for mod in mod_list if bool(deamid.search(mod)) )
# unroll/exapnd spec table to account for multiple deamid-sites/gsites per peptide ...
def unroll_by_mfunc(df,col_name,mfunc,unrolled_colname='unrolled'):
# get the column with arguments of the 'mfunc' ...
thecolumn = df[col_name]
# it appears as multi-column thing right after mfunc application ...
multicol_mfunc_result = thecolumn.apply(mfunc)
# stacked - multiindex is in use, level=1 index is the column name from 'multicol_mfunc_result' ...
unrolled_mindex = multicol_mfunc_result.stack()
# index is no longer uniq after the following operation ... we dropped inner indexing part.
# IMPORTANT: indexes correspond to those from the original df (use it for merging later) ...
unrolled_origindex = pd.DataFrame(unrolled_mindex.reset_index(level=1,drop=True),columns=[unrolled_colname,])
# merge unrolled_origindex (a single column with ambiguous index) with the original df ...
# 'unrolled_origindex' must be DataFrame to merge: Series are not mergeable for some reason ...
# the whole df is to be unrolled after the following operation.
unrolled_df = df.merge(unrolled_origindex,left_index=True,right_index=True)
#
return unrolled_df.reset_index(drop=True)
##########################################################################
# unroll that spec table to have 1 deamid per row ...
spec_info_unrolled = unroll_by_mfunc(spec_info,'Variable modifications identified by spectrum',extract_deamids,'deamid_info')
# SOME VALIDATIONS ...
spec_peps_in_peps = spec_info_unrolled['pept'].isin(pep_info['Peptide sequence'])
if spec_peps_in_peps.all():
print "All peptides from spectrum file are present in the peptide summary file!"
print "Sound very logicall."
else:
print "There are some peptide from spectrum file that are not present in the peptide summary file:"
print spec_info_unrolled[~spec_peps_in_peps]
print """That is a strange situation,
suggesting that different parameters were used in Scaffold
to generate peptide summary and spectrum file."""
print """We proceed dismissing thise fact,
and using all the gsite/peptides pairs present in spectrum,
thus assuming self-sufficiency of the spectrum file
and its prevalence over peptide summary.
In other words, there is nothing in the peptide summary file,
that cannot be deduced from the spectrum file.(? seem to be true, but is it general?)"""
##################################################################################################
pept_sum_prots = spec_info_unrolled['Protein name'].unique()
spec_prots = pep_info['Protein name'].unique()
if pept_sum_prots.size == spec_prots.size:
print "Pept.summary and spectrum files are refferring to the same number of different protein names."
print "It is a good sign!"
else:
print "Pept.summary file is refferring to %d unique protein names."%pept_sum_prots.size
print "Spectrum file is refferring to %d unique protein names."%spec_prots.size
print "This is unexpected discrepancy: proceed using data stored in the spectrum file."
############################################
# columns that needs to be delivered ... #
############################################
# A gsites, 1 per line
# B pept, 1 per line
# B1 enzyme, G or T, derive from 'Biological sample category', like this: {'TrypsinSample1':'T','GluC_Sample2':'G'}
# C peptide_start, 1 per line accordingly
# D all_uids, REPLACE WITH col:H
# E prot_seq, try to get those from NCBI, not from UniProt ...
# F protein, ??? sequence, name or what???
# G uid_max, UID for major form instead or something like that ...
# H prot_name, parsed out human-readable name from 'Protein name'
# H1 gene_name, parsed out GN=xxx from 'Protein name'
# I uniq_peptide_count, discrad that column ...
# J pept_probability, output number not the string - this would be the criteria
# K gsites_predicted, OK
# L gsites_predicted_number, OK
# M gsite_start, beware of 0 or 1 type of indexing ...
# N,O,P - gsites AAs in separate columns
# M1, NOP combined, gsite sequence basically!
# Q signal, from GeneBank record on the protein, simply Y,N on whether there is a 'Signal' in gb.
# R signal_location, location of the signal from Q
# S tm_span, Y,N just for the fact of having TM span as a protein feature.
# enumerating all tractable peptides from pep_df ...
glyco_mod = []
for uniq_pept, pept_pos, prot_sr in pep_df[['pept','peptide_start','prot_seqrec']].itertuples(index=False):
prot_seq = str(prot_sr)
pept_spectrum = spec_info[ spec_info['pept']==uniq_pept ]
# let's check all present modifications in the flatten-out list of lists ...
modifs = [ mod for cmod in pept_spectrum['Variable modifications identified by spectrum'] for mod in cmod.strip().split(',') ]
# and get those that are unique ...
modifs = np.unique(modifs)
# now extract type,position and value for each of them ...
modifs = [ ms.parse_spectrum_modifications(mod) for mod in modifs if bool(deamid.search(mod)) ]
# now extrating meaningfull glycosilation sites ...
glyco_sites = []
glyco_start = []
# looks like the inner loop here is the only place where we do need 0-based indexing switching ...
for type_aa,gpos_pept,value in modifs:
if (type_aa in ['n','N']) and (np.abs(value-3)<0.1):
# 'pept_pos' - is 1-based absolute poisition of the peptide in the protein ...
# 'gpos_pept' - is 1-based relative position of gsite_start_N in the peptide ...
gsite_start = pept_pos + gpos_pept-1 # 1-based coordinate ...
gsite_stop = pept_pos + gpos_pept-1 + 3-1 # 1-based coordinate ...
glyco_start.append(gsite_start)
# Due to slicing rules, we need [start-1:stop], no have position 'stop' included ...
glyco_sites.append(prot_seq[gsite_start-1:gsite_stop])
############################################################
# gstart must be 1-based for output ...
glyco_mod_str = ';'.join([ gsite+("(%d)"%gstart) for gsite,gstart in zip(glyco_sites,glyco_start)])
glyco_mod.append(glyco_mod_str)
############################
#
#
############################
pep_df['gsites'] = glyco_mod
############################
print
print " We need to unroll glyco-sites and expand the DataFrame ..."
# we need to unroll those rows whith more than 1 items of gsites ...
pep_df_cols = ['all_uids', 'pept', 'peptide_start', 'prot_seqrec', 'protlen', 'uid_max', 'gsites','aa_before','aa_after',"prot_name", "uniq_pept_count", "pept_probab"]
pep_df_gsite_extracted = []
for all_uids,pept,pept_start,prot_sr,protlen,uid_max,gsites,aa_bef,aa_aft,prot_name,upept_count,pept_probab in pep_df[pep_df_cols].itertuples(index=False):
# unrolling ...
for gsite in gsites.split(';'):
pep_df_gsite_extracted.append((all_uids,pept,pept_start,prot_sr,protlen,uid_max,gsite,aa_bef,aa_aft,prot_name,upept_count,pept_probab))
########################
pep_df_ext = pd.DataFrame(pep_df_gsite_extracted,columns = pep_df_cols)
print "mission accomplished ..."
# so far we didn't use gsite index for adressing, so it was 1-based safely ...
print
print "Now we'd need to collapse those identical glyco sites that orginiate from overlaing peptides ..."
gsite_uid_combined = pep_df_ext['gsites'] +':'+ pep_df_ext['all_uids']
gsites_uniq = gsite_uid_combined.unique()
print " There are %d unique (glyco-site, protein id) pairs that includes empty sites, however."%gsites_uniq.size
# now collapsing on those uniq sites ...
final_dataframe = []
for site_uniq in gsites_uniq:
# indexes of a given unque gsite ...
sites_index = (gsite_uid_combined == site_uniq)
pep_site = pep_df_ext[sites_index]
#
pepts = ';'.join( '('+pep_site['aa_before']+')'+pep_site['pept']+'('+pep_site['aa_after']+')' )
all_uids, = pep_site['all_uids'].unique()
# peptide_start must be 1-based, we won't use it as index anymore ...
pepts_start = ';'.join( str(pos_value) for pos_value in pep_site['peptide_start'] )
prot_seq = str(pep_site['prot_seqrec'].unique()[0])
prot_len, = pep_site['protlen'].unique()
uid_max, = pep_site['uid_max'].unique()
#
prot_name, = pep_site["prot_name"].unique()
upept_count = pep_site["uniq_pept_count"].unique()[0] # temporary solution, due to inconsistencies ...
pept_probab = ';'.join(str(_) for _ in pep_site["pept_probab"])
#
# make sure there is just a single gsite ...
pep_site_gsite_uniq, = pep_site['gsites'].unique()
# appending to final dataframe ...
final_dataframe.append((pep_site_gsite_uniq,pepts,pepts_start,all_uids,prot_seq,prot_len,uid_max, prot_name, upept_count, pept_probab))
# turn to dataframe ...
final_dataframe = pd.DataFrame(final_dataframe,columns=['gsites', 'pept', 'peptide_start','all_uids', 'prot_seq', 'protlen', 'uid_max','prot_name', 'uniq_pept_count', 'pept_probab'])
#
print
print "Now we'd need to add theoretical glycosilation sites as a separate column ..."
g_site = re.compile(r'(?=(N[ACDEFGHIKLMNQRSTVWY][TS]))')
def get_theor_sites_number(prot_seq):
# find all sites ...
all_sites = [(site.start(),site.groups()[0]) for site in g_site.finditer(prot_seq)]
N_sites = len(all_sites)
return N_sites
def get_theor_sites(prot_seq):
# BEWARE ZERO-BASED INDEXING TO 1-BASED INDEXING TRANSLATION ...
# find all sites ...
all_sites = [(site.start(),site.groups()[0]) for site in g_site.finditer(prot_seq)]
# N_sites = len(all_sites)
return ';'.join( (gsite_seq+'(%d)'%(pos+1)) for pos,gsite_seq in all_sites) # pos+1 - indexing transition ...
##################################################################################################################
print " 'gsites_predicted' column uses 1-based numbering. Enforced and checked."
final_dataframe['gsites_predicted'] = final_dataframe['prot_seq'].apply(get_theor_sites)
final_dataframe['gsites_predicted_number'] = final_dataframe['prot_seq'].apply(get_theor_sites_number)
# gsite_start must be 1-based ...
final_dataframe['gsite_start'] = final_dataframe['gsites'].apply( lambda x: int(x[3:].strip('()')) if x else None )
final_dataframe['gsites_AA1_N'] = final_dataframe['gsites'].apply( lambda x: x[0] if x else None )
final_dataframe['gsites_AA2_XbutP'] = final_dataframe['gsites'].apply( lambda x: x[1] if x else None )
final_dataframe['gsites_AA3_TS'] = final_dataframe['gsites'].apply( lambda x: x[2] if x else None )
# final_dataframe['gsites']NFT(1098)
# #
# # let's change within a cell separators to ';' instead of ',', which is used for column separation ...
# for col,dtype in final_dataframe.dtypes.iteritems():
# if dtype=='object':
# final_dataframe[col].str.replace(';',' ').str.replace(',',';')
final_dataframe.to_csv(out_fname,index=False)
# 1 NIS(126) (R)ALSNISLR(F)
# 2 NTT(794) (R)CVYEALCNTTSECPPPVITR(Q),(E)ALCNTTSECPPPVITR...
# 3 NDT(1048) (R)EAESLQPMTVVGTDYVFHNDTK(V),(E)SLQPMTVVGTDYVF...
# 4 NET(732) (K)LSHDANETLPLHLYVK(Y)
# 5 NMS(527) (K)SCVAVTSAQPQNMSR(A)
# 6 NVT(1001) (R)SINVTGQGFSLIQR(A)
# 7 NFT(1098) (R)TEAGAFEYVPDPTFENFTGGVK(Q),(R)TEAGAFEYVPDPTF...
# 8 NLT(1067) (K)VVFLSPAVPEEPEAYNLTVLIEMDGHR(L)