-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
289 lines (255 loc) · 13.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import re, sys, os, logging, numpy as np, pandas as pd
def getParams(paramFile):
parameters = dict()
with open(paramFile, 'r') as file:
for line in file:
if re.search(r'^#', line) or re.search(r'^\s', line):
continue
line = re.sub(r'#.*', '', line) # Remove comments (start from '#')
line = re.sub(r'\s*', '', line) # Remove all whitespaces
# Exception for "feature_files" parameter
if "feature_files" in parameters and line.endswith(".feature") and "=" not in line:
parameters["feature_files"].append(line)
elif "library" in parameters and line.endswith(".db") and "=" not in line:
parameters["library"].append(line)
else:
key = line.split('=')[0]
val = line.split('=')[1]
if key == "feature_files" or key == "library":
parameters[key] = [val]
else:
parameters[key] = val
return parameters
def readFeatures(featureFile):
with open(featureFile, 'r') as file:
features = file.readlines()
return features
class progressBar:
def __init__(self, total):
self.total = total
self.barLength = 20
self.count = 0
self.progress = 0
self.block = 0
self.status = ""
def increment(self, nIncrement=None):
if nIncrement == None:
self.count += 1
else:
self.count = nIncrement
self.progress = self.count / self.total
self.block = int(round(self.barLength * self.progress))
if self.progress == 1:
self.status = "Done...\r\n"
else:
self.status = ""
# self.status = str(self.count) + "/" + str(self.total)
text = "\r Progress: [{0}] {1}% {2}".format("#" * self.block + "-" * (self.barLength - self.block),
int(self.progress * 100), self.status)
sys.stdout.write(text)
sys.stdout.flush()
def summarizeFeatures(full, params):
# Input arguments
# 1. full: numpy recarray of fully-aligned features
# 2. params: dictionary of parameters
####################################
# Summarize fully-aligned features #
####################################
mzCols = [col for col in full.dtype.names if col.lower().endswith("_mz")]
rtCols = [col for col in full.dtype.names if col.lower().endswith("_rt")]
intensityCols = [col for col in full.dtype.names if col.lower().endswith("_intensity")]
chargeCols = [col for col in full.dtype.names if col.lower().endswith("_z")]
minRtCols = [col for col in full.dtype.names if col.lower().endswith("_minrt")]
maxRtCols = [col for col in full.dtype.names if col.lower().endswith("_maxrt")]
snCols = [col for col in full.dtype.names if col.lower().endswith("_snratio")]
df = pd.DataFrame.from_records(full)
res = pd.DataFrame()
res["feature_m/z"] = df[mzCols].mean(axis=1)
res["feature_RT"] = df[rtCols].mean(axis=1) / 60
res["feature_intensity"] = df[intensityCols].mean(axis=1)
res["feature_SNratio"] = df[snCols].mean(axis=1)
res["feature_width"] = pd.DataFrame((df[maxRtCols].values - df[minRtCols].values) / 60).mean(axis=1)
# Handling charges
res["feature_z"] = df[chargeCols].mode(axis=1)[0]
res["feature_z"][res["feature_z"] == 0] = 1
res["feature_z"] = res["feature_z"].astype(int)
if params["mode"] == "-1":
res["feature_ion"] = "[M-" + res["feature_z"].astype(str) + "H]" + res["feature_z"].astype(str) + "-"
res["feature_ion"] = res["feature_ion"].replace("[M-1H]1-", "[M-H]-")
elif params["mode"] == "1":
res["feature_ion"] = "[M+" + res["feature_z"].astype(str) + "H]" + res["feature_z"].astype(str) + "+"
res["feature_ion"] = res["feature_ion"].replace("[M+1H]1+", "[M+H]+")
for intensityCol in intensityCols:
res[intensityCol] = df[intensityCol]
colNames = ["feature_ion", "feature_z", "feature_m/z", "feature_RT", "feature_width",
"feature_SNratio"] + intensityCols
res = res[colNames]
return res
'''
def generateSummarizedFeatureFile(nFeatures, full, ms2, params):
filePath = os.path.join(os.getcwd(), "align_" + params["output_name"])
#############################################################
# Summarize fully-aligned features and write them to a file #
#############################################################
# This file contains "summarized" information of fully-aligned features
# e.g. mean m/z, mean intensity, mean RT of fully-aligned features and so on
# width and SNratio are from the reference run
mzCols = [col for col in full.dtype.names if col.lower().endswith("_mz")]
rtCols = [col for col in full.dtype.names if col.lower().endswith("_rt")]
intensityCols = [col for col in full.dtype.names if col.lower().endswith("_intensity")]
chargeCols = [col for col in full.dtype.names if col.lower().endswith("_z")]
minRtCols = [col for col in full.dtype.names if col.lower().endswith("_minrt")]
maxRtCols = [col for col in full.dtype.names if col.lower().endswith("_maxrt")]
snCols = [col for col in full.dtype.names if col.lower().endswith("_snratio")]
df = pd.DataFrame.from_records(full)
res = pd.DataFrame()
res["feature_m/z"] = df[mzCols].mean(axis=1)
res["feature_RT"] = df[rtCols].mean(axis=1)
res["feature_intensity"] = df[intensityCols].mean(axis=1)
res["feature_SNratio"] = df[snCols].mean(axis=1)
res["feature_width"] = pd.DataFrame((df[maxRtCols].values - df[minRtCols].values) / 60).mean(axis=1)
# Handling charges
res["feature_z"] = df[chargeCols].mode(axis=1)[0]
res["feature_z"][res["feature_z"] == 0] = 1
res["feature_z"] = res["feature_z"].astype(int)
if params["mode"] == "-1":
res["feature_ion"] = "[M-" + res["feature_z"].astype(str) + "H]" + res["feature_z"].astype(str) + "-"
res["feature_ion"] = res["feature_ion"].replace("[M-1H]1-", "[M-H]-")
elif params["mode"] == "1":
res["feature_ion"] = "[M+" + res["feature_z"].astype(str) + "H]" + res["feature_z"].astype(str) + "+"
res["feature_ion"] = res["feature_ion"].replace("[M+1H]1+", "[M+H]+")
for intensityCol in intensityCols:
res[intensityCol] = df[intensityCol]
# Add the mean m/z of feature and its charge state to the beginning of MS2 spectrum (similar to .dta file)
for i in range(nFeatures):
if ms2[i] is not None:
ms2[i]["mz"] = np.insert(ms2[i]["mz"], 0, res["feature_m/z"].iloc[i])
ms2[i]["intensity"] = np.insert(ms2[i]["intensity"], 0, res["feature_z"].iloc[i])
res["MS2"] = ms2
# Write the summarized fully-aligned features to a file
res = res.sort_values(by="feature_m/z", ignore_index=True)
res[
"feature_num"] = res.index + 1 # Update "feature_num" according to the ascending order of "feature_m/z" (as sorted)
outColumns = ["feature_num", "feature_ion", "feature_z", "feature_m/z", "feature_RT",
"feature_width", "feature_SNratio"] + intensityCols
resOut = res[outColumns].copy()
resOut["feature_RT"] = resOut["feature_RT"] / 60 # Change the unit to minute
fullName = os.path.join(filePath, params["output_name"] + "_summarized_fully_aligned.feature")
resOut.to_csv(fullName, index=False, sep="\t")
# Write MS2 spectra to files
ms2Path = os.path.join(filePath, "MS2")
if not os.path.exists(ms2Path):
os.mkdir(ms2Path)
for i in range(res.shape[0]):
if res["MS2"].iloc[i] is not None:
fileName = os.path.join(ms2Path, "f" + str(i + 1) + ".MS2")
dfMS2 = pd.DataFrame.from_dict(res["MS2"].iloc[i])
dfMS2.to_csv(fileName, index=False, header=False, sep="\t")
# Save fully-aligned features with their MS2 spectra (i.e. res) for debugging purpose
# When the pipeline gets mature, this part needs to be removed
pickle.dump(res, open(os.path.join(filePath, ".fully_aligned_feature.pickle"), "wb")) # Make the file be hidden
return res
'''
def processQuantityData(df, params):
print(" Loading-bias summary")
print(" ====================")
logging.info(" Loading-bias summary")
logging.info(" ====================")
intensityCols = [col for col in df.columns if col.lower().endswith("_intensity")]
expr = df[intensityCols]
# Calculation and print-out loading bias information
sampleNames = expr.columns
rowMeans = expr.mean(axis=1)
lexpr = np.log2(expr.div(rowMeans, axis=0))
nFeatures, nSamples = lexpr.shape
idx = pd.Series([True] * nFeatures)
for i in intensityCols:
idx = idx & (lexpr[i] > lexpr[i].quantile(q=0.1)) & (lexpr[i] < lexpr[i].quantile(q=0.9))
meanIntensity = 2 ** (lexpr.loc[idx, :].mean(axis=0)) * 100
sdVal = lexpr.loc[idx, :].std(axis=0)
sdIntensity = ((2 ** sdVal - 1) + (1 - 2 ** (-sdVal))) / 2 * 100
semIntensity = sdIntensity / np.sqrt(len(idx))
print(" Sample_name\tMean[%]\tSD[%]\tSEM[%]\t#features")
logging.info(" Sample_name\tMean[%]\tSD[%]\tSEM[%]\t#features")
for i in range(expr.shape[1]):
print(" {}\t{:.2f}\t{:.2f}\t{:.2f}\t{}".format(intensityCols[i].replace("_intensity", ""), meanIntensity[i], sdIntensity[i], semIntensity[i], len(idx)))
logging.info(" {}\t{:.2f}\t{:.2f}\t{:.2f}\t{}".format(intensityCols[i].replace("_intensity", ""), meanIntensity[i], sdIntensity[i], semIntensity[i], len(idx)))
# Normalization using trimmed-mean values (loading-bias correction)
lexpr = np.log2(expr)
if params["skip_loading_bias_correction"] == 0:
# Parameters for normalization
cutoff = np.nanquantile(lexpr.to_numpy(), q=0.1) # 10% quantile of overall intensities
# This is the original implementation (as the same as Perl-pipeline), but it may be too stringent
idx = pd.Series([True] * nFeatures)
for i in intensityCols:
idx = idx & (lexpr[lexpr > cutoff][i] > lexpr[lexpr > cutoff][i].quantile(q=0.1)) & (
lexpr[lexpr > cutoff][i] > lexpr[lexpr > cutoff][i].quantile(q=0.9))
meanIntensity = lexpr.loc[idx, :].mean(axis=0)
meanIntensity = lexpr[lexpr > cutoff].mean(axis=0)
normFactor = meanIntensity - np.mean(meanIntensity)
lexpr = lexpr - normFactor
# Replace missing values (i.e. nan) with the 1/2 of the global minimum intensity
minIntensity = np.nanmin(lexpr)
lexpr = lexpr.fillna(minIntensity - 1) # Half of the global minimum intensity (at log2-scale)
lexpr += lexpr.isna() * pd.DataFrame(1e-2 * np.random.uniform(0, 1, size=(lexpr.shape)),
columns=lexpr.columns) # Add small random number for numerical stability
df[intensityCols] = 2 ** lexpr
return df
def generateFeatureFile(full, partial, unaligned, params):
# Input arguments
# full: fully-aligned features (numpy recarray)
# partial: partially-aligned features (numpy recarray, or None)
# unaligned: un-aligned features (list of numpy recarray, or None)
# params: dictionary of parameters
filePath = os.path.join(os.getcwd(), "align_" + params["output_name"])
if not os.path.exists(filePath):
os.mkdir(filePath)
# Organize fully-aligned features
fullName = os.path.join(filePath, params["output_name"] + "_fully_aligned.feature")
dfFull = pd.DataFrame(full)
dfFull["meanMz"] = dfFull.filter(regex=(".*mz$")).mean(axis=1)
dfFull = dfFull.sort_values(by="meanMz", ignore_index=True) # Features are sorted by mean m/z
colNames = dfFull.columns.tolist()
colNames = colNames[-1:] + colNames[:-1]
dfFull = dfFull[colNames]
# Organize "summarized" fully-aligned features
fullName2 = os.path.join(filePath, params["output_name"] + "_summarized_fully_aligned.feature")
dfFull2 = summarizeFeatures(full, params)
dfFull2 = dfFull2.sort_values(by="feature_m/z", ignore_index=True) # Features are sorted by "feature_m/z"
dfFull2.insert(loc=0, column="feature_num", value=dfFull2.index + 1)
# Processing of quantity data (missing value imputation, normalization, etc.)
dfFull2 = processQuantityData(dfFull2, params)
intensityCols = [col for col in dfFull2.columns if col.lower().endswith("_intensity")]
for col in intensityCols:
dfFull[col] = dfFull2[col] # Replace intensity values of dfFull with those of dfFull2 (preprocessed)
# This file contains fully-aligned features with run-specific information
# Since the run-specific information is required for MS2 processing, it should be kept
dfFull.to_csv(fullName, index=False, sep="\t")
# This file contains "summarized" fully-aligned features
# Except intensity, all feature information is summarized over runs
dfFull2.to_csv(fullName2, index=False, sep="\t")
##############################################
# Write partially-aligned features, if exist #
##############################################
if partial is not None:
dfPartial = pd.DataFrame(partial)
if dfPartial.shape[0] > 0:
partialName = os.path.join(filePath, params["output_name"] + "_partially_aligned.feature")
dfPartial.to_csv(partialName, index=False, sep="\t")
else:
dfPartial = None
#######################################
# Write un-aligned features, if exist #
#######################################
dfArrayUnaligned = None
if unaligned is not None:
dfArrayUnaligned = []
for un in unaligned:
unName = [col for col in un.dtype.names if col.endswith('_mz')][0]
unName = os.path.join(filePath, re.sub("_mz", "", unName) + "_unaligned.feature")
dfUn = pd.DataFrame(un)
dfUn.to_csv(unName, index=False, sep="\t")
dfArrayUnaligned.append(dfUn)
else:
dfArrayUnaligned = None
return dfFull, dfPartial, dfArrayUnaligned