-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathms_clustering.py
52 lines (39 loc) · 1.29 KB
/
ms_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
'''
Getting MS spectra for CnHm systems from NIST Chemistry WebBook
'''
#%% Imports
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist#, squareform
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
#%% Functions
def signals_to_spectrum(ms, max_mz):
'''
Returns np array of ms
'''
signals = [_.split(',') for _ in ms.split()]
signals = {int(key): int(val) for key, val in signals}
spec = np.zeros(max_mz + 1)
for mass, intens in signals.items():
spec[mass] = intens
# normalize to one integral value
spec *= 10000 / sum(spec)
return spec
#%% Plot dendrograms
# read
df = pd.read_csv('csvs/CnHm_filtered.csv')
# clustering
for subtype in set(df.type):
sub = df.loc[df.type == subtype]
max_mz = int(100*(max(sub.MW) // 100 + 1))
MSs = np.array([signals_to_spectrum(ms, max_mz) for ms in sub.MS])
X = pdist(MSs)
#plt.imshow(squareform(X))
# make dendrogram
plt.figure(figsize=(6, 24*len(sub)/271), dpi = 300)
linked = linkage(X, 'single')
dendrogram(linked, orientation = 'right', labels = list(sub.name),
distance_sort = 'descending', show_leaf_counts = True)
plt.subplots_adjust(left = 0.3)
plt.savefig(f'dendrograms/{subtype}.png')