Skip to content

Commit

Permalink
Merge pull request #35 from RECETOX/wverastegui/issue34
Browse files Browse the repository at this point in the history
Scripts for paper's plots
  • Loading branch information
hechth authored Jul 15, 2024
2 parents 46ce75a + e564efe commit af8c47a
Show file tree
Hide file tree
Showing 17 changed files with 1,417 additions and 248 deletions.
33 changes: 33 additions & 0 deletions analysis/Python_scripts/Fig2_scatterplot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
from matchms.logging_functions import set_matchms_logger_level

from utils import append_classes, load_spectra_metadata, normalize_df
from plotting import scatterplot_matplotlib

set_matchms_logger_level('ERROR')

matchms_scores = pd.read_csv("../data/output_matching/matchms/matchms_tol_0.0035_1%I_all_peaks_with_0s_only_matching.tsv", sep="\t")
matchms_scores.rename(columns={'CosineHungarian_0.0035_0.0_1.0_scores': 'scores'}, inplace=True)
matchms_scores.rename(columns={'CosineHungarian_0.0035_0.0_1.0_matches': 'matches'}, inplace=True)

_ , spectra_metadata, _ = load_spectra_metadata("../data/filtered/simulated_matchms_filter_1%I_all_peaks.msp", 'query')
_ , reference_spectra_metadata, _ = load_spectra_metadata("../data/experimental/RECETOX_GC-EI_MS_20201028.msp", 'reference')

merged = matchms_scores.merge(spectra_metadata, on="query", how="inner")
merged.rename(columns={'num_peaks': 'n_peaks_query'}, inplace=True)

merged = merged.merge(reference_spectra_metadata, on="reference", how="inner")
merged.rename(columns={'num_peaks': 'n_peaks_reference'}, inplace=True)

numeric_columns = ['matches', 'n_peaks_query', 'n_peaks_reference']
merged[numeric_columns] = merged[numeric_columns].apply(pd.to_numeric, errors='coerce')

merged['FractionQuery'] = merged['matches'] / merged['n_peaks_query']
merged['FractionReference'] = merged['matches'] / merged['n_peaks_reference']

merged = append_classes(merged, "query")

# Create a scatter plot
scatterplot_matplotlib(normalize_df(merged, matches_norm_col=None)).savefig("paper_plots/Fig2_scatterplot.png", bbox_inches='tight')
# plot name in the manuscript:
# "20240517_scatterplot.png"
48 changes: 48 additions & 0 deletions analysis/Python_scripts/Fig3_correlations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
from matplotlib import pyplot as plt

from utils import *
from plotting import *

matchms_scores = load_matchms_scores()

df = normalize_df(matchms_scores, matches_norm_col=None)
del df['peak_comments']

matches_col = 'matches'
scores_col = 'scores'

df['matches_norm_query'] = df[matches_col] / df['n_peaks_query']
df['matches_norm_reference'] = df[matches_col] / df['n_peaks_reference']

properties = [
'scores',
'matches',
'matches_norm_query',
'matches_norm_reference',
'molecular_flexibility',
'rotatable_bonds',
'stereo_centers',
'molecular_complexity',
'n_atoms',
'precursor_mz',
'electronegative_atoms',
'aromatic_nitrogens',
'amines',
'amides',
]

# Assuming `df` is your DataFrame
corr = df[properties].corr().round(2)

plt.figure(figsize=(24, 20))
cax = sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1,annot_kws={"size": 20})
# plt.title('Pearson Correlations')
plt.tick_params(axis='both', which='major', labelsize=20)
# Get the colorbar from the HeatMap and set the fontsize for its tick labels
cbar = cax.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)

plt.savefig("paper_plots/Fig3_correlations.png", bbox_inches='tight')
# plot name in the manuscript:
# "correlations/20240517_heatmap_properties_correlations.png"
54 changes: 54 additions & 0 deletions analysis/Python_scripts/Fig4_superclass_histograms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import pandas as pd
import os
import numpy as np
import math
from matplotlib import pyplot as plt
from rdkit import Chem
import plotly.graph_objs as go
from plotly.subplots import make_subplots

from utils import *
from plotting import *

matchms_scores = load_matchms_scores()

matchms_scores_superclass = preprocess_data(normalize_df(matchms_scores.copy()), ["superclass"])
larger_superclasses = matchms_scores_superclass.groupby("superclass").filter(lambda x: len(x) > 2)
create_plot(larger_superclasses, "superclass", normalized_matches=True).savefig("paper_plots/Fig4a_superclasses_boxplot.png", bbox_inches='tight')
# plot name in the manuscript: "superclasses/20240207_boxplot_superclasses.png"

matches_normalized = matchms_scores['matches'] / matchms_scores['n_peaks_reference']
plt.clf()
plt.set_cmap('viridis')
plt.hist2d(matches_normalized * 100, matchms_scores['scores'] * 1000, bins=(5, 5), range=[[0, 100], [0, 1000]])
plt.colorbar()
plt.clim(0, 70)
plt.xlabel('ions matching reference (%)', fontsize=20)
plt.ylabel('scores', fontsize=20)
plt.tick_params(labelsize=13)
plt.gcf().set_size_inches(8, 6)
plt.savefig("paper_plots/Fig4a_superclasses_histogram.png", bbox_inches='tight')


matchms_scores_top5 = pd.read_csv("../data/output_matching/matchms/matchms_tol_0.0035_1%I_top5_with_0s_only_matching.tsv", sep="\t")
matchms_scores_top5.rename(columns={'CosineHungarian_0.0035_0.0_1.0_scores': 'scores'}, inplace=True)
matchms_scores_top5.rename(columns={'CosineHungarian_0.0035_0.0_1.0_matches': 'matches'}, inplace=True)
matchms_scores_top5 = append_classes(matchms_scores_top5, 'query')
matchms_scores_top5 = append_spectrum_metadata(matchms_scores_top5)

matchms_scores_superclass_top5 = preprocess_data(normalize_df(matchms_scores_top5.copy(), matches_norm_col=None), ["superclass"])
larger_superclasses_top5 = matchms_scores_superclass_top5.groupby("superclass").filter(lambda x: len(x) > 2)
create_plot(larger_superclasses_top5, "superclass", normalized_matches=False).savefig("paper_plots/Fig4b_superclasses_boxplot.png", bbox_inches='tight')
# plot name in the manuscript: "superclasses/20240223_boxplot_superclasses_top5.png"

plt.clf()
plt.set_cmap('viridis')
plt.hist2d(matchms_scores_top5['matches'], matchms_scores_top5['scores'] * 1000, bins=([0,1,2,3,4,5], 5))
plt.colorbar()
plt.clim(0, 70)

plt.xlabel('ion matches', fontsize=20)
plt.ylabel('scores', fontsize=20)
plt.tick_params(labelsize=13)
plt.gcf().set_size_inches(8, 6)
plt.savefig("paper_plots/Fig4b_superclasses_histogram.png", bbox_inches='tight')
34 changes: 34 additions & 0 deletions analysis/Python_scripts/Fig5_classes_boxplots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pandas as pd
import os
import numpy as np
import math
from matplotlib import pyplot as plt
from rdkit import Chem
import plotly.graph_objs as go
from plotly.subplots import make_subplots

from utils import *
from plotting import *


matchms_scores = load_matchms_scores()
merged = normalize_df(matchms_scores.copy())

scores_preprocessed_hierarchy = preprocess_data(merged, ["superclass", "class", "subclass"])
grouped_superclass = scores_preprocessed_hierarchy.groupby("superclass")
grouping = "class"

for group in grouped_superclass.groups:
grp = grouped_superclass.get_group(group).groupby(grouping).filter(lambda x: len(x) > 2)
if len(grp) > 0:
fig = create_plot(grp, grouping, showlegend=False, hide_labels=True)
fig.savefig(f"paper_plots/Fig5_{group}.png", bbox_inches='tight')
# plot name in the manuscript in that order:
# "classes/20240207_boxplot_benzenoids.png"
# "classes/20240207_boxplot_lipids.png"
# "classes/20240207_boxplot_organic_acids.png"
# "classes/20240207_boxplot_organooxygen.png"
# "classes/20240207_boxplot_organohalogen.png"
# "classes/20240207_boxplot_organoheterocyclic.png"
# "classes/20240207_boxplot_phenylpropanoids.png"

23 changes: 23 additions & 0 deletions analysis/Python_scripts/Fig6_benzene_subclasses_boxplot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import numpy as np
import math
from matplotlib import pyplot as plt
from rdkit import Chem
import plotly.graph_objs as go
from plotly.subplots import make_subplots

from utils import *
from plotting import *


matchms_scores = load_matchms_scores()
merged = normalize_df(matchms_scores.copy())

scores_preprocessed_hierarchy = preprocess_data(merged, ["superclass", "class", "subclass"])

grouped_class = scores_preprocessed_hierarchy.groupby("class")
grouping = "subclass"
for group in grouped_class.groups:
grp = grouped_class.get_group(group).groupby(grouping).filter(lambda x: len(x) > 6)
if len(grp) > 0 and group == "Benzene and substituted derivatives":
fig = create_plot(grp, grouping, showlegend=False, hide_labels=True)
fig.savefig(f"paper_plots/Fig6_benzene_subclasses.png", bbox_inches='tight')
32 changes: 32 additions & 0 deletions analysis/Python_scripts/Fig7_nitrogen_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from utils import *
from plotting import boxplot_comparison

matchms_scores = load_matchms_scores()
merged_all_peaks_same = normalize_df(matchms_scores)
mdf_comp = preprocess_data(merged_all_peaks_same, ["composition"])

baseline_cols= ['C,H', 'C,H,O', 'C,H,O,S', 'C,Cl,H,O', 'Br,C,H,O', 'C,Cl,H', 'C,Cl,H,O,S', 'C,Cl,F,H,O', 'C,H,O,P', 'C,H,O,P,S']
mdf_comp_baseline = mdf_comp.loc[mdf_comp['composition'].isin(baseline_cols)]
mdf_comp_baseline.sort_index(axis=1, inplace=True)

nitrogen_cols = ['C,H,N', 'C,H,N,O','C,H,N,O,S', 'C,Cl,H,N,O', 'Br,C,H,N,O', 'C,Cl,H,N', 'C,Cl,H,N,O,S', 'C,Cl,F,H,N,O','C,H,N,O,P', 'C,H,N,O,P,S']
mdf_comp_nitrogen = mdf_comp.loc[mdf_comp['composition'].isin(nitrogen_cols)]
mdf_comp_nitrogen.sort_index(axis=1, inplace=True)

boxplot_comparison(
mdf_comp_baseline,
baseline_cols,
mdf_comp_nitrogen,
nitrogen_cols,
'scores',
colors=['crimson', 'deepskyblue']
).savefig("paper_plots/Fig7_scores.png", bbox_inches='tight')

boxplot_comparison(
mdf_comp_baseline,
baseline_cols,
mdf_comp_nitrogen,
nitrogen_cols,
'matches',
colors=["darkgoldenrod", "yellow"],
).savefig("paper_plots/Fig7_matches.png", bbox_inches='tight')
11 changes: 11 additions & 0 deletions analysis/Python_scripts/Fig8_p_and_s.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from utils import *
from plotting import create_plot

matchms_scores = load_matchms_scores()
merged_all_peaks_same = normalize_df(matchms_scores)
mdf_comp = preprocess_data(merged_all_peaks_same, ["composition"])

mdf_comp_ps = mdf_comp[mdf_comp['composition'].str.contains('S|P')]
mdf_comp_ps = mdf_comp_ps[mdf_comp_ps['composition'] != 'C,F,H,N,Si']
mdf_comp_ps = mdf_comp_ps.groupby('composition').filter(lambda x: len(x) > 2)
create_plot(mdf_comp_ps, "composition").savefig("paper_plots/Fig8_P_and_S.png", bbox_inches='tight')
Loading

0 comments on commit af8c47a

Please sign in to comment.