Merge pull request #35 from RECETOX/wverastegui/issue34

Scripts for paper's plots
RECETOX · Jul 15, 2024 · af8c47a · af8c47a
2 parents 46ce75a + e564efe
commit af8c47a
Show file tree

Hide file tree

Showing 17 changed files with 1,417 additions and 248 deletions.
diff --git a/analysis/Python_scripts/Fig2_scatterplot.py b/analysis/Python_scripts/Fig2_scatterplot.py
@@ -0,0 +1,33 @@
+import pandas as pd
+from matchms.logging_functions import set_matchms_logger_level
+
+from utils import append_classes, load_spectra_metadata, normalize_df
+from plotting import scatterplot_matplotlib
+
+set_matchms_logger_level('ERROR')
+
+matchms_scores = pd.read_csv("../data/output_matching/matchms/matchms_tol_0.0035_1%I_all_peaks_with_0s_only_matching.tsv", sep="\t")
+matchms_scores.rename(columns={'CosineHungarian_0.0035_0.0_1.0_scores': 'scores'}, inplace=True)
+matchms_scores.rename(columns={'CosineHungarian_0.0035_0.0_1.0_matches': 'matches'}, inplace=True)
+
+_ , spectra_metadata, _ = load_spectra_metadata("../data/filtered/simulated_matchms_filter_1%I_all_peaks.msp", 'query')
+_ , reference_spectra_metadata, _ = load_spectra_metadata("../data/experimental/RECETOX_GC-EI_MS_20201028.msp", 'reference')
+
+merged = matchms_scores.merge(spectra_metadata, on="query", how="inner")
+merged.rename(columns={'num_peaks': 'n_peaks_query'}, inplace=True)
+
+merged = merged.merge(reference_spectra_metadata, on="reference", how="inner")
+merged.rename(columns={'num_peaks': 'n_peaks_reference'}, inplace=True)
+
+numeric_columns = ['matches', 'n_peaks_query', 'n_peaks_reference']
+merged[numeric_columns] = merged[numeric_columns].apply(pd.to_numeric, errors='coerce')
+
+merged['FractionQuery'] = merged['matches'] / merged['n_peaks_query']
+merged['FractionReference'] = merged['matches'] / merged['n_peaks_reference']
+
+merged = append_classes(merged, "query")
+
+# Create a scatter plot
+scatterplot_matplotlib(normalize_df(merged, matches_norm_col=None)).savefig("paper_plots/Fig2_scatterplot.png", bbox_inches='tight')
+# plot name in the manuscript:
+# "20240517_scatterplot.png"
diff --git a/analysis/Python_scripts/Fig3_correlations.py b/analysis/Python_scripts/Fig3_correlations.py
@@ -0,0 +1,48 @@
+import pandas as pd
+from matplotlib import pyplot as plt
+
+from utils import *
+from plotting import *
+
+matchms_scores = load_matchms_scores()
+
+df = normalize_df(matchms_scores, matches_norm_col=None)
+del df['peak_comments']
+
+matches_col = 'matches'
+scores_col = 'scores'
+
+df['matches_norm_query'] = df[matches_col] / df['n_peaks_query']
+df['matches_norm_reference'] = df[matches_col] / df['n_peaks_reference']
+
+properties = [
+    'scores',
+    'matches',
+    'matches_norm_query',
+    'matches_norm_reference',
+    'molecular_flexibility',
+    'rotatable_bonds',
+    'stereo_centers',
+    'molecular_complexity',
+    'n_atoms',
+    'precursor_mz',
+    'electronegative_atoms',
+    'aromatic_nitrogens',
+    'amines',
+    'amides',
+]
+
+# Assuming `df` is your DataFrame
+corr = df[properties].corr().round(2)
+
+plt.figure(figsize=(24, 20))
+cax = sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1,annot_kws={"size": 20})
+# plt.title('Pearson Correlations')
+plt.tick_params(axis='both', which='major', labelsize=20)
+# Get the colorbar from the HeatMap and set the fontsize for its tick labels
+cbar = cax.collections[0].colorbar
+cbar.ax.tick_params(labelsize=20)
+
+plt.savefig("paper_plots/Fig3_correlations.png", bbox_inches='tight')
+# plot name in the manuscript:
+# "correlations/20240517_heatmap_properties_correlations.png"
diff --git a/analysis/Python_scripts/Fig4_superclass_histograms.py b/analysis/Python_scripts/Fig4_superclass_histograms.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import os
+import numpy as np
+import math
+from matplotlib import pyplot as plt
+from rdkit import Chem
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+
+from utils import *
+from plotting import *
+
+matchms_scores = load_matchms_scores()
+
+matchms_scores_superclass = preprocess_data(normalize_df(matchms_scores.copy()), ["superclass"])
+larger_superclasses = matchms_scores_superclass.groupby("superclass").filter(lambda x: len(x) > 2)
+create_plot(larger_superclasses, "superclass", normalized_matches=True).savefig("paper_plots/Fig4a_superclasses_boxplot.png", bbox_inches='tight')
+# plot name in the manuscript: "superclasses/20240207_boxplot_superclasses.png"
+
+matches_normalized = matchms_scores['matches'] / matchms_scores['n_peaks_reference']
+plt.clf()
+plt.set_cmap('viridis')
+plt.hist2d(matches_normalized * 100, matchms_scores['scores'] * 1000, bins=(5, 5), range=[[0, 100], [0, 1000]])
+plt.colorbar()
+plt.clim(0, 70)
+plt.xlabel('ions matching reference (%)', fontsize=20)
+plt.ylabel('scores', fontsize=20)
+plt.tick_params(labelsize=13)
+plt.gcf().set_size_inches(8, 6)
+plt.savefig("paper_plots/Fig4a_superclasses_histogram.png", bbox_inches='tight')
+
+
+matchms_scores_top5 = pd.read_csv("../data/output_matching/matchms/matchms_tol_0.0035_1%I_top5_with_0s_only_matching.tsv", sep="\t")
+matchms_scores_top5.rename(columns={'CosineHungarian_0.0035_0.0_1.0_scores': 'scores'}, inplace=True)
+matchms_scores_top5.rename(columns={'CosineHungarian_0.0035_0.0_1.0_matches': 'matches'}, inplace=True)
+matchms_scores_top5 = append_classes(matchms_scores_top5, 'query')
+matchms_scores_top5 = append_spectrum_metadata(matchms_scores_top5)
+
+matchms_scores_superclass_top5 = preprocess_data(normalize_df(matchms_scores_top5.copy(), matches_norm_col=None), ["superclass"])
+larger_superclasses_top5 = matchms_scores_superclass_top5.groupby("superclass").filter(lambda x: len(x) > 2)
+create_plot(larger_superclasses_top5, "superclass", normalized_matches=False).savefig("paper_plots/Fig4b_superclasses_boxplot.png", bbox_inches='tight')
+# plot name in the manuscript: "superclasses/20240223_boxplot_superclasses_top5.png"
+
+plt.clf()
+plt.set_cmap('viridis')
+plt.hist2d(matchms_scores_top5['matches'], matchms_scores_top5['scores'] * 1000, bins=([0,1,2,3,4,5], 5))
+plt.colorbar()
+plt.clim(0, 70)
+
+plt.xlabel('ion matches', fontsize=20)
+plt.ylabel('scores', fontsize=20)
+plt.tick_params(labelsize=13)
+plt.gcf().set_size_inches(8, 6)
+plt.savefig("paper_plots/Fig4b_superclasses_histogram.png", bbox_inches='tight')
diff --git a/analysis/Python_scripts/Fig5_classes_boxplots.py b/analysis/Python_scripts/Fig5_classes_boxplots.py
@@ -0,0 +1,34 @@
+import pandas as pd
+import os
+import numpy as np
+import math
+from matplotlib import pyplot as plt
+from rdkit import Chem
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+
+from utils import *
+from plotting import *
+
+
+matchms_scores = load_matchms_scores()
+merged = normalize_df(matchms_scores.copy())
+
+scores_preprocessed_hierarchy = preprocess_data(merged, ["superclass", "class", "subclass"])
+grouped_superclass = scores_preprocessed_hierarchy.groupby("superclass")
+grouping = "class"
+
+for group in grouped_superclass.groups:
+    grp = grouped_superclass.get_group(group).groupby(grouping).filter(lambda x: len(x) > 2)
+    if len(grp) > 0:
+        fig = create_plot(grp, grouping, showlegend=False, hide_labels=True)
+        fig.savefig(f"paper_plots/Fig5_{group}.png", bbox_inches='tight')
+# plot name in the manuscript in that order:
+# "classes/20240207_boxplot_benzenoids.png"
+# "classes/20240207_boxplot_lipids.png"
+# "classes/20240207_boxplot_organic_acids.png"
+# "classes/20240207_boxplot_organooxygen.png"
+# "classes/20240207_boxplot_organohalogen.png"
+# "classes/20240207_boxplot_organoheterocyclic.png"
+# "classes/20240207_boxplot_phenylpropanoids.png"
+
diff --git a/analysis/Python_scripts/Fig6_benzene_subclasses_boxplot.py b/analysis/Python_scripts/Fig6_benzene_subclasses_boxplot.py
@@ -0,0 +1,23 @@
+import numpy as np
+import math
+from matplotlib import pyplot as plt
+from rdkit import Chem
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+
+from utils import *
+from plotting import *
+
+
+matchms_scores = load_matchms_scores()
+merged = normalize_df(matchms_scores.copy())
+
+scores_preprocessed_hierarchy = preprocess_data(merged, ["superclass", "class", "subclass"])
+
+grouped_class = scores_preprocessed_hierarchy.groupby("class")
+grouping = "subclass"
+for group in grouped_class.groups:
+    grp = grouped_class.get_group(group).groupby(grouping).filter(lambda x: len(x) > 6)
+    if len(grp) > 0 and group == "Benzene and substituted derivatives":
+        fig = create_plot(grp, grouping, showlegend=False, hide_labels=True)
+        fig.savefig(f"paper_plots/Fig6_benzene_subclasses.png", bbox_inches='tight')
diff --git a/analysis/Python_scripts/Fig7_nitrogen_comparison.py b/analysis/Python_scripts/Fig7_nitrogen_comparison.py
@@ -0,0 +1,32 @@
+from utils import *
+from plotting import boxplot_comparison
+
+matchms_scores = load_matchms_scores()
+merged_all_peaks_same = normalize_df(matchms_scores)
+mdf_comp = preprocess_data(merged_all_peaks_same, ["composition"])
+
+baseline_cols= ['C,H', 'C,H,O', 'C,H,O,S', 'C,Cl,H,O', 'Br,C,H,O', 'C,Cl,H', 'C,Cl,H,O,S', 'C,Cl,F,H,O', 'C,H,O,P', 'C,H,O,P,S']
+mdf_comp_baseline = mdf_comp.loc[mdf_comp['composition'].isin(baseline_cols)]
+mdf_comp_baseline.sort_index(axis=1, inplace=True)
+
+nitrogen_cols = ['C,H,N', 'C,H,N,O','C,H,N,O,S', 'C,Cl,H,N,O', 'Br,C,H,N,O', 'C,Cl,H,N', 'C,Cl,H,N,O,S', 'C,Cl,F,H,N,O','C,H,N,O,P', 'C,H,N,O,P,S']
+mdf_comp_nitrogen = mdf_comp.loc[mdf_comp['composition'].isin(nitrogen_cols)]
+mdf_comp_nitrogen.sort_index(axis=1, inplace=True)
+
+boxplot_comparison(
+    mdf_comp_baseline,
+    baseline_cols,
+    mdf_comp_nitrogen,
+    nitrogen_cols,
+    'scores',
+    colors=['crimson', 'deepskyblue']
+).savefig("paper_plots/Fig7_scores.png", bbox_inches='tight')
+
+boxplot_comparison(
+    mdf_comp_baseline,
+    baseline_cols,
+    mdf_comp_nitrogen,
+    nitrogen_cols,
+    'matches',
+    colors=["darkgoldenrod", "yellow"],
+).savefig("paper_plots/Fig7_matches.png", bbox_inches='tight')
diff --git a/analysis/Python_scripts/Fig8_p_and_s.py b/analysis/Python_scripts/Fig8_p_and_s.py
@@ -0,0 +1,11 @@
+from utils import *
+from plotting import create_plot
+
+matchms_scores = load_matchms_scores()
+merged_all_peaks_same = normalize_df(matchms_scores)
+mdf_comp = preprocess_data(merged_all_peaks_same, ["composition"])
+
+mdf_comp_ps = mdf_comp[mdf_comp['composition'].str.contains('S|P')]
+mdf_comp_ps = mdf_comp_ps[mdf_comp_ps['composition'] != 'C,F,H,N,Si']
+mdf_comp_ps = mdf_comp_ps.groupby('composition').filter(lambda x: len(x) > 2)
+create_plot(mdf_comp_ps, "composition").savefig("paper_plots/Fig8_P_and_S.png", bbox_inches='tight')