Merge pull request #10 from RECETOX/hechth/issue8

Add matchms with 0 scores into the output sctructure
RECETOX · Oct 31, 2023 · 616ca13 · 616ca13
2 parents 02d0c18 + 2a1fbf5
commit 616ca13
Show file tree

Hide file tree

Showing 8 changed files with 247,935 additions and 303,850 deletions.
diff --git a/analysis/Python_scripts/add_zeros.ipynb b/analysis/Python_scripts/add_zeros.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from matchms.importing import load_from_msp\n",
+    "from matchms.logging_functions import set_matchms_logger_level\n",
+    "import itertools\n",
+    "\n",
+    "set_matchms_logger_level(\"ERROR\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predicted_spectra = list(load_from_msp(\"../data/filtered/simulated_matchms_filter_1%I_all_peaks.msp\"))\n",
+    "predicted_spectra_metadata= pd.DataFrame.from_dict([x.metadata for x in predicted_spectra])\n",
+    "predicted_spectra_metadata.rename(columns={'compound_name': 'query'}, inplace=True)\n",
+    "predicted_spectra_names = predicted_spectra_metadata['query'].to_list()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "experimental_spectra = list(load_from_msp(\"../data/experimental/RECETOX_GC-EI_MS_20201028.msp\"))\n",
+    "experimental_spectra_metadata= pd.DataFrame.from_dict([x.metadata for x in experimental_spectra])\n",
+    "experimental_spectra_metadata.rename(columns={'compound_name': 'reference'}, inplace=True)\n",
+    "experimental_spectra_names = experimental_spectra_metadata['reference'].to_list()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_combinations = set(itertools.product(predicted_spectra_names, experimental_spectra_names))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matches = pd.read_csv(\"../data/output_matching/matchms/matchms_tol_0.01_1%I_top5.tsv\", sep=\"\\t\")\n",
+    "matches.set_index(['query', 'reference'], inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "existing_rows = set(matches.index.to_list())\n",
+    "rows_to_add = list(all_combinations - existing_rows)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_zeros = pd.DataFrame(0, index=rows_to_add, columns=matches.columns)\n",
+    "merged = pd.concat([matches, all_zeros]).reset_index(names=['query', 'reference'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged.to_csv(\"../data/output_matching/matchms/matchms_tol_0.01_1%I_top5_with_0s.tsv\", sep=\"\\t\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/analysis/Python_scripts/chemical_composition_boxplot.png b/analysis/Python_scripts/chemical_composition_boxplot.png
diff --git a/analysis/Python_scripts/scatterplot.ipynb b/analysis/Python_scripts/scatterplot.ipynb
diff --git a/analysis/Python_scripts/utils.py b/analysis/Python_scripts/utils.py
@@ -45,11 +45,11 @@ def append_classes(df, left_on):
         "S": [has_atom(m, 'S') for m in molecules],
         "P": [has_atom(m, 'P') for m in molecules],
         "Si": [has_atom(m, 'Si') for m in molecules],
-        "C,O,N,H": [has_organic_atoms(m) for m in molecules],
+#        "C,O,N,H": [has_organic_atoms(m) for m in molecules],
     })
     merged_df = pd.merge(df, class_names, left_on=left_on, right_on='molname')
     return merged_df
 
 # Define a function to map the true columns to a list of names
 def get_true_names(row, df):
-    return [col for col in df.columns[11:18] if row[col]]
+    return [col for col in df.columns[11:17] if row[col]]