Merge pull request #490 from Proteobench/472-make-the-dda-module-comp…

…atible-with-msangel-outputs 472 make the dda module compatible with msangel outputs
Proteobench · Dec 11, 2024 · 8102064 · 8102064
2 parents e31ff49 + a969cd8
commit 8102064
Show file tree

Hide file tree

Showing 11 changed files with 486 additions and 11 deletions.
diff --git a/docs/available-modules/2-quant-lfq-ion-dda.md b/docs/available-modules/2-quant-lfq-ion-dda.md
@@ -87,7 +87,6 @@ Table 2 provides an overview of the required input files for public submission.
 
 In FragPipe output files, the protein identifiers matching a given ion are in two separate columns: "Proteins" and "Mapped Proteins". So we concatenate these two fields to have the protein groups.
 
-
 ### i2MassChroQ
 A ProteoBench-compatible format is available in i2MassChroQ through the button `ProteoBench export`. It generates a tab-delimited file containing one row per quantified ion for metric calculation ("proteobench_export.tsv"; column headers are: "rawfile", "sequence", "ProForma", "charge", "proteins" and "area"); and a parameter file for public submission ("Project parameters.tsv"). Like with the other tools, the protein identifiers should be in the format "sp|P49327|FAS_HUMAN". 
 Link to the i2MassChroQ documentation [here](http://pappso.inrae.fr/bioinfo/i2masschroq/documentation/html/).
@@ -106,7 +105,7 @@ The field "Proteins" in **the "evidence.txt" table should report proteins in the
 In the recent versions of MaxQuant, the default settings work perfectly (`Identifier rule = >([^\s]*)`; `Description rule = >(.*)`).
 Some older versions of MaxQuant do not provide the option to change fasta header parsing. These are not compatible with ProteoBench.
 
-### Proline Studio (work in progress..)
+### Proline Studio 
 Make sure that the peaklists are named with the same prefix as raw files. To do so in ProlineStudio, use peaklist names as sample names (manually or with automatic renaming option).
 
 ![ProlineStudio Naming](../../img/module_docs/quant_lfq_ion_DDA/ProlineStudio_naming.png)
@@ -117,6 +116,10 @@ The `Quantified peptide ions` tab reports the precursor ion quantities (retrieve
 
 For public submission, you can upload the same excel export, just make sure to have the tabs `Search settings and infos`, `Import and filters`, `Quant config`. For local usage and public submission, we strongly recommend to use the following [template.json](../../files_provided_to_users/quant_lfq_ion_DDA/ProlineStudio/template.json) to make sure that all the tabs and columns needed are exported to be correctly parsed. Make sure that no personal information is stored in the excel file before making it public. The version of ProlineStudio is only exported in the parameters from version 2.3. 
 
+### MSAngel (work in progress..)
+MSAngel allows to build piplenes for bottom-up MS analysis with a choice of search engines, validation strategy and the Proline quantification. 
+More information can be found [here](https://www.profiproteomics.fr/ms-angel/)
+
 ### Sage
 
 1. Convert .raw files into .mzML using MSConvert or ThermoRawFileParser **(do not change the file names)**

diff --git a/proteobench/io/params/MSAngel.py b/proteobench/io/params/MSAngel.py
@@ -0,0 +1,122 @@
+"""MSAngel creates modular pipelines that allows several search engines to identify 
+peptides, which are then quantified with Proline.
+The parameters are provided in a .json file.
+MSAngel allows for multiple search engines to be used in the same pipeline. So it 
+requires a list of search engines and their respective parameters, which are then 
+concatenated.
+
+Relevant information in file:
+
+"""
+
+import json
+import pathlib
+from typing import Union
+
+import pandas as pd
+
+from proteobench.io.params import ProteoBenchParameters
+
+
+def extract_search_engine(search_params: list) -> dict:
+    """
+    Extract search engine parameters from the JSON data.
+    The parameter format depends on the search engine used, so this functino needs to be
+    updated for each search engine. Currently, it is set up for:
+    . Mascot
+    """
+
+    all_search_engines = []
+    for each_search_params in search_params["operations"]:
+        print("1")
+        if "searchEnginesWithForms" in each_search_params:
+            all_search_engines.append(each_search_params["searchEnginesWithForms"][0][0])
+
+    return all_search_engines
+
+
+def extract_params(fname: Union[str, pathlib.Path]) -> ProteoBenchParameters:
+    """
+    Parse MSAangel quantification tool JSON parameter file and extract relevant parameters.
+
+    Args:
+        fname (str or pathlib.Path): The path to the Sage JSON parameter file.
+
+    Returns:
+        ProteoBenchParameters: The extracted parameters as a `ProteoBenchParameters` object.
+    """
+    params = ProteoBenchParameters()
+
+    try:
+        # If the input is a file-like object (e.g., StringIO), decode it
+        file_contents = fname.getvalue().decode("utf-8")
+        data = json.loads(file_contents)
+    except AttributeError:
+        # Otherwise, treat it as a file path
+        with open(fname, "r") as file_contents:
+            data = json.load(file_contents)
+
+    # Extract parameters from the JSON data
+    params.software_name = "MSAngel"
+    params.software_version = data["msAngelVersion"]
+
+    ## Extract the search engine(s) parameters before concatenating them:
+    all_search_engines = extract_search_engine(data)
+    params.search_engines = all_search_engines.join(",")
+    all_search_engines = []
+    all_enzyme = []
+    all_allowed_miscleavages = []
+    all_fixed_mods = []
+    all_variable_mods = []
+
+    # TODO needs to have actual values
+    all_search_params = {}
+
+    for key, value in all_search_params.items():
+        all_search_engines.append(value["format"])
+        all_enzyme.append(value["enzyme"]["cleave_at"])
+        all_allowed_miscleavages.append(value["enzyme"]["missed_cleavages"])
+        all_fixed_mods.append(value["static_mods"])
+        all_variable_mods.append(value["variable_mods"])
+
+    # TODO need to have an actual value
+    params.search_engine = ""
+    params.search_engine_version = data["version"]
+    params.enzyme = data["database"]["enzyme"]["cleave_at"]
+    params.allowed_miscleavages = data["database"]["enzyme"]["missed_cleavages"]
+    params.fixed_mods = data["database"]["static_mods"]
+    params.variable_mods = data["database"]["variable_mods"]
+
+    try:
+        params.precursor_mass_tolerance = data["precursor_tol"]["ppm"]
+    except KeyError:
+        params.precursor_mass_tolerance = data["precursor_tol"]["Da"]
+
+    params.fragment_mass_tolerance = data["fragment_tol"]["ppm"]
+    params.min_peptide_length = data["database"]["enzyme"]["min_len"]
+    params.max_peptide_length = data["database"]["enzyme"]["max_len"]
+    params.max_mods = data["database"]["max_variable_mods"]
+    params.min_precursor_charge = data["precursor_charge"][0]
+    params.max_precursor_charge = data["precursor_charge"][1]
+    params.enable_match_between_runs = True
+
+    return params
+
+
+if __name__ == "__main__":
+    """
+    Extract parameters from MSAngel JSON files and save them as CSV.
+    """
+    from pathlib import Path
+
+    file = Path("../../../test/params/msangel_results.json")
+
+    # Extract parameters from the file
+    params = extract_params(file)
+
+    # Convert the extracted parameters to a dictionary and then to a pandas Series
+    data_dict = params.__dict__
+    series = pd.Series(data_dict)
+
+    # Write the Series to a CSV file
+    series.to_csv(file.with_suffix(".csv"))
diff --git a/proteobench/io/params/sage.py b/proteobench/io/params/sage.py
@@ -1,10 +1,4 @@
-"""Proline is a quantification tool. Search engine is often Mascot.
-The parameters are provided per raw file in separate sheets of an Excel file.
-
-Relevant information in sheets:
-- "Search settings and infos",
-- "Import and filters"
-- "Quant config"
+"""Sage parameter extraction.
 """
 
 import json

diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/ion/DDA/parse_settings_msangel.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/ion/DDA/parse_settings_msangel.toml
@@ -0,0 +1,31 @@
+[mapper]
+"proteins" = "Proteins" # this will be the combination of samesets_accessions and subset_accessions after import. Specific to ProlineStudio and MSAngel.
+"sequence" = "Sequence"
+"modifications" = "Modifications"
+"master_quant_peptide_ion_charge" = "Charge"
+
+
+[condition_mapper]
+"abundance_LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01" = "A"
+"abundance_LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02" = "A"
+"abundance_LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03" = "A"
+"abundance_LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01" = "B"
+"abundance_LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02" = "B"
+"abundance_LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03" = "B"
+
+[run_mapper]
+"abundance_LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01" = "Condition_A_Sample_Alpha_01"
+"abundance_LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02" = "Condition_A_Sample_Alpha_02"
+"abundance_LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03" = "Condition_A_Sample_Alpha_03"
+"abundance_LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01" = "Condition_B_Sample_Alpha_01"
+"abundance_LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02" = "Condition_B_Sample_Alpha_02"
+"abundance_LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03" = "Condition_B_Sample_Alpha_03"
+
+[species_mapper]
+"_YEAST" = "YEAST"
+"_ECOLI" = "ECOLI"
+"_HUMAN" = "HUMAN"
+
+[general]
+"contaminant_flag" = "Cont_"
+"decoy_flag" = true
diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml
@@ -4,6 +4,7 @@
 "i2MassChroQ" = "parse_settings_i2massChroQ.toml"
 "MaxQuant" = "parse_settings_maxquant.toml"
 "ProlineStudio" = "parse_settings_proline.toml"
+"MSAngel" = "parse_settings_msangel.toml"
 "Sage" = "parse_settings_sage.toml"
 "Custom" = "parse_settings_custom.toml"
 

diff --git a/proteobench/io/parsing/parse_ion.py b/proteobench/io/parsing/parse_ion.py
@@ -30,7 +30,9 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
     elif input_format == "WOMBAT":
         input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
         input_data_frame["proforma"] = input_data_frame["modified_peptide"]
-    elif input_format == "ProlineStudio":
+    elif (
+        input_format == "ProlineStudio" or input_format == "MSAngel"
+    ):  # Proline Studio and MSAngel have the same output format but the .toml is different
         input_data_frame = pd.read_excel(
             input_csv,
             sheet_name="Quantified peptide ions",

diff --git a/proteobench/modules/quant/quant_base/quant_base_module.py b/proteobench/modules/quant/quant_base/quant_base_module.py
@@ -27,6 +27,8 @@
 from proteobench.io.params.maxquant import extract_params as extract_params_maxquant
 from proteobench.io.params.msaid import extract_params as extract_params_msaid
 from proteobench.io.params.proline import extract_params as extract_params_proline
+
+# from proteobench.io.params.msangel import extract_params as extract_params_msangel
 from proteobench.io.params.sage import extract_params as extract_params_sage
 from proteobench.io.params.spectronaut import (
     read_spectronaut_settings as extract_params_spectronaut,
@@ -59,6 +61,7 @@ class QuantModule:
     EXTRACT_PARAMS_DICT: Dict[str, Any] = {
         "MaxQuant": extract_params_maxquant,
         "ProlineStudio": extract_params_proline,
+        # "MSAngel": extract_params_msangel,
         "AlphaPept": extract_params_alphapept,
         "Sage": extract_params_sage,
         "FragPipe": extract_params_fragger,

diff --git a/test/data/dda_quant/MSAngel_DDA_quan_ions_subset.xlsx b/test/data/dda_quant/MSAngel_DDA_quan_ions_subset.xlsx