From 57d94eced9230ca090fce53869bd6903b9f0d95c Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Wed, 4 Oct 2023 16:16:46 +0000 Subject: [PATCH 01/20] draft of additional pyopenms reader --- psm_utils/io/idxml.py | 80 +++++++++++++++++++++++++++++++------------ pyproject.toml | 1 + 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index fced5ca..932873d 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -13,9 +13,13 @@ from __future__ import annotations +from abc import abstractmethod import re -from pyteomics.openms import idxml +try: + import pyopenms as oms +except ImportError: + from pyteomics.openms import idxml from psm_utils.io._base_classes import ReaderBase from psm_utils.psm import PSM @@ -26,20 +30,19 @@ MOD_PATTERN_NTERM = re.compile(r"^\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]") MOD_PATTERN_CTERM = re.compile(r"\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]$") - -class IdXMLReader(ReaderBase): +class AIdXMLReader(ReaderBase): + @abstractmethod def __iter__(self): - """Iterate over file and return PSMs one-by-one.""" - with idxml.read(str(self.filename)) as reader: - for entry in reader: - for peptide_hit in entry["PeptideHit"]: - yield self._parse_psm(entry, peptide_hit) - + raise NotImplementedError() + + @abstractmethod + def _parse_psm(self): + raise NotImplementedError() + @staticmethod def _parse_peptidoform(sequence: str, charge: int): """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. - Notes ----- Implemented according to the documentation on @@ -56,23 +59,21 @@ def _parse_peptidoform(sequence: str, charge: int): sequence += f"/{charge}" return sequence - @staticmethod - def _parse_is_decoy(target_decoy: str): - if target_decoy == "target": - return False - elif target_decoy == "decoy": - return True - elif target_decoy == "target+decoy": - return False - else: - return None + +class IdXMLReader(AIdXMLReader): + def __iter__(self): + """Iterate over file and return PSMs one-by-one.""" + with idxml.read(str(self.filename)) as reader: + for entry in reader: + for peptide_hit in entry["PeptideHit"]: + yield self._parse_psm(entry, peptide_hit) def _parse_psm(self, entry: dict, peptide_hit: dict) -> PSM: """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM`.""" return PSM( peptidoform=self._parse_peptidoform(peptide_hit["sequence"], peptide_hit["charge"]), spectrum_id=entry["spectrum_reference"], - is_decoy=self._parse_is_decoy(peptide_hit["target_decoy"]), + is_decoy= peptide_hit["target_decoy"] == "decoy", score=peptide_hit["score"], precursor_mz=entry["MZ"], retention_time=entry["RT"], @@ -84,3 +85,40 @@ def _parse_psm(self, entry: dict, peptide_hit: dict) -> PSM: "idxml:significance_threshold": str(entry["significance_threshold"]), }, ) + + +class OmsIdXMLReader(AIdXMLReader): + def __iter__(self): + """Iterate over file and return PSMs one-by-one.""" + # TODO: This is a bit hacky, is there a cleaner way to do this? + try: + protein_ids, peptide_ids = [], [] + oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) + except NameError: + raise ModuleNotFoundError("Install pyopenms to use OmsIdXMLReader.") + for peptide_id in peptide_ids: + for peptide_hit in peptide_id.getHits(): + yield self._parse_psm(peptide_id, peptide_hit) + + def _parse_psm(self, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM: + """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM` using pyopenms""" + return PSM( + peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()), + spectrum_id = peptide_id.getMetaValue("spectrum_reference"), + is_decoy = peptide_hit.getMetaValue("target_decoy") == "decoy", + score = peptide_hit.getScore(), + precursor_mz = peptide_id.getMZ(), + retention_time = peptide_id.getRT(), + # TODO: ion mobility is currently not parsed to idXML so this needs to be read out from mzml + protein_list=[accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()], + rank = peptide_hit.getRank() + 1, # 0-based to 1-based + source="idXML", + # TODO: Store all peptide hit related metadata? + metadata = { + "idxml:score_type": str(peptide_id.getScoreType()), + "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), + "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()) + } + # TODO: provenance_data + # TODO: rescoring_features (Is there a fixed nomenclature for using these?) + ) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 96cbe4d..bf547f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ docs = [ "sphinx-autobuild", ] online = ["streamlit", "plotly"] +pyopenms = ["pyopenms"] [project.urls] GitHub = "https://github.com/compomics/psm_utils" From 6c6441bc727c1982e9633b92087db1ebc56e1b90 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 10 Oct 2023 13:34:01 +0000 Subject: [PATCH 02/20] Complete version of IdXMLReader solely with pyopenms --- psm_utils/io/idxml.py | 211 +++++++++++++++++++++++++++++------------- 1 file changed, 147 insertions(+), 64 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 932873d..9dff8c5 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -14,33 +14,113 @@ from __future__ import annotations from abc import abstractmethod +from typing import Iterable, Tuple, List import re +from pathlib import Path -try: - import pyopenms as oms -except ImportError: - from pyteomics.openms import idxml +import pyopenms as oms -from psm_utils.io._base_classes import ReaderBase +from psm_utils.io._base_classes import ReaderBase, WriterBase from psm_utils.psm import PSM from psm_utils.psm_list import PSMList +from psm_utils.exceptions import PSMUtilsException # Patterns to match open and closed round/square brackets MOD_PATTERN = re.compile(r"\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)") MOD_PATTERN_NTERM = re.compile(r"^\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]") MOD_PATTERN_CTERM = re.compile(r"\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]$") -class AIdXMLReader(ReaderBase): - @abstractmethod - def __iter__(self): - raise NotImplementedError() - - @abstractmethod - def _parse_psm(self): - raise NotImplementedError() - - @staticmethod - def _parse_peptidoform(sequence: str, charge: int): +# Extracted from the OpenMS PSMFeatureExtractor, which adds and manipulates features that will be given to percolator +# https://github.com/OpenMS/OpenMS/blob/342f6524e76a2bab3dcb428ba2f4aa2d6bfe8483/src/topp/PSMFeatureExtractor.cpp +RESOCRING_FEATURE_LIST = [ + "isotope_error" + "MS:1002049", # MSGFPlus unchanged RawScore + "MS:1002050", # MSGFPlus unchanged DeNovoScore + "MSGF:ScoreRatio", + "MSGF:Energy", + "MSGF:lnEValue", + "MSGF:lnExplainedIonCurrentRatio", + "MSGF:lnNTermIonCurrentRatio", + "MSGF:lnCTermIonCurrentRatio", + "MSGF:lnMS2IonCurrent", + "MSGF:MeanErrorTop7", + "MSGF:sqMeanErrorTop7", + "MSGF:StdevErrorTop7", + "XTANDEM:hyperscore", + "XTANDEM:deltascore", + "MS:1001330", # expect_score + "hyperscore", # MSfragger + "nextscore", # MSfragger + "COMET:deltaCn", # recalculated deltaCn = (current_XCorr - 2nd_best_XCorr) / max(current_XCorr, 1) + "COMET:deltaLCn", # deltaLCn = (current_XCorr - worst_XCorr) / max(current_XCorr, 1) + "COMET:lnExpect", # log(E-value) + "MS:1002252", # unchanged XCorr + "MS:1002255", # unchanged Sp = number of candidate peptides + "COMET:lnNumSP", # log(number of candidate peptides) + "COMET:lnRankSP", # log(rank based on Sp score) + "COMET:IonFrac", # matched_ions / total_ions + "MS:1001171", # unchanged mScore + "MASCOT:delta_score", # delta score based on mScore + "CONCAT:lnEvalue", + "CONCAT:deltaLnEvalue", + "SAGE:ln(-poisson)" + "SAGE:ln(delta_best)", + "SAGE:ln(delta_next)", + "SAGE:ln(matched_intensity_pct)", + "SAGE:longest_b", + "SAGE:longest_y", + "SAGE:longest_y_pct", + "SAGE:matched_peaks", + "SAGE:scored_candidates" +] + +class IdXMLReader(ReaderBase): + def __init__(self, filename: str | Path, *args, **kwargs) -> None: + """ + Reader for idXML files. + + Parameters + ---------- + filename: str, pathlib.Path + Path to idXML file. + Examples + -------- + >>> from psm_utils.io import IdXMLReader + >>> reader = IdXMLReader("example.idXML") + >>> psm_list = [psm for psm in reader] + """ + super().__init__(filename, *args, **kwargs) + self.filename = Path(filename) + protein_ids, peptide_ids = self._parse_idxml() + self.protein_ids = protein_ids + self.peptide_ids = peptide_ids + self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0]) + + def __iter__(self) -> Iterable[PSM]: + """ + Iterate over file and return PSMs one-by-one. + """ + for peptide_id in self.peptide_ids: + for peptide_hit in peptide_id.getHits(): + yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit) + + def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification): + """ + Parse idXML using pyopenms and do some sanity checks to make sure the file is not empty. + """ + protein_ids, peptide_ids = [], [] + oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) + + if len(protein_ids) == 0: + raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins.") + elif len(peptide_ids) == 0: + raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideIdentifcation` to parse.") + elif len(peptide_ids[0].getHits()) == 0: + raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideHit` (PSM) to parse.") + else: + return protein_ids, peptide_ids + + def _parse_peptidoform(self, sequence: str, charge: int) -> str: """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. Notes @@ -57,68 +137,71 @@ def _parse_peptidoform(sequence: str, charge: int): sequence = MOD_PATTERN_CTERM.sub(r"-[\1]", sequence) sequence = sequence.strip(".") sequence += f"/{charge}" - return sequence - - -class IdXMLReader(AIdXMLReader): - def __iter__(self): - """Iterate over file and return PSMs one-by-one.""" - with idxml.read(str(self.filename)) as reader: - for entry in reader: - for peptide_hit in entry["PeptideHit"]: - yield self._parse_psm(entry, peptide_hit) - - def _parse_psm(self, entry: dict, peptide_hit: dict) -> PSM: - """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM`.""" - return PSM( - peptidoform=self._parse_peptidoform(peptide_hit["sequence"], peptide_hit["charge"]), - spectrum_id=entry["spectrum_reference"], - is_decoy= peptide_hit["target_decoy"] == "decoy", - score=peptide_hit["score"], - precursor_mz=entry["MZ"], - retention_time=entry["RT"], - protein_list=[protein["accession"] for protein in peptide_hit["protein"]], - source="idXML", - metadata={ - "idxml:score_type": str(entry["score_type"]), - "idxml:higher_score_better": str(entry["higher_score_better"]), - "idxml:significance_threshold": str(entry["significance_threshold"]), - }, - ) + return sequence -class OmsIdXMLReader(AIdXMLReader): - def __iter__(self): - """Iterate over file and return PSMs one-by-one.""" - # TODO: This is a bit hacky, is there a cleaner way to do this? - try: - protein_ids, peptide_ids = [], [] - oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) - except NameError: - raise ModuleNotFoundError("Install pyopenms to use OmsIdXMLReader.") - for peptide_id in peptide_ids: - for peptide_hit in peptide_id.getHits(): - yield self._parse_psm(peptide_id, peptide_hit) - def _parse_psm(self, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM: - """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM` using pyopenms""" + def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM: + """ + Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`. + Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentifcation` + to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object. + """ return PSM( peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()), spectrum_id = peptide_id.getMetaValue("spectrum_reference"), + run = self._get_run(protein_ids, peptide_id), is_decoy = peptide_hit.getMetaValue("target_decoy") == "decoy", score = peptide_hit.getScore(), precursor_mz = peptide_id.getMZ(), retention_time = peptide_id.getRT(), - # TODO: ion mobility is currently not parsed to idXML so this needs to be read out from mzml + # NOTE: In future update of OpenMS we will be able to read out ion_mobility from idXML file protein_list=[accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()], rank = peptide_hit.getRank() + 1, # 0-based to 1-based source="idXML", - # TODO: Store all peptide hit related metadata? + # This is needed to calculate a qvalue before rescoring the PSMList metadata = { "idxml:score_type": str(peptide_id.getScoreType()), "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()) - } - # TODO: provenance_data - # TODO: rescoring_features (Is there a fixed nomenclature for using these?) - ) \ No newline at end of file + }, + rescoring_features = {key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features if self._is_float(peptide_hit.getMetaValue(key))} + ) + + def _get_run(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str: + """ + Get run name from idXML using pyopenms + If idXML contains merge index, use it to annotate the run name without extension + """ + if peptide_id.metaValueExists("id_merge_index"): + return Path(protein_ids[0].getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")].decode()).stem + else: + return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem + + def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]: + """ + Get list of rescoring features from idXML + """ + # Fill the key list with all the keys from the PeptideHit in the pyopenms way + keys = [] + peptide_hit.getKeys(keys) + # Convert from byte to str and remove the keys, which values are not in the fixed list of features + keys = [key.decode() for key in keys if key.decode() in RESOCRING_FEATURE_LIST] + + return keys + + def _is_float(self, element: any) -> bool: + """ + Check if element is float. + """ + if element is None: + return False + try: + float(element) + return True + except ValueError: + return False + + +class IdXMLReaderEmptyListException(PSMUtilsException): + """Exception in psm_utils.io.IdXMLReader""" \ No newline at end of file From e422496f95bba9aed3c86c9a7549ee8bd909925e Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 10 Oct 2023 13:36:11 +0000 Subject: [PATCH 03/20] pyopenms as fixed dependency --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bf547f8..ba85cab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dynamic = ["version"] requires-python = ">=3.7" dependencies = [ "pyteomics >= 4", + "pyopenms", "lxml", "psims", "pandas", @@ -44,7 +45,6 @@ docs = [ "sphinx-autobuild", ] online = ["streamlit", "plotly"] -pyopenms = ["pyopenms"] [project.urls] GitHub = "https://github.com/compomics/psm_utils" From 503e64dd05d52ac8a99617eec9d769451b7c7ff6 Mon Sep 17 00:00:00 2001 From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> Date: Tue, 10 Oct 2023 17:35:05 +0200 Subject: [PATCH 04/20] Update psm_utils/io/idxml.py Co-authored-by: Ralf Gabriels --- psm_utils/io/idxml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 9dff8c5..dcfafc4 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -32,7 +32,7 @@ # Extracted from the OpenMS PSMFeatureExtractor, which adds and manipulates features that will be given to percolator # https://github.com/OpenMS/OpenMS/blob/342f6524e76a2bab3dcb428ba2f4aa2d6bfe8483/src/topp/PSMFeatureExtractor.cpp -RESOCRING_FEATURE_LIST = [ +RESCORING_FEATURE_LIST = [ "isotope_error" "MS:1002049", # MSGFPlus unchanged RawScore "MS:1002050", # MSGFPlus unchanged DeNovoScore From 9287015e887bf3680b295cf2aae1ef4881ae2ef1 Mon Sep 17 00:00:00 2001 From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> Date: Tue, 10 Oct 2023 17:47:31 +0200 Subject: [PATCH 05/20] nitpicks --- psm_utils/io/idxml.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index dcfafc4..4e4f31c 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -14,7 +14,7 @@ from __future__ import annotations from abc import abstractmethod -from typing import Iterable, Tuple, List +from typing import Union, Iterable, Tuple, List import re from pathlib import Path @@ -75,7 +75,7 @@ ] class IdXMLReader(ReaderBase): - def __init__(self, filename: str | Path, *args, **kwargs) -> None: + def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: """ Reader for idXML files. @@ -83,6 +83,7 @@ def __init__(self, filename: str | Path, *args, **kwargs) -> None: ---------- filename: str, pathlib.Path Path to idXML file. + Examples -------- >>> from psm_utils.io import IdXMLReader @@ -123,6 +124,7 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica def _parse_peptidoform(self, sequence: str, charge: int) -> str: """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. + Notes ----- Implemented according to the documentation on @@ -186,7 +188,7 @@ def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]: keys = [] peptide_hit.getKeys(keys) # Convert from byte to str and remove the keys, which values are not in the fixed list of features - keys = [key.decode() for key in keys if key.decode() in RESOCRING_FEATURE_LIST] + keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST] return keys @@ -204,4 +206,4 @@ def _is_float(self, element: any) -> bool: class IdXMLReaderEmptyListException(PSMUtilsException): - """Exception in psm_utils.io.IdXMLReader""" \ No newline at end of file + """Exception in psm_utils.io.IdXMLReader""" From 0a2db3e4a32ef3c9ef9cf57fa96ce591feae63a2 Mon Sep 17 00:00:00 2001 From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> Date: Tue, 10 Oct 2023 17:49:12 +0200 Subject: [PATCH 06/20] typo --- psm_utils/io/idxml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 4e4f31c..73205e6 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -115,7 +115,7 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica if len(protein_ids) == 0: raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins.") elif len(peptide_ids) == 0: - raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideIdentifcation` to parse.") + raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideIdentification` to parse.") elif len(peptide_ids[0].getHits()) == 0: raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideHit` (PSM) to parse.") else: @@ -146,7 +146,7 @@ def _parse_peptidoform(self, sequence: str, charge: int) -> str: def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM: """ Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`. - Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentifcation` + Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object. """ return PSM( From 042c91eb3aeb6a2b58c01870ba81bb49baaa978a Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 10 Oct 2023 19:38:44 +0000 Subject: [PATCH 07/20] move idxml parsing to iter, add decoy function --- psm_utils/io/idxml.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 73205e6..66eef38 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -92,18 +92,17 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: """ super().__init__(filename, *args, **kwargs) self.filename = Path(filename) - protein_ids, peptide_ids = self._parse_idxml() - self.protein_ids = protein_ids - self.peptide_ids = peptide_ids - self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0]) + self.rescoring_features = None def __iter__(self) -> Iterable[PSM]: """ Iterate over file and return PSMs one-by-one. """ - for peptide_id in self.peptide_ids: + protein_ids, peptide_ids = self._parse_idxml() + self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0]) + for peptide_id in peptide_ids: for peptide_hit in peptide_id.getHits(): - yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit) + yield self._parse_psm(protein_ids, peptide_id, peptide_hit) def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification): """ @@ -113,11 +112,11 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) if len(protein_ids) == 0: - raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins.") + raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins. Nothing to parse.") elif len(peptide_ids) == 0: - raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideIdentification` to parse.") + raise IdXMLReaderEmptyListException(f"File {self.filename} contains no PeptideIdentifications. Nothing to parse.") elif len(peptide_ids[0].getHits()) == 0: - raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideHit` (PSM) to parse.") + raise IdXMLReaderEmptyListException(f"File {self.filename} contains no PeptideHits. Nothing to parse.") else: return protein_ids, peptide_ids @@ -153,7 +152,7 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()), spectrum_id = peptide_id.getMetaValue("spectrum_reference"), run = self._get_run(protein_ids, peptide_id), - is_decoy = peptide_hit.getMetaValue("target_decoy") == "decoy", + is_decoy = self._is_decoy(peptide_hit), score = peptide_hit.getScore(), precursor_mz = peptide_id.getMZ(), retention_time = peptide_id.getRT(), @@ -167,7 +166,8 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()) }, - rescoring_features = {key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features if self._is_float(peptide_hit.getMetaValue(key))} + rescoring_features = {key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features \ + if self._is_float(peptide_hit.getMetaValue(key))} ) def _get_run(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str: @@ -204,6 +204,14 @@ def _is_float(self, element: any) -> bool: except ValueError: return False + def _is_decoy(self, peptide_hit: oms.PeptideHit) -> bool: + """ + Check if PSM is target or decoy + """ + if peptide_hit.metaValueExists("target_decoy"): + return peptide_hit.getMetaValue("target_decoy") == "decoy" + else: + return None class IdXMLReaderEmptyListException(PSMUtilsException): """Exception in psm_utils.io.IdXMLReader""" From 83e8792251bcb18668b6a60015741c46deff13a7 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Wed, 11 Oct 2023 09:13:10 +0000 Subject: [PATCH 08/20] add intersphinx config --- docs/source/conf.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 3c05a45..36b1602 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -40,6 +40,14 @@ autodoc_member_order = "bysource" autoclass_content = "init" +# Intersphinx options +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pyteomics": ("https://pyteomics.readthedocs.io/en/latest/", None), + "pyopenms": ("https://pyopenms.readthedocs.io/en/latest/", None) +} def setup(app): config = { From 280f671dd07875d78156202b39858f75808dc34d Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Fri, 13 Oct 2023 09:40:59 +0000 Subject: [PATCH 09/20] Adjustments to idxml reader --- psm_utils/io/idxml.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 66eef38..1757e37 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -92,17 +92,18 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: """ super().__init__(filename, *args, **kwargs) self.filename = Path(filename) - self.rescoring_features = None + protein_ids, peptide_ids = self._parse_idxml() + self.protein_ids = protein_ids + self.peptide_ids = peptide_ids + self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0]) def __iter__(self) -> Iterable[PSM]: """ Iterate over file and return PSMs one-by-one. """ - protein_ids, peptide_ids = self._parse_idxml() - self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0]) - for peptide_id in peptide_ids: + for peptide_id in self.peptide_ids: for peptide_hit in peptide_id.getHits(): - yield self._parse_psm(protein_ids, peptide_id, peptide_hit) + yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit) def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification): """ @@ -120,6 +121,7 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica else: return protein_ids, peptide_ids + def _parse_peptidoform(self, sequence: str, charge: int) -> str: """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. @@ -148,8 +150,9 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object. """ + peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()) return PSM( - peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()), + peptidoform = peptidoform, spectrum_id = peptide_id.getMetaValue("spectrum_reference"), run = self._get_run(protein_ids, peptide_id), is_decoy = self._is_decoy(peptide_hit), @@ -160,6 +163,8 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep protein_list=[accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()], rank = peptide_hit.getRank() + 1, # 0-based to 1-based source="idXML", + # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back to original sequence in writer + provenance_data = {str(peptidoform): peptide_hit.getSequence().toString()}, # This is needed to calculate a qvalue before rescoring the PSMList metadata = { "idxml:score_type": str(peptide_id.getScoreType()), @@ -213,5 +218,6 @@ def _is_decoy(self, peptide_hit: oms.PeptideHit) -> bool: else: return None + class IdXMLReaderEmptyListException(PSMUtilsException): - """Exception in psm_utils.io.IdXMLReader""" + """Exception in psm_utils.io.IdXMLReader""" \ No newline at end of file From 00d3577cefdcf23301734a37be40b33a95b57d84 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Fri, 13 Oct 2023 11:13:11 +0000 Subject: [PATCH 10/20] Add tests for reader --- tests/test_data/test.idXML | 153 ++++++++++++++++++++++++++++++++++++ tests/test_io/test_idxml.py | 65 ++++++++++++++- 2 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 tests/test_data/test.idXML diff --git a/tests/test_data/test.idXML b/tests/test_data/test.idXML new file mode 100644 index 0000000..c1983ad --- /dev/null +++ b/tests/test_data/test.idXML @@ -0,0 +1,153 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index 808f1a1..2d32bca 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -1,9 +1,10 @@ """Tests for psm_utils.io.idxml.""" from psm_utils.io.idxml import IdXMLReader - +from psm_utils.psm import PSM class TestIdXMLReader: + def test__parse_peptidoform(self): test_cases = [ (("DFPIAMGER", 2), "DFPIAMGER/2"), @@ -20,3 +21,65 @@ def test__parse_peptidoform(self): for test_in, expected_out in test_cases: assert IdXMLReader._parse_peptidoform(*test_in) == expected_out + + + def test__parse_psm(self): + test_psm = PSM( + peptidoform=Peptidoform('LVPIWKK/2'), + spectrum_id='controllerType=0 controllerNumber=1 scan=294', + run='HepG2_rep3_small', + collection=None, + spectrum=None, + is_decoy=True, + score=1.0099999904632568, + qvalue=None, + pep=None, + precursor_mz=442.29595853189994, + retention_time=849.0, + ion_mobility=None, + protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'], + rank=1, + source='idXML', + provenance_data={'LVPIWKK/2': 'LVPIWKK'}, + metadata={ + 'idxml:score_type': 'expect', + 'idxml:higher_score_better': 'False', + 'idxml:significance_threshold': '0.0' + }, + rescoring_features={'MS:1002252': 0.693, + 'COMET:deltaCn': 1.0, + 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25} + ) + + reader = IdXMLReader("./tests/test_data/test.idXML") + psm = IdXMLReader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0]) + assert psm.peptidoform == test_psm.peptidoform + assert psm.spectrum_id == test_psm.spectrum_id + assert psm.run == test_psm.run + assert psm.collection == test_psm.collection + assert psm.spectrum == test_psm.spectrum + assert psm.is_decoy == test_psm.is_decoy + assert psm.score == test_psm.score + assert psm.qvalue == test_psm.qvalue + assert psm.pep == test_psm.pep + assert psm.precursor_mz == test_psm.precursor_mz + assert psm.retention_time == test_psm.retention_time + assert psm.ion_mobility == test_psm.ion_mobility + assert psm.protein_list == test_psm.protein_list + assert psm.rank == test_psm.rank + assert psm.source == test_psm.source + assert psm.provenance_data == test_psm.provenance_data + assert psm.metadata == test_psm.metadata + assert psm.rescoring_features == test_psm.rescoring_features + + + def test__get_run(): + reader = IdXMLReader("./tests/test_data/test.idXML") + expected_output = "HepG2_rep3_small" + assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output + + + def _get_rescoring_features(): + reader = IdXMLReader("./tests/test_data/test.idXML") + expected_output = ['MS:1002252', 'COMET:deltaCn', 'MS:1002255', 'COMET:deltaLCn', 'COMET:lnExpect', 'COMET:lnNumSP', 'COMET:lnRankSP', 'COMET:IonFrac'] + assert IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output From c4c646edb75e0e1a6d0767d7eaf109f448fa786a Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Fri, 13 Oct 2023 11:22:41 +0000 Subject: [PATCH 11/20] Add idxml writer --- psm_utils/io/idxml.py | 180 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 179 insertions(+), 1 deletion(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 1757e37..b99f828 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -220,4 +220,182 @@ def _is_decoy(self, peptide_hit: oms.PeptideHit) -> bool: class IdXMLReaderEmptyListException(PSMUtilsException): - """Exception in psm_utils.io.IdXMLReader""" \ No newline at end of file + """Exception in psm_utils.io.IdXMLReader""" + + +class IdXMLWriter(WriterBase): + def __init__( + self, + filename: str | Path, + protein_ids = None, + peptide_ids = None, + *args, + **kwargs, + ): + """ + Writer for idXML files. + + Parameters + ---------- + filename: str, Pathlib.Path + Path to PSM file. + run : str, optional + Name of the MS run. Usually the spectrum file filename without + extension. + + Notes + ----- + - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing + a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList + to a file per collection with the :py:meth:`write_file` method is currently supported. + - If :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` + objects are provided, the :py:meth:`write_file` method will update the objects with novel + features from rescoring_features dictionary of the :py:class:`~psm_utils.psm.PSM` objects + in the PSMList and write them to a idXML file. + """ + super().__init__(filename, *args, **kwargs) + self.protein_ids = protein_ids + self.peptide_ids = peptide_ids + self._writer = None + + def __enter__(self) -> IdXMLWriter: + return self + + def __exit__(self, *args, **kwargs) -> None: + pass + + def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None: + """ + Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information + from :py:class:`~psm_utils.psm.PSM`. + """ + peptide_hit.setScore(psm.score) + peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based + for feature, value in psm.rescoring_features.items(): + if feature not in RESCORING_FEATURE_LIST: + peptide_hit.setMetaValue(feature, value) + + def _convert_proforma_to_unimod(self, sequence: str) -> str: + """ + Convert a peptidoform sequence in proforma notation to UNIMOD notation. + """ + # Replace square brackets with parentheses + sequence = sequence.replace("[", "(").replace("]", ")") + + # Check for N-terminal and C-terminal modifications + if sequence[:2] == "((": + sequence = sequence.replace("((", ".[", 1).replace(")-", ".", 1) + if sequence[-2:] == "))": + sequence = sequence[::-1].replace("))", ".]", 1).replace("-(", ".", 1)[::-1] + + return sequence + + def write_psm(self, psm: PSM): + """ + Write a single PSM to the PSM file. + + This method is currently not supported (see Notes). + + Raises + ------ + NotImplementedError + IdXMLWriter currently does not support write_psm. + + """ + raise NotImplementedError("IdXMLWriter currently does not support write_psm.") + + def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None: + """ + Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary. + """ + if d is not None: + for key, value in d.items(): + peptide_hit.setMetaValue(key, value) + + def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification = None, peptide_ids: oms.PeptideIdentification = None) -> None: + """ + Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification` + and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. + """ + psm_dict = psm_list.get_psm_dict() + + if protein_ids is not None and peptide_ids is not None: + # Access run name(s) from ProteinIdentification + spectrum_files = [Path(run.decode()).stem for run in protein_ids[0].getMetaValue("spectra_data")] + for peptide_id in peptide_ids: + if len(spectrum_files) > 1: + run = spectrum_files[peptide_id.getMetaValue("id_merge_index")] + else: + run = spectrum_files[0] + # Get PSM objects associated from runs since we are writing a merged idXML + # NOTE: Collections with multiple protein_ids and peptide_ids is not supported + try: + psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")] + except KeyError: + print("Multiple collections are not supported when parsing single pyopenms protein and peptide objects.") + # Dict of UNIMOD peptide sequence and PSM object + hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms} + # Update PeptideHits according to the PSM objects + updated_peptide_hits = [] + for peptide_hit in peptide_id.getHits(): + sequence = peptide_hit.getSequence().toString() + psm = hit_dict[sequence] + self._update_peptide_hit(peptide_hit, psm) + updated_peptide_hits.append(peptide_hit) + + peptide_id.setHits(updated_peptide_hits) + + oms.IdXMLFile().store(str(self.filename), protein_ids, peptide_ids) + + else: + for collection, runs in psm_dict.items(): + protein_ids = oms.ProteinIdentification() + # Set msrun filename with spectra_data meta value + msrun_reference = [run for run in runs.keys()] + protein_ids.setMetaValue("spectra_data", msrun_reference) + protein_list = [] + peptide_ids = [] + for run, psm_dict_run in runs.items(): + for spectrum_id, psms in psm_dict_run.items(): + protein_list.append([accession for psm in psms for accession in psm.protein_list]) + + # Fill PeptideIdentification object with PeptideHits + peptide_id = oms.PeptideIdentification() + peptide_id.setMetaValue("spectrum_reference", spectrum_id) + peptide_id.setMetaValue("id_merge_index", msrun_reference.index(run)) + if psms[0].precursor_mz is not None: + peptide_id.setMZ(psms[0].precursor_mz) + if psms[0].retention_time is not None: + peptide_id.setRT(psms[0].retention_time) + + # Fill PeptideHits object + peptide_hits = [] + for psm in psms: + peptide_hit = oms.PeptideHit() + peptide_hit.setSequence(oms.AASequence.fromString(self._convert_proforma_to_unimod(psm.peptidoform.sequence))) + peptide_hit.setCharge(psm.peptidoform.precursor_charge) + peptide_hit.setMetaValue("target_decoy", "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target")) + if psm.score is not None: + peptide_hit.setScore(psm.score) + if psm.rank is not None: + peptide_hit.setRank(psm.rank -1) # 1-based to 0-based + self._add_meta_values_from_dict(peptide_hit, psm.metadata) + self._add_meta_values_from_dict(peptide_hit, psm.provenance_data) + self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features) + + peptide_hits.append(peptide_hit) + + peptide_id.setHits(peptide_hits) + peptide_ids.append(peptide_id) + + # Get unique protein accessions + protein_list = list(set([accession for proteins in protein_list for accession in proteins])) + protein_hits = [] + for accession in protein_list: + protein_hit = oms.ProteinHit() + protein_hit.setAccession(accession) + protein_hits.append(protein_hit) + protein_ids.setHits(protein_hits) + + # Write an idXML file for each collection + oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [protein_ids], peptide_ids) \ No newline at end of file From 7d07f90a925bba35ad9cff9c2a7bd1d7d9c451b4 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Fri, 13 Oct 2023 11:29:58 +0000 Subject: [PATCH 12/20] assert psm objects rather than asserting each param of the object --- tests/test_io/test_idxml.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index 2d32bca..5b3b5c6 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -53,24 +53,7 @@ def test__parse_psm(self): reader = IdXMLReader("./tests/test_data/test.idXML") psm = IdXMLReader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0]) - assert psm.peptidoform == test_psm.peptidoform - assert psm.spectrum_id == test_psm.spectrum_id - assert psm.run == test_psm.run - assert psm.collection == test_psm.collection - assert psm.spectrum == test_psm.spectrum - assert psm.is_decoy == test_psm.is_decoy - assert psm.score == test_psm.score - assert psm.qvalue == test_psm.qvalue - assert psm.pep == test_psm.pep - assert psm.precursor_mz == test_psm.precursor_mz - assert psm.retention_time == test_psm.retention_time - assert psm.ion_mobility == test_psm.ion_mobility - assert psm.protein_list == test_psm.protein_list - assert psm.rank == test_psm.rank - assert psm.source == test_psm.source - assert psm.provenance_data == test_psm.provenance_data - assert psm.metadata == test_psm.metadata - assert psm.rescoring_features == test_psm.rescoring_features + assert psm == test_psm def test__get_run(): From d19ed506ac3ea484a8f0523ba88f335abe2eac5c Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Sun, 15 Oct 2023 13:45:21 +0000 Subject: [PATCH 13/20] Add tests for idxml writer --- psm_utils/io/idxml.py | 54 ++++--- tests/test_data/{test.idXML => test_in.idXML} | 0 tests/test_data/test_out.idXML | 153 ++++++++++++++++++ tests/test_data/test_out_sage.idXML | 42 +++++ tests/test_io/test_idxml.py | 43 +++-- 5 files changed, 257 insertions(+), 35 deletions(-) rename tests/test_data/{test.idXML => test_in.idXML} (100%) create mode 100644 tests/test_data/test_out.idXML create mode 100644 tests/test_data/test_out_sage.idXML diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index b99f828..7d3f776 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -5,7 +5,7 @@ Notes ----- -* idXML supports multiple peptide hits (identifications) per spectrum. Each peptide hit +* idXML supports mulwtiple peptide hits (identifications) per spectrum. Each peptide hit is parsed as an individual :py:class:`~psm_utils.psm.PSM` object. """ @@ -97,6 +97,7 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: self.peptide_ids = peptide_ids self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0]) + def __iter__(self) -> Iterable[PSM]: """ Iterate over file and return PSMs one-by-one. @@ -105,6 +106,7 @@ def __iter__(self) -> Iterable[PSM]: for peptide_hit in peptide_id.getHits(): yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit) + def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification): """ Parse idXML using pyopenms and do some sanity checks to make sure the file is not empty. @@ -121,8 +123,8 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica else: return protein_ids, peptide_ids - - def _parse_peptidoform(self, sequence: str, charge: int) -> str: + @staticmethod + def _parse_peptidoform(sequence:str, charge:int) -> str: """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. @@ -175,7 +177,8 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep if self._is_float(peptide_hit.getMetaValue(key))} ) - def _get_run(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str: + @staticmethod + def _get_run(protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str: """ Get run name from idXML using pyopenms If idXML contains merge index, use it to annotate the run name without extension @@ -185,7 +188,8 @@ def _get_run(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pepti else: return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem - def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]: + @staticmethod + def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]: """ Get list of rescoring features from idXML """ @@ -197,7 +201,8 @@ def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]: return keys - def _is_float(self, element: any) -> bool: + @staticmethod + def _is_float(element: any) -> bool: """ Check if element is float. """ @@ -208,8 +213,9 @@ def _is_float(self, element: any) -> bool: return True except ValueError: return False - - def _is_decoy(self, peptide_hit: oms.PeptideHit) -> bool: + + @staticmethod + def _is_decoy(peptide_hit: oms.PeptideHit) -> bool: """ Check if PSM is target or decoy """ @@ -231,7 +237,7 @@ def __init__( peptide_ids = None, *args, **kwargs, - ): + ) -> None: """ Writer for idXML files. @@ -247,11 +253,7 @@ def __init__( ----- - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList - to a file per collection with the :py:meth:`write_file` method is currently supported. - - If :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` - objects are provided, the :py:meth:`write_file` method will update the objects with novel - features from rescoring_features dictionary of the :py:class:`~psm_utils.psm.PSM` objects - in the PSMList and write them to a idXML file. + to a file at once with the :py:meth:`write_file` method is currently supported. TODO: Adjust """ super().__init__(filename, *args, **kwargs) self.protein_ids = protein_ids @@ -299,7 +301,7 @@ def write_psm(self, psm: PSM): Raises ------ NotImplementedError - IdXMLWriter currently does not support write_psm. + OmsIdXMLWriter currently does not support write_psm. """ raise NotImplementedError("IdXMLWriter currently does not support write_psm.") @@ -312,17 +314,17 @@ def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> No for key, value in d.items(): peptide_hit.setMetaValue(key, value) - def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification = None, peptide_ids: oms.PeptideIdentification = None) -> None: + def write_file(self, psm_list: PSMList) -> None: """ Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. """ psm_dict = psm_list.get_psm_dict() - if protein_ids is not None and peptide_ids is not None: + if self.protein_ids is not None and self.peptide_ids is not None: # Access run name(s) from ProteinIdentification - spectrum_files = [Path(run.decode()).stem for run in protein_ids[0].getMetaValue("spectra_data")] - for peptide_id in peptide_ids: + spectrum_files = [Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")] + for peptide_id in self.peptide_ids: if len(spectrum_files) > 1: run = spectrum_files[peptide_id.getMetaValue("id_merge_index")] else: @@ -345,16 +347,16 @@ def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification = peptide_id.setHits(updated_peptide_hits) - oms.IdXMLFile().store(str(self.filename), protein_ids, peptide_ids) + oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids) else: for collection, runs in psm_dict.items(): - protein_ids = oms.ProteinIdentification() + self.protein_ids = oms.ProteinIdentification() + self.peptide_ids = [] # Set msrun filename with spectra_data meta value msrun_reference = [run for run in runs.keys()] - protein_ids.setMetaValue("spectra_data", msrun_reference) + self.protein_ids.setMetaValue("spectra_data", msrun_reference) protein_list = [] - peptide_ids = [] for run, psm_dict_run in runs.items(): for spectrum_id, psms in psm_dict_run.items(): protein_list.append([accession for psm in psms for accession in psm.protein_list]) @@ -386,7 +388,7 @@ def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification = peptide_hits.append(peptide_hit) peptide_id.setHits(peptide_hits) - peptide_ids.append(peptide_id) + self.peptide_ids.append(peptide_id) # Get unique protein accessions protein_list = list(set([accession for proteins in protein_list for accession in proteins])) @@ -395,7 +397,7 @@ def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification = protein_hit = oms.ProteinHit() protein_hit.setAccession(accession) protein_hits.append(protein_hit) - protein_ids.setHits(protein_hits) + self.protein_ids.setHits(protein_hits) # Write an idXML file for each collection - oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [protein_ids], peptide_ids) \ No newline at end of file + oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [self.protein_ids], self.peptide_ids) \ No newline at end of file diff --git a/tests/test_data/test.idXML b/tests/test_data/test_in.idXML similarity index 100% rename from tests/test_data/test.idXML rename to tests/test_data/test_in.idXML diff --git a/tests/test_data/test_out.idXML b/tests/test_data/test_out.idXML new file mode 100644 index 0000000..148a2df --- /dev/null +++ b/tests/test_data/test_out.idXML @@ -0,0 +1,153 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_data/test_out_sage.idXML b/tests/test_data/test_out_sage.idXML new file mode 100644 index 0000000..2d307cb --- /dev/null +++ b/tests/test_data/test_out_sage.idXML @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index 5b3b5c6..f89c3fb 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -1,7 +1,11 @@ """Tests for psm_utils.io.idxml.""" -from psm_utils.io.idxml import IdXMLReader +import hashlib + +from psm_utils.io.idxml import IdXMLReader, IdXMLWriter +from psm_utils.io.sage import SageReader from psm_utils.psm import PSM +from psm_utils.peptidoform import Peptidoform class TestIdXMLReader: @@ -16,7 +20,7 @@ def test__parse_peptidoform(self): ((".DFPIAM[147]GER.", 2), "DFPIAM[147]GER/2"), ((".(Dimethyl)DFPIAMGER.", 2), "[Dimethyl]-DFPIAMGER/2"), ((".DFPIAMGER.(Label:18O(2))", 2), "DFPIAMGER-[Label:18O(2)]/2"), - ((".DFPIAMGER(Phospho).", 2), "DFPIAMGER[Phospho]/2"), + ((".DFPIAMGER(Phospho).", 2), "DFPIAMGER[Phospho]/2") ] for test_in, expected_out in test_cases: @@ -51,18 +55,39 @@ def test__parse_psm(self): 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25} ) - reader = IdXMLReader("./tests/test_data/test.idXML") - psm = IdXMLReader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0]) + reader = IdXMLReader("./tests/test_data/test_in.idXML") + psm = reader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0]) assert psm == test_psm - def test__get_run(): - reader = IdXMLReader("./tests/test_data/test.idXML") + def test__get_run(self): expected_output = "HepG2_rep3_small" + reader = IdXMLReader("./tests/test_data/test_in.idXML") assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output - - def _get_rescoring_features(): - reader = IdXMLReader("./tests/test_data/test.idXML") + + def test__get_rescoring_features(self): expected_output = ['MS:1002252', 'COMET:deltaCn', 'MS:1002255', 'COMET:deltaLCn', 'COMET:lnExpect', 'COMET:lnNumSP', 'COMET:lnRankSP', 'COMET:IonFrac'] + reader = IdXMLReader("./tests/test_data/test_in.idXML") assert IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output + + +class TestIdXMLWriter: + + def test_write_file_with_pyopenms_objects(self): + expected_sha = "8d8cb6d8194c5c296f0f5ee8be83d2072be125547b2d51b88100859b001f47fa" + reader = IdXMLReader("./tests/test_data/test_in.idXML") + psm_list = reader.read_file() + writer = IdXMLWriter("./tests/test_data/test_out.idXML", reader.protein_ids, reader.peptide_ids) + writer.write_file(psm_list) + sha = hashlib.sha256(open("./tests/test_data/test_out.idXML", 'rb').read()).hexdigest() + assert sha == expected_sha + + def test_write_file_without_pyopenms_objects(self): + expected_sha = "a24e157579a4e9da71900931c6b5578a264ea7830135ede480b41e6550944e5e" + reader = SageReader("./tests/test_data/results.sage.tsv") + psm_list = reader.read_file() + writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML") + writer.write_file(psm_list) + sha = hashlib.sha256(open("./tests/test_data/test_out_sage.idXML", 'rb').read()).hexdigest() + assert sha == expected_sha From a2bef5dccf10962a259a3fb9449ea5ca736bc44f Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Sun, 15 Oct 2023 14:13:24 +0000 Subject: [PATCH 14/20] logical structur of functions --- psm_utils/io/idxml.py | 243 ++++++++++++++++++++++++------------------ 1 file changed, 138 insertions(+), 105 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 7d3f776..641645a 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -5,7 +5,7 @@ Notes ----- -* idXML supports mulwtiple peptide hits (identifications) per spectrum. Each peptide hit +* idXML supports multiple peptide hits (identifications) per spectrum. Each peptide hit is parsed as an individual :py:class:`~psm_utils.psm.PSM` object. """ @@ -13,7 +13,7 @@ from __future__ import annotations -from abc import abstractmethod +import logging from typing import Union, Iterable, Tuple, List import re from pathlib import Path @@ -25,6 +25,8 @@ from psm_utils.psm_list import PSMList from psm_utils.exceptions import PSMUtilsException +logger = logging.getLogger(__name__) + # Patterns to match open and closed round/square brackets MOD_PATTERN = re.compile(r"\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)") MOD_PATTERN_NTERM = re.compile(r"^\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]") @@ -128,6 +130,7 @@ def _parse_peptidoform(sequence:str, charge:int) -> str: """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. + Notes ----- Implemented according to the documentation on @@ -177,6 +180,7 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep if self._is_float(peptide_hit.getMetaValue(key))} ) + @staticmethod def _get_run(protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str: """ @@ -188,6 +192,7 @@ def _get_run(protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIden else: return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem + @staticmethod def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]: """ @@ -201,6 +206,7 @@ def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]: return keys + @staticmethod def _is_float(element: any) -> bool: """ @@ -213,7 +219,8 @@ def _is_float(element: any) -> bool: return True except ValueError: return False - + + @staticmethod def _is_decoy(peptide_hit: oms.PeptideHit) -> bool: """ @@ -253,19 +260,87 @@ def __init__( ----- - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList - to a file at once with the :py:meth:`write_file` method is currently supported. TODO: Adjust + to a file at once with the :py:meth:`write_file` method is currently supported. """ super().__init__(filename, *args, **kwargs) self.protein_ids = protein_ids self.peptide_ids = peptide_ids self._writer = None + def __enter__(self) -> IdXMLWriter: return self + def __exit__(self, *args, **kwargs) -> None: pass + + def write_psm(self, psm: PSM): + """ + Write a single PSM to the PSM file. + + This method is currently not supported (see Notes). + + Raises + ------ + NotImplementedError + OmsIdXMLWriter currently does not support write_psm. + + """ + raise NotImplementedError("IdXMLWriter currently does not support write_psm.") + + + def write_file(self, psm_list: PSMList) -> None: + """ + Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification` + and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. + """ + psm_dict = psm_list.get_psm_dict() + + if self.protein_ids is not None and self.peptide_ids is not None: + self._update_existing_ids(psm_dict) + # Check if one of self.protein_ids or self.peptide_ids is None + elif self.protein_ids is not None or self.peptide_ids is not None: + logger.warning("One of the protein_ids or peptide_ids is None. Falling back to creating new idXML files solely based on the PSMList.") + self._create_new_ids(psm_dict) + else: + self._create_new_ids(psm_dict) + + + def _update_existing_ids(self, psm_dict: dict) -> None: + """ + Update existing :py:class:`~pyopenms.ProteinIdentification` + and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. + """ + # Access run name(s) from ProteinIdentification + spectrum_files = [Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")] + for peptide_id in self.peptide_ids: + if len(spectrum_files) > 1: + run = spectrum_files[peptide_id.getMetaValue("id_merge_index")] + else: + run = spectrum_files[0] + # Get PSM objects associated from runs since we are writing a merged idXML + # NOTE: Collections with multiple protein_ids and peptide_ids is not supported + try: + psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")] + except KeyError: + print("Multiple collections are not supported when parsing single pyopenms protein and peptide objects.") + # Dict of UNIMOD peptide sequence and PSM object + hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms} + # Update PeptideHits according to the PSM objects + updated_peptide_hits = [] + for peptide_hit in peptide_id.getHits(): + sequence = peptide_hit.getSequence().toString() + psm = hit_dict[sequence] + self._update_peptide_hit(peptide_hit, psm) + updated_peptide_hits.append(peptide_hit) + + peptide_id.setHits(updated_peptide_hits) + + oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids) + + def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None: """ Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information @@ -277,6 +352,65 @@ def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None: if feature not in RESCORING_FEATURE_LIST: peptide_hit.setMetaValue(feature, value) + + def _create_new_ids(self, psm_dict: dict) -> None: + """ + Create new :py:class:`~pyopenms.ProteinIdentification` + and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. + """ + for collection, runs in psm_dict.items(): + self.protein_ids = oms.ProteinIdentification() + self.peptide_ids = [] + # Set msrun filename with spectra_data meta value + msrun_reference = [run for run in runs.keys()] + self.protein_ids.setMetaValue("spectra_data", msrun_reference) + protein_list = [] + for run, psm_dict_run in runs.items(): + for spectrum_id, psms in psm_dict_run.items(): + protein_list.append([accession for psm in psms for accession in psm.protein_list]) + + # Fill PeptideIdentification object with PeptideHits + peptide_id = oms.PeptideIdentification() + peptide_id.setMetaValue("spectrum_reference", spectrum_id) + peptide_id.setMetaValue("id_merge_index", msrun_reference.index(run)) + if psms[0].precursor_mz is not None: + peptide_id.setMZ(psms[0].precursor_mz) + if psms[0].retention_time is not None: + peptide_id.setRT(psms[0].retention_time) + + # Fill PeptideHits object + peptide_hits = [] + for psm in psms: + peptide_hit = oms.PeptideHit() + peptide_hit.setSequence(oms.AASequence.fromString(self._convert_proforma_to_unimod(psm.peptidoform.sequence))) + peptide_hit.setCharge(psm.peptidoform.precursor_charge) + peptide_hit.setMetaValue("target_decoy", "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target")) + if psm.score is not None: + peptide_hit.setScore(psm.score) + if psm.rank is not None: + peptide_hit.setRank(psm.rank -1) # 1-based to 0-based + self._add_meta_values_from_dict(peptide_hit, psm.metadata) + self._add_meta_values_from_dict(peptide_hit, psm.provenance_data) + self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features) + + peptide_hits.append(peptide_hit) + + peptide_id.setHits(peptide_hits) + self.peptide_ids.append(peptide_id) + + # Get unique protein accessions + protein_list = list(set([accession for proteins in protein_list for accession in proteins])) + protein_hits = [] + for accession in protein_list: + protein_hit = oms.ProteinHit() + protein_hit.setAccession(accession) + protein_hits.append(protein_hit) + self.protein_ids.setHits(protein_hits) + + # Write an idXML file for each collection + oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [self.protein_ids], self.peptide_ids) + + def _convert_proforma_to_unimod(self, sequence: str) -> str: """ Convert a peptidoform sequence in proforma notation to UNIMOD notation. @@ -292,20 +426,7 @@ def _convert_proforma_to_unimod(self, sequence: str) -> str: return sequence - def write_psm(self, psm: PSM): - """ - Write a single PSM to the PSM file. - - This method is currently not supported (see Notes). - Raises - ------ - NotImplementedError - OmsIdXMLWriter currently does not support write_psm. - - """ - raise NotImplementedError("IdXMLWriter currently does not support write_psm.") - def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None: """ Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary. @@ -313,91 +434,3 @@ def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> No if d is not None: for key, value in d.items(): peptide_hit.setMetaValue(key, value) - - def write_file(self, psm_list: PSMList) -> None: - """ - Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification` - and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. - """ - psm_dict = psm_list.get_psm_dict() - - if self.protein_ids is not None and self.peptide_ids is not None: - # Access run name(s) from ProteinIdentification - spectrum_files = [Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")] - for peptide_id in self.peptide_ids: - if len(spectrum_files) > 1: - run = spectrum_files[peptide_id.getMetaValue("id_merge_index")] - else: - run = spectrum_files[0] - # Get PSM objects associated from runs since we are writing a merged idXML - # NOTE: Collections with multiple protein_ids and peptide_ids is not supported - try: - psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")] - except KeyError: - print("Multiple collections are not supported when parsing single pyopenms protein and peptide objects.") - # Dict of UNIMOD peptide sequence and PSM object - hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms} - # Update PeptideHits according to the PSM objects - updated_peptide_hits = [] - for peptide_hit in peptide_id.getHits(): - sequence = peptide_hit.getSequence().toString() - psm = hit_dict[sequence] - self._update_peptide_hit(peptide_hit, psm) - updated_peptide_hits.append(peptide_hit) - - peptide_id.setHits(updated_peptide_hits) - - oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids) - - else: - for collection, runs in psm_dict.items(): - self.protein_ids = oms.ProteinIdentification() - self.peptide_ids = [] - # Set msrun filename with spectra_data meta value - msrun_reference = [run for run in runs.keys()] - self.protein_ids.setMetaValue("spectra_data", msrun_reference) - protein_list = [] - for run, psm_dict_run in runs.items(): - for spectrum_id, psms in psm_dict_run.items(): - protein_list.append([accession for psm in psms for accession in psm.protein_list]) - - # Fill PeptideIdentification object with PeptideHits - peptide_id = oms.PeptideIdentification() - peptide_id.setMetaValue("spectrum_reference", spectrum_id) - peptide_id.setMetaValue("id_merge_index", msrun_reference.index(run)) - if psms[0].precursor_mz is not None: - peptide_id.setMZ(psms[0].precursor_mz) - if psms[0].retention_time is not None: - peptide_id.setRT(psms[0].retention_time) - - # Fill PeptideHits object - peptide_hits = [] - for psm in psms: - peptide_hit = oms.PeptideHit() - peptide_hit.setSequence(oms.AASequence.fromString(self._convert_proforma_to_unimod(psm.peptidoform.sequence))) - peptide_hit.setCharge(psm.peptidoform.precursor_charge) - peptide_hit.setMetaValue("target_decoy", "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target")) - if psm.score is not None: - peptide_hit.setScore(psm.score) - if psm.rank is not None: - peptide_hit.setRank(psm.rank -1) # 1-based to 0-based - self._add_meta_values_from_dict(peptide_hit, psm.metadata) - self._add_meta_values_from_dict(peptide_hit, psm.provenance_data) - self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features) - - peptide_hits.append(peptide_hit) - - peptide_id.setHits(peptide_hits) - self.peptide_ids.append(peptide_id) - - # Get unique protein accessions - protein_list = list(set([accession for proteins in protein_list for accession in proteins])) - protein_hits = [] - for accession in protein_list: - protein_hit = oms.ProteinHit() - protein_hit.setAccession(accession) - protein_hits.append(protein_hit) - self.protein_ids.setHits(protein_hits) - - # Write an idXML file for each collection - oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [self.protein_ids], self.peptide_ids) \ No newline at end of file From 83d23701d97d0b4c44be06b1e5f589585136616d Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Sun, 15 Oct 2023 14:22:44 +0000 Subject: [PATCH 15/20] black --- docs/source/conf.py | 3 +- online/pages/2_PSM_file_conversion.py | 7 +- psm_utils/io/_pd_msf_tables.py | 1 + psm_utils/io/idxml.py | 146 ++++++++++++++++---------- psm_utils/io/msamanda.py | 2 +- tests/test_io/test_idxml.py | 69 +++++++----- tests/test_io/test_maxquant.py | 5 +- tests/test_io/test_msamanda.py | 4 +- tests/test_io/test_mzid.py | 10 +- tests/test_io/test_percolator.py | 5 +- tests/test_io/test_xtandem.py | 1 - tests/test_peptidoform.py | 2 - tests/test_psm_list.py | 32 +++++- 13 files changed, 174 insertions(+), 113 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 36b1602..29762dd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -46,9 +46,10 @@ "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None), "pyteomics": ("https://pyteomics.readthedocs.io/en/latest/", None), - "pyopenms": ("https://pyopenms.readthedocs.io/en/latest/", None) + "pyopenms": ("https://pyopenms.readthedocs.io/en/latest/", None), } + def setup(app): config = { # "auto_toc_tree_section": "Contents", diff --git a/online/pages/2_PSM_file_conversion.py b/online/pages/2_PSM_file_conversion.py index 2873782..2dddcef 100644 --- a/online/pages/2_PSM_file_conversion.py +++ b/online/pages/2_PSM_file_conversion.py @@ -68,9 +68,10 @@ def _convert(self): st.success("PSM file successfully converted!") # Construct output filename with new extension - output_filename = Path( - st.session_state["convert_input_file"].name - ).stem + FILETYPES[st.session_state["convert_output_filetype"]]["extension"] + output_filename = ( + Path(st.session_state["convert_input_file"].name).stem + + FILETYPES[st.session_state["convert_output_filetype"]]["extension"] + ) # Open converted file in memory for download button with open("output_filename", "rb") as file: diff --git a/psm_utils/io/_pd_msf_tables.py b/psm_utils/io/_pd_msf_tables.py index 7601e38..a2323bd 100644 --- a/psm_utils/io/_pd_msf_tables.py +++ b/psm_utils/io/_pd_msf_tables.py @@ -17,6 +17,7 @@ UniqueConstraint, text, ) + try: from sqlalchemy.orm import declarative_base except ImportError: diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 641645a..818971b 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -35,8 +35,7 @@ # Extracted from the OpenMS PSMFeatureExtractor, which adds and manipulates features that will be given to percolator # https://github.com/OpenMS/OpenMS/blob/342f6524e76a2bab3dcb428ba2f4aa2d6bfe8483/src/topp/PSMFeatureExtractor.cpp RESCORING_FEATURE_LIST = [ - "isotope_error" - "MS:1002049", # MSGFPlus unchanged RawScore + "isotope_error" "MS:1002049", # MSGFPlus unchanged RawScore "MS:1002050", # MSGFPlus unchanged DeNovoScore "MSGF:ScoreRatio", "MSGF:Energy", @@ -65,17 +64,17 @@ "MASCOT:delta_score", # delta score based on mScore "CONCAT:lnEvalue", "CONCAT:deltaLnEvalue", - "SAGE:ln(-poisson)" - "SAGE:ln(delta_best)", + "SAGE:ln(-poisson)" "SAGE:ln(delta_best)", "SAGE:ln(delta_next)", "SAGE:ln(matched_intensity_pct)", "SAGE:longest_b", "SAGE:longest_y", "SAGE:longest_y_pct", "SAGE:matched_peaks", - "SAGE:scored_candidates" + "SAGE:scored_candidates", ] + class IdXMLReader(ReaderBase): def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: """ @@ -99,7 +98,6 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: self.peptide_ids = peptide_ids self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0]) - def __iter__(self) -> Iterable[PSM]: """ Iterate over file and return PSMs one-by-one. @@ -108,7 +106,6 @@ def __iter__(self) -> Iterable[PSM]: for peptide_hit in peptide_id.getHits(): yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit) - def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification): """ Parse idXML using pyopenms and do some sanity checks to make sure the file is not empty. @@ -117,16 +114,22 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) if len(protein_ids) == 0: - raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins. Nothing to parse.") + raise IdXMLReaderEmptyListException( + f"File {self.filename} contains no proteins. Nothing to parse." + ) elif len(peptide_ids) == 0: - raise IdXMLReaderEmptyListException(f"File {self.filename} contains no PeptideIdentifications. Nothing to parse.") + raise IdXMLReaderEmptyListException( + f"File {self.filename} contains no PeptideIdentifications. Nothing to parse." + ) elif len(peptide_ids[0].getHits()) == 0: - raise IdXMLReaderEmptyListException(f"File {self.filename} contains no PeptideHits. Nothing to parse.") + raise IdXMLReaderEmptyListException( + f"File {self.filename} contains no PeptideHits. Nothing to parse." + ) else: return protein_ids, peptide_ids @staticmethod - def _parse_peptidoform(sequence:str, charge:int) -> str: + def _parse_peptidoform(sequence: str, charge: int) -> str: """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. @@ -148,51 +151,66 @@ def _parse_peptidoform(sequence:str, charge:int) -> str: return sequence - - def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM: + def _parse_psm( + self, + protein_ids: oms.ProteinIdentification, + peptide_id: oms.PeptideIdentification, + peptide_hit: oms.PeptideHit, + ) -> PSM: """ Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`. Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object. """ - peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()) + peptidoform = self._parse_peptidoform( + peptide_hit.getSequence().toString(), peptide_hit.getCharge() + ) return PSM( - peptidoform = peptidoform, - spectrum_id = peptide_id.getMetaValue("spectrum_reference"), - run = self._get_run(protein_ids, peptide_id), - is_decoy = self._is_decoy(peptide_hit), - score = peptide_hit.getScore(), - precursor_mz = peptide_id.getMZ(), - retention_time = peptide_id.getRT(), + peptidoform=peptidoform, + spectrum_id=peptide_id.getMetaValue("spectrum_reference"), + run=self._get_run(protein_ids, peptide_id), + is_decoy=self._is_decoy(peptide_hit), + score=peptide_hit.getScore(), + precursor_mz=peptide_id.getMZ(), + retention_time=peptide_id.getRT(), # NOTE: In future update of OpenMS we will be able to read out ion_mobility from idXML file - protein_list=[accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()], - rank = peptide_hit.getRank() + 1, # 0-based to 1-based + protein_list=[ + accession.decode() for accession in peptide_hit.extractProteinAccessionsSet() + ], + rank=peptide_hit.getRank() + 1, # 0-based to 1-based source="idXML", # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back to original sequence in writer - provenance_data = {str(peptidoform): peptide_hit.getSequence().toString()}, + provenance_data={str(peptidoform): peptide_hit.getSequence().toString()}, # This is needed to calculate a qvalue before rescoring the PSMList - metadata = { + metadata={ "idxml:score_type": str(peptide_id.getScoreType()), "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), - "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()) + "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()), + }, + rescoring_features={ + key: float(peptide_hit.getMetaValue(key)) + for key in self.rescoring_features + if self._is_float(peptide_hit.getMetaValue(key)) }, - rescoring_features = {key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features \ - if self._is_float(peptide_hit.getMetaValue(key))} ) - @staticmethod - def _get_run(protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str: + def _get_run( + protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification + ) -> str: """ Get run name from idXML using pyopenms If idXML contains merge index, use it to annotate the run name without extension """ if peptide_id.metaValueExists("id_merge_index"): - return Path(protein_ids[0].getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")].decode()).stem + return Path( + protein_ids[0] + .getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")] + .decode() + ).stem else: return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem - @staticmethod def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]: """ @@ -202,17 +220,16 @@ def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]: keys = [] peptide_hit.getKeys(keys) # Convert from byte to str and remove the keys, which values are not in the fixed list of features - keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST] + keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST] return keys - @staticmethod def _is_float(element: any) -> bool: """ Check if element is float. """ - if element is None: + if element is None: return False try: float(element) @@ -220,7 +237,6 @@ def _is_float(element: any) -> bool: except ValueError: return False - @staticmethod def _is_decoy(peptide_hit: oms.PeptideHit) -> bool: """ @@ -240,8 +256,8 @@ class IdXMLWriter(WriterBase): def __init__( self, filename: str | Path, - protein_ids = None, - peptide_ids = None, + protein_ids=None, + peptide_ids=None, *args, **kwargs, ) -> None: @@ -267,15 +283,12 @@ def __init__( self.peptide_ids = peptide_ids self._writer = None - def __enter__(self) -> IdXMLWriter: return self - def __exit__(self, *args, **kwargs) -> None: pass - def write_psm(self, psm: PSM): """ Write a single PSM to the PSM file. @@ -290,7 +303,6 @@ def write_psm(self, psm: PSM): """ raise NotImplementedError("IdXMLWriter currently does not support write_psm.") - def write_file(self, psm_list: PSMList) -> None: """ Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification` @@ -302,19 +314,22 @@ def write_file(self, psm_list: PSMList) -> None: self._update_existing_ids(psm_dict) # Check if one of self.protein_ids or self.peptide_ids is None elif self.protein_ids is not None or self.peptide_ids is not None: - logger.warning("One of the protein_ids or peptide_ids is None. Falling back to creating new idXML files solely based on the PSMList.") + logger.warning( + "One of the protein_ids or peptide_ids is None. Falling back to creating new idXML files solely based on the PSMList." + ) self._create_new_ids(psm_dict) else: self._create_new_ids(psm_dict) - def _update_existing_ids(self, psm_dict: dict) -> None: """ Update existing :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. """ # Access run name(s) from ProteinIdentification - spectrum_files = [Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")] + spectrum_files = [ + Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data") + ] for peptide_id in self.peptide_ids: if len(spectrum_files) > 1: run = spectrum_files[peptide_id.getMetaValue("id_merge_index")] @@ -325,7 +340,9 @@ def _update_existing_ids(self, psm_dict: dict) -> None: try: psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")] except KeyError: - print("Multiple collections are not supported when parsing single pyopenms protein and peptide objects.") + print( + "Multiple collections are not supported when parsing single pyopenms protein and peptide objects." + ) # Dict of UNIMOD peptide sequence and PSM object hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms} # Update PeptideHits according to the PSM objects @@ -340,19 +357,17 @@ def _update_existing_ids(self, psm_dict: dict) -> None: oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids) - def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None: """ Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information from :py:class:`~psm_utils.psm.PSM`. """ peptide_hit.setScore(psm.score) - peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based + peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based for feature, value in psm.rescoring_features.items(): if feature not in RESCORING_FEATURE_LIST: peptide_hit.setMetaValue(feature, value) - def _create_new_ids(self, psm_dict: dict) -> None: """ Create new :py:class:`~pyopenms.ProteinIdentification` @@ -366,8 +381,10 @@ def _create_new_ids(self, psm_dict: dict) -> None: self.protein_ids.setMetaValue("spectra_data", msrun_reference) protein_list = [] for run, psm_dict_run in runs.items(): - for spectrum_id, psms in psm_dict_run.items(): - protein_list.append([accession for psm in psms for accession in psm.protein_list]) + for spectrum_id, psms in psm_dict_run.items(): + protein_list.append( + [accession for psm in psms for accession in psm.protein_list] + ) # Fill PeptideIdentification object with PeptideHits peptide_id = oms.PeptideIdentification() @@ -382,13 +399,22 @@ def _create_new_ids(self, psm_dict: dict) -> None: peptide_hits = [] for psm in psms: peptide_hit = oms.PeptideHit() - peptide_hit.setSequence(oms.AASequence.fromString(self._convert_proforma_to_unimod(psm.peptidoform.sequence))) + peptide_hit.setSequence( + oms.AASequence.fromString( + self._convert_proforma_to_unimod(psm.peptidoform.sequence) + ) + ) peptide_hit.setCharge(psm.peptidoform.precursor_charge) - peptide_hit.setMetaValue("target_decoy", "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target")) + peptide_hit.setMetaValue( + "target_decoy", + "" + if psm.is_decoy is None + else ("decoy" if psm.is_decoy else "target"), + ) if psm.score is not None: peptide_hit.setScore(psm.score) if psm.rank is not None: - peptide_hit.setRank(psm.rank -1) # 1-based to 0-based + peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based self._add_meta_values_from_dict(peptide_hit, psm.metadata) self._add_meta_values_from_dict(peptide_hit, psm.provenance_data) self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features) @@ -399,7 +425,9 @@ def _create_new_ids(self, psm_dict: dict) -> None: self.peptide_ids.append(peptide_id) # Get unique protein accessions - protein_list = list(set([accession for proteins in protein_list for accession in proteins])) + protein_list = list( + set([accession for proteins in protein_list for accession in proteins]) + ) protein_hits = [] for accession in protein_list: protein_hit = oms.ProteinHit() @@ -408,8 +436,11 @@ def _create_new_ids(self, psm_dict: dict) -> None: self.protein_ids.setHits(protein_hits) # Write an idXML file for each collection - oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [self.protein_ids], self.peptide_ids) - + oms.IdXMLFile().store( + "/".join(filter(None, [collection, str(self.filename)])), + [self.protein_ids], + self.peptide_ids, + ) def _convert_proforma_to_unimod(self, sequence: str) -> str: """ @@ -426,7 +457,6 @@ def _convert_proforma_to_unimod(self, sequence: str) -> str: return sequence - def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None: """ Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary. diff --git a/psm_utils/io/msamanda.py b/psm_utils/io/msamanda.py index 3040a74..bbbeab0 100644 --- a/psm_utils/io/msamanda.py +++ b/psm_utils/io/msamanda.py @@ -129,7 +129,7 @@ def _parse_peptidoform(seq, modifications, charge): peptide[0] = peptide[0] + f'[{match.group("mod_name")}]' elif match.group("term") == "C-Term": peptide[-1] = peptide[-1] + f'[{match.group("mod_name")}]' - if match.group("loc") is not None: + if match.group("loc") is not None: peptide[int(match.group("loc"))] = ( peptide[int(match.group("loc"))] + f'[{match.group("mod_name")}]' ) diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index f89c3fb..3ef4d7a 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -7,8 +7,8 @@ from psm_utils.psm import PSM from psm_utils.peptidoform import Peptidoform -class TestIdXMLReader: +class TestIdXMLReader: def test__parse_peptidoform(self): test_cases = [ (("DFPIAMGER", 2), "DFPIAMGER/2"), @@ -20,18 +20,17 @@ def test__parse_peptidoform(self): ((".DFPIAM[147]GER.", 2), "DFPIAM[147]GER/2"), ((".(Dimethyl)DFPIAMGER.", 2), "[Dimethyl]-DFPIAMGER/2"), ((".DFPIAMGER.(Label:18O(2))", 2), "DFPIAMGER-[Label:18O(2)]/2"), - ((".DFPIAMGER(Phospho).", 2), "DFPIAMGER[Phospho]/2") + ((".DFPIAMGER(Phospho).", 2), "DFPIAMGER[Phospho]/2"), ] for test_in, expected_out in test_cases: assert IdXMLReader._parse_peptidoform(*test_in) == expected_out - def test__parse_psm(self): test_psm = PSM( - peptidoform=Peptidoform('LVPIWKK/2'), - spectrum_id='controllerType=0 controllerNumber=1 scan=294', - run='HepG2_rep3_small', + peptidoform=Peptidoform("LVPIWKK/2"), + spectrum_id="controllerType=0 controllerNumber=1 scan=294", + run="HepG2_rep3_small", collection=None, spectrum=None, is_decoy=True, @@ -41,46 +40,66 @@ def test__parse_psm(self): precursor_mz=442.29595853189994, retention_time=849.0, ion_mobility=None, - protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'], + protein_list=["DECOY_sp|Q5TH74|STPG1_HUMAN"], rank=1, - source='idXML', - provenance_data={'LVPIWKK/2': 'LVPIWKK'}, + source="idXML", + provenance_data={"LVPIWKK/2": "LVPIWKK"}, metadata={ - 'idxml:score_type': 'expect', - 'idxml:higher_score_better': 'False', - 'idxml:significance_threshold': '0.0' + "idxml:score_type": "expect", + "idxml:higher_score_better": "False", + "idxml:significance_threshold": "0.0", + }, + rescoring_features={ + "MS:1002252": 0.693, + "COMET:deltaCn": 1.0, + "MS:1002255": 35.9, + "COMET:deltaLCn": 0.0, + "COMET:lnExpect": 0.009950330853168092, + "COMET:lnNumSP": 3.555348061489414, + "COMET:lnRankSP": 0.0, + "COMET:IonFrac": 0.25, }, - rescoring_features={'MS:1002252': 0.693, - 'COMET:deltaCn': 1.0, - 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25} ) reader = IdXMLReader("./tests/test_data/test_in.idXML") - psm = reader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0]) + psm = reader._parse_psm( + reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0] + ) assert psm == test_psm - def test__get_run(self): expected_output = "HepG2_rep3_small" reader = IdXMLReader("./tests/test_data/test_in.idXML") assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output - def test__get_rescoring_features(self): - expected_output = ['MS:1002252', 'COMET:deltaCn', 'MS:1002255', 'COMET:deltaLCn', 'COMET:lnExpect', 'COMET:lnNumSP', 'COMET:lnRankSP', 'COMET:IonFrac'] + expected_output = [ + "MS:1002252", + "COMET:deltaCn", + "MS:1002255", + "COMET:deltaLCn", + "COMET:lnExpect", + "COMET:lnNumSP", + "COMET:lnRankSP", + "COMET:IonFrac", + ] reader = IdXMLReader("./tests/test_data/test_in.idXML") - assert IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output + assert ( + IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) + == expected_output + ) class TestIdXMLWriter: - def test_write_file_with_pyopenms_objects(self): expected_sha = "8d8cb6d8194c5c296f0f5ee8be83d2072be125547b2d51b88100859b001f47fa" reader = IdXMLReader("./tests/test_data/test_in.idXML") psm_list = reader.read_file() - writer = IdXMLWriter("./tests/test_data/test_out.idXML", reader.protein_ids, reader.peptide_ids) + writer = IdXMLWriter( + "./tests/test_data/test_out.idXML", reader.protein_ids, reader.peptide_ids + ) writer.write_file(psm_list) - sha = hashlib.sha256(open("./tests/test_data/test_out.idXML", 'rb').read()).hexdigest() + sha = hashlib.sha256(open("./tests/test_data/test_out.idXML", "rb").read()).hexdigest() assert sha == expected_sha def test_write_file_without_pyopenms_objects(self): @@ -89,5 +108,7 @@ def test_write_file_without_pyopenms_objects(self): psm_list = reader.read_file() writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML") writer.write_file(psm_list) - sha = hashlib.sha256(open("./tests/test_data/test_out_sage.idXML", 'rb').read()).hexdigest() + sha = hashlib.sha256( + open("./tests/test_data/test_out_sage.idXML", "rb").read() + ).hexdigest() assert sha == expected_sha diff --git a/tests/test_io/test_maxquant.py b/tests/test_io/test_maxquant.py index d0546c4..5384127 100644 --- a/tests/test_io/test_maxquant.py +++ b/tests/test_io/test_maxquant.py @@ -19,7 +19,6 @@ class TestMSMSReader: def test_evaluate_columns(self): - columns = TEST_COL.copy() # Test with the right column names maxquant.MSMSReader._evaluate_columns(columns) @@ -73,8 +72,6 @@ def test_parse_peptidoform(self): msms_reader = maxquant.MSMSReader("./tests/test_data/test_msms.txt") - for test_in, expected_out in zip( - test_cases["input"], test_cases["expected_output"] - ): + for test_in, expected_out in zip(test_cases["input"], test_cases["expected_output"]): output = msms_reader._parse_peptidoform(*test_in) assert output.proforma == expected_out diff --git a/tests/test_io/test_msamanda.py b/tests/test_io/test_msamanda.py index 82e94f1..97466ff 100644 --- a/tests/test_io/test_msamanda.py +++ b/tests/test_io/test_msamanda.py @@ -64,8 +64,6 @@ def test_parse_peptidoform(self): ], } - for test_in, expected_out in zip( - test_cases["input"], test_cases["expected_output"] - ): + for test_in, expected_out in zip(test_cases["input"], test_cases["expected_output"]): output = msamanda.MSAmandaReader._parse_peptidoform(*test_in).proforma assert output == expected_out diff --git a/tests/test_io/test_mzid.py b/tests/test_io/test_mzid.py index 6832107..3c60c99 100644 --- a/tests/test_io/test_mzid.py +++ b/tests/test_io/test_mzid.py @@ -9,7 +9,6 @@ class TestMzIdentMlReader: def test_infer_source(self): - msgf = mzid.MzidReader(MSGF_TEST_FILE) assert msgf._source == "MS-GF+" @@ -17,7 +16,6 @@ def test_infer_source(self): assert peaks._source == "PEAKS Studio" def test_get_namespace(self): - root = ET.parse(MSGF_TEST_FILE).getroot() root_tag = root.tag assert ( @@ -95,7 +93,7 @@ def test_parse_peptidoform(self): } ], 2, - ) + ), } for expected_out, test_in in test_cases.items(): @@ -103,7 +101,6 @@ def test_parse_peptidoform(self): assert test_out == expected_out def test_parse_peptide_evidence_ref(self): - test_cases = [ [ { @@ -175,10 +172,7 @@ def test_parse_peptide_evidence_ref(self): (False, ["#CONTAM#Q86YZ3", "Q86YZ3|HORN_HUMAN"]), ] for i, test_case in enumerate(test_cases): - assert ( - mzid.MzidReader._parse_peptide_evidence_ref(test_case) - == expected_output[i] - ) + assert mzid.MzidReader._parse_peptide_evidence_ref(test_case) == expected_output[i] # TODO def test_get_peptide_spectrum_match(self): diff --git a/tests/test_io/test_percolator.py b/tests/test_io/test_percolator.py index 09c3751..692f5f9 100644 --- a/tests/test_io/test_percolator.py +++ b/tests/test_io/test_percolator.py @@ -31,7 +31,4 @@ def test_parse_peptidoform(self): (("-.ACDEFGHR.-", None), "ACDEFGHR"), ] for test_in, expected_out in test_cases: - assert ( - expected_out - == PercolatorTabReader._parse_peptidoform(*test_in).proforma - ) + assert expected_out == PercolatorTabReader._parse_peptidoform(*test_in).proforma diff --git a/tests/test_io/test_xtandem.py b/tests/test_io/test_xtandem.py index 32d9204..7592c2d 100644 --- a/tests/test_io/test_xtandem.py +++ b/tests/test_io/test_xtandem.py @@ -4,7 +4,6 @@ class TestXTandemReader: - reader = XTandemReader("path") def test__parse_peptidoform(self): diff --git a/tests/test_peptidoform.py b/tests/test_peptidoform.py index a9cb691..1e048e7 100644 --- a/tests/test_peptidoform.py +++ b/tests/test_peptidoform.py @@ -4,7 +4,6 @@ class TestPeptidoform: - def test__len__(self): test_cases = [ ("ACDEFGHIK", 9), @@ -27,7 +26,6 @@ def test__iter__(self): for mod in mods: assert isinstance(mod, proforma.TagBase) - def test_rename_modifications(self): label_mapping = {"ac": "Acetyl", "cm": "Carbamidomethyl"} diff --git a/tests/test_psm_list.py b/tests/test_psm_list.py index 412b4d3..2be2c8f 100644 --- a/tests/test_psm_list.py +++ b/tests/test_psm_list.py @@ -71,10 +71,34 @@ def test___set_item__(self): def test_set_ranks(self): psm_list = PSMList( psm_list=[ - PSM(peptidoform=Peptidoform('ACDE/2'), spectrum_id=1, run=None, collection=None, score=3.3), - PSM(peptidoform=Peptidoform('ACDF/2'), spectrum_id=2, run=None, collection=None, score=12.0), - PSM(peptidoform=Peptidoform('ACDG/2'), spectrum_id=3, run=None, collection=None, score=12.2), - PSM(peptidoform=Peptidoform('ACDH/2'), spectrum_id=4, run=None, collection=None, score=2.5), + PSM( + peptidoform=Peptidoform("ACDE/2"), + spectrum_id=1, + run=None, + collection=None, + score=3.3, + ), + PSM( + peptidoform=Peptidoform("ACDF/2"), + spectrum_id=2, + run=None, + collection=None, + score=12.0, + ), + PSM( + peptidoform=Peptidoform("ACDG/2"), + spectrum_id=3, + run=None, + collection=None, + score=12.2, + ), + PSM( + peptidoform=Peptidoform("ACDH/2"), + spectrum_id=4, + run=None, + collection=None, + score=2.5, + ), ] ) From b26443a2c4fa0eea3b61f20c97588f67f0126886 Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 20 Oct 2023 16:47:50 -0700 Subject: [PATCH 16/20] PR review; minor fixes; add to-do's --- docs/source/conf.py | 1 + psm_utils/io/__init__.py | 6 +- psm_utils/io/idxml.py | 151 +++++++++++++++++++++++---------------- 3 files changed, 92 insertions(+), 66 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 29762dd..ccaca8b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,6 +21,7 @@ "sphinx.ext.autodoc", "sphinx.ext.autosectionlabel", "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", "sphinx.ext.napoleon", "sphinx_rtd_theme", "sphinx_mdinclude", diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index 2fc69cd..ef86c10 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -9,6 +9,7 @@ from rich.progress import track import psm_utils.io.idxml as idxml +import psm_utils.io.ionbot as ionbot import psm_utils.io.maxquant as maxquant import psm_utils.io.msamanda as msamanda import psm_utils.io.mzid as mzid @@ -18,7 +19,6 @@ import psm_utils.io.sage as sage import psm_utils.io.tsv as tsv import psm_utils.io.xtandem as xtandem -import psm_utils.io.ionbot as ionbot from psm_utils.io._base_classes import WriterBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM @@ -27,7 +27,7 @@ FILETYPES = { "idxml": { "reader": idxml.IdXMLReader, - "writer": None, + "writer": idxml.IdXMLWriter, "extension": ".idXML", "filename_pattern": r"^.*\.idxml$", }, @@ -120,7 +120,7 @@ def _supports_write_psm(writer: WriterBase): supports_write_psm = True else: supports_write_psm = True - Path(temp_file.name).unlink() + Path(temp_file.name).unlink() return supports_write_psm diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 818971b..fed16ea 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -14,16 +14,16 @@ from __future__ import annotations import logging -from typing import Union, Iterable, Tuple, List import re from pathlib import Path +from typing import Iterable, List, Tuple, Union import pyopenms as oms +from psm_utils.exceptions import PSMUtilsException from psm_utils.io._base_classes import ReaderBase, WriterBase from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -from psm_utils.exceptions import PSMUtilsException logger = logging.getLogger(__name__) @@ -92,11 +92,8 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: >>> psm_list = [psm for psm in reader] """ super().__init__(filename, *args, **kwargs) - self.filename = Path(filename) - protein_ids, peptide_ids = self._parse_idxml() - self.protein_ids = protein_ids - self.peptide_ids = peptide_ids - self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0]) + self.protein_ids, self.peptide_ids = self._parse_idxml() + self.rescoring_features = self._get_rescoring_features(self.peptide_ids[0].getHits()[0]) def __iter__(self) -> Iterable[PSM]: """ @@ -108,7 +105,7 @@ def __iter__(self) -> Iterable[PSM]: def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification): """ - Parse idXML using pyopenms and do some sanity checks to make sure the file is not empty. + Parse idXML using pyopenms and perform sanity checks to make sure the file is not empty. """ protein_ids, peptide_ids = [], [] oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) @@ -133,13 +130,11 @@ def _parse_peptidoform(sequence: str, charge: int) -> str: """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. - Notes ----- Implemented according to the documentation on `github.com/OpenMS/OpenMS `_ - . The differentiation between square- and round bracket notation is removed - after parsing. + . The differentiation between square- and round bracket notation is removed after parsing. """ sequence = MOD_PATTERN.sub(r"[\1]", sequence) if sequence[:2] == ".[": @@ -159,8 +154,10 @@ def _parse_psm( ) -> PSM: """ Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`. - Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` - to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object. + + Uses additional information from :py:class:`~pyopenms.ProteinIdentification` and + :py:class:`~pyopenms.PeptideIdentification` to annotate parameters of the + :py:class:`~psm_utils.psm.PSM` object. """ peptidoform = self._parse_peptidoform( peptide_hit.getSequence().toString(), peptide_hit.getCharge() @@ -173,14 +170,16 @@ def _parse_psm( score=peptide_hit.getScore(), precursor_mz=peptide_id.getMZ(), retention_time=peptide_id.getRT(), - # NOTE: In future update of OpenMS we will be able to read out ion_mobility from idXML file + # NOTE: ion mobility will be supported by OpenMS in the future protein_list=[ accession.decode() for accession in peptide_hit.extractProteinAccessionsSet() ], rank=peptide_hit.getRank() + 1, # 0-based to 1-based source="idXML", - # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back to original sequence in writer + # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back + # to original sequence in writer provenance_data={str(peptidoform): peptide_hit.getSequence().toString()}, + # TODO: Can we recover any all UserParams and save them as metadata? # This is needed to calculate a qvalue before rescoring the PSMList metadata={ "idxml:score_type": str(peptide_id.getScoreType()), @@ -199,36 +198,39 @@ def _get_run( protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification ) -> str: """ - Get run name from idXML using pyopenms - If idXML contains merge index, use it to annotate the run name without extension + Get run name from idXML using pyopenms. + + If the idXML file contains a merge index, use it to annotate the run name without file + extension. """ if peptide_id.metaValueExists("id_merge_index"): - return Path( + run = Path( protein_ids[0] .getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")] .decode() ).stem else: - return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem + run = Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem + + # Convert back to None value (see writer) + if run == "None": + run = None + + return run @staticmethod def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]: - """ - Get list of rescoring features from idXML - """ - # Fill the key list with all the keys from the PeptideHit in the pyopenms way + """Get list of rescoring features from idXML.""" + # Fill the key list with all the keys from the PeptideHit + # Empty list is required for the Cython wrapper to work correctly keys = [] peptide_hit.getKeys(keys) - # Convert from byte to str and remove the keys, which values are not in the fixed list of features keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST] - return keys @staticmethod def _is_float(element: any) -> bool: - """ - Check if element is float. - """ + """Check if element can be coerced to a float.""" if element is None: return False try: @@ -239,23 +241,17 @@ def _is_float(element: any) -> bool: @staticmethod def _is_decoy(peptide_hit: oms.PeptideHit) -> bool: - """ - Check if PSM is target or decoy - """ + """Check if PSM is target or decoy.""" if peptide_hit.metaValueExists("target_decoy"): return peptide_hit.getMetaValue("target_decoy") == "decoy" else: return None -class IdXMLReaderEmptyListException(PSMUtilsException): - """Exception in psm_utils.io.IdXMLReader""" - - class IdXMLWriter(WriterBase): def __init__( self, - filename: str | Path, + filename: Union[str, Path], protein_ids=None, peptide_ids=None, *args, @@ -266,17 +262,19 @@ def __init__( Parameters ---------- - filename: str, Pathlib.Path + filename Path to PSM file. - run : str, optional - Name of the MS run. Usually the spectrum file filename without - extension. + protein_ids + # TODO Add description + peptide_ids + # TODO Add description Notes ----- - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList to a file at once with the :py:meth:`write_file` method is currently supported. + """ super().__init__(filename, *args, **kwargs) self.protein_ids = protein_ids @@ -298,15 +296,18 @@ def write_psm(self, psm: PSM): Raises ------ NotImplementedError - OmsIdXMLWriter currently does not support write_psm. + IdXMLWriter currently does not support write_psm. """ raise NotImplementedError("IdXMLWriter currently does not support write_psm.") def write_file(self, psm_list: PSMList) -> None: """ - Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification` - and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. + Write the PSMList to the PSM file. + + If `self.protein_ids` and `self.peptide_ids` are not None, the PSM list scores, ranks, + and rescoring features will first be merged with the existing IDs from those objects. + """ psm_dict = psm_list.get_psm_dict() @@ -315,7 +316,8 @@ def write_file(self, psm_list: PSMList) -> None: # Check if one of self.protein_ids or self.peptide_ids is None elif self.protein_ids is not None or self.peptide_ids is not None: logger.warning( - "One of the protein_ids or peptide_ids is None. Falling back to creating new idXML files solely based on the PSMList." + "One of the protein_ids or peptide_ids is None. Falling back to creating new " + "idXML files solely based on the PSMList." ) self._create_new_ids(psm_dict) else: @@ -323,8 +325,11 @@ def write_file(self, psm_list: PSMList) -> None: def _update_existing_ids(self, psm_dict: dict) -> None: """ - Update existing :py:class:`~pyopenms.ProteinIdentification` - and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. + Update an existing idXML file with info from the PSM list or write a new one. + + Update existing :py:class:`~pyopenms.ProteinIdentification` and + :py:class:`~pyopenms.PeptideIdentification` objects with new features from the PSMList + or create new ones. """ # Access run name(s) from ProteinIdentification spectrum_files = [ @@ -335,14 +340,17 @@ def _update_existing_ids(self, psm_dict: dict) -> None: run = spectrum_files[peptide_id.getMetaValue("id_merge_index")] else: run = spectrum_files[0] + # Get PSM objects associated from runs since we are writing a merged idXML # NOTE: Collections with multiple protein_ids and peptide_ids is not supported try: psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")] - except KeyError: - print( - "Multiple collections are not supported when parsing single pyopenms protein and peptide objects." - ) + except KeyError as e: + raise IdXMLException( + "Multiple collections are not supported when parsing single pyopenms protein " + "and peptide objects." + ) from e + # Dict of UNIMOD peptide sequence and PSM object hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms} # Update PeptideHits according to the PSM objects @@ -359,26 +367,29 @@ def _update_existing_ids(self, psm_dict: dict) -> None: def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None: """ - Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information - from :py:class:`~psm_utils.psm.PSM`. + Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features + information from :py:class:`~psm_utils.psm.PSM`. """ peptide_hit.setScore(psm.score) peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based + # TODO: Should q-value and pep also be updated? for feature, value in psm.rescoring_features.items(): if feature not in RESCORING_FEATURE_LIST: peptide_hit.setMetaValue(feature, value) def _create_new_ids(self, psm_dict: dict) -> None: """ - Create new :py:class:`~pyopenms.ProteinIdentification` - and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList. + Create new ProteinIdentification and PeptideIdentification objects with new features from + the PSMList. """ for collection, runs in psm_dict.items(): self.protein_ids = oms.ProteinIdentification() self.peptide_ids = [] + # Set msrun filename with spectra_data meta value - msrun_reference = [run for run in runs.keys()] + msrun_reference = [str(run) for run in runs.keys()] self.protein_ids.setMetaValue("spectra_data", msrun_reference) + protein_list = [] for run, psm_dict_run in runs.items(): for spectrum_id, psms in psm_dict_run.items(): @@ -389,7 +400,7 @@ def _create_new_ids(self, psm_dict: dict) -> None: # Fill PeptideIdentification object with PeptideHits peptide_id = oms.PeptideIdentification() peptide_id.setMetaValue("spectrum_reference", spectrum_id) - peptide_id.setMetaValue("id_merge_index", msrun_reference.index(run)) + peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run))) if psms[0].precursor_mz is not None: peptide_id.setMZ(psms[0].precursor_mz) if psms[0].retention_time is not None: @@ -401,7 +412,7 @@ def _create_new_ids(self, psm_dict: dict) -> None: peptide_hit = oms.PeptideHit() peptide_hit.setSequence( oms.AASequence.fromString( - self._convert_proforma_to_unimod(psm.peptidoform.sequence) + self._convert_proforma_to_unimod(str(psm.peptidoform)) ) ) peptide_hit.setCharge(psm.peptidoform.precursor_charge) @@ -419,6 +430,12 @@ def _create_new_ids(self, psm_dict: dict) -> None: self._add_meta_values_from_dict(peptide_hit, psm.provenance_data) self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features) + if psm.protein_list is not None: + for protein in psm.protein_list: + peptide_evidence = oms.PeptideEvidence() + peptide_evidence.setProteinAccession(protein) + peptide_hit.addPeptideEvidence(peptide_evidence) + peptide_hits.append(peptide_hit) peptide_id.setHits(peptide_hits) @@ -443,9 +460,7 @@ def _create_new_ids(self, psm_dict: dict) -> None: ) def _convert_proforma_to_unimod(self, sequence: str) -> str: - """ - Convert a peptidoform sequence in proforma notation to UNIMOD notation. - """ + """Convert a peptidoform sequence in proforma notation to UNIMOD notation.""" # Replace square brackets with parentheses sequence = sequence.replace("[", "(").replace("]", ")") @@ -458,9 +473,19 @@ def _convert_proforma_to_unimod(self, sequence: str) -> str: return sequence def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None: - """ - Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary. - """ + """Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary.""" if d is not None: for key, value in d.items(): peptide_hit.setMetaValue(key, value) + + +class IdXMLException(PSMUtilsException): + """Exception in psm_utils.io.IdXML""" + + pass + + +class IdXMLReaderEmptyListException(PSMUtilsException): + """Exception in psm_utils.io.IdXMLReader""" + + pass From ffd5138ce00b969a0a60bd2b72c915489ee27a60 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Sun, 22 Oct 2023 16:46:18 +0000 Subject: [PATCH 17/20] Incorporate feedback --- psm_utils/io/idxml.py | 100 ++++++++++++++++++++-------- tests/test_data/test_out_sage.idXML | 5 +- tests/test_io/test_idxml.py | 41 ++---------- 3 files changed, 80 insertions(+), 66 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index fed16ea..00ae57d 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -24,6 +24,7 @@ from psm_utils.io._base_classes import ReaderBase, WriterBase from psm_utils.psm import PSM from psm_utils.psm_list import PSMList +from psm_utils.peptidoform import Peptidoform logger = logging.getLogger(__name__) @@ -93,6 +94,7 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: """ super().__init__(filename, *args, **kwargs) self.protein_ids, self.peptide_ids = self._parse_idxml() + self.user_params_metadata = self._get_userparams_metadata(self.peptide_ids[0].getHits()[0]) self.rescoring_features = self._get_rescoring_features(self.peptide_ids[0].getHits()[0]) def __iter__(self) -> Iterable[PSM]: @@ -162,6 +164,12 @@ def _parse_psm( peptidoform = self._parse_peptidoform( peptide_hit.getSequence().toString(), peptide_hit.getCharge() ) + # This is needed to calculate a qvalue before rescoring the PSMList + peptide_id_metadata = { + "idxml:score_type": str(peptide_id.getScoreType()), + "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), + "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())} + peptide_hit_metadata = {key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata} return PSM( peptidoform=peptidoform, spectrum_id=peptide_id.getMetaValue("spectrum_reference"), @@ -179,17 +187,11 @@ def _parse_psm( # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back # to original sequence in writer provenance_data={str(peptidoform): peptide_hit.getSequence().toString()}, - # TODO: Can we recover any all UserParams and save them as metadata? - # This is needed to calculate a qvalue before rescoring the PSMList - metadata={ - "idxml:score_type": str(peptide_id.getScoreType()), - "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), - "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()), - }, + # Store metadata of PeptideIdentification and PeptideHit objects + metadata=peptide_id_metadata | peptide_hit_metadata, rescoring_features={ key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features - if self._is_float(peptide_hit.getMetaValue(key)) }, ) @@ -218,14 +220,20 @@ def _get_run( return run - @staticmethod - def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]: - """Get list of rescoring features from idXML.""" + def _get_userparams_metadata(self, peptide_hit: oms.PeptideHit) -> List[str]: + """Get list of string type UserParams attached to each PeptideHit.""" # Fill the key list with all the keys from the PeptideHit # Empty list is required for the Cython wrapper to work correctly keys = [] peptide_hit.getKeys(keys) - keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST] + keys = [key.decode() for key in keys if not self._is_float(peptide_hit.getMetaValue(key.decode()))] + return keys + + def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]: + """Get list of rescoring features in UserParams attached to each PeptideHit.""" + keys = [] + peptide_hit.getKeys(keys) + keys = [key.decode() for key in keys if self._is_float(peptide_hit.getMetaValue(key.decode())) and key.decode() in RESCORING_FEATURE_LIST] return keys @staticmethod @@ -265,15 +273,37 @@ def __init__( filename Path to PSM file. protein_ids - # TODO Add description + Optional :py:class:`~pyopenms.ProteinIdentification` object to be written to the idXML file. peptide_ids - # TODO Add description + Optional :py:class:`~pyopenms.PeptideIdentification` object to be written to the idXML file. Notes ----- - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList to a file at once with the :py:meth:`write_file` method is currently supported. + - If `protein_ids` and `peptide_ids` are provided, each :py:class:`~pyopenms.PeptideIdentification` + object in the list `peptide_ids` will be updated with new `rescoring_features` from the PSMList. + Otherwise, new pyopenms objects will be created, filled with information of PSMList and written to the idXML file. + + Examples + -------- + - Example with `pyopenms` objects: + + >>> from psm_utils.io.idxml import IdXMLReader, IdXMLWriter + >>> reader = IdXMLReader("psm_utils/tests/test_data/test_in.idXML"") + >>> psm_list = reader.read_file() + >>> for psm in psm_list: + ... psm.rescoring_features = psm.rescoring_features | {"feature": 1} + >>> writer = IdXMLWriter("psm_utils/tests/test_data//test_out.idXML", reader.protein_ids, reader.peptide_ids) + >>> writer.write_file(psm_list) + + - Example without `pyopenms` objects: + + >>> from psm_utils.psm_list import PSMList + >>> psm_list = PSMList(psm_list=[PSM(peptidoform="ACDK", spectrum_id=1, score=140.2, retention_time=600.2)]) + >>> writer = IdXMLWriter("psm_utils/tests/test_data//test_out.idXML") + >>> writer.write_file(psm_list) """ super().__init__(filename, *args, **kwargs) @@ -370,9 +400,15 @@ def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None: Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information from :py:class:`~psm_utils.psm.PSM`. """ - peptide_hit.setScore(psm.score) - peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based - # TODO: Should q-value and pep also be updated? + if psm.score is not None: + peptide_hit.setScore(psm.score) + if psm.rank is not None: + peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based + if psm.qvalue is not None: + peptide_hit.setMetaValue("q-value", psm.qvalue) + if psm.pep is not None: + peptide_hit.setMetaValue("PEP", psm.pep) + for feature, value in psm.rescoring_features.items(): if feature not in RESCORING_FEATURE_LIST: peptide_hit.setMetaValue(feature, value) @@ -401,6 +437,9 @@ def _create_new_ids(self, psm_dict: dict) -> None: peptide_id = oms.PeptideIdentification() peptide_id.setMetaValue("spectrum_reference", spectrum_id) peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run))) + # TODO: Question: Is there a way to infer the score type from the PSM? + if psms[0].score is not None: + peptide_id.setScoreType("search_engine_score") if psms[0].precursor_mz is not None: peptide_id.setMZ(psms[0].precursor_mz) if psms[0].retention_time is not None: @@ -412,7 +451,7 @@ def _create_new_ids(self, psm_dict: dict) -> None: peptide_hit = oms.PeptideHit() peptide_hit.setSequence( oms.AASequence.fromString( - self._convert_proforma_to_unimod(str(psm.peptidoform)) + self._convert_proforma_to_unimod(psm.peptidoform) ) ) peptide_hit.setCharge(psm.peptidoform.precursor_charge) @@ -422,8 +461,10 @@ def _create_new_ids(self, psm_dict: dict) -> None: if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target"), ) - if psm.score is not None: - peptide_hit.setScore(psm.score) + if psm.qvalue is not None: + peptide_hit.setMetaValue("q-value", psm.qvalue) + if psm.pep is not None: + peptide_hit.setMetaValue("PEP", psm.pep) if psm.rank is not None: peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based self._add_meta_values_from_dict(peptide_hit, psm.metadata) @@ -459,16 +500,21 @@ def _create_new_ids(self, psm_dict: dict) -> None: self.peptide_ids, ) - def _convert_proforma_to_unimod(self, sequence: str) -> str: + def _convert_proforma_to_unimod(self, peptidoform: Peptidoform) -> str: """Convert a peptidoform sequence in proforma notation to UNIMOD notation.""" - # Replace square brackets with parentheses - sequence = sequence.replace("[", "(").replace("]", ")") + sequence = str(peptidoform).split('/')[0] + + # Replace square brackets around modifications with parentheses + sequence = re.sub(r"\[([^\]]+)\]", r"(\1)", sequence) # Check for N-terminal and C-terminal modifications - if sequence[:2] == "((": - sequence = sequence.replace("((", ".[", 1).replace(")-", ".", 1) - if sequence[-2:] == "))": - sequence = sequence[::-1].replace("))", ".]", 1).replace("-(", ".", 1)[::-1] + if sequence.startswith("["): + sequence = re.sub(r"^\[([^\]]+)\]-", r"(\1)", sequence) + if sequence.endswith("]"): + sequence = re.sub(r"-\[([^\]]+)\]$", r"-(\1)", sequence) + + # Remove dashes for N-terminal and C-terminal modifications + sequence = sequence.replace(")-", ")").replace("-(", "(") return sequence diff --git a/tests/test_data/test_out_sage.idXML b/tests/test_data/test_out_sage.idXML index 2d307cb..450ef56 100644 --- a/tests/test_data/test_out_sage.idXML +++ b/tests/test_data/test_out_sage.idXML @@ -9,10 +9,11 @@ - - + + + diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index 3ef4d7a..ce45556 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -27,40 +27,7 @@ def test__parse_peptidoform(self): assert IdXMLReader._parse_peptidoform(*test_in) == expected_out def test__parse_psm(self): - test_psm = PSM( - peptidoform=Peptidoform("LVPIWKK/2"), - spectrum_id="controllerType=0 controllerNumber=1 scan=294", - run="HepG2_rep3_small", - collection=None, - spectrum=None, - is_decoy=True, - score=1.0099999904632568, - qvalue=None, - pep=None, - precursor_mz=442.29595853189994, - retention_time=849.0, - ion_mobility=None, - protein_list=["DECOY_sp|Q5TH74|STPG1_HUMAN"], - rank=1, - source="idXML", - provenance_data={"LVPIWKK/2": "LVPIWKK"}, - metadata={ - "idxml:score_type": "expect", - "idxml:higher_score_better": "False", - "idxml:significance_threshold": "0.0", - }, - rescoring_features={ - "MS:1002252": 0.693, - "COMET:deltaCn": 1.0, - "MS:1002255": 35.9, - "COMET:deltaLCn": 0.0, - "COMET:lnExpect": 0.009950330853168092, - "COMET:lnNumSP": 3.555348061489414, - "COMET:lnRankSP": 0.0, - "COMET:IonFrac": 0.25, - }, - ) - + test_psm = PSM(peptidoform=Peptidoform('LVPIWKK/2'), spectrum_id='controllerType=0 controllerNumber=1 scan=294', run='HepG2_rep3_small', collection=None, spectrum=None, is_decoy=True, score=1.0099999904632568, qvalue=None, pep=None, precursor_mz=442.29595853189994, retention_time=849.0, ion_mobility=None, protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'], rank=1, source='idXML', provenance_data={'LVPIWKK/2': 'LVPIWKK'}, metadata={'idxml:score_type': 'expect', 'idxml:higher_score_better': 'False', 'idxml:significance_threshold': '0.0', 'target_decoy': 'decoy', 'protein_references': 'unique'}, rescoring_features={'MS:1002252': 0.693, 'COMET:deltaCn': 1.0, 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25}) reader = IdXMLReader("./tests/test_data/test_in.idXML") psm = reader._parse_psm( reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0] @@ -70,7 +37,7 @@ def test__parse_psm(self): def test__get_run(self): expected_output = "HepG2_rep3_small" reader = IdXMLReader("./tests/test_data/test_in.idXML") - assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output + assert reader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output def test__get_rescoring_features(self): expected_output = [ @@ -85,7 +52,7 @@ def test__get_rescoring_features(self): ] reader = IdXMLReader("./tests/test_data/test_in.idXML") assert ( - IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) + reader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output ) @@ -103,7 +70,7 @@ def test_write_file_with_pyopenms_objects(self): assert sha == expected_sha def test_write_file_without_pyopenms_objects(self): - expected_sha = "a24e157579a4e9da71900931c6b5578a264ea7830135ede480b41e6550944e5e" + expected_sha = "b81addaf8ef1f5cb5007f14a914bee508c54d59f34f8857a5770d3db9aa2c15b" reader = SageReader("./tests/test_data/results.sage.tsv") psm_list = reader.read_file() writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML") From 5bb19efb89ef80bcf0a92d6878e6941f9f4527f5 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Sun, 22 Oct 2023 16:46:46 +0000 Subject: [PATCH 18/20] black --- psm_utils/io/idxml.py | 35 +++++++++++++++++++++------------ tests/test_io/test_idxml.py | 39 ++++++++++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 00ae57d..dbf36e1 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -166,10 +166,13 @@ def _parse_psm( ) # This is needed to calculate a qvalue before rescoring the PSMList peptide_id_metadata = { - "idxml:score_type": str(peptide_id.getScoreType()), - "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), - "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())} - peptide_hit_metadata = {key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata} + "idxml:score_type": str(peptide_id.getScoreType()), + "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()), + "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()), + } + peptide_hit_metadata = { + key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata + } return PSM( peptidoform=peptidoform, spectrum_id=peptide_id.getMetaValue("spectrum_reference"), @@ -190,8 +193,7 @@ def _parse_psm( # Store metadata of PeptideIdentification and PeptideHit objects metadata=peptide_id_metadata | peptide_hit_metadata, rescoring_features={ - key: float(peptide_hit.getMetaValue(key)) - for key in self.rescoring_features + key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features }, ) @@ -226,14 +228,23 @@ def _get_userparams_metadata(self, peptide_hit: oms.PeptideHit) -> List[str]: # Empty list is required for the Cython wrapper to work correctly keys = [] peptide_hit.getKeys(keys) - keys = [key.decode() for key in keys if not self._is_float(peptide_hit.getMetaValue(key.decode()))] + keys = [ + key.decode() + for key in keys + if not self._is_float(peptide_hit.getMetaValue(key.decode())) + ] return keys - + def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]: """Get list of rescoring features in UserParams attached to each PeptideHit.""" keys = [] peptide_hit.getKeys(keys) - keys = [key.decode() for key in keys if self._is_float(peptide_hit.getMetaValue(key.decode())) and key.decode() in RESCORING_FEATURE_LIST] + keys = [ + key.decode() + for key in keys + if self._is_float(peptide_hit.getMetaValue(key.decode())) + and key.decode() in RESCORING_FEATURE_LIST + ] return keys @staticmethod @@ -285,7 +296,7 @@ def __init__( - If `protein_ids` and `peptide_ids` are provided, each :py:class:`~pyopenms.PeptideIdentification` object in the list `peptide_ids` will be updated with new `rescoring_features` from the PSMList. Otherwise, new pyopenms objects will be created, filled with information of PSMList and written to the idXML file. - + Examples -------- - Example with `pyopenms` objects: @@ -408,7 +419,7 @@ def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None: peptide_hit.setMetaValue("q-value", psm.qvalue) if psm.pep is not None: peptide_hit.setMetaValue("PEP", psm.pep) - + for feature, value in psm.rescoring_features.items(): if feature not in RESCORING_FEATURE_LIST: peptide_hit.setMetaValue(feature, value) @@ -502,7 +513,7 @@ def _create_new_ids(self, psm_dict: dict) -> None: def _convert_proforma_to_unimod(self, peptidoform: Peptidoform) -> str: """Convert a peptidoform sequence in proforma notation to UNIMOD notation.""" - sequence = str(peptidoform).split('/')[0] + sequence = str(peptidoform).split("/")[0] # Replace square brackets around modifications with parentheses sequence = re.sub(r"\[([^\]]+)\]", r"(\1)", sequence) diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index ce45556..075b12f 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -27,7 +27,41 @@ def test__parse_peptidoform(self): assert IdXMLReader._parse_peptidoform(*test_in) == expected_out def test__parse_psm(self): - test_psm = PSM(peptidoform=Peptidoform('LVPIWKK/2'), spectrum_id='controllerType=0 controllerNumber=1 scan=294', run='HepG2_rep3_small', collection=None, spectrum=None, is_decoy=True, score=1.0099999904632568, qvalue=None, pep=None, precursor_mz=442.29595853189994, retention_time=849.0, ion_mobility=None, protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'], rank=1, source='idXML', provenance_data={'LVPIWKK/2': 'LVPIWKK'}, metadata={'idxml:score_type': 'expect', 'idxml:higher_score_better': 'False', 'idxml:significance_threshold': '0.0', 'target_decoy': 'decoy', 'protein_references': 'unique'}, rescoring_features={'MS:1002252': 0.693, 'COMET:deltaCn': 1.0, 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25}) + test_psm = PSM( + peptidoform=Peptidoform("LVPIWKK/2"), + spectrum_id="controllerType=0 controllerNumber=1 scan=294", + run="HepG2_rep3_small", + collection=None, + spectrum=None, + is_decoy=True, + score=1.0099999904632568, + qvalue=None, + pep=None, + precursor_mz=442.29595853189994, + retention_time=849.0, + ion_mobility=None, + protein_list=["DECOY_sp|Q5TH74|STPG1_HUMAN"], + rank=1, + source="idXML", + provenance_data={"LVPIWKK/2": "LVPIWKK"}, + metadata={ + "idxml:score_type": "expect", + "idxml:higher_score_better": "False", + "idxml:significance_threshold": "0.0", + "target_decoy": "decoy", + "protein_references": "unique", + }, + rescoring_features={ + "MS:1002252": 0.693, + "COMET:deltaCn": 1.0, + "MS:1002255": 35.9, + "COMET:deltaLCn": 0.0, + "COMET:lnExpect": 0.009950330853168092, + "COMET:lnNumSP": 3.555348061489414, + "COMET:lnRankSP": 0.0, + "COMET:IonFrac": 0.25, + }, + ) reader = IdXMLReader("./tests/test_data/test_in.idXML") psm = reader._parse_psm( reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0] @@ -52,8 +86,7 @@ def test__get_rescoring_features(self): ] reader = IdXMLReader("./tests/test_data/test_in.idXML") assert ( - reader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) - == expected_output + reader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output ) From dc3b5d0220451691929ec17230090d01bba2528b Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Mon, 23 Oct 2023 21:11:02 +0000 Subject: [PATCH 19/20] incorporate feedback --- psm_utils/io/idxml.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index dbf36e1..109ff2c 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -191,7 +191,7 @@ def _parse_psm( # to original sequence in writer provenance_data={str(peptidoform): peptide_hit.getSequence().toString()}, # Store metadata of PeptideIdentification and PeptideHit objects - metadata=peptide_id_metadata | peptide_hit_metadata, + metadata= {**peptide_id_metadata, **peptide_hit_metadata}, rescoring_features={ key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features }, @@ -302,10 +302,10 @@ def __init__( - Example with `pyopenms` objects: >>> from psm_utils.io.idxml import IdXMLReader, IdXMLWriter - >>> reader = IdXMLReader("psm_utils/tests/test_data/test_in.idXML"") + >>> reader = IdXMLReader("psm_utils/tests/test_data/test_in.idXML") >>> psm_list = reader.read_file() >>> for psm in psm_list: - ... psm.rescoring_features = psm.rescoring_features | {"feature": 1} + ... psm.rescoring_features = {**psm.rescoring_features, **{"feature": 1}} >>> writer = IdXMLWriter("psm_utils/tests/test_data//test_out.idXML", reader.protein_ids, reader.peptide_ids) >>> writer.write_file(psm_list) @@ -448,7 +448,6 @@ def _create_new_ids(self, psm_dict: dict) -> None: peptide_id = oms.PeptideIdentification() peptide_id.setMetaValue("spectrum_reference", spectrum_id) peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run))) - # TODO: Question: Is there a way to infer the score type from the PSM? if psms[0].score is not None: peptide_id.setScoreType("search_engine_score") if psms[0].precursor_mz is not None: From bd3baf35fa5daa70f793faab0f626ef877110525 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Mon, 23 Oct 2023 22:19:09 +0000 Subject: [PATCH 20/20] fix byte error for p38 --- psm_utils/io/idxml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 109ff2c..84ecb26 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -434,7 +434,7 @@ def _create_new_ids(self, psm_dict: dict) -> None: self.peptide_ids = [] # Set msrun filename with spectra_data meta value - msrun_reference = [str(run) for run in runs.keys()] + msrun_reference = [str(run).encode() for run in runs.keys()] self.protein_ids.setMetaValue("spectra_data", msrun_reference) protein_list = [] @@ -447,7 +447,7 @@ def _create_new_ids(self, psm_dict: dict) -> None: # Fill PeptideIdentification object with PeptideHits peptide_id = oms.PeptideIdentification() peptide_id.setMetaValue("spectrum_reference", spectrum_id) - peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run))) + peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run).encode())) if psms[0].score is not None: peptide_id.setScoreType("search_engine_score") if psms[0].precursor_mz is not None: