From 57d94eced9230ca090fce53869bd6903b9f0d95c Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Wed, 4 Oct 2023 16:16:46 +0000
Subject: [PATCH 01/20] draft of additional pyopenms reader

---
 psm_utils/io/idxml.py | 80 +++++++++++++++++++++++++++++++------------
 pyproject.toml        |  1 +
 2 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index fced5ca..932873d 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -13,9 +13,13 @@
 
 from __future__ import annotations
 
+from abc import abstractmethod
 import re
 
-from pyteomics.openms import idxml
+try:
+    import pyopenms as oms
+except ImportError:
+    from pyteomics.openms import idxml
 
 from psm_utils.io._base_classes import ReaderBase
 from psm_utils.psm import PSM
@@ -26,20 +30,19 @@
 MOD_PATTERN_NTERM = re.compile(r"^\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]")
 MOD_PATTERN_CTERM = re.compile(r"\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]$")
 
-
-class IdXMLReader(ReaderBase):
+class AIdXMLReader(ReaderBase):
+    @abstractmethod
     def __iter__(self):
-        """Iterate over file and return PSMs one-by-one."""
-        with idxml.read(str(self.filename)) as reader:
-            for entry in reader:
-                for peptide_hit in entry["PeptideHit"]:
-                    yield self._parse_psm(entry, peptide_hit)
-
+        raise NotImplementedError()
+    
+    @abstractmethod
+    def _parse_psm(self):
+        raise NotImplementedError()
+    
     @staticmethod
     def _parse_peptidoform(sequence: str, charge: int):
         """
         Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.
-
         Notes
         -----
         Implemented according to the documentation on
@@ -56,23 +59,21 @@ def _parse_peptidoform(sequence: str, charge: int):
         sequence += f"/{charge}"
         return sequence
 
-    @staticmethod
-    def _parse_is_decoy(target_decoy: str):
-        if target_decoy == "target":
-            return False
-        elif target_decoy == "decoy":
-            return True
-        elif target_decoy == "target+decoy":
-            return False
-        else:
-            return None
+
+class IdXMLReader(AIdXMLReader):
+    def __iter__(self):
+        """Iterate over file and return PSMs one-by-one."""
+        with idxml.read(str(self.filename)) as reader:
+            for entry in reader:
+                for peptide_hit in entry["PeptideHit"]:
+                    yield self._parse_psm(entry, peptide_hit)
 
     def _parse_psm(self, entry: dict, peptide_hit: dict) -> PSM:
         """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM`."""
         return PSM(
             peptidoform=self._parse_peptidoform(peptide_hit["sequence"], peptide_hit["charge"]),
             spectrum_id=entry["spectrum_reference"],
-            is_decoy=self._parse_is_decoy(peptide_hit["target_decoy"]),
+            is_decoy= peptide_hit["target_decoy"] == "decoy",
             score=peptide_hit["score"],
             precursor_mz=entry["MZ"],
             retention_time=entry["RT"],
@@ -84,3 +85,40 @@ def _parse_psm(self, entry: dict, peptide_hit: dict) -> PSM:
                 "idxml:significance_threshold": str(entry["significance_threshold"]),
             },
         )
+
+
+class OmsIdXMLReader(AIdXMLReader):
+    def __iter__(self):
+        """Iterate over file and return PSMs one-by-one."""
+        # TODO: This is a bit hacky, is there a cleaner way to do this?
+        try:
+            protein_ids, peptide_ids = [], []
+            oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids)
+        except NameError:
+            raise ModuleNotFoundError("Install pyopenms to use OmsIdXMLReader.")
+        for peptide_id in peptide_ids:
+            for peptide_hit in peptide_id.getHits():
+                yield self._parse_psm(peptide_id, peptide_hit)
+
+    def _parse_psm(self, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM:
+        """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM` using pyopenms"""
+        return PSM(
+            peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()),
+            spectrum_id = peptide_id.getMetaValue("spectrum_reference"),
+            is_decoy = peptide_hit.getMetaValue("target_decoy") == "decoy",
+            score = peptide_hit.getScore(),
+            precursor_mz = peptide_id.getMZ(),
+            retention_time = peptide_id.getRT(),
+            # TODO: ion mobility is currently not parsed to idXML so this needs to be read out from mzml
+            protein_list=[accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()],
+            rank = peptide_hit.getRank() + 1, # 0-based to 1-based
+            source="idXML",
+            # TODO: Store all peptide hit related metadata?
+            metadata = {
+                "idxml:score_type": str(peptide_id.getScoreType()),
+                "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
+                "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())
+            }
+            # TODO: provenance_data
+            # TODO: rescoring_features (Is there a fixed nomenclature for using these?)
+        )
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 96cbe4d..bf547f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ docs = [
     "sphinx-autobuild",
 ]
 online = ["streamlit", "plotly"]
+pyopenms = ["pyopenms"]
 
 [project.urls]
 GitHub = "https://github.com/compomics/psm_utils"

From 6c6441bc727c1982e9633b92087db1ebc56e1b90 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Tue, 10 Oct 2023 13:34:01 +0000
Subject: [PATCH 02/20] Complete version of IdXMLReader solely with pyopenms

---
 psm_utils/io/idxml.py | 211 +++++++++++++++++++++++++++++-------------
 1 file changed, 147 insertions(+), 64 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 932873d..9dff8c5 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -14,33 +14,113 @@
 from __future__ import annotations
 
 from abc import abstractmethod
+from typing import Iterable, Tuple, List
 import re
+from pathlib import Path
 
-try:
-    import pyopenms as oms
-except ImportError:
-    from pyteomics.openms import idxml
+import pyopenms as oms
 
-from psm_utils.io._base_classes import ReaderBase
+from psm_utils.io._base_classes import ReaderBase, WriterBase
 from psm_utils.psm import PSM
 from psm_utils.psm_list import PSMList
+from psm_utils.exceptions import PSMUtilsException
 
 # Patterns to match open and closed round/square brackets
 MOD_PATTERN = re.compile(r"\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)")
 MOD_PATTERN_NTERM = re.compile(r"^\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]")
 MOD_PATTERN_CTERM = re.compile(r"\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]$")
 
-class AIdXMLReader(ReaderBase):
-    @abstractmethod
-    def __iter__(self):
-        raise NotImplementedError()
-    
-    @abstractmethod
-    def _parse_psm(self):
-        raise NotImplementedError()
-    
-    @staticmethod
-    def _parse_peptidoform(sequence: str, charge: int):
+# Extracted from the OpenMS PSMFeatureExtractor, which adds and manipulates features that will be given to percolator
+# https://github.com/OpenMS/OpenMS/blob/342f6524e76a2bab3dcb428ba2f4aa2d6bfe8483/src/topp/PSMFeatureExtractor.cpp
+RESOCRING_FEATURE_LIST = [
+    "isotope_error"
+    "MS:1002049",  # MSGFPlus unchanged RawScore
+    "MS:1002050",  # MSGFPlus unchanged DeNovoScore
+    "MSGF:ScoreRatio",
+    "MSGF:Energy",
+    "MSGF:lnEValue",
+    "MSGF:lnExplainedIonCurrentRatio",
+    "MSGF:lnNTermIonCurrentRatio",
+    "MSGF:lnCTermIonCurrentRatio",
+    "MSGF:lnMS2IonCurrent",
+    "MSGF:MeanErrorTop7",
+    "MSGF:sqMeanErrorTop7",
+    "MSGF:StdevErrorTop7",
+    "XTANDEM:hyperscore",
+    "XTANDEM:deltascore",
+    "MS:1001330",  # expect_score
+    "hyperscore",  # MSfragger
+    "nextscore",  # MSfragger
+    "COMET:deltaCn",  # recalculated deltaCn = (current_XCorr - 2nd_best_XCorr) / max(current_XCorr, 1)
+    "COMET:deltaLCn",  # deltaLCn = (current_XCorr - worst_XCorr) / max(current_XCorr, 1)
+    "COMET:lnExpect",  # log(E-value)
+    "MS:1002252",  # unchanged XCorr
+    "MS:1002255",  # unchanged Sp = number of candidate peptides
+    "COMET:lnNumSP",  # log(number of candidate peptides)
+    "COMET:lnRankSP",  # log(rank based on Sp score)
+    "COMET:IonFrac",  # matched_ions / total_ions
+    "MS:1001171",  # unchanged mScore
+    "MASCOT:delta_score",  # delta score based on mScore
+    "CONCAT:lnEvalue",
+    "CONCAT:deltaLnEvalue",
+    "SAGE:ln(-poisson)"
+    "SAGE:ln(delta_best)",
+    "SAGE:ln(delta_next)",
+    "SAGE:ln(matched_intensity_pct)",
+    "SAGE:longest_b",
+    "SAGE:longest_y",
+    "SAGE:longest_y_pct",
+    "SAGE:matched_peaks",
+    "SAGE:scored_candidates"
+]
+
+class IdXMLReader(ReaderBase):
+    def __init__(self, filename: str | Path, *args, **kwargs) -> None:
+        """
+        Reader for idXML files.
+
+        Parameters
+        ----------
+        filename: str, pathlib.Path
+            Path to idXML file.
+        Examples
+        --------
+        >>> from psm_utils.io import IdXMLReader
+        >>> reader = IdXMLReader("example.idXML")
+        >>> psm_list = [psm for psm in reader]
+        """
+        super().__init__(filename, *args, **kwargs)
+        self.filename = Path(filename)
+        protein_ids, peptide_ids = self._parse_idxml()
+        self.protein_ids = protein_ids
+        self.peptide_ids = peptide_ids
+        self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0])
+
+    def __iter__(self) -> Iterable[PSM]:
+        """
+        Iterate over file and return PSMs one-by-one.
+        """
+        for peptide_id in self.peptide_ids:
+            for peptide_hit in peptide_id.getHits():
+                yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit)
+
+    def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification):
+        """
+        Parse idXML using pyopenms and do some sanity checks to make sure the file is not empty.
+        """
+        protein_ids, peptide_ids = [], []
+        oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids)
+
+        if len(protein_ids) == 0:
+            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins.")
+        elif len(peptide_ids) == 0:
+            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideIdentifcation` to parse.")
+        elif len(peptide_ids[0].getHits()) == 0:
+            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideHit` (PSM) to parse.")
+        else:
+            return protein_ids, peptide_ids
+
+    def _parse_peptidoform(self, sequence: str, charge: int) -> str:
         """
         Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.
         Notes
@@ -57,68 +137,71 @@ def _parse_peptidoform(sequence: str, charge: int):
             sequence = MOD_PATTERN_CTERM.sub(r"-[\1]", sequence)
         sequence = sequence.strip(".")
         sequence += f"/{charge}"
-        return sequence
-
-
-class IdXMLReader(AIdXMLReader):
-    def __iter__(self):
-        """Iterate over file and return PSMs one-by-one."""
-        with idxml.read(str(self.filename)) as reader:
-            for entry in reader:
-                for peptide_hit in entry["PeptideHit"]:
-                    yield self._parse_psm(entry, peptide_hit)
-
-    def _parse_psm(self, entry: dict, peptide_hit: dict) -> PSM:
-        """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM`."""
-        return PSM(
-            peptidoform=self._parse_peptidoform(peptide_hit["sequence"], peptide_hit["charge"]),
-            spectrum_id=entry["spectrum_reference"],
-            is_decoy= peptide_hit["target_decoy"] == "decoy",
-            score=peptide_hit["score"],
-            precursor_mz=entry["MZ"],
-            retention_time=entry["RT"],
-            protein_list=[protein["accession"] for protein in peptide_hit["protein"]],
-            source="idXML",
-            metadata={
-                "idxml:score_type": str(entry["score_type"]),
-                "idxml:higher_score_better": str(entry["higher_score_better"]),
-                "idxml:significance_threshold": str(entry["significance_threshold"]),
-            },
-        )
 
+        return sequence
 
-class OmsIdXMLReader(AIdXMLReader):
-    def __iter__(self):
-        """Iterate over file and return PSMs one-by-one."""
-        # TODO: This is a bit hacky, is there a cleaner way to do this?
-        try:
-            protein_ids, peptide_ids = [], []
-            oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids)
-        except NameError:
-            raise ModuleNotFoundError("Install pyopenms to use OmsIdXMLReader.")
-        for peptide_id in peptide_ids:
-            for peptide_hit in peptide_id.getHits():
-                yield self._parse_psm(peptide_id, peptide_hit)
 
-    def _parse_psm(self, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM:
-        """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM` using pyopenms"""
+    def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM:
+        """
+        Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`.
+        Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentifcation`
+        to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object.
+        """
         return PSM(
             peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()),
             spectrum_id = peptide_id.getMetaValue("spectrum_reference"),
+            run = self._get_run(protein_ids, peptide_id),
             is_decoy = peptide_hit.getMetaValue("target_decoy") == "decoy",
             score = peptide_hit.getScore(),
             precursor_mz = peptide_id.getMZ(),
             retention_time = peptide_id.getRT(),
-            # TODO: ion mobility is currently not parsed to idXML so this needs to be read out from mzml
+            # NOTE: In future update of OpenMS we will be able to read out ion_mobility from idXML file
             protein_list=[accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()],
             rank = peptide_hit.getRank() + 1, # 0-based to 1-based
             source="idXML",
-            # TODO: Store all peptide hit related metadata?
+            # This is needed to calculate a qvalue before rescoring the PSMList
             metadata = {
                 "idxml:score_type": str(peptide_id.getScoreType()),
                 "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
                 "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())
-            }
-            # TODO: provenance_data
-            # TODO: rescoring_features (Is there a fixed nomenclature for using these?)
-        )
\ No newline at end of file
+            },
+            rescoring_features = {key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features if self._is_float(peptide_hit.getMetaValue(key))}
+        )
+
+    def _get_run(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str:
+        """
+        Get run name from idXML using pyopenms
+        If idXML contains merge index, use it to annotate the run name without extension
+        """
+        if peptide_id.metaValueExists("id_merge_index"):
+            return Path(protein_ids[0].getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")].decode()).stem
+        else:
+            return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem
+
+    def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]:
+        """
+        Get list of rescoring features from idXML
+        """
+        # Fill the key list with all the keys from the PeptideHit in the pyopenms way
+        keys = []
+        peptide_hit.getKeys(keys)
+        # Convert from byte to str and remove the keys, which values are not in the fixed list of features
+        keys = [key.decode() for key in keys if  key.decode() in RESOCRING_FEATURE_LIST]
+
+        return keys
+
+    def _is_float(self, element: any) -> bool:
+        """
+        Check if element is float.
+        """
+        if element is None: 
+            return False
+        try:
+            float(element)
+            return True
+        except ValueError:
+            return False
+        
+
+class IdXMLReaderEmptyListException(PSMUtilsException):
+    """Exception in psm_utils.io.IdXMLReader"""
\ No newline at end of file

From e422496f95bba9aed3c86c9a7549ee8bd909925e Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Tue, 10 Oct 2023 13:36:11 +0000
Subject: [PATCH 03/20] pyopenms as fixed dependency

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index bf547f8..ba85cab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dynamic = ["version"]
 requires-python = ">=3.7"
 dependencies = [
     "pyteomics >= 4",
+    "pyopenms",
     "lxml",
     "psims",
     "pandas",
@@ -44,7 +45,6 @@ docs = [
     "sphinx-autobuild",
 ]
 online = ["streamlit", "plotly"]
-pyopenms = ["pyopenms"]
 
 [project.urls]
 GitHub = "https://github.com/compomics/psm_utils"

From 503e64dd05d52ac8a99617eec9d769451b7c7ff6 Mon Sep 17 00:00:00 2001
From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com>
Date: Tue, 10 Oct 2023 17:35:05 +0200
Subject: [PATCH 04/20] Update psm_utils/io/idxml.py

Co-authored-by: Ralf Gabriels <ralfg@hotmail.be>
---
 psm_utils/io/idxml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 9dff8c5..dcfafc4 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -32,7 +32,7 @@
 
 # Extracted from the OpenMS PSMFeatureExtractor, which adds and manipulates features that will be given to percolator
 # https://github.com/OpenMS/OpenMS/blob/342f6524e76a2bab3dcb428ba2f4aa2d6bfe8483/src/topp/PSMFeatureExtractor.cpp
-RESOCRING_FEATURE_LIST = [
+RESCORING_FEATURE_LIST = [
     "isotope_error"
     "MS:1002049",  # MSGFPlus unchanged RawScore
     "MS:1002050",  # MSGFPlus unchanged DeNovoScore

From 9287015e887bf3680b295cf2aae1ef4881ae2ef1 Mon Sep 17 00:00:00 2001
From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com>
Date: Tue, 10 Oct 2023 17:47:31 +0200
Subject: [PATCH 05/20] nitpicks

---
 psm_utils/io/idxml.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index dcfafc4..4e4f31c 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import Iterable, Tuple, List
+from typing import Union, Iterable, Tuple, List
 import re
 from pathlib import Path
 
@@ -75,7 +75,7 @@
 ]
 
 class IdXMLReader(ReaderBase):
-    def __init__(self, filename: str | Path, *args, **kwargs) -> None:
+    def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
         """
         Reader for idXML files.
 
@@ -83,6 +83,7 @@ def __init__(self, filename: str | Path, *args, **kwargs) -> None:
         ----------
         filename: str, pathlib.Path
             Path to idXML file.
+
         Examples
         --------
         >>> from psm_utils.io import IdXMLReader
@@ -123,6 +124,7 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica
     def _parse_peptidoform(self, sequence: str, charge: int) -> str:
         """
         Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.
+
         Notes
         -----
         Implemented according to the documentation on
@@ -186,7 +188,7 @@ def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]:
         keys = []
         peptide_hit.getKeys(keys)
         # Convert from byte to str and remove the keys, which values are not in the fixed list of features
-        keys = [key.decode() for key in keys if  key.decode() in RESOCRING_FEATURE_LIST]
+        keys = [key.decode() for key in keys if  key.decode() in RESCORING_FEATURE_LIST]
 
         return keys
 
@@ -204,4 +206,4 @@ def _is_float(self, element: any) -> bool:
         
 
 class IdXMLReaderEmptyListException(PSMUtilsException):
-    """Exception in psm_utils.io.IdXMLReader"""
\ No newline at end of file
+    """Exception in psm_utils.io.IdXMLReader"""

From 0a2db3e4a32ef3c9ef9cf57fa96ce591feae63a2 Mon Sep 17 00:00:00 2001
From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com>
Date: Tue, 10 Oct 2023 17:49:12 +0200
Subject: [PATCH 06/20] typo

---
 psm_utils/io/idxml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 4e4f31c..73205e6 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -115,7 +115,7 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica
         if len(protein_ids) == 0:
             raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins.")
         elif len(peptide_ids) == 0:
-            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideIdentifcation` to parse.")
+            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideIdentification` to parse.")
         elif len(peptide_ids[0].getHits()) == 0:
             raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideHit` (PSM) to parse.")
         else:
@@ -146,7 +146,7 @@ def _parse_peptidoform(self, sequence: str, charge: int) -> str:
     def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM:
         """
         Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`.
-        Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentifcation`
+        Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification`
         to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object.
         """
         return PSM(

From 042c91eb3aeb6a2b58c01870ba81bb49baaa978a Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Tue, 10 Oct 2023 19:38:44 +0000
Subject: [PATCH 07/20] move idxml parsing to iter, add decoy function

---
 psm_utils/io/idxml.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 73205e6..66eef38 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -92,18 +92,17 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
         """
         super().__init__(filename, *args, **kwargs)
         self.filename = Path(filename)
-        protein_ids, peptide_ids = self._parse_idxml()
-        self.protein_ids = protein_ids
-        self.peptide_ids = peptide_ids
-        self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0])
+        self.rescoring_features = None
 
     def __iter__(self) -> Iterable[PSM]:
         """
         Iterate over file and return PSMs one-by-one.
         """
-        for peptide_id in self.peptide_ids:
+        protein_ids, peptide_ids = self._parse_idxml()
+        self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0])
+        for peptide_id in peptide_ids:
             for peptide_hit in peptide_id.getHits():
-                yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit)
+                yield self._parse_psm(protein_ids, peptide_id, peptide_hit)
 
     def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification):
         """
@@ -113,11 +112,11 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica
         oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids)
 
         if len(protein_ids) == 0:
-            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins.")
+            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins. Nothing to parse.")
         elif len(peptide_ids) == 0:
-            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideIdentification` to parse.")
+            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no PeptideIdentifications. Nothing to parse.")
         elif len(peptide_ids[0].getHits()) == 0:
-            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no :py:class:`~pyopenms.PeptideHit` (PSM) to parse.")
+            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no PeptideHits. Nothing to parse.")
         else:
             return protein_ids, peptide_ids
 
@@ -153,7 +152,7 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep
             peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()),
             spectrum_id = peptide_id.getMetaValue("spectrum_reference"),
             run = self._get_run(protein_ids, peptide_id),
-            is_decoy = peptide_hit.getMetaValue("target_decoy") == "decoy",
+            is_decoy = self._is_decoy(peptide_hit),
             score = peptide_hit.getScore(),
             precursor_mz = peptide_id.getMZ(),
             retention_time = peptide_id.getRT(),
@@ -167,7 +166,8 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep
                 "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
                 "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())
             },
-            rescoring_features = {key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features if self._is_float(peptide_hit.getMetaValue(key))}
+            rescoring_features = {key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features \
+                                  if self._is_float(peptide_hit.getMetaValue(key))}
         )
 
     def _get_run(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str:
@@ -204,6 +204,14 @@ def _is_float(self, element: any) -> bool:
         except ValueError:
             return False
         
+    def _is_decoy(self, peptide_hit: oms.PeptideHit) -> bool:
+        """
+        Check if PSM is target or decoy
+        """
+        if peptide_hit.metaValueExists("target_decoy"):
+            return peptide_hit.getMetaValue("target_decoy") == "decoy"
+        else:
+            return None
 
 class IdXMLReaderEmptyListException(PSMUtilsException):
     """Exception in psm_utils.io.IdXMLReader"""

From 83e8792251bcb18668b6a60015741c46deff13a7 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Wed, 11 Oct 2023 09:13:10 +0000
Subject: [PATCH 08/20] add intersphinx config

---
 docs/source/conf.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3c05a45..36b1602 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -40,6 +40,14 @@
 autodoc_member_order = "bysource"
 autoclass_content = "init"
 
+# Intersphinx options
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+    "pyteomics": ("https://pyteomics.readthedocs.io/en/latest/", None),
+    "pyopenms": ("https://pyopenms.readthedocs.io/en/latest/", None)
+}
 
 def setup(app):
     config = {

From 280f671dd07875d78156202b39858f75808dc34d Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Fri, 13 Oct 2023 09:40:59 +0000
Subject: [PATCH 09/20] Adjustments to idxml reader

---
 psm_utils/io/idxml.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 66eef38..1757e37 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -92,17 +92,18 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
         """
         super().__init__(filename, *args, **kwargs)
         self.filename = Path(filename)
-        self.rescoring_features = None
+        protein_ids, peptide_ids = self._parse_idxml()
+        self.protein_ids = protein_ids
+        self.peptide_ids = peptide_ids
+        self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0])
 
     def __iter__(self) -> Iterable[PSM]:
         """
         Iterate over file and return PSMs one-by-one.
         """
-        protein_ids, peptide_ids = self._parse_idxml()
-        self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0])
-        for peptide_id in peptide_ids:
+        for peptide_id in self.peptide_ids:
             for peptide_hit in peptide_id.getHits():
-                yield self._parse_psm(protein_ids, peptide_id, peptide_hit)
+                yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit)
 
     def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification):
         """
@@ -120,6 +121,7 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica
         else:
             return protein_ids, peptide_ids
 
+
     def _parse_peptidoform(self, sequence: str, charge: int) -> str:
         """
         Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.
@@ -148,8 +150,9 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep
         Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification`
         to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object.
         """
+        peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge())
         return PSM(
-            peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge()),
+            peptidoform = peptidoform,
             spectrum_id = peptide_id.getMetaValue("spectrum_reference"),
             run = self._get_run(protein_ids, peptide_id),
             is_decoy = self._is_decoy(peptide_hit),
@@ -160,6 +163,8 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep
             protein_list=[accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()],
             rank = peptide_hit.getRank() + 1, # 0-based to 1-based
             source="idXML",
+            # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back to original sequence in writer
+            provenance_data = {str(peptidoform): peptide_hit.getSequence().toString()},
             # This is needed to calculate a qvalue before rescoring the PSMList
             metadata = {
                 "idxml:score_type": str(peptide_id.getScoreType()),
@@ -213,5 +218,6 @@ def _is_decoy(self, peptide_hit: oms.PeptideHit) -> bool:
         else:
             return None
 
+
 class IdXMLReaderEmptyListException(PSMUtilsException):
-    """Exception in psm_utils.io.IdXMLReader"""
+    """Exception in psm_utils.io.IdXMLReader"""
\ No newline at end of file

From 00d3577cefdcf23301734a37be40b33a95b57d84 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Fri, 13 Oct 2023 11:13:11 +0000
Subject: [PATCH 10/20] Add tests for reader

---
 tests/test_data/test.idXML  | 153 ++++++++++++++++++++++++++++++++++++
 tests/test_io/test_idxml.py |  65 ++++++++++++++-
 2 files changed, 217 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_data/test.idXML

diff --git a/tests/test_data/test.idXML b/tests/test_data/test.idXML
new file mode 100644
index 0000000..c1983ad
--- /dev/null
+++ b/tests/test_data/test.idXML
@@ -0,0 +1,153 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="https://www.openms.de/xml-stylesheet/IdXML.xsl" ?>
+<IdXML version="1.5" xsi:noNamespaceSchemaLocation="https://www.openms.de/xml-schema/IdXML_1_5.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+	<SearchParameters id="SP_0" db="UP000005640_9606_decoy.fasta" db_version="" taxonomy="" mass_type="monoisotopic" charges="0:0" enzyme="trypsin" missed_cleavages="1" precursor_peak_tolerance="10" precursor_peak_tolerance_ppm="true" peak_mass_tolerance="0.01" peak_mass_tolerance_ppm="false" >
+		<VariableModification name="Oxidation (M)" />
+				<UserParam type="string" name="CometAdapter:1:in" value="HepG2_rep3_small.mzML"/>
+				<UserParam type="string" name="CometAdapter:1:out" value="HepG2_rep3_small.idXML"/>
+				<UserParam type="string" name="CometAdapter:1:database" value="UP000005640_9606_decoy.fasta"/>
+				<UserParam type="string" name="CometAdapter:1:comet_executable" value="comet.exe"/>
+				<UserParam type="string" name="CometAdapter:1:pin_out" value="HepG2_rep3_small_pin.tsv"/>
+				<UserParam type="string" name="CometAdapter:1:default_params_file" value=""/>
+				<UserParam type="float" name="CometAdapter:1:precursor_mass_tolerance" value="10.0"/>
+				<UserParam type="string" name="CometAdapter:1:precursor_error_units" value="ppm"/>
+				<UserParam type="string" name="CometAdapter:1:isotope_error" value="off"/>
+				<UserParam type="float" name="CometAdapter:1:fragment_mass_tolerance" value="0.01"/>
+				<UserParam type="string" name="CometAdapter:1:fragment_error_units" value="Da"/>
+				<UserParam type="float" name="CometAdapter:1:fragment_bin_offset" value="0.0"/>
+				<UserParam type="string" name="CometAdapter:1:instrument" value="high_res"/>
+				<UserParam type="string" name="CometAdapter:1:use_A_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:use_B_ions" value="true"/>
+				<UserParam type="string" name="CometAdapter:1:use_C_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:use_X_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:use_Y_ions" value="true"/>
+				<UserParam type="string" name="CometAdapter:1:use_Z_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:use_NL_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:enzyme" value="Trypsin"/>
+				<UserParam type="string" name="CometAdapter:1:second_enzyme" value=""/>
+				<UserParam type="string" name="CometAdapter:1:num_enzyme_termini" value="fully"/>
+				<UserParam type="int" name="CometAdapter:1:missed_cleavages" value="1"/>
+				<UserParam type="int" name="CometAdapter:1:min_peptide_length" value="5"/>
+				<UserParam type="int" name="CometAdapter:1:max_peptide_length" value="63"/>
+				<UserParam type="int" name="CometAdapter:1:num_hits" value="1"/>
+				<UserParam type="string" name="CometAdapter:1:precursor_charge" value="0:0"/>
+				<UserParam type="string" name="CometAdapter:1:override_charge" value="keep known search unknown"/>
+				<UserParam type="int" name="CometAdapter:1:ms_level" value="2"/>
+				<UserParam type="string" name="CometAdapter:1:activation_method" value="ALL"/>
+				<UserParam type="string" name="CometAdapter:1:digest_mass_range" value="600:5000"/>
+				<UserParam type="int" name="CometAdapter:1:max_fragment_charge" value="3"/>
+				<UserParam type="int" name="CometAdapter:1:max_precursor_charge" value="5"/>
+				<UserParam type="string" name="CometAdapter:1:clip_nterm_methionine" value="false"/>
+				<UserParam type="int" name="CometAdapter:1:spectrum_batch_size" value="20000"/>
+				<UserParam type="floatList" name="CometAdapter:1:mass_offsets" value="[0.0]"/>
+				<UserParam type="int" name="CometAdapter:1:minimum_peaks" value="10"/>
+				<UserParam type="float" name="CometAdapter:1:minimum_intensity" value="0.0"/>
+				<UserParam type="string" name="CometAdapter:1:remove_precursor_peak" value="no"/>
+				<UserParam type="float" name="CometAdapter:1:remove_precursor_tolerance" value="1.5"/>
+				<UserParam type="string" name="CometAdapter:1:clear_mz_range" value="0:0"/>
+				<UserParam type="stringList" name="CometAdapter:1:fixed_modifications" value="[]"/>
+				<UserParam type="stringList" name="CometAdapter:1:variable_modifications" value="[Oxidation (M)]"/>
+				<UserParam type="intList" name="CometAdapter:1:binary_modifications" value="[]"/>
+				<UserParam type="int" name="CometAdapter:1:max_variable_mods_in_peptide" value="5"/>
+				<UserParam type="string" name="CometAdapter:1:require_variable_mod" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:reindex" value="true"/>
+				<UserParam type="string" name="CometAdapter:1:log" value=""/>
+				<UserParam type="int" name="CometAdapter:1:debug" value="0"/>
+				<UserParam type="int" name="CometAdapter:1:threads" value="2"/>
+				<UserParam type="string" name="CometAdapter:1:no_progress" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:force" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:test" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:decoy_string" value=""/>
+				<UserParam type="string" name="CometAdapter:1:decoy_string_position" value="prefix"/>
+				<UserParam type="string" name="CometAdapter:1:missing_decoy_action" value="warn"/>
+				<UserParam type="string" name="CometAdapter:1:write_protein_sequence" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:write_protein_description" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:keep_unreferenced_proteins" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:unmatched_action" value="error"/>
+				<UserParam type="int" name="CometAdapter:1:aaa_max" value="3"/>
+				<UserParam type="int" name="CometAdapter:1:mismatches_max" value="0"/>
+				<UserParam type="string" name="CometAdapter:1:IL_equivalent" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:allow_nterm_protein_cleavage" value="true"/>
+				<UserParam type="string" name="CometAdapter:1:name" value="auto"/>
+				<UserParam type="string" name="CometAdapter:1:specificity" value="auto"/>
+				<UserParam type="string" name="EnzymeTermSpecificity" value="full"/>
+				<UserParam type="string" name="SE:Comet" value="# comet_version 2023.01 rev. 0"/>
+				<UserParam type="string" name="Comet:db" value="UP000005640_9606_decoy.fasta"/>
+				<UserParam type="string" name="Comet:db_version" value=""/>
+				<UserParam type="string" name="Comet:taxonomy" value=""/>
+				<UserParam type="string" name="Comet:charges" value="0:0"/>
+				<UserParam type="string" name="Comet:fixed_modifications" value=""/>
+				<UserParam type="string" name="Comet:variable_modifications" value="Oxidation (M)"/>
+				<UserParam type="int" name="Comet:missed_cleavages" value="1"/>
+				<UserParam type="float" name="Comet:fragment_mass_tolerance" value="0.01"/>
+				<UserParam type="string" name="Comet:fragment_mass_tolerance_unit" value="Da"/>
+				<UserParam type="float" name="Comet:precursor_mass_tolerance" value="10.0"/>
+				<UserParam type="string" name="Comet:precursor_mass_tolerance_unit" value="ppm"/>
+				<UserParam type="string" name="Comet:digestion_enzyme" value="Trypsin"/>
+				<UserParam type="int" name="Comet:enzyme_term_specificity" value="2"/>
+				<UserParam type="string" name="feature_extractor" value="TOPP_PSMFeatureExtractor"/>
+				<UserParam type="string" name="extra_features" value="isotope_error,COMET:deltaCn,COMET:deltaLCn,COMET:lnExpect,MS:1002252,MS:1002255,COMET:lnNumSP,COMET:lnRankSP,COMET:IonFrac"/>
+				<UserParam name="EnzymeTermSpecificity" type="string" value="full" />
+	</SearchParameters>
+	<IdentificationRun date="2023-10-02T14:26:36" search_engine="Comet" search_engine_version="" search_parameters_ref="SP_0" >
+		<ProteinIdentification score_type="" higher_score_better="true" significance_threshold="0" >
+			<ProteinHit id="PH_0" accession="DECOY_sp|Q5TH74|STPG1_HUMAN" score="0.0" sequence="" >
+				<UserParam type="string" name="target_decoy" value="decoy"/>
+			</ProteinHit>
+			<ProteinHit id="PH_1" accession="sp|Q8WTT2|NOC3L_HUMAN" score="0.0" sequence="" >
+				<UserParam type="string" name="target_decoy" value="target"/>
+			</ProteinHit>
+			<UserParam type="stringList" name="spectra_data" value="[HepG2_rep3_small.mzML,HepG2_rep1_small.mzML,HepG2_rep2_small.mzML]"/>
+		</ProteinIdentification>
+		<PeptideIdentification score_type="expect" higher_score_better="false" significance_threshold="0.0" MZ="442.295958531899942" RT="849.0" spectrum_reference="controllerType=0 controllerNumber=1 scan=294" >
+			<PeptideHit score="1.01" sequence="LVPIWKK" charge="2" aa_before="[" aa_after="D" start="0" end="6" protein_refs="PH_0" >
+				<UserParam type="string" name="target_decoy" value="decoy"/>
+				<UserParam type="string" name="MS:1002258" value="3"/>
+				<UserParam type="string" name="MS:1002259" value="12"/>
+				<UserParam type="string" name="num_matched_peptides" value="35"/>
+				<UserParam type="int" name="isotope_error" value="0"/>
+				<UserParam type="float" name="MS:1002252" value="0.693"/>
+				<UserParam type="float" name="COMET:xcorr" value="0.693"/>
+				<UserParam type="float" name="MS:1002253" value="1.0"/>
+				<UserParam type="float" name="COMET:deltaCn" value="1.0"/>
+				<UserParam type="float" name="MS:1002255" value="35.899999999999999"/>
+				<UserParam type="float" name="COMET:spscore" value="35.899999999999999"/>
+				<UserParam type="float" name="MS:1002256" value="1.0"/>
+				<UserParam type="float" name="COMET:sprank" value="1.0"/>
+				<UserParam type="float" name="MS:1002257" value="1.01"/>
+				<UserParam type="string" name="protein_references" value="unique"/>
+				<UserParam type="float" name="COMET:deltaLCn" value="0.0"/>
+				<UserParam type="float" name="COMET:lnExpect" value="9.950330853168092e-03"/>
+				<UserParam type="float" name="COMET:lnNumSP" value="3.555348061489414"/>
+				<UserParam type="float" name="COMET:lnRankSP" value="0.0"/>
+				<UserParam type="float" name="COMET:IonFrac" value="0.25"/>
+			</PeptideHit>
+			<UserParam type="int" name="id_merge_index" value="0"/>
+		</PeptideIdentification>
+		<PeptideIdentification score_type="expect" higher_score_better="false" significance_threshold="0.0" MZ="554.316466531900005" RT="857.0" spectrum_reference="controllerType=0 controllerNumber=1 scan=313" >
+			<PeptideHit score="999.0" sequence="QFKQQSTLK" charge="2" aa_before="K" aa_after="K" start="31" end="39" protein_refs="PH_1" >
+				<UserParam type="string" name="target_decoy" value="target"/>
+				<UserParam type="string" name="MS:1002258" value="1"/>
+				<UserParam type="string" name="MS:1002259" value="16"/>
+				<UserParam type="string" name="num_matched_peptides" value="216"/>
+				<UserParam type="int" name="isotope_error" value="0"/>
+				<UserParam type="float" name="MS:1002252" value="0.172"/>
+				<UserParam type="float" name="COMET:xcorr" value="0.172"/>
+				<UserParam type="float" name="MS:1002253" value="1.0"/>
+				<UserParam type="float" name="COMET:deltaCn" value="1.0"/>
+				<UserParam type="float" name="MS:1002255" value="4.4"/>
+				<UserParam type="float" name="COMET:spscore" value="4.4"/>
+				<UserParam type="float" name="MS:1002256" value="1.0"/>
+				<UserParam type="float" name="COMET:sprank" value="1.0"/>
+				<UserParam type="float" name="MS:1002257" value="999.0"/>
+				<UserParam type="string" name="protein_references" value="unique"/>
+				<UserParam type="float" name="COMET:deltaLCn" value="0.0"/>
+				<UserParam type="float" name="COMET:lnExpect" value="6.906754778648554"/>
+				<UserParam type="float" name="COMET:lnNumSP" value="5.375278407684165"/>
+				<UserParam type="float" name="COMET:lnRankSP" value="0.0"/>
+				<UserParam type="float" name="COMET:IonFrac" value="0.0625"/>
+			</PeptideHit>
+			<UserParam type="int" name="id_merge_index" value="0"/>
+		</PeptideIdentification>
+	</IdentificationRun>
+</IdXML>
diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index 808f1a1..2d32bca 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -1,9 +1,10 @@
 """Tests for psm_utils.io.idxml."""
 
 from psm_utils.io.idxml import IdXMLReader
-
+from psm_utils.psm import PSM
 
 class TestIdXMLReader:
+
     def test__parse_peptidoform(self):
         test_cases = [
             (("DFPIAMGER", 2), "DFPIAMGER/2"),
@@ -20,3 +21,65 @@ def test__parse_peptidoform(self):
 
         for test_in, expected_out in test_cases:
             assert IdXMLReader._parse_peptidoform(*test_in) == expected_out
+
+
+    def test__parse_psm(self):
+        test_psm = PSM(
+            peptidoform=Peptidoform('LVPIWKK/2'),
+            spectrum_id='controllerType=0 controllerNumber=1 scan=294',
+            run='HepG2_rep3_small',
+            collection=None,
+            spectrum=None,
+            is_decoy=True,
+            score=1.0099999904632568,
+            qvalue=None,
+            pep=None,
+            precursor_mz=442.29595853189994,
+            retention_time=849.0,
+            ion_mobility=None,
+            protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'],
+            rank=1,
+            source='idXML',
+            provenance_data={'LVPIWKK/2': 'LVPIWKK'},
+            metadata={
+                'idxml:score_type': 'expect',
+                'idxml:higher_score_better': 'False',
+                'idxml:significance_threshold': '0.0'
+            },
+            rescoring_features={'MS:1002252': 0.693,
+                                'COMET:deltaCn': 1.0,
+                                'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25}
+        )
+
+        reader = IdXMLReader("./tests/test_data/test.idXML")
+        psm = IdXMLReader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0])
+        assert psm.peptidoform == test_psm.peptidoform
+        assert psm.spectrum_id == test_psm.spectrum_id
+        assert psm.run == test_psm.run
+        assert psm.collection == test_psm.collection
+        assert psm.spectrum == test_psm.spectrum
+        assert psm.is_decoy == test_psm.is_decoy
+        assert psm.score == test_psm.score
+        assert psm.qvalue == test_psm.qvalue
+        assert psm.pep == test_psm.pep
+        assert psm.precursor_mz == test_psm.precursor_mz
+        assert psm.retention_time == test_psm.retention_time
+        assert psm.ion_mobility == test_psm.ion_mobility
+        assert psm.protein_list == test_psm.protein_list
+        assert psm.rank == test_psm.rank
+        assert psm.source == test_psm.source
+        assert psm.provenance_data == test_psm.provenance_data
+        assert psm.metadata == test_psm.metadata
+        assert psm.rescoring_features == test_psm.rescoring_features
+
+
+    def test__get_run():
+        reader = IdXMLReader("./tests/test_data/test.idXML")
+        expected_output = "HepG2_rep3_small"
+        assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output
+    
+
+    def _get_rescoring_features():
+        reader = IdXMLReader("./tests/test_data/test.idXML")
+        expected_output = ['MS:1002252', 'COMET:deltaCn', 'MS:1002255', 'COMET:deltaLCn', 'COMET:lnExpect', 'COMET:lnNumSP', 'COMET:lnRankSP', 'COMET:IonFrac']
+        assert IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output

From c4c646edb75e0e1a6d0767d7eaf109f448fa786a Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Fri, 13 Oct 2023 11:22:41 +0000
Subject: [PATCH 11/20] Add idxml writer

---
 psm_utils/io/idxml.py | 180 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 179 insertions(+), 1 deletion(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 1757e37..b99f828 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -220,4 +220,182 @@ def _is_decoy(self, peptide_hit: oms.PeptideHit) -> bool:
 
 
 class IdXMLReaderEmptyListException(PSMUtilsException):
-    """Exception in psm_utils.io.IdXMLReader"""
\ No newline at end of file
+    """Exception in psm_utils.io.IdXMLReader"""
+
+
+class IdXMLWriter(WriterBase):
+    def __init__(
+        self,
+        filename: str | Path,
+        protein_ids = None,
+        peptide_ids = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Writer for idXML files.
+
+        Parameters
+        ----------
+        filename: str, Pathlib.Path
+            Path to PSM file.
+        run : str, optional
+            Name of the MS run. Usually the spectrum file filename without
+            extension.
+
+        Notes
+        -----
+        - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing
+          a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList
+          to a file per collection with the :py:meth:`write_file` method is currently supported.
+        - If :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification`
+          objects are provided, the :py:meth:`write_file` method will update the objects with novel
+          features from rescoring_features dictionary of the :py:class:`~psm_utils.psm.PSM` objects 
+          in the PSMList and write them to a idXML file.
+        """
+        super().__init__(filename, *args, **kwargs)
+        self.protein_ids = protein_ids
+        self.peptide_ids = peptide_ids
+        self._writer = None
+
+    def __enter__(self) -> IdXMLWriter:
+        return self
+
+    def __exit__(self, *args, **kwargs) -> None:
+        pass
+
+    def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None:
+        """
+        Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information
+        from :py:class:`~psm_utils.psm.PSM`.
+        """
+        peptide_hit.setScore(psm.score)
+        peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based
+        for feature, value in psm.rescoring_features.items():
+            if feature not in RESCORING_FEATURE_LIST:
+                peptide_hit.setMetaValue(feature, value)
+
+    def _convert_proforma_to_unimod(self, sequence: str) -> str:
+        """
+        Convert a peptidoform sequence in proforma notation to UNIMOD notation.
+        """
+        # Replace square brackets with parentheses
+        sequence = sequence.replace("[", "(").replace("]", ")")
+
+        # Check for N-terminal and C-terminal modifications
+        if sequence[:2] == "((":
+            sequence = sequence.replace("((", ".[", 1).replace(")-", ".", 1)
+        if sequence[-2:] == "))":
+            sequence = sequence[::-1].replace("))", ".]", 1).replace("-(", ".", 1)[::-1]
+
+        return sequence
+
+    def write_psm(self, psm: PSM):
+        """
+        Write a single PSM to the PSM file.
+
+        This method is currently not supported (see Notes).
+
+        Raises
+        ------
+        NotImplementedError
+            IdXMLWriter currently does not support write_psm.
+
+        """
+        raise NotImplementedError("IdXMLWriter currently does not support write_psm.")
+    
+    def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None:
+        """
+        Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary.
+        """
+        if d is not None:
+            for key, value in d.items():
+                peptide_hit.setMetaValue(key, value)
+
+    def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification = None, peptide_ids: oms.PeptideIdentification = None) -> None:
+        """
+        Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification`
+        and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
+        """
+        psm_dict = psm_list.get_psm_dict()
+
+        if protein_ids is not None and peptide_ids is not None:
+            # Access run name(s) from ProteinIdentification
+            spectrum_files = [Path(run.decode()).stem for run in protein_ids[0].getMetaValue("spectra_data")]
+            for peptide_id in peptide_ids:
+                if len(spectrum_files) > 1:
+                    run = spectrum_files[peptide_id.getMetaValue("id_merge_index")]
+                else:
+                    run = spectrum_files[0]
+                # Get PSM objects associated from runs since we are writing a merged idXML
+                # NOTE: Collections with multiple protein_ids and peptide_ids is not supported
+                try:
+                    psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")]
+                except KeyError:
+                    print("Multiple collections are not supported when parsing single pyopenms protein and peptide objects.")
+                # Dict of UNIMOD peptide sequence and PSM object
+                hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms}
+                # Update PeptideHits according to the PSM objects
+                updated_peptide_hits = []
+                for peptide_hit in peptide_id.getHits():
+                    sequence = peptide_hit.getSequence().toString()
+                    psm = hit_dict[sequence]
+                    self._update_peptide_hit(peptide_hit, psm)
+                    updated_peptide_hits.append(peptide_hit)
+
+                peptide_id.setHits(updated_peptide_hits)
+
+            oms.IdXMLFile().store(str(self.filename), protein_ids, peptide_ids)
+
+        else:
+            for collection, runs in psm_dict.items():
+                protein_ids = oms.ProteinIdentification()
+                # Set msrun filename with spectra_data meta value
+                msrun_reference = [run for run in runs.keys()]
+                protein_ids.setMetaValue("spectra_data", msrun_reference)
+                protein_list = []
+                peptide_ids = []
+                for run, psm_dict_run in runs.items():
+                    for spectrum_id, psms in psm_dict_run.items():                    
+                        protein_list.append([accession for psm in psms for accession in psm.protein_list])
+
+                        # Fill PeptideIdentification object with PeptideHits
+                        peptide_id = oms.PeptideIdentification()
+                        peptide_id.setMetaValue("spectrum_reference", spectrum_id)
+                        peptide_id.setMetaValue("id_merge_index", msrun_reference.index(run))
+                        if psms[0].precursor_mz is not None:
+                            peptide_id.setMZ(psms[0].precursor_mz)
+                        if psms[0].retention_time is not None:
+                            peptide_id.setRT(psms[0].retention_time)
+
+                        # Fill PeptideHits object
+                        peptide_hits = []
+                        for psm in psms:
+                            peptide_hit = oms.PeptideHit()
+                            peptide_hit.setSequence(oms.AASequence.fromString(self._convert_proforma_to_unimod(psm.peptidoform.sequence)))
+                            peptide_hit.setCharge(psm.peptidoform.precursor_charge)
+                            peptide_hit.setMetaValue("target_decoy", "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target"))
+                            if psm.score is not None:
+                                peptide_hit.setScore(psm.score)
+                            if psm.rank is not None:
+                                peptide_hit.setRank(psm.rank -1) # 1-based to 0-based
+                            self._add_meta_values_from_dict(peptide_hit, psm.metadata)
+                            self._add_meta_values_from_dict(peptide_hit, psm.provenance_data)
+                            self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features)
+
+                            peptide_hits.append(peptide_hit)
+
+                        peptide_id.setHits(peptide_hits)
+                        peptide_ids.append(peptide_id)
+
+                # Get unique protein accessions
+                protein_list = list(set([accession for proteins in protein_list for accession in proteins]))
+                protein_hits = []
+                for accession in protein_list:
+                    protein_hit = oms.ProteinHit()
+                    protein_hit.setAccession(accession)
+                    protein_hits.append(protein_hit)
+                protein_ids.setHits(protein_hits)
+
+                # Write an idXML file for each collection
+                oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [protein_ids], peptide_ids)
\ No newline at end of file

From 7d07f90a925bba35ad9cff9c2a7bd1d7d9c451b4 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Fri, 13 Oct 2023 11:29:58 +0000
Subject: [PATCH 12/20] assert psm objects rather than asserting each param of
 the object

---
 tests/test_io/test_idxml.py | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index 2d32bca..5b3b5c6 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -53,24 +53,7 @@ def test__parse_psm(self):
 
         reader = IdXMLReader("./tests/test_data/test.idXML")
         psm = IdXMLReader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0])
-        assert psm.peptidoform == test_psm.peptidoform
-        assert psm.spectrum_id == test_psm.spectrum_id
-        assert psm.run == test_psm.run
-        assert psm.collection == test_psm.collection
-        assert psm.spectrum == test_psm.spectrum
-        assert psm.is_decoy == test_psm.is_decoy
-        assert psm.score == test_psm.score
-        assert psm.qvalue == test_psm.qvalue
-        assert psm.pep == test_psm.pep
-        assert psm.precursor_mz == test_psm.precursor_mz
-        assert psm.retention_time == test_psm.retention_time
-        assert psm.ion_mobility == test_psm.ion_mobility
-        assert psm.protein_list == test_psm.protein_list
-        assert psm.rank == test_psm.rank
-        assert psm.source == test_psm.source
-        assert psm.provenance_data == test_psm.provenance_data
-        assert psm.metadata == test_psm.metadata
-        assert psm.rescoring_features == test_psm.rescoring_features
+        assert psm == test_psm
 
 
     def test__get_run():

From d19ed506ac3ea484a8f0523ba88f335abe2eac5c Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Sun, 15 Oct 2023 13:45:21 +0000
Subject: [PATCH 13/20] Add tests for idxml writer

---
 psm_utils/io/idxml.py                         |  54 ++++---
 tests/test_data/{test.idXML => test_in.idXML} |   0
 tests/test_data/test_out.idXML                | 153 ++++++++++++++++++
 tests/test_data/test_out_sage.idXML           |  42 +++++
 tests/test_io/test_idxml.py                   |  43 +++--
 5 files changed, 257 insertions(+), 35 deletions(-)
 rename tests/test_data/{test.idXML => test_in.idXML} (100%)
 create mode 100644 tests/test_data/test_out.idXML
 create mode 100644 tests/test_data/test_out_sage.idXML

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index b99f828..7d3f776 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -5,7 +5,7 @@
 Notes
 -----
 
-* idXML supports multiple peptide hits (identifications) per spectrum. Each peptide hit
+* idXML supports mulwtiple peptide hits (identifications) per spectrum. Each peptide hit
   is parsed as an individual :py:class:`~psm_utils.psm.PSM` object.
 
 """
@@ -97,6 +97,7 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
         self.peptide_ids = peptide_ids
         self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0])
 
+
     def __iter__(self) -> Iterable[PSM]:
         """
         Iterate over file and return PSMs one-by-one.
@@ -105,6 +106,7 @@ def __iter__(self) -> Iterable[PSM]:
             for peptide_hit in peptide_id.getHits():
                 yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit)
 
+
     def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification):
         """
         Parse idXML using pyopenms and do some sanity checks to make sure the file is not empty.
@@ -121,8 +123,8 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica
         else:
             return protein_ids, peptide_ids
 
-
-    def _parse_peptidoform(self, sequence: str, charge: int) -> str:
+    @staticmethod
+    def _parse_peptidoform(sequence:str, charge:int) -> str:
         """
         Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.
 
@@ -175,7 +177,8 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep
                                   if self._is_float(peptide_hit.getMetaValue(key))}
         )
 
-    def _get_run(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str:
+    @staticmethod
+    def _get_run(protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str:
         """
         Get run name from idXML using pyopenms
         If idXML contains merge index, use it to annotate the run name without extension
@@ -185,7 +188,8 @@ def _get_run(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pepti
         else:
             return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem
 
-    def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]:
+    @staticmethod
+    def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]:
         """
         Get list of rescoring features from idXML
         """
@@ -197,7 +201,8 @@ def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]:
 
         return keys
 
-    def _is_float(self, element: any) -> bool:
+    @staticmethod
+    def _is_float(element: any) -> bool:
         """
         Check if element is float.
         """
@@ -208,8 +213,9 @@ def _is_float(self, element: any) -> bool:
             return True
         except ValueError:
             return False
-        
-    def _is_decoy(self, peptide_hit: oms.PeptideHit) -> bool:
+    
+    @staticmethod
+    def _is_decoy(peptide_hit: oms.PeptideHit) -> bool:
         """
         Check if PSM is target or decoy
         """
@@ -231,7 +237,7 @@ def __init__(
         peptide_ids = None,
         *args,
         **kwargs,
-    ):
+    ) -> None:
         """
         Writer for idXML files.
 
@@ -247,11 +253,7 @@ def __init__(
         -----
         - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing
           a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList
-          to a file per collection with the :py:meth:`write_file` method is currently supported.
-        - If :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification`
-          objects are provided, the :py:meth:`write_file` method will update the objects with novel
-          features from rescoring_features dictionary of the :py:class:`~psm_utils.psm.PSM` objects 
-          in the PSMList and write them to a idXML file.
+          to a file at once with the :py:meth:`write_file` method is currently supported. TODO: Adjust
         """
         super().__init__(filename, *args, **kwargs)
         self.protein_ids = protein_ids
@@ -299,7 +301,7 @@ def write_psm(self, psm: PSM):
         Raises
         ------
         NotImplementedError
-            IdXMLWriter currently does not support write_psm.
+            OmsIdXMLWriter currently does not support write_psm.
 
         """
         raise NotImplementedError("IdXMLWriter currently does not support write_psm.")
@@ -312,17 +314,17 @@ def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> No
             for key, value in d.items():
                 peptide_hit.setMetaValue(key, value)
 
-    def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification = None, peptide_ids: oms.PeptideIdentification = None) -> None:
+    def write_file(self, psm_list: PSMList) -> None:
         """
         Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification`
         and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
         """
         psm_dict = psm_list.get_psm_dict()
 
-        if protein_ids is not None and peptide_ids is not None:
+        if self.protein_ids is not None and self.peptide_ids is not None:
             # Access run name(s) from ProteinIdentification
-            spectrum_files = [Path(run.decode()).stem for run in protein_ids[0].getMetaValue("spectra_data")]
-            for peptide_id in peptide_ids:
+            spectrum_files = [Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")]
+            for peptide_id in self.peptide_ids:
                 if len(spectrum_files) > 1:
                     run = spectrum_files[peptide_id.getMetaValue("id_merge_index")]
                 else:
@@ -345,16 +347,16 @@ def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification =
 
                 peptide_id.setHits(updated_peptide_hits)
 
-            oms.IdXMLFile().store(str(self.filename), protein_ids, peptide_ids)
+            oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids)
 
         else:
             for collection, runs in psm_dict.items():
-                protein_ids = oms.ProteinIdentification()
+                self.protein_ids = oms.ProteinIdentification()
+                self.peptide_ids = []
                 # Set msrun filename with spectra_data meta value
                 msrun_reference = [run for run in runs.keys()]
-                protein_ids.setMetaValue("spectra_data", msrun_reference)
+                self.protein_ids.setMetaValue("spectra_data", msrun_reference)
                 protein_list = []
-                peptide_ids = []
                 for run, psm_dict_run in runs.items():
                     for spectrum_id, psms in psm_dict_run.items():                    
                         protein_list.append([accession for psm in psms for accession in psm.protein_list])
@@ -386,7 +388,7 @@ def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification =
                             peptide_hits.append(peptide_hit)
 
                         peptide_id.setHits(peptide_hits)
-                        peptide_ids.append(peptide_id)
+                        self.peptide_ids.append(peptide_id)
 
                 # Get unique protein accessions
                 protein_list = list(set([accession for proteins in protein_list for accession in proteins]))
@@ -395,7 +397,7 @@ def write_file(self, psm_list: PSMList, protein_ids: oms.ProteinIdentification =
                     protein_hit = oms.ProteinHit()
                     protein_hit.setAccession(accession)
                     protein_hits.append(protein_hit)
-                protein_ids.setHits(protein_hits)
+                self.protein_ids.setHits(protein_hits)
 
                 # Write an idXML file for each collection
-                oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [protein_ids], peptide_ids)
\ No newline at end of file
+                oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [self.protein_ids], self.peptide_ids)
\ No newline at end of file
diff --git a/tests/test_data/test.idXML b/tests/test_data/test_in.idXML
similarity index 100%
rename from tests/test_data/test.idXML
rename to tests/test_data/test_in.idXML
diff --git a/tests/test_data/test_out.idXML b/tests/test_data/test_out.idXML
new file mode 100644
index 0000000..148a2df
--- /dev/null
+++ b/tests/test_data/test_out.idXML
@@ -0,0 +1,153 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="https://www.openms.de/xml-stylesheet/IdXML.xsl" ?>
+<IdXML version="1.5" xsi:noNamespaceSchemaLocation="https://www.openms.de/xml-schema/IdXML_1_5.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+	<SearchParameters id="SP_0" db="UP000005640_9606_decoy.fasta" db_version="" taxonomy="" mass_type="monoisotopic" charges="0:0" enzyme="trypsin" missed_cleavages="1" precursor_peak_tolerance="10" precursor_peak_tolerance_ppm="true" peak_mass_tolerance="0.01" peak_mass_tolerance_ppm="false" >
+		<VariableModification name="Oxidation (M)" />
+				<UserParam type="string" name="CometAdapter:1:in" value="HepG2_rep3_small.mzML"/>
+				<UserParam type="string" name="CometAdapter:1:out" value="HepG2_rep3_small.idXML"/>
+				<UserParam type="string" name="CometAdapter:1:database" value="UP000005640_9606_decoy.fasta"/>
+				<UserParam type="string" name="CometAdapter:1:comet_executable" value="comet.exe"/>
+				<UserParam type="string" name="CometAdapter:1:pin_out" value="HepG2_rep3_small_pin.tsv"/>
+				<UserParam type="string" name="CometAdapter:1:default_params_file" value=""/>
+				<UserParam type="float" name="CometAdapter:1:precursor_mass_tolerance" value="10.0"/>
+				<UserParam type="string" name="CometAdapter:1:precursor_error_units" value="ppm"/>
+				<UserParam type="string" name="CometAdapter:1:isotope_error" value="off"/>
+				<UserParam type="float" name="CometAdapter:1:fragment_mass_tolerance" value="0.01"/>
+				<UserParam type="string" name="CometAdapter:1:fragment_error_units" value="Da"/>
+				<UserParam type="float" name="CometAdapter:1:fragment_bin_offset" value="0.0"/>
+				<UserParam type="string" name="CometAdapter:1:instrument" value="high_res"/>
+				<UserParam type="string" name="CometAdapter:1:use_A_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:use_B_ions" value="true"/>
+				<UserParam type="string" name="CometAdapter:1:use_C_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:use_X_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:use_Y_ions" value="true"/>
+				<UserParam type="string" name="CometAdapter:1:use_Z_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:use_NL_ions" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:enzyme" value="Trypsin"/>
+				<UserParam type="string" name="CometAdapter:1:second_enzyme" value=""/>
+				<UserParam type="string" name="CometAdapter:1:num_enzyme_termini" value="fully"/>
+				<UserParam type="int" name="CometAdapter:1:missed_cleavages" value="1"/>
+				<UserParam type="int" name="CometAdapter:1:min_peptide_length" value="5"/>
+				<UserParam type="int" name="CometAdapter:1:max_peptide_length" value="63"/>
+				<UserParam type="int" name="CometAdapter:1:num_hits" value="1"/>
+				<UserParam type="string" name="CometAdapter:1:precursor_charge" value="0:0"/>
+				<UserParam type="string" name="CometAdapter:1:override_charge" value="keep known search unknown"/>
+				<UserParam type="int" name="CometAdapter:1:ms_level" value="2"/>
+				<UserParam type="string" name="CometAdapter:1:activation_method" value="ALL"/>
+				<UserParam type="string" name="CometAdapter:1:digest_mass_range" value="600:5000"/>
+				<UserParam type="int" name="CometAdapter:1:max_fragment_charge" value="3"/>
+				<UserParam type="int" name="CometAdapter:1:max_precursor_charge" value="5"/>
+				<UserParam type="string" name="CometAdapter:1:clip_nterm_methionine" value="false"/>
+				<UserParam type="int" name="CometAdapter:1:spectrum_batch_size" value="20000"/>
+				<UserParam type="floatList" name="CometAdapter:1:mass_offsets" value="[0.0]"/>
+				<UserParam type="int" name="CometAdapter:1:minimum_peaks" value="10"/>
+				<UserParam type="float" name="CometAdapter:1:minimum_intensity" value="0.0"/>
+				<UserParam type="string" name="CometAdapter:1:remove_precursor_peak" value="no"/>
+				<UserParam type="float" name="CometAdapter:1:remove_precursor_tolerance" value="1.5"/>
+				<UserParam type="string" name="CometAdapter:1:clear_mz_range" value="0:0"/>
+				<UserParam type="stringList" name="CometAdapter:1:fixed_modifications" value="[]"/>
+				<UserParam type="stringList" name="CometAdapter:1:variable_modifications" value="[Oxidation (M)]"/>
+				<UserParam type="intList" name="CometAdapter:1:binary_modifications" value="[]"/>
+				<UserParam type="int" name="CometAdapter:1:max_variable_mods_in_peptide" value="5"/>
+				<UserParam type="string" name="CometAdapter:1:require_variable_mod" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:reindex" value="true"/>
+				<UserParam type="string" name="CometAdapter:1:log" value=""/>
+				<UserParam type="int" name="CometAdapter:1:debug" value="0"/>
+				<UserParam type="int" name="CometAdapter:1:threads" value="2"/>
+				<UserParam type="string" name="CometAdapter:1:no_progress" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:force" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:test" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:decoy_string" value=""/>
+				<UserParam type="string" name="CometAdapter:1:decoy_string_position" value="prefix"/>
+				<UserParam type="string" name="CometAdapter:1:missing_decoy_action" value="warn"/>
+				<UserParam type="string" name="CometAdapter:1:write_protein_sequence" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:write_protein_description" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:keep_unreferenced_proteins" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:unmatched_action" value="error"/>
+				<UserParam type="int" name="CometAdapter:1:aaa_max" value="3"/>
+				<UserParam type="int" name="CometAdapter:1:mismatches_max" value="0"/>
+				<UserParam type="string" name="CometAdapter:1:IL_equivalent" value="false"/>
+				<UserParam type="string" name="CometAdapter:1:allow_nterm_protein_cleavage" value="true"/>
+				<UserParam type="string" name="CometAdapter:1:name" value="auto"/>
+				<UserParam type="string" name="CometAdapter:1:specificity" value="auto"/>
+				<UserParam type="string" name="EnzymeTermSpecificity" value="full"/>
+				<UserParam type="string" name="SE:Comet" value="# comet_version 2023.01 rev. 0"/>
+				<UserParam type="string" name="Comet:db" value="UP000005640_9606_decoy.fasta"/>
+				<UserParam type="string" name="Comet:db_version" value=""/>
+				<UserParam type="string" name="Comet:taxonomy" value=""/>
+				<UserParam type="string" name="Comet:charges" value="0:0"/>
+				<UserParam type="string" name="Comet:fixed_modifications" value=""/>
+				<UserParam type="string" name="Comet:variable_modifications" value="Oxidation (M)"/>
+				<UserParam type="int" name="Comet:missed_cleavages" value="1"/>
+				<UserParam type="float" name="Comet:fragment_mass_tolerance" value="0.01"/>
+				<UserParam type="string" name="Comet:fragment_mass_tolerance_unit" value="Da"/>
+				<UserParam type="float" name="Comet:precursor_mass_tolerance" value="10.0"/>
+				<UserParam type="string" name="Comet:precursor_mass_tolerance_unit" value="ppm"/>
+				<UserParam type="string" name="Comet:digestion_enzyme" value="Trypsin"/>
+				<UserParam type="int" name="Comet:enzyme_term_specificity" value="2"/>
+				<UserParam type="string" name="feature_extractor" value="TOPP_PSMFeatureExtractor"/>
+				<UserParam type="string" name="extra_features" value="isotope_error,COMET:deltaCn,COMET:deltaLCn,COMET:lnExpect,MS:1002252,MS:1002255,COMET:lnNumSP,COMET:lnRankSP,COMET:IonFrac"/>
+				<UserParam name="EnzymeTermSpecificity" type="string" value="full" />
+	</SearchParameters>
+	<IdentificationRun date="2023-10-02T14:26:36" search_engine="Comet" search_engine_version="" search_parameters_ref="SP_0" >
+		<ProteinIdentification score_type="" higher_score_better="true" significance_threshold="0" >
+			<ProteinHit id="PH_0" accession="DECOY_sp|Q5TH74|STPG1_HUMAN" score="0.0" sequence="" >
+				<UserParam type="string" name="target_decoy" value="decoy"/>
+			</ProteinHit>
+			<ProteinHit id="PH_1" accession="sp|Q8WTT2|NOC3L_HUMAN" score="0.0" sequence="" >
+				<UserParam type="string" name="target_decoy" value="target"/>
+			</ProteinHit>
+			<UserParam type="stringList" name="spectra_data" value="[HepG2_rep3_small.mzML,HepG2_rep1_small.mzML,HepG2_rep2_small.mzML]"/>
+		</ProteinIdentification>
+		<PeptideIdentification score_type="expect" higher_score_better="false" significance_threshold="0.0" MZ="442.295958531899942" RT="849.0" spectrum_reference="controllerType=0 controllerNumber=1 scan=294" >
+			<PeptideHit score="1.009999990463257" sequence="LVPIWKK" charge="2" aa_before="[" aa_after="D" start="0" end="6" protein_refs="PH_0" >
+				<UserParam type="string" name="target_decoy" value="decoy"/>
+				<UserParam type="string" name="MS:1002258" value="3"/>
+				<UserParam type="string" name="MS:1002259" value="12"/>
+				<UserParam type="string" name="num_matched_peptides" value="35"/>
+				<UserParam type="int" name="isotope_error" value="0"/>
+				<UserParam type="float" name="MS:1002252" value="0.693"/>
+				<UserParam type="float" name="COMET:xcorr" value="0.693"/>
+				<UserParam type="float" name="MS:1002253" value="1.0"/>
+				<UserParam type="float" name="COMET:deltaCn" value="1.0"/>
+				<UserParam type="float" name="MS:1002255" value="35.899999999999999"/>
+				<UserParam type="float" name="COMET:spscore" value="35.899999999999999"/>
+				<UserParam type="float" name="MS:1002256" value="1.0"/>
+				<UserParam type="float" name="COMET:sprank" value="1.0"/>
+				<UserParam type="float" name="MS:1002257" value="1.01"/>
+				<UserParam type="string" name="protein_references" value="unique"/>
+				<UserParam type="float" name="COMET:deltaLCn" value="0.0"/>
+				<UserParam type="float" name="COMET:lnExpect" value="9.950330853168092e-03"/>
+				<UserParam type="float" name="COMET:lnNumSP" value="3.555348061489414"/>
+				<UserParam type="float" name="COMET:lnRankSP" value="0.0"/>
+				<UserParam type="float" name="COMET:IonFrac" value="0.25"/>
+			</PeptideHit>
+			<UserParam type="int" name="id_merge_index" value="0"/>
+		</PeptideIdentification>
+		<PeptideIdentification score_type="expect" higher_score_better="false" significance_threshold="0.0" MZ="554.316466531900005" RT="857.0" spectrum_reference="controllerType=0 controllerNumber=1 scan=313" >
+			<PeptideHit score="999.0" sequence="QFKQQSTLK" charge="2" aa_before="K" aa_after="K" start="31" end="39" protein_refs="PH_1" >
+				<UserParam type="string" name="target_decoy" value="target"/>
+				<UserParam type="string" name="MS:1002258" value="1"/>
+				<UserParam type="string" name="MS:1002259" value="16"/>
+				<UserParam type="string" name="num_matched_peptides" value="216"/>
+				<UserParam type="int" name="isotope_error" value="0"/>
+				<UserParam type="float" name="MS:1002252" value="0.172"/>
+				<UserParam type="float" name="COMET:xcorr" value="0.172"/>
+				<UserParam type="float" name="MS:1002253" value="1.0"/>
+				<UserParam type="float" name="COMET:deltaCn" value="1.0"/>
+				<UserParam type="float" name="MS:1002255" value="4.4"/>
+				<UserParam type="float" name="COMET:spscore" value="4.4"/>
+				<UserParam type="float" name="MS:1002256" value="1.0"/>
+				<UserParam type="float" name="COMET:sprank" value="1.0"/>
+				<UserParam type="float" name="MS:1002257" value="999.0"/>
+				<UserParam type="string" name="protein_references" value="unique"/>
+				<UserParam type="float" name="COMET:deltaLCn" value="0.0"/>
+				<UserParam type="float" name="COMET:lnExpect" value="6.906754778648554"/>
+				<UserParam type="float" name="COMET:lnNumSP" value="5.375278407684165"/>
+				<UserParam type="float" name="COMET:lnRankSP" value="0.0"/>
+				<UserParam type="float" name="COMET:IonFrac" value="0.0625"/>
+			</PeptideHit>
+			<UserParam type="int" name="id_merge_index" value="0"/>
+		</PeptideIdentification>
+	</IdentificationRun>
+</IdXML>
diff --git a/tests/test_data/test_out_sage.idXML b/tests/test_data/test_out_sage.idXML
new file mode 100644
index 0000000..2d307cb
--- /dev/null
+++ b/tests/test_data/test_out_sage.idXML
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="https://www.openms.de/xml-stylesheet/IdXML.xsl" ?>
+<IdXML version="1.5" xsi:noNamespaceSchemaLocation="https://www.openms.de/xml-schema/IdXML_1_5.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+	<SearchParameters id="SP_0" db="" db_version="" taxonomy="" mass_type="monoisotopic" charges="" enzyme="unknown_enzyme" missed_cleavages="0" precursor_peak_tolerance="0" precursor_peak_tolerance_ppm="false" peak_mass_tolerance="0" peak_mass_tolerance_ppm="false" >
+	</SearchParameters>
+	<IdentificationRun date="0000-00-00T00:00:00" search_engine="" search_engine_version="" search_parameters_ref="SP_0" >
+		<ProteinIdentification score_type="" higher_score_better="true" significance_threshold="0" >
+			<ProteinHit id="PH_0" accession="sp|Q99536|VAT1_HUMAN" score="0.0" sequence="" >
+			</ProteinHit>
+			<UserParam type="stringList" name="spectra_data" value="[LQSRPAAPPAPGPGQLTLR]"/>
+		</ProteinIdentification>
+		<PeptideIdentification score_type="" higher_score_better="true" significance_threshold="0.0" MZ="643.03499169873669" RT="108.285399999999996" spectrum_reference="controllerType=0 controllerNumber=1 scan=30069" >
+			<PeptideHit score="1.2944585" sequence="LQSRPAAPPAPGPGQLTLR" charge="3" >
+				<UserParam type="string" name="target_decoy" value="target"/>
+				<UserParam type="float" name="isotope_error" value="0.0"/>
+				<UserParam type="string" name="sage_filename" value="./tests/test_data/results.sage.tsv"/>
+				<UserParam type="float" name="expmass" value="1926.081500000000005"/>
+				<UserParam type="float" name="calcmass" value="1926.079999999999927"/>
+				<UserParam type="float" name="peptide_len" value="19.0"/>
+				<UserParam type="float" name="missed_cleavages" value="0.0"/>
+				<UserParam type="float" name="precursor_ppm" value="0.8239083"/>
+				<UserParam type="float" name="fragment_ppm" value="0.5347518"/>
+				<UserParam type="float" name="hyperscore" value="71.788444602553838"/>
+				<UserParam type="float" name="delta_next" value="71.788444602553838"/>
+				<UserParam type="float" name="delta_best" value="0.0"/>
+				<UserParam type="float" name="delta_rt_model" value="0.0"/>
+				<UserParam type="float" name="aligned_rt" value="0.0"/>
+				<UserParam type="float" name="predicted_rt" value="0.0"/>
+				<UserParam type="float" name="matched_peaks" value="22.0"/>
+				<UserParam type="float" name="longest_b" value="9.0"/>
+				<UserParam type="float" name="longest_y" value="12.0"/>
+				<UserParam type="float" name="longest_y_pct" value="0.6315789"/>
+				<UserParam type="float" name="matched_intensity_pct" value="50.784999999999997"/>
+				<UserParam type="float" name="scored_candidates" value="1.0"/>
+				<UserParam type="float" name="poisson" value="-1.956281191108343"/>
+				<UserParam type="float" name="ms1_intensity" value="3.0614618e08"/>
+				<UserParam type="float" name="ms2_intensity" value="5.6930696e07"/>
+			</PeptideHit>
+			<UserParam type="int" name="id_merge_index" value="0"/>
+		</PeptideIdentification>
+	</IdentificationRun>
+</IdXML>
diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index 5b3b5c6..f89c3fb 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -1,7 +1,11 @@
 """Tests for psm_utils.io.idxml."""
 
-from psm_utils.io.idxml import IdXMLReader
+import hashlib
+
+from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
+from psm_utils.io.sage import SageReader
 from psm_utils.psm import PSM
+from psm_utils.peptidoform import Peptidoform
 
 class TestIdXMLReader:
 
@@ -16,7 +20,7 @@ def test__parse_peptidoform(self):
             ((".DFPIAM[147]GER.", 2), "DFPIAM[147]GER/2"),
             ((".(Dimethyl)DFPIAMGER.", 2), "[Dimethyl]-DFPIAMGER/2"),
             ((".DFPIAMGER.(Label:18O(2))", 2), "DFPIAMGER-[Label:18O(2)]/2"),
-            ((".DFPIAMGER(Phospho).", 2), "DFPIAMGER[Phospho]/2"),
+            ((".DFPIAMGER(Phospho).", 2), "DFPIAMGER[Phospho]/2")
         ]
 
         for test_in, expected_out in test_cases:
@@ -51,18 +55,39 @@ def test__parse_psm(self):
                                 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25}
         )
 
-        reader = IdXMLReader("./tests/test_data/test.idXML")
-        psm = IdXMLReader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0])
+        reader = IdXMLReader("./tests/test_data/test_in.idXML")
+        psm = reader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0])
         assert psm == test_psm
 
 
-    def test__get_run():
-        reader = IdXMLReader("./tests/test_data/test.idXML")
+    def test__get_run(self):
         expected_output = "HepG2_rep3_small"
+        reader = IdXMLReader("./tests/test_data/test_in.idXML")
         assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output
-    
 
-    def _get_rescoring_features():
-        reader = IdXMLReader("./tests/test_data/test.idXML")
+
+    def test__get_rescoring_features(self):
         expected_output = ['MS:1002252', 'COMET:deltaCn', 'MS:1002255', 'COMET:deltaLCn', 'COMET:lnExpect', 'COMET:lnNumSP', 'COMET:lnRankSP', 'COMET:IonFrac']
+        reader = IdXMLReader("./tests/test_data/test_in.idXML")
         assert IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output
+
+
+class TestIdXMLWriter:
+    
+    def test_write_file_with_pyopenms_objects(self):
+        expected_sha = "8d8cb6d8194c5c296f0f5ee8be83d2072be125547b2d51b88100859b001f47fa"
+        reader = IdXMLReader("./tests/test_data/test_in.idXML")
+        psm_list = reader.read_file()
+        writer = IdXMLWriter("./tests/test_data/test_out.idXML", reader.protein_ids, reader.peptide_ids)
+        writer.write_file(psm_list)
+        sha = hashlib.sha256(open("./tests/test_data/test_out.idXML", 'rb').read()).hexdigest()
+        assert sha == expected_sha
+
+    def test_write_file_without_pyopenms_objects(self):
+        expected_sha = "a24e157579a4e9da71900931c6b5578a264ea7830135ede480b41e6550944e5e"
+        reader = SageReader("./tests/test_data/results.sage.tsv")
+        psm_list = reader.read_file()
+        writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML")
+        writer.write_file(psm_list)
+        sha = hashlib.sha256(open("./tests/test_data/test_out_sage.idXML", 'rb').read()).hexdigest()
+        assert sha == expected_sha

From a2bef5dccf10962a259a3fb9449ea5ca736bc44f Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Sun, 15 Oct 2023 14:13:24 +0000
Subject: [PATCH 14/20] logical structur of functions

---
 psm_utils/io/idxml.py | 243 ++++++++++++++++++++++++------------------
 1 file changed, 138 insertions(+), 105 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 7d3f776..641645a 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -5,7 +5,7 @@
 Notes
 -----
 
-* idXML supports mulwtiple peptide hits (identifications) per spectrum. Each peptide hit
+* idXML supports multiple peptide hits (identifications) per spectrum. Each peptide hit
   is parsed as an individual :py:class:`~psm_utils.psm.PSM` object.
 
 """
@@ -13,7 +13,7 @@
 
 from __future__ import annotations
 
-from abc import abstractmethod
+import logging
 from typing import Union, Iterable, Tuple, List
 import re
 from pathlib import Path
@@ -25,6 +25,8 @@
 from psm_utils.psm_list import PSMList
 from psm_utils.exceptions import PSMUtilsException
 
+logger = logging.getLogger(__name__)
+
 # Patterns to match open and closed round/square brackets
 MOD_PATTERN = re.compile(r"\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)")
 MOD_PATTERN_NTERM = re.compile(r"^\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]")
@@ -128,6 +130,7 @@ def _parse_peptidoform(sequence:str, charge:int) -> str:
         """
         Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.
 
+
         Notes
         -----
         Implemented according to the documentation on
@@ -177,6 +180,7 @@ def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.Pep
                                   if self._is_float(peptide_hit.getMetaValue(key))}
         )
 
+
     @staticmethod
     def _get_run(protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str:
         """
@@ -188,6 +192,7 @@ def _get_run(protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIden
         else:
             return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem
 
+
     @staticmethod
     def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]:
         """
@@ -201,6 +206,7 @@ def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]:
 
         return keys
 
+
     @staticmethod
     def _is_float(element: any) -> bool:
         """
@@ -213,7 +219,8 @@ def _is_float(element: any) -> bool:
             return True
         except ValueError:
             return False
-    
+
+
     @staticmethod
     def _is_decoy(peptide_hit: oms.PeptideHit) -> bool:
         """
@@ -253,19 +260,87 @@ def __init__(
         -----
         - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing
           a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList
-          to a file at once with the :py:meth:`write_file` method is currently supported. TODO: Adjust
+          to a file at once with the :py:meth:`write_file` method is currently supported.
         """
         super().__init__(filename, *args, **kwargs)
         self.protein_ids = protein_ids
         self.peptide_ids = peptide_ids
         self._writer = None
 
+
     def __enter__(self) -> IdXMLWriter:
         return self
 
+
     def __exit__(self, *args, **kwargs) -> None:
         pass
 
+
+    def write_psm(self, psm: PSM):
+        """
+        Write a single PSM to the PSM file.
+
+        This method is currently not supported (see Notes).
+
+        Raises
+        ------
+        NotImplementedError
+            OmsIdXMLWriter currently does not support write_psm.
+
+        """
+        raise NotImplementedError("IdXMLWriter currently does not support write_psm.")
+
+
+    def write_file(self, psm_list: PSMList) -> None:
+        """
+        Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification`
+        and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
+        """
+        psm_dict = psm_list.get_psm_dict()
+
+        if self.protein_ids is not None and self.peptide_ids is not None:
+            self._update_existing_ids(psm_dict)
+        # Check if one of self.protein_ids or self.peptide_ids is None
+        elif self.protein_ids is not None or self.peptide_ids is not None:
+            logger.warning("One of the protein_ids or peptide_ids is None. Falling back to creating new idXML files solely based on the PSMList.")
+            self._create_new_ids(psm_dict)
+        else:
+            self._create_new_ids(psm_dict)
+
+
+    def _update_existing_ids(self, psm_dict: dict) -> None:
+        """
+        Update existing :py:class:`~pyopenms.ProteinIdentification`
+        and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
+        """
+        # Access run name(s) from ProteinIdentification
+        spectrum_files = [Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")]
+        for peptide_id in self.peptide_ids:
+            if len(spectrum_files) > 1:
+                run = spectrum_files[peptide_id.getMetaValue("id_merge_index")]
+            else:
+                run = spectrum_files[0]
+            # Get PSM objects associated from runs since we are writing a merged idXML
+            # NOTE: Collections with multiple protein_ids and peptide_ids is not supported
+            try:
+                psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")]
+            except KeyError:
+                print("Multiple collections are not supported when parsing single pyopenms protein and peptide objects.")
+            # Dict of UNIMOD peptide sequence and PSM object
+            hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms}
+            # Update PeptideHits according to the PSM objects
+            updated_peptide_hits = []
+            for peptide_hit in peptide_id.getHits():
+                sequence = peptide_hit.getSequence().toString()
+                psm = hit_dict[sequence]
+                self._update_peptide_hit(peptide_hit, psm)
+                updated_peptide_hits.append(peptide_hit)
+
+            peptide_id.setHits(updated_peptide_hits)
+
+        oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids)
+
+
     def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None:
         """
         Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information
@@ -277,6 +352,65 @@ def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None:
             if feature not in RESCORING_FEATURE_LIST:
                 peptide_hit.setMetaValue(feature, value)
 
+
+    def _create_new_ids(self, psm_dict: dict) -> None:
+        """
+        Create new :py:class:`~pyopenms.ProteinIdentification`
+        and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
+        """
+        for collection, runs in psm_dict.items():
+            self.protein_ids = oms.ProteinIdentification()
+            self.peptide_ids = []
+            # Set msrun filename with spectra_data meta value
+            msrun_reference = [run for run in runs.keys()]
+            self.protein_ids.setMetaValue("spectra_data", msrun_reference)
+            protein_list = []
+            for run, psm_dict_run in runs.items():
+                for spectrum_id, psms in psm_dict_run.items():                    
+                    protein_list.append([accession for psm in psms for accession in psm.protein_list])
+
+                    # Fill PeptideIdentification object with PeptideHits
+                    peptide_id = oms.PeptideIdentification()
+                    peptide_id.setMetaValue("spectrum_reference", spectrum_id)
+                    peptide_id.setMetaValue("id_merge_index", msrun_reference.index(run))
+                    if psms[0].precursor_mz is not None:
+                        peptide_id.setMZ(psms[0].precursor_mz)
+                    if psms[0].retention_time is not None:
+                        peptide_id.setRT(psms[0].retention_time)
+
+                    # Fill PeptideHits object
+                    peptide_hits = []
+                    for psm in psms:
+                        peptide_hit = oms.PeptideHit()
+                        peptide_hit.setSequence(oms.AASequence.fromString(self._convert_proforma_to_unimod(psm.peptidoform.sequence)))
+                        peptide_hit.setCharge(psm.peptidoform.precursor_charge)
+                        peptide_hit.setMetaValue("target_decoy", "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target"))
+                        if psm.score is not None:
+                            peptide_hit.setScore(psm.score)
+                        if psm.rank is not None:
+                            peptide_hit.setRank(psm.rank -1) # 1-based to 0-based
+                        self._add_meta_values_from_dict(peptide_hit, psm.metadata)
+                        self._add_meta_values_from_dict(peptide_hit, psm.provenance_data)
+                        self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features)
+
+                        peptide_hits.append(peptide_hit)
+
+                    peptide_id.setHits(peptide_hits)
+                    self.peptide_ids.append(peptide_id)
+
+            # Get unique protein accessions
+            protein_list = list(set([accession for proteins in protein_list for accession in proteins]))
+            protein_hits = []
+            for accession in protein_list:
+                protein_hit = oms.ProteinHit()
+                protein_hit.setAccession(accession)
+                protein_hits.append(protein_hit)
+            self.protein_ids.setHits(protein_hits)
+
+            # Write an idXML file for each collection
+            oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [self.protein_ids], self.peptide_ids)
+
+
     def _convert_proforma_to_unimod(self, sequence: str) -> str:
         """
         Convert a peptidoform sequence in proforma notation to UNIMOD notation.
@@ -292,20 +426,7 @@ def _convert_proforma_to_unimod(self, sequence: str) -> str:
 
         return sequence
 
-    def write_psm(self, psm: PSM):
-        """
-        Write a single PSM to the PSM file.
-
-        This method is currently not supported (see Notes).
 
-        Raises
-        ------
-        NotImplementedError
-            OmsIdXMLWriter currently does not support write_psm.
-
-        """
-        raise NotImplementedError("IdXMLWriter currently does not support write_psm.")
-    
     def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None:
         """
         Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary.
@@ -313,91 +434,3 @@ def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> No
         if d is not None:
             for key, value in d.items():
                 peptide_hit.setMetaValue(key, value)
-
-    def write_file(self, psm_list: PSMList) -> None:
-        """
-        Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification`
-        and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
-        """
-        psm_dict = psm_list.get_psm_dict()
-
-        if self.protein_ids is not None and self.peptide_ids is not None:
-            # Access run name(s) from ProteinIdentification
-            spectrum_files = [Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")]
-            for peptide_id in self.peptide_ids:
-                if len(spectrum_files) > 1:
-                    run = spectrum_files[peptide_id.getMetaValue("id_merge_index")]
-                else:
-                    run = spectrum_files[0]
-                # Get PSM objects associated from runs since we are writing a merged idXML
-                # NOTE: Collections with multiple protein_ids and peptide_ids is not supported
-                try:
-                    psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")]
-                except KeyError:
-                    print("Multiple collections are not supported when parsing single pyopenms protein and peptide objects.")
-                # Dict of UNIMOD peptide sequence and PSM object
-                hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms}
-                # Update PeptideHits according to the PSM objects
-                updated_peptide_hits = []
-                for peptide_hit in peptide_id.getHits():
-                    sequence = peptide_hit.getSequence().toString()
-                    psm = hit_dict[sequence]
-                    self._update_peptide_hit(peptide_hit, psm)
-                    updated_peptide_hits.append(peptide_hit)
-
-                peptide_id.setHits(updated_peptide_hits)
-
-            oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids)
-
-        else:
-            for collection, runs in psm_dict.items():
-                self.protein_ids = oms.ProteinIdentification()
-                self.peptide_ids = []
-                # Set msrun filename with spectra_data meta value
-                msrun_reference = [run for run in runs.keys()]
-                self.protein_ids.setMetaValue("spectra_data", msrun_reference)
-                protein_list = []
-                for run, psm_dict_run in runs.items():
-                    for spectrum_id, psms in psm_dict_run.items():                    
-                        protein_list.append([accession for psm in psms for accession in psm.protein_list])
-
-                        # Fill PeptideIdentification object with PeptideHits
-                        peptide_id = oms.PeptideIdentification()
-                        peptide_id.setMetaValue("spectrum_reference", spectrum_id)
-                        peptide_id.setMetaValue("id_merge_index", msrun_reference.index(run))
-                        if psms[0].precursor_mz is not None:
-                            peptide_id.setMZ(psms[0].precursor_mz)
-                        if psms[0].retention_time is not None:
-                            peptide_id.setRT(psms[0].retention_time)
-
-                        # Fill PeptideHits object
-                        peptide_hits = []
-                        for psm in psms:
-                            peptide_hit = oms.PeptideHit()
-                            peptide_hit.setSequence(oms.AASequence.fromString(self._convert_proforma_to_unimod(psm.peptidoform.sequence)))
-                            peptide_hit.setCharge(psm.peptidoform.precursor_charge)
-                            peptide_hit.setMetaValue("target_decoy", "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target"))
-                            if psm.score is not None:
-                                peptide_hit.setScore(psm.score)
-                            if psm.rank is not None:
-                                peptide_hit.setRank(psm.rank -1) # 1-based to 0-based
-                            self._add_meta_values_from_dict(peptide_hit, psm.metadata)
-                            self._add_meta_values_from_dict(peptide_hit, psm.provenance_data)
-                            self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features)
-
-                            peptide_hits.append(peptide_hit)
-
-                        peptide_id.setHits(peptide_hits)
-                        self.peptide_ids.append(peptide_id)
-
-                # Get unique protein accessions
-                protein_list = list(set([accession for proteins in protein_list for accession in proteins]))
-                protein_hits = []
-                for accession in protein_list:
-                    protein_hit = oms.ProteinHit()
-                    protein_hit.setAccession(accession)
-                    protein_hits.append(protein_hit)
-                self.protein_ids.setHits(protein_hits)
-
-                # Write an idXML file for each collection
-                oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [self.protein_ids], self.peptide_ids)
\ No newline at end of file

From 83d23701d97d0b4c44be06b1e5f589585136616d Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Sun, 15 Oct 2023 14:22:44 +0000
Subject: [PATCH 15/20] black

---
 docs/source/conf.py                   |   3 +-
 online/pages/2_PSM_file_conversion.py |   7 +-
 psm_utils/io/_pd_msf_tables.py        |   1 +
 psm_utils/io/idxml.py                 | 146 ++++++++++++++++----------
 psm_utils/io/msamanda.py              |   2 +-
 tests/test_io/test_idxml.py           |  69 +++++++-----
 tests/test_io/test_maxquant.py        |   5 +-
 tests/test_io/test_msamanda.py        |   4 +-
 tests/test_io/test_mzid.py            |  10 +-
 tests/test_io/test_percolator.py      |   5 +-
 tests/test_io/test_xtandem.py         |   1 -
 tests/test_peptidoform.py             |   2 -
 tests/test_psm_list.py                |  32 +++++-
 13 files changed, 174 insertions(+), 113 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 36b1602..29762dd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -46,9 +46,10 @@
     "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
     "pyteomics": ("https://pyteomics.readthedocs.io/en/latest/", None),
-    "pyopenms": ("https://pyopenms.readthedocs.io/en/latest/", None)
+    "pyopenms": ("https://pyopenms.readthedocs.io/en/latest/", None),
 }
 
+
 def setup(app):
     config = {
         # "auto_toc_tree_section": "Contents",
diff --git a/online/pages/2_PSM_file_conversion.py b/online/pages/2_PSM_file_conversion.py
index 2873782..2dddcef 100644
--- a/online/pages/2_PSM_file_conversion.py
+++ b/online/pages/2_PSM_file_conversion.py
@@ -68,9 +68,10 @@ def _convert(self):
                 st.success("PSM file successfully converted!")
 
                 # Construct output filename with new extension
-                output_filename = Path(
-                    st.session_state["convert_input_file"].name
-                ).stem + FILETYPES[st.session_state["convert_output_filetype"]]["extension"]
+                output_filename = (
+                    Path(st.session_state["convert_input_file"].name).stem
+                    + FILETYPES[st.session_state["convert_output_filetype"]]["extension"]
+                )
 
                 # Open converted file in memory for download button
                 with open("output_filename", "rb") as file:
diff --git a/psm_utils/io/_pd_msf_tables.py b/psm_utils/io/_pd_msf_tables.py
index 7601e38..a2323bd 100644
--- a/psm_utils/io/_pd_msf_tables.py
+++ b/psm_utils/io/_pd_msf_tables.py
@@ -17,6 +17,7 @@
     UniqueConstraint,
     text,
 )
+
 try:
     from sqlalchemy.orm import declarative_base
 except ImportError:
diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 641645a..818971b 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -35,8 +35,7 @@
 # Extracted from the OpenMS PSMFeatureExtractor, which adds and manipulates features that will be given to percolator
 # https://github.com/OpenMS/OpenMS/blob/342f6524e76a2bab3dcb428ba2f4aa2d6bfe8483/src/topp/PSMFeatureExtractor.cpp
 RESCORING_FEATURE_LIST = [
-    "isotope_error"
-    "MS:1002049",  # MSGFPlus unchanged RawScore
+    "isotope_error" "MS:1002049",  # MSGFPlus unchanged RawScore
     "MS:1002050",  # MSGFPlus unchanged DeNovoScore
     "MSGF:ScoreRatio",
     "MSGF:Energy",
@@ -65,17 +64,17 @@
     "MASCOT:delta_score",  # delta score based on mScore
     "CONCAT:lnEvalue",
     "CONCAT:deltaLnEvalue",
-    "SAGE:ln(-poisson)"
-    "SAGE:ln(delta_best)",
+    "SAGE:ln(-poisson)" "SAGE:ln(delta_best)",
     "SAGE:ln(delta_next)",
     "SAGE:ln(matched_intensity_pct)",
     "SAGE:longest_b",
     "SAGE:longest_y",
     "SAGE:longest_y_pct",
     "SAGE:matched_peaks",
-    "SAGE:scored_candidates"
+    "SAGE:scored_candidates",
 ]
 
+
 class IdXMLReader(ReaderBase):
     def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
         """
@@ -99,7 +98,6 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
         self.peptide_ids = peptide_ids
         self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0])
 
-
     def __iter__(self) -> Iterable[PSM]:
         """
         Iterate over file and return PSMs one-by-one.
@@ -108,7 +106,6 @@ def __iter__(self) -> Iterable[PSM]:
             for peptide_hit in peptide_id.getHits():
                 yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit)
 
-
     def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification):
         """
         Parse idXML using pyopenms and do some sanity checks to make sure the file is not empty.
@@ -117,16 +114,22 @@ def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentifica
         oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids)
 
         if len(protein_ids) == 0:
-            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no proteins. Nothing to parse.")
+            raise IdXMLReaderEmptyListException(
+                f"File {self.filename} contains no proteins. Nothing to parse."
+            )
         elif len(peptide_ids) == 0:
-            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no PeptideIdentifications. Nothing to parse.")
+            raise IdXMLReaderEmptyListException(
+                f"File {self.filename} contains no PeptideIdentifications. Nothing to parse."
+            )
         elif len(peptide_ids[0].getHits()) == 0:
-            raise IdXMLReaderEmptyListException(f"File {self.filename} contains no PeptideHits. Nothing to parse.")
+            raise IdXMLReaderEmptyListException(
+                f"File {self.filename} contains no PeptideHits. Nothing to parse."
+            )
         else:
             return protein_ids, peptide_ids
 
     @staticmethod
-    def _parse_peptidoform(sequence:str, charge:int) -> str:
+    def _parse_peptidoform(sequence: str, charge: int) -> str:
         """
         Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.
 
@@ -148,51 +151,66 @@ def _parse_peptidoform(sequence:str, charge:int) -> str:
 
         return sequence
 
-
-    def _parse_psm(self, protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification, peptide_hit: oms.PeptideHit) -> PSM:
+    def _parse_psm(
+        self,
+        protein_ids: oms.ProteinIdentification,
+        peptide_id: oms.PeptideIdentification,
+        peptide_hit: oms.PeptideHit,
+    ) -> PSM:
         """
         Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`.
         Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification`
         to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object.
         """
-        peptidoform = self._parse_peptidoform(peptide_hit.getSequence().toString(), peptide_hit.getCharge())
+        peptidoform = self._parse_peptidoform(
+            peptide_hit.getSequence().toString(), peptide_hit.getCharge()
+        )
         return PSM(
-            peptidoform = peptidoform,
-            spectrum_id = peptide_id.getMetaValue("spectrum_reference"),
-            run = self._get_run(protein_ids, peptide_id),
-            is_decoy = self._is_decoy(peptide_hit),
-            score = peptide_hit.getScore(),
-            precursor_mz = peptide_id.getMZ(),
-            retention_time = peptide_id.getRT(),
+            peptidoform=peptidoform,
+            spectrum_id=peptide_id.getMetaValue("spectrum_reference"),
+            run=self._get_run(protein_ids, peptide_id),
+            is_decoy=self._is_decoy(peptide_hit),
+            score=peptide_hit.getScore(),
+            precursor_mz=peptide_id.getMZ(),
+            retention_time=peptide_id.getRT(),
             # NOTE: In future update of OpenMS we will be able to read out ion_mobility from idXML file
-            protein_list=[accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()],
-            rank = peptide_hit.getRank() + 1, # 0-based to 1-based
+            protein_list=[
+                accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()
+            ],
+            rank=peptide_hit.getRank() + 1,  # 0-based to 1-based
             source="idXML",
             # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back to original sequence in writer
-            provenance_data = {str(peptidoform): peptide_hit.getSequence().toString()},
+            provenance_data={str(peptidoform): peptide_hit.getSequence().toString()},
             # This is needed to calculate a qvalue before rescoring the PSMList
-            metadata = {
+            metadata={
                 "idxml:score_type": str(peptide_id.getScoreType()),
                 "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
-                "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())
+                "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()),
+            },
+            rescoring_features={
+                key: float(peptide_hit.getMetaValue(key))
+                for key in self.rescoring_features
+                if self._is_float(peptide_hit.getMetaValue(key))
             },
-            rescoring_features = {key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features \
-                                  if self._is_float(peptide_hit.getMetaValue(key))}
         )
 
-
     @staticmethod
-    def _get_run(protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification) -> str:
+    def _get_run(
+        protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification
+    ) -> str:
         """
         Get run name from idXML using pyopenms
         If idXML contains merge index, use it to annotate the run name without extension
         """
         if peptide_id.metaValueExists("id_merge_index"):
-            return Path(protein_ids[0].getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")].decode()).stem
+            return Path(
+                protein_ids[0]
+                .getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")]
+                .decode()
+            ).stem
         else:
             return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem
 
-
     @staticmethod
     def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]:
         """
@@ -202,17 +220,16 @@ def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]:
         keys = []
         peptide_hit.getKeys(keys)
         # Convert from byte to str and remove the keys, which values are not in the fixed list of features
-        keys = [key.decode() for key in keys if  key.decode() in RESCORING_FEATURE_LIST]
+        keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST]
 
         return keys
 
-
     @staticmethod
     def _is_float(element: any) -> bool:
         """
         Check if element is float.
         """
-        if element is None: 
+        if element is None:
             return False
         try:
             float(element)
@@ -220,7 +237,6 @@ def _is_float(element: any) -> bool:
         except ValueError:
             return False
 
-
     @staticmethod
     def _is_decoy(peptide_hit: oms.PeptideHit) -> bool:
         """
@@ -240,8 +256,8 @@ class IdXMLWriter(WriterBase):
     def __init__(
         self,
         filename: str | Path,
-        protein_ids = None,
-        peptide_ids = None,
+        protein_ids=None,
+        peptide_ids=None,
         *args,
         **kwargs,
     ) -> None:
@@ -267,15 +283,12 @@ def __init__(
         self.peptide_ids = peptide_ids
         self._writer = None
 
-
     def __enter__(self) -> IdXMLWriter:
         return self
 
-
     def __exit__(self, *args, **kwargs) -> None:
         pass
 
-
     def write_psm(self, psm: PSM):
         """
         Write a single PSM to the PSM file.
@@ -290,7 +303,6 @@ def write_psm(self, psm: PSM):
         """
         raise NotImplementedError("IdXMLWriter currently does not support write_psm.")
 
-
     def write_file(self, psm_list: PSMList) -> None:
         """
         Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification`
@@ -302,19 +314,22 @@ def write_file(self, psm_list: PSMList) -> None:
             self._update_existing_ids(psm_dict)
         # Check if one of self.protein_ids or self.peptide_ids is None
         elif self.protein_ids is not None or self.peptide_ids is not None:
-            logger.warning("One of the protein_ids or peptide_ids is None. Falling back to creating new idXML files solely based on the PSMList.")
+            logger.warning(
+                "One of the protein_ids or peptide_ids is None. Falling back to creating new idXML files solely based on the PSMList."
+            )
             self._create_new_ids(psm_dict)
         else:
             self._create_new_ids(psm_dict)
 
-
     def _update_existing_ids(self, psm_dict: dict) -> None:
         """
         Update existing :py:class:`~pyopenms.ProteinIdentification`
         and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
         """
         # Access run name(s) from ProteinIdentification
-        spectrum_files = [Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")]
+        spectrum_files = [
+            Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data")
+        ]
         for peptide_id in self.peptide_ids:
             if len(spectrum_files) > 1:
                 run = spectrum_files[peptide_id.getMetaValue("id_merge_index")]
@@ -325,7 +340,9 @@ def _update_existing_ids(self, psm_dict: dict) -> None:
             try:
                 psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")]
             except KeyError:
-                print("Multiple collections are not supported when parsing single pyopenms protein and peptide objects.")
+                print(
+                    "Multiple collections are not supported when parsing single pyopenms protein and peptide objects."
+                )
             # Dict of UNIMOD peptide sequence and PSM object
             hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms}
             # Update PeptideHits according to the PSM objects
@@ -340,19 +357,17 @@ def _update_existing_ids(self, psm_dict: dict) -> None:
 
         oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids)
 
-
     def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None:
         """
         Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information
         from :py:class:`~psm_utils.psm.PSM`.
         """
         peptide_hit.setScore(psm.score)
-        peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based
+        peptide_hit.setRank(psm.rank - 1)  # 1-based to 0-based
         for feature, value in psm.rescoring_features.items():
             if feature not in RESCORING_FEATURE_LIST:
                 peptide_hit.setMetaValue(feature, value)
 
-
     def _create_new_ids(self, psm_dict: dict) -> None:
         """
         Create new :py:class:`~pyopenms.ProteinIdentification`
@@ -366,8 +381,10 @@ def _create_new_ids(self, psm_dict: dict) -> None:
             self.protein_ids.setMetaValue("spectra_data", msrun_reference)
             protein_list = []
             for run, psm_dict_run in runs.items():
-                for spectrum_id, psms in psm_dict_run.items():                    
-                    protein_list.append([accession for psm in psms for accession in psm.protein_list])
+                for spectrum_id, psms in psm_dict_run.items():
+                    protein_list.append(
+                        [accession for psm in psms for accession in psm.protein_list]
+                    )
 
                     # Fill PeptideIdentification object with PeptideHits
                     peptide_id = oms.PeptideIdentification()
@@ -382,13 +399,22 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                     peptide_hits = []
                     for psm in psms:
                         peptide_hit = oms.PeptideHit()
-                        peptide_hit.setSequence(oms.AASequence.fromString(self._convert_proforma_to_unimod(psm.peptidoform.sequence)))
+                        peptide_hit.setSequence(
+                            oms.AASequence.fromString(
+                                self._convert_proforma_to_unimod(psm.peptidoform.sequence)
+                            )
+                        )
                         peptide_hit.setCharge(psm.peptidoform.precursor_charge)
-                        peptide_hit.setMetaValue("target_decoy", "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target"))
+                        peptide_hit.setMetaValue(
+                            "target_decoy",
+                            ""
+                            if psm.is_decoy is None
+                            else ("decoy" if psm.is_decoy else "target"),
+                        )
                         if psm.score is not None:
                             peptide_hit.setScore(psm.score)
                         if psm.rank is not None:
-                            peptide_hit.setRank(psm.rank -1) # 1-based to 0-based
+                            peptide_hit.setRank(psm.rank - 1)  # 1-based to 0-based
                         self._add_meta_values_from_dict(peptide_hit, psm.metadata)
                         self._add_meta_values_from_dict(peptide_hit, psm.provenance_data)
                         self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features)
@@ -399,7 +425,9 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                     self.peptide_ids.append(peptide_id)
 
             # Get unique protein accessions
-            protein_list = list(set([accession for proteins in protein_list for accession in proteins]))
+            protein_list = list(
+                set([accession for proteins in protein_list for accession in proteins])
+            )
             protein_hits = []
             for accession in protein_list:
                 protein_hit = oms.ProteinHit()
@@ -408,8 +436,11 @@ def _create_new_ids(self, psm_dict: dict) -> None:
             self.protein_ids.setHits(protein_hits)
 
             # Write an idXML file for each collection
-            oms.IdXMLFile().store("/".join(filter(None, [collection, str(self.filename)])), [self.protein_ids], self.peptide_ids)
-
+            oms.IdXMLFile().store(
+                "/".join(filter(None, [collection, str(self.filename)])),
+                [self.protein_ids],
+                self.peptide_ids,
+            )
 
     def _convert_proforma_to_unimod(self, sequence: str) -> str:
         """
@@ -426,7 +457,6 @@ def _convert_proforma_to_unimod(self, sequence: str) -> str:
 
         return sequence
 
-
     def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None:
         """
         Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary.
diff --git a/psm_utils/io/msamanda.py b/psm_utils/io/msamanda.py
index 3040a74..bbbeab0 100644
--- a/psm_utils/io/msamanda.py
+++ b/psm_utils/io/msamanda.py
@@ -129,7 +129,7 @@ def _parse_peptidoform(seq, modifications, charge):
                 peptide[0] = peptide[0] + f'[{match.group("mod_name")}]'
             elif match.group("term") == "C-Term":
                 peptide[-1] = peptide[-1] + f'[{match.group("mod_name")}]'
-            if match.group("loc") is not None: 
+            if match.group("loc") is not None:
                 peptide[int(match.group("loc"))] = (
                     peptide[int(match.group("loc"))] + f'[{match.group("mod_name")}]'
                 )
diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index f89c3fb..3ef4d7a 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -7,8 +7,8 @@
 from psm_utils.psm import PSM
 from psm_utils.peptidoform import Peptidoform
 
-class TestIdXMLReader:
 
+class TestIdXMLReader:
     def test__parse_peptidoform(self):
         test_cases = [
             (("DFPIAMGER", 2), "DFPIAMGER/2"),
@@ -20,18 +20,17 @@ def test__parse_peptidoform(self):
             ((".DFPIAM[147]GER.", 2), "DFPIAM[147]GER/2"),
             ((".(Dimethyl)DFPIAMGER.", 2), "[Dimethyl]-DFPIAMGER/2"),
             ((".DFPIAMGER.(Label:18O(2))", 2), "DFPIAMGER-[Label:18O(2)]/2"),
-            ((".DFPIAMGER(Phospho).", 2), "DFPIAMGER[Phospho]/2")
+            ((".DFPIAMGER(Phospho).", 2), "DFPIAMGER[Phospho]/2"),
         ]
 
         for test_in, expected_out in test_cases:
             assert IdXMLReader._parse_peptidoform(*test_in) == expected_out
 
-
     def test__parse_psm(self):
         test_psm = PSM(
-            peptidoform=Peptidoform('LVPIWKK/2'),
-            spectrum_id='controllerType=0 controllerNumber=1 scan=294',
-            run='HepG2_rep3_small',
+            peptidoform=Peptidoform("LVPIWKK/2"),
+            spectrum_id="controllerType=0 controllerNumber=1 scan=294",
+            run="HepG2_rep3_small",
             collection=None,
             spectrum=None,
             is_decoy=True,
@@ -41,46 +40,66 @@ def test__parse_psm(self):
             precursor_mz=442.29595853189994,
             retention_time=849.0,
             ion_mobility=None,
-            protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'],
+            protein_list=["DECOY_sp|Q5TH74|STPG1_HUMAN"],
             rank=1,
-            source='idXML',
-            provenance_data={'LVPIWKK/2': 'LVPIWKK'},
+            source="idXML",
+            provenance_data={"LVPIWKK/2": "LVPIWKK"},
             metadata={
-                'idxml:score_type': 'expect',
-                'idxml:higher_score_better': 'False',
-                'idxml:significance_threshold': '0.0'
+                "idxml:score_type": "expect",
+                "idxml:higher_score_better": "False",
+                "idxml:significance_threshold": "0.0",
+            },
+            rescoring_features={
+                "MS:1002252": 0.693,
+                "COMET:deltaCn": 1.0,
+                "MS:1002255": 35.9,
+                "COMET:deltaLCn": 0.0,
+                "COMET:lnExpect": 0.009950330853168092,
+                "COMET:lnNumSP": 3.555348061489414,
+                "COMET:lnRankSP": 0.0,
+                "COMET:IonFrac": 0.25,
             },
-            rescoring_features={'MS:1002252': 0.693,
-                                'COMET:deltaCn': 1.0,
-                                'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25}
         )
 
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
-        psm = reader._parse_psm(reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0])
+        psm = reader._parse_psm(
+            reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0]
+        )
         assert psm == test_psm
 
-
     def test__get_run(self):
         expected_output = "HepG2_rep3_small"
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
         assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output
 
-
     def test__get_rescoring_features(self):
-        expected_output = ['MS:1002252', 'COMET:deltaCn', 'MS:1002255', 'COMET:deltaLCn', 'COMET:lnExpect', 'COMET:lnNumSP', 'COMET:lnRankSP', 'COMET:IonFrac']
+        expected_output = [
+            "MS:1002252",
+            "COMET:deltaCn",
+            "MS:1002255",
+            "COMET:deltaLCn",
+            "COMET:lnExpect",
+            "COMET:lnNumSP",
+            "COMET:lnRankSP",
+            "COMET:IonFrac",
+        ]
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
-        assert IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output
+        assert (
+            IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0])
+            == expected_output
+        )
 
 
 class TestIdXMLWriter:
-    
     def test_write_file_with_pyopenms_objects(self):
         expected_sha = "8d8cb6d8194c5c296f0f5ee8be83d2072be125547b2d51b88100859b001f47fa"
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
         psm_list = reader.read_file()
-        writer = IdXMLWriter("./tests/test_data/test_out.idXML", reader.protein_ids, reader.peptide_ids)
+        writer = IdXMLWriter(
+            "./tests/test_data/test_out.idXML", reader.protein_ids, reader.peptide_ids
+        )
         writer.write_file(psm_list)
-        sha = hashlib.sha256(open("./tests/test_data/test_out.idXML", 'rb').read()).hexdigest()
+        sha = hashlib.sha256(open("./tests/test_data/test_out.idXML", "rb").read()).hexdigest()
         assert sha == expected_sha
 
     def test_write_file_without_pyopenms_objects(self):
@@ -89,5 +108,7 @@ def test_write_file_without_pyopenms_objects(self):
         psm_list = reader.read_file()
         writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML")
         writer.write_file(psm_list)
-        sha = hashlib.sha256(open("./tests/test_data/test_out_sage.idXML", 'rb').read()).hexdigest()
+        sha = hashlib.sha256(
+            open("./tests/test_data/test_out_sage.idXML", "rb").read()
+        ).hexdigest()
         assert sha == expected_sha
diff --git a/tests/test_io/test_maxquant.py b/tests/test_io/test_maxquant.py
index d0546c4..5384127 100644
--- a/tests/test_io/test_maxquant.py
+++ b/tests/test_io/test_maxquant.py
@@ -19,7 +19,6 @@
 
 class TestMSMSReader:
     def test_evaluate_columns(self):
-
         columns = TEST_COL.copy()
         # Test with the right column names
         maxquant.MSMSReader._evaluate_columns(columns)
@@ -73,8 +72,6 @@ def test_parse_peptidoform(self):
 
         msms_reader = maxquant.MSMSReader("./tests/test_data/test_msms.txt")
 
-        for test_in, expected_out in zip(
-            test_cases["input"], test_cases["expected_output"]
-        ):
+        for test_in, expected_out in zip(test_cases["input"], test_cases["expected_output"]):
             output = msms_reader._parse_peptidoform(*test_in)
             assert output.proforma == expected_out
diff --git a/tests/test_io/test_msamanda.py b/tests/test_io/test_msamanda.py
index 82e94f1..97466ff 100644
--- a/tests/test_io/test_msamanda.py
+++ b/tests/test_io/test_msamanda.py
@@ -64,8 +64,6 @@ def test_parse_peptidoform(self):
             ],
         }
 
-        for test_in, expected_out in zip(
-            test_cases["input"], test_cases["expected_output"]
-        ):
+        for test_in, expected_out in zip(test_cases["input"], test_cases["expected_output"]):
             output = msamanda.MSAmandaReader._parse_peptidoform(*test_in).proforma
             assert output == expected_out
diff --git a/tests/test_io/test_mzid.py b/tests/test_io/test_mzid.py
index 6832107..3c60c99 100644
--- a/tests/test_io/test_mzid.py
+++ b/tests/test_io/test_mzid.py
@@ -9,7 +9,6 @@
 
 class TestMzIdentMlReader:
     def test_infer_source(self):
-
         msgf = mzid.MzidReader(MSGF_TEST_FILE)
         assert msgf._source == "MS-GF+"
 
@@ -17,7 +16,6 @@ def test_infer_source(self):
         assert peaks._source == "PEAKS Studio"
 
     def test_get_namespace(self):
-
         root = ET.parse(MSGF_TEST_FILE).getroot()
         root_tag = root.tag
         assert (
@@ -95,7 +93,7 @@ def test_parse_peptidoform(self):
                     }
                 ],
                 2,
-            )
+            ),
         }
 
         for expected_out, test_in in test_cases.items():
@@ -103,7 +101,6 @@ def test_parse_peptidoform(self):
             assert test_out == expected_out
 
     def test_parse_peptide_evidence_ref(self):
-
         test_cases = [
             [
                 {
@@ -175,10 +172,7 @@ def test_parse_peptide_evidence_ref(self):
             (False, ["#CONTAM#Q86YZ3", "Q86YZ3|HORN_HUMAN"]),
         ]
         for i, test_case in enumerate(test_cases):
-            assert (
-                mzid.MzidReader._parse_peptide_evidence_ref(test_case)
-                == expected_output[i]
-            )
+            assert mzid.MzidReader._parse_peptide_evidence_ref(test_case) == expected_output[i]
 
     # TODO
     def test_get_peptide_spectrum_match(self):
diff --git a/tests/test_io/test_percolator.py b/tests/test_io/test_percolator.py
index 09c3751..692f5f9 100644
--- a/tests/test_io/test_percolator.py
+++ b/tests/test_io/test_percolator.py
@@ -31,7 +31,4 @@ def test_parse_peptidoform(self):
             (("-.ACDEFGHR.-", None), "ACDEFGHR"),
         ]
         for test_in, expected_out in test_cases:
-            assert (
-                expected_out
-                == PercolatorTabReader._parse_peptidoform(*test_in).proforma
-            )
+            assert expected_out == PercolatorTabReader._parse_peptidoform(*test_in).proforma
diff --git a/tests/test_io/test_xtandem.py b/tests/test_io/test_xtandem.py
index 32d9204..7592c2d 100644
--- a/tests/test_io/test_xtandem.py
+++ b/tests/test_io/test_xtandem.py
@@ -4,7 +4,6 @@
 
 
 class TestXTandemReader:
-
     reader = XTandemReader("path")
 
     def test__parse_peptidoform(self):
diff --git a/tests/test_peptidoform.py b/tests/test_peptidoform.py
index a9cb691..1e048e7 100644
--- a/tests/test_peptidoform.py
+++ b/tests/test_peptidoform.py
@@ -4,7 +4,6 @@
 
 
 class TestPeptidoform:
-
     def test__len__(self):
         test_cases = [
             ("ACDEFGHIK", 9),
@@ -27,7 +26,6 @@ def test__iter__(self):
                 for mod in mods:
                     assert isinstance(mod, proforma.TagBase)
 
-
     def test_rename_modifications(self):
         label_mapping = {"ac": "Acetyl", "cm": "Carbamidomethyl"}
 
diff --git a/tests/test_psm_list.py b/tests/test_psm_list.py
index 412b4d3..2be2c8f 100644
--- a/tests/test_psm_list.py
+++ b/tests/test_psm_list.py
@@ -71,10 +71,34 @@ def test___set_item__(self):
     def test_set_ranks(self):
         psm_list = PSMList(
             psm_list=[
-                PSM(peptidoform=Peptidoform('ACDE/2'), spectrum_id=1, run=None, collection=None, score=3.3),
-                PSM(peptidoform=Peptidoform('ACDF/2'), spectrum_id=2, run=None, collection=None, score=12.0),
-                PSM(peptidoform=Peptidoform('ACDG/2'), spectrum_id=3, run=None, collection=None, score=12.2),
-                PSM(peptidoform=Peptidoform('ACDH/2'), spectrum_id=4, run=None, collection=None, score=2.5),
+                PSM(
+                    peptidoform=Peptidoform("ACDE/2"),
+                    spectrum_id=1,
+                    run=None,
+                    collection=None,
+                    score=3.3,
+                ),
+                PSM(
+                    peptidoform=Peptidoform("ACDF/2"),
+                    spectrum_id=2,
+                    run=None,
+                    collection=None,
+                    score=12.0,
+                ),
+                PSM(
+                    peptidoform=Peptidoform("ACDG/2"),
+                    spectrum_id=3,
+                    run=None,
+                    collection=None,
+                    score=12.2,
+                ),
+                PSM(
+                    peptidoform=Peptidoform("ACDH/2"),
+                    spectrum_id=4,
+                    run=None,
+                    collection=None,
+                    score=2.5,
+                ),
             ]
         )
 

From b26443a2c4fa0eea3b61f20c97588f67f0126886 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Fri, 20 Oct 2023 16:47:50 -0700
Subject: [PATCH 16/20] PR review; minor fixes; add to-do's

---
 docs/source/conf.py      |   1 +
 psm_utils/io/__init__.py |   6 +-
 psm_utils/io/idxml.py    | 151 +++++++++++++++++++++++----------------
 3 files changed, 92 insertions(+), 66 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 29762dd..ccaca8b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -21,6 +21,7 @@
     "sphinx.ext.autodoc",
     "sphinx.ext.autosectionlabel",
     "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
     "sphinx.ext.napoleon",
     "sphinx_rtd_theme",
     "sphinx_mdinclude",
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index 2fc69cd..ef86c10 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -9,6 +9,7 @@
 from rich.progress import track
 
 import psm_utils.io.idxml as idxml
+import psm_utils.io.ionbot as ionbot
 import psm_utils.io.maxquant as maxquant
 import psm_utils.io.msamanda as msamanda
 import psm_utils.io.mzid as mzid
@@ -18,7 +19,6 @@
 import psm_utils.io.sage as sage
 import psm_utils.io.tsv as tsv
 import psm_utils.io.xtandem as xtandem
-import psm_utils.io.ionbot as ionbot
 from psm_utils.io._base_classes import WriterBase
 from psm_utils.io.exceptions import PSMUtilsIOException
 from psm_utils.psm import PSM
@@ -27,7 +27,7 @@
 FILETYPES = {
     "idxml": {
         "reader": idxml.IdXMLReader,
-        "writer": None,
+        "writer": idxml.IdXMLWriter,
         "extension": ".idXML",
         "filename_pattern": r"^.*\.idxml$",
     },
@@ -120,7 +120,7 @@ def _supports_write_psm(writer: WriterBase):
             supports_write_psm = True
         else:
             supports_write_psm = True
-        Path(temp_file.name).unlink()
+            Path(temp_file.name).unlink()
         return supports_write_psm
 
 
diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 818971b..fed16ea 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -14,16 +14,16 @@
 from __future__ import annotations
 
 import logging
-from typing import Union, Iterable, Tuple, List
 import re
 from pathlib import Path
+from typing import Iterable, List, Tuple, Union
 
 import pyopenms as oms
 
+from psm_utils.exceptions import PSMUtilsException
 from psm_utils.io._base_classes import ReaderBase, WriterBase
 from psm_utils.psm import PSM
 from psm_utils.psm_list import PSMList
-from psm_utils.exceptions import PSMUtilsException
 
 logger = logging.getLogger(__name__)
 
@@ -92,11 +92,8 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
         >>> psm_list = [psm for psm in reader]
         """
         super().__init__(filename, *args, **kwargs)
-        self.filename = Path(filename)
-        protein_ids, peptide_ids = self._parse_idxml()
-        self.protein_ids = protein_ids
-        self.peptide_ids = peptide_ids
-        self.rescoring_features = self._get_rescoring_features(peptide_ids[0].getHits()[0])
+        self.protein_ids, self.peptide_ids = self._parse_idxml()
+        self.rescoring_features = self._get_rescoring_features(self.peptide_ids[0].getHits()[0])
 
     def __iter__(self) -> Iterable[PSM]:
         """
@@ -108,7 +105,7 @@ def __iter__(self) -> Iterable[PSM]:
 
     def _parse_idxml(self) -> Tuple(oms.ProteinIdentification, oms.PeptideIdentification):
         """
-        Parse idXML using pyopenms and do some sanity checks to make sure the file is not empty.
+        Parse idXML using pyopenms and perform sanity checks to make sure the file is not empty.
         """
         protein_ids, peptide_ids = [], []
         oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids)
@@ -133,13 +130,11 @@ def _parse_peptidoform(sequence: str, charge: int) -> str:
         """
         Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.
 
-
         Notes
         -----
         Implemented according to the documentation on
         `github.com/OpenMS/OpenMS <https://github.com/OpenMS/OpenMS/blob/8cb90/src/openms/include/OpenMS/CHEMISTRY/AASequence.h>`_
-        . The differentiation between square- and round bracket notation is removed
-        after parsing.
+        . The differentiation between square- and round bracket notation is removed after parsing.
         """
         sequence = MOD_PATTERN.sub(r"[\1]", sequence)
         if sequence[:2] == ".[":
@@ -159,8 +154,10 @@ def _parse_psm(
     ) -> PSM:
         """
         Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`.
-        Use additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification`
-        to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object.
+
+        Uses additional information from :py:class:`~pyopenms.ProteinIdentification` and
+        :py:class:`~pyopenms.PeptideIdentification` to annotate parameters of the
+        :py:class:`~psm_utils.psm.PSM` object.
         """
         peptidoform = self._parse_peptidoform(
             peptide_hit.getSequence().toString(), peptide_hit.getCharge()
@@ -173,14 +170,16 @@ def _parse_psm(
             score=peptide_hit.getScore(),
             precursor_mz=peptide_id.getMZ(),
             retention_time=peptide_id.getRT(),
-            # NOTE: In future update of OpenMS we will be able to read out ion_mobility from idXML file
+            # NOTE: ion mobility will be supported by OpenMS in the future
             protein_list=[
                 accession.decode() for accession in peptide_hit.extractProteinAccessionsSet()
             ],
             rank=peptide_hit.getRank() + 1,  # 0-based to 1-based
             source="idXML",
-            # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back to original sequence in writer
+            # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back
+            # to original sequence in writer
             provenance_data={str(peptidoform): peptide_hit.getSequence().toString()},
+            # TODO: Can we recover any all UserParams and save them as metadata?
             # This is needed to calculate a qvalue before rescoring the PSMList
             metadata={
                 "idxml:score_type": str(peptide_id.getScoreType()),
@@ -199,36 +198,39 @@ def _get_run(
         protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification
     ) -> str:
         """
-        Get run name from idXML using pyopenms
-        If idXML contains merge index, use it to annotate the run name without extension
+        Get run name from idXML using pyopenms.
+
+        If the idXML file contains a merge index, use it to annotate the run name without file
+        extension.
         """
         if peptide_id.metaValueExists("id_merge_index"):
-            return Path(
+            run = Path(
                 protein_ids[0]
                 .getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")]
                 .decode()
             ).stem
         else:
-            return Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem
+            run = Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem
+
+        # Convert back to None value (see writer)
+        if run == "None":
+            run = None
+
+        return run
 
     @staticmethod
     def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]:
-        """
-        Get list of rescoring features from idXML
-        """
-        # Fill the key list with all the keys from the PeptideHit in the pyopenms way
+        """Get list of rescoring features from idXML."""
+        # Fill the key list with all the keys from the PeptideHit
+        # Empty list is required for the Cython wrapper to work correctly
         keys = []
         peptide_hit.getKeys(keys)
-        # Convert from byte to str and remove the keys, which values are not in the fixed list of features
         keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST]
-
         return keys
 
     @staticmethod
     def _is_float(element: any) -> bool:
-        """
-        Check if element is float.
-        """
+        """Check if element can be coerced to a float."""
         if element is None:
             return False
         try:
@@ -239,23 +241,17 @@ def _is_float(element: any) -> bool:
 
     @staticmethod
     def _is_decoy(peptide_hit: oms.PeptideHit) -> bool:
-        """
-        Check if PSM is target or decoy
-        """
+        """Check if PSM is target or decoy."""
         if peptide_hit.metaValueExists("target_decoy"):
             return peptide_hit.getMetaValue("target_decoy") == "decoy"
         else:
             return None
 
 
-class IdXMLReaderEmptyListException(PSMUtilsException):
-    """Exception in psm_utils.io.IdXMLReader"""
-
-
 class IdXMLWriter(WriterBase):
     def __init__(
         self,
-        filename: str | Path,
+        filename: Union[str, Path],
         protein_ids=None,
         peptide_ids=None,
         *args,
@@ -266,17 +262,19 @@ def __init__(
 
         Parameters
         ----------
-        filename: str, Pathlib.Path
+        filename
             Path to PSM file.
-        run : str, optional
-            Name of the MS run. Usually the spectrum file filename without
-            extension.
+        protein_ids
+            # TODO Add description
+        peptide_ids
+            # TODO Add description
 
         Notes
         -----
         - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing
           a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList
           to a file at once with the :py:meth:`write_file` method is currently supported.
+
         """
         super().__init__(filename, *args, **kwargs)
         self.protein_ids = protein_ids
@@ -298,15 +296,18 @@ def write_psm(self, psm: PSM):
         Raises
         ------
         NotImplementedError
-            OmsIdXMLWriter currently does not support write_psm.
+            IdXMLWriter currently does not support write_psm.
 
         """
         raise NotImplementedError("IdXMLWriter currently does not support write_psm.")
 
     def write_file(self, psm_list: PSMList) -> None:
         """
-        Write an entire PSMList to the PSM file or update the :py:class:`~pyopenms.ProteinIdentification`
-        and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
+        Write the PSMList to the PSM file.
+
+        If `self.protein_ids` and `self.peptide_ids` are not None, the PSM list scores, ranks,
+        and rescoring features will first be merged with the existing IDs from those objects.
+
         """
         psm_dict = psm_list.get_psm_dict()
 
@@ -315,7 +316,8 @@ def write_file(self, psm_list: PSMList) -> None:
         # Check if one of self.protein_ids or self.peptide_ids is None
         elif self.protein_ids is not None or self.peptide_ids is not None:
             logger.warning(
-                "One of the protein_ids or peptide_ids is None. Falling back to creating new idXML files solely based on the PSMList."
+                "One of the protein_ids or peptide_ids is None. Falling back to creating new "
+                "idXML files solely based on the PSMList."
             )
             self._create_new_ids(psm_dict)
         else:
@@ -323,8 +325,11 @@ def write_file(self, psm_list: PSMList) -> None:
 
     def _update_existing_ids(self, psm_dict: dict) -> None:
         """
-        Update existing :py:class:`~pyopenms.ProteinIdentification`
-        and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
+        Update an existing idXML file with info from the PSM list or write a new one.
+
+        Update existing :py:class:`~pyopenms.ProteinIdentification` and
+        :py:class:`~pyopenms.PeptideIdentification` objects with new features from the PSMList
+        or create new ones.
         """
         # Access run name(s) from ProteinIdentification
         spectrum_files = [
@@ -335,14 +340,17 @@ def _update_existing_ids(self, psm_dict: dict) -> None:
                 run = spectrum_files[peptide_id.getMetaValue("id_merge_index")]
             else:
                 run = spectrum_files[0]
+
             # Get PSM objects associated from runs since we are writing a merged idXML
             # NOTE: Collections with multiple protein_ids and peptide_ids is not supported
             try:
                 psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")]
-            except KeyError:
-                print(
-                    "Multiple collections are not supported when parsing single pyopenms protein and peptide objects."
-                )
+            except KeyError as e:
+                raise IdXMLException(
+                    "Multiple collections are not supported when parsing single pyopenms protein "
+                    "and peptide objects."
+                ) from e
+
             # Dict of UNIMOD peptide sequence and PSM object
             hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms}
             # Update PeptideHits according to the PSM objects
@@ -359,26 +367,29 @@ def _update_existing_ids(self, psm_dict: dict) -> None:
 
     def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None:
         """
-        Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features information
-        from :py:class:`~psm_utils.psm.PSM`.
+        Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features
+        information from :py:class:`~psm_utils.psm.PSM`.
         """
         peptide_hit.setScore(psm.score)
         peptide_hit.setRank(psm.rank - 1)  # 1-based to 0-based
+        # TODO: Should q-value and pep also be updated?
         for feature, value in psm.rescoring_features.items():
             if feature not in RESCORING_FEATURE_LIST:
                 peptide_hit.setMetaValue(feature, value)
 
     def _create_new_ids(self, psm_dict: dict) -> None:
         """
-        Create new :py:class:`~pyopenms.ProteinIdentification`
-        and :py:class:`~pyopenms.PeptideIdentification` objects with novel features from the PSMList.
+        Create new ProteinIdentification and PeptideIdentification objects with new features from
+        the PSMList.
         """
         for collection, runs in psm_dict.items():
             self.protein_ids = oms.ProteinIdentification()
             self.peptide_ids = []
+
             # Set msrun filename with spectra_data meta value
-            msrun_reference = [run for run in runs.keys()]
+            msrun_reference = [str(run) for run in runs.keys()]
             self.protein_ids.setMetaValue("spectra_data", msrun_reference)
+
             protein_list = []
             for run, psm_dict_run in runs.items():
                 for spectrum_id, psms in psm_dict_run.items():
@@ -389,7 +400,7 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                     # Fill PeptideIdentification object with PeptideHits
                     peptide_id = oms.PeptideIdentification()
                     peptide_id.setMetaValue("spectrum_reference", spectrum_id)
-                    peptide_id.setMetaValue("id_merge_index", msrun_reference.index(run))
+                    peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run)))
                     if psms[0].precursor_mz is not None:
                         peptide_id.setMZ(psms[0].precursor_mz)
                     if psms[0].retention_time is not None:
@@ -401,7 +412,7 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                         peptide_hit = oms.PeptideHit()
                         peptide_hit.setSequence(
                             oms.AASequence.fromString(
-                                self._convert_proforma_to_unimod(psm.peptidoform.sequence)
+                                self._convert_proforma_to_unimod(str(psm.peptidoform))
                             )
                         )
                         peptide_hit.setCharge(psm.peptidoform.precursor_charge)
@@ -419,6 +430,12 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                         self._add_meta_values_from_dict(peptide_hit, psm.provenance_data)
                         self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features)
 
+                        if psm.protein_list is not None:
+                            for protein in psm.protein_list:
+                                peptide_evidence = oms.PeptideEvidence()
+                                peptide_evidence.setProteinAccession(protein)
+                                peptide_hit.addPeptideEvidence(peptide_evidence)
+
                         peptide_hits.append(peptide_hit)
 
                     peptide_id.setHits(peptide_hits)
@@ -443,9 +460,7 @@ def _create_new_ids(self, psm_dict: dict) -> None:
             )
 
     def _convert_proforma_to_unimod(self, sequence: str) -> str:
-        """
-        Convert a peptidoform sequence in proforma notation to UNIMOD notation.
-        """
+        """Convert a peptidoform sequence in proforma notation to UNIMOD notation."""
         # Replace square brackets with parentheses
         sequence = sequence.replace("[", "(").replace("]", ")")
 
@@ -458,9 +473,19 @@ def _convert_proforma_to_unimod(self, sequence: str) -> str:
         return sequence
 
     def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None:
-        """
-        Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary.
-        """
+        """Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary."""
         if d is not None:
             for key, value in d.items():
                 peptide_hit.setMetaValue(key, value)
+
+
+class IdXMLException(PSMUtilsException):
+    """Exception in psm_utils.io.IdXML"""
+
+    pass
+
+
+class IdXMLReaderEmptyListException(PSMUtilsException):
+    """Exception in psm_utils.io.IdXMLReader"""
+
+    pass

From ffd5138ce00b969a0a60bd2b72c915489ee27a60 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Sun, 22 Oct 2023 16:46:18 +0000
Subject: [PATCH 17/20] Incorporate feedback

---
 psm_utils/io/idxml.py               | 100 ++++++++++++++++++++--------
 tests/test_data/test_out_sage.idXML |   5 +-
 tests/test_io/test_idxml.py         |  41 ++----------
 3 files changed, 80 insertions(+), 66 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index fed16ea..00ae57d 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -24,6 +24,7 @@
 from psm_utils.io._base_classes import ReaderBase, WriterBase
 from psm_utils.psm import PSM
 from psm_utils.psm_list import PSMList
+from psm_utils.peptidoform import Peptidoform
 
 logger = logging.getLogger(__name__)
 
@@ -93,6 +94,7 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
         """
         super().__init__(filename, *args, **kwargs)
         self.protein_ids, self.peptide_ids = self._parse_idxml()
+        self.user_params_metadata = self._get_userparams_metadata(self.peptide_ids[0].getHits()[0])
         self.rescoring_features = self._get_rescoring_features(self.peptide_ids[0].getHits()[0])
 
     def __iter__(self) -> Iterable[PSM]:
@@ -162,6 +164,12 @@ def _parse_psm(
         peptidoform = self._parse_peptidoform(
             peptide_hit.getSequence().toString(), peptide_hit.getCharge()
         )
+        # This is needed to calculate a qvalue before rescoring the PSMList
+        peptide_id_metadata = {
+                "idxml:score_type": str(peptide_id.getScoreType()),
+                "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
+                "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())}
+        peptide_hit_metadata = {key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata}
         return PSM(
             peptidoform=peptidoform,
             spectrum_id=peptide_id.getMetaValue("spectrum_reference"),
@@ -179,17 +187,11 @@ def _parse_psm(
             # Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back
             # to original sequence in writer
             provenance_data={str(peptidoform): peptide_hit.getSequence().toString()},
-            # TODO: Can we recover any all UserParams and save them as metadata?
-            # This is needed to calculate a qvalue before rescoring the PSMList
-            metadata={
-                "idxml:score_type": str(peptide_id.getScoreType()),
-                "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
-                "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()),
-            },
+            # Store metadata of PeptideIdentification and PeptideHit objects
+            metadata=peptide_id_metadata | peptide_hit_metadata,
             rescoring_features={
                 key: float(peptide_hit.getMetaValue(key))
                 for key in self.rescoring_features
-                if self._is_float(peptide_hit.getMetaValue(key))
             },
         )
 
@@ -218,14 +220,20 @@ def _get_run(
 
         return run
 
-    @staticmethod
-    def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]:
-        """Get list of rescoring features from idXML."""
+    def _get_userparams_metadata(self, peptide_hit: oms.PeptideHit) -> List[str]:
+        """Get list of string type UserParams attached to each PeptideHit."""
         # Fill the key list with all the keys from the PeptideHit
         # Empty list is required for the Cython wrapper to work correctly
         keys = []
         peptide_hit.getKeys(keys)
-        keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST]
+        keys = [key.decode() for key in keys if not self._is_float(peptide_hit.getMetaValue(key.decode()))]
+        return keys
+    
+    def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]:
+        """Get list of rescoring features in UserParams attached to each PeptideHit."""
+        keys = []
+        peptide_hit.getKeys(keys)
+        keys = [key.decode() for key in keys if self._is_float(peptide_hit.getMetaValue(key.decode())) and key.decode() in RESCORING_FEATURE_LIST]
         return keys
 
     @staticmethod
@@ -265,15 +273,37 @@ def __init__(
         filename
             Path to PSM file.
         protein_ids
-            # TODO Add description
+            Optional :py:class:`~pyopenms.ProteinIdentification` object to be written to the idXML file.
         peptide_ids
-            # TODO Add description
+            Optional :py:class:`~pyopenms.PeptideIdentification` object to be written to the idXML file.
 
         Notes
         -----
         - Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing
           a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList
           to a file at once with the :py:meth:`write_file` method is currently supported.
+        - If `protein_ids` and `peptide_ids` are provided, each :py:class:`~pyopenms.PeptideIdentification`
+          object in the list `peptide_ids` will be updated with new `rescoring_features` from the PSMList.
+          Otherwise, new pyopenms objects will be created, filled with information of PSMList and written to the idXML file.
+        
+        Examples
+        --------
+        - Example with `pyopenms` objects:
+
+        >>> from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
+        >>> reader = IdXMLReader("psm_utils/tests/test_data/test_in.idXML"")
+        >>> psm_list = reader.read_file()
+        >>> for psm in psm_list:
+        ...     psm.rescoring_features = psm.rescoring_features | {"feature": 1}
+        >>> writer = IdXMLWriter("psm_utils/tests/test_data//test_out.idXML", reader.protein_ids, reader.peptide_ids)
+        >>> writer.write_file(psm_list)
+
+        - Example without `pyopenms` objects:
+
+        >>> from psm_utils.psm_list import PSMList
+        >>> psm_list = PSMList(psm_list=[PSM(peptidoform="ACDK", spectrum_id=1, score=140.2, retention_time=600.2)])
+        >>> writer = IdXMLWriter("psm_utils/tests/test_data//test_out.idXML")
+        >>> writer.write_file(psm_list)
 
         """
         super().__init__(filename, *args, **kwargs)
@@ -370,9 +400,15 @@ def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None:
         Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features
         information from :py:class:`~psm_utils.psm.PSM`.
         """
-        peptide_hit.setScore(psm.score)
-        peptide_hit.setRank(psm.rank - 1)  # 1-based to 0-based
-        # TODO: Should q-value and pep also be updated?
+        if psm.score is not None:
+            peptide_hit.setScore(psm.score)
+        if psm.rank is not None:
+            peptide_hit.setRank(psm.rank - 1)  # 1-based to 0-based
+        if psm.qvalue is not None:
+            peptide_hit.setMetaValue("q-value", psm.qvalue)
+        if psm.pep is not None:
+            peptide_hit.setMetaValue("PEP", psm.pep)
+        
         for feature, value in psm.rescoring_features.items():
             if feature not in RESCORING_FEATURE_LIST:
                 peptide_hit.setMetaValue(feature, value)
@@ -401,6 +437,9 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                     peptide_id = oms.PeptideIdentification()
                     peptide_id.setMetaValue("spectrum_reference", spectrum_id)
                     peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run)))
+                    # TODO: Question: Is there a way to infer the score type from the PSM?
+                    if psms[0].score is not None:
+                        peptide_id.setScoreType("search_engine_score")
                     if psms[0].precursor_mz is not None:
                         peptide_id.setMZ(psms[0].precursor_mz)
                     if psms[0].retention_time is not None:
@@ -412,7 +451,7 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                         peptide_hit = oms.PeptideHit()
                         peptide_hit.setSequence(
                             oms.AASequence.fromString(
-                                self._convert_proforma_to_unimod(str(psm.peptidoform))
+                                self._convert_proforma_to_unimod(psm.peptidoform)
                             )
                         )
                         peptide_hit.setCharge(psm.peptidoform.precursor_charge)
@@ -422,8 +461,10 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                             if psm.is_decoy is None
                             else ("decoy" if psm.is_decoy else "target"),
                         )
-                        if psm.score is not None:
-                            peptide_hit.setScore(psm.score)
+                        if psm.qvalue is not None:
+                            peptide_hit.setMetaValue("q-value", psm.qvalue)
+                        if psm.pep is not None:
+                            peptide_hit.setMetaValue("PEP", psm.pep)
                         if psm.rank is not None:
                             peptide_hit.setRank(psm.rank - 1)  # 1-based to 0-based
                         self._add_meta_values_from_dict(peptide_hit, psm.metadata)
@@ -459,16 +500,21 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                 self.peptide_ids,
             )
 
-    def _convert_proforma_to_unimod(self, sequence: str) -> str:
+    def _convert_proforma_to_unimod(self, peptidoform: Peptidoform) -> str:
         """Convert a peptidoform sequence in proforma notation to UNIMOD notation."""
-        # Replace square brackets with parentheses
-        sequence = sequence.replace("[", "(").replace("]", ")")
+        sequence = str(peptidoform).split('/')[0]
+
+        # Replace square brackets around modifications with parentheses
+        sequence = re.sub(r"\[([^\]]+)\]", r"(\1)", sequence)
 
         # Check for N-terminal and C-terminal modifications
-        if sequence[:2] == "((":
-            sequence = sequence.replace("((", ".[", 1).replace(")-", ".", 1)
-        if sequence[-2:] == "))":
-            sequence = sequence[::-1].replace("))", ".]", 1).replace("-(", ".", 1)[::-1]
+        if sequence.startswith("["):
+            sequence = re.sub(r"^\[([^\]]+)\]-", r"(\1)", sequence)
+        if sequence.endswith("]"):
+            sequence = re.sub(r"-\[([^\]]+)\]$", r"-(\1)", sequence)
+
+        # Remove dashes for N-terminal and C-terminal modifications
+        sequence = sequence.replace(")-", ")").replace("-(", "(")
 
         return sequence
 
diff --git a/tests/test_data/test_out_sage.idXML b/tests/test_data/test_out_sage.idXML
index 2d307cb..450ef56 100644
--- a/tests/test_data/test_out_sage.idXML
+++ b/tests/test_data/test_out_sage.idXML
@@ -9,10 +9,11 @@
 			</ProteinHit>
 			<UserParam type="stringList" name="spectra_data" value="[LQSRPAAPPAPGPGQLTLR]"/>
 		</ProteinIdentification>
-		<PeptideIdentification score_type="" higher_score_better="true" significance_threshold="0.0" MZ="643.03499169873669" RT="108.285399999999996" spectrum_reference="controllerType=0 controllerNumber=1 scan=30069" >
-			<PeptideHit score="1.2944585" sequence="LQSRPAAPPAPGPGQLTLR" charge="3" >
+		<PeptideIdentification score_type="search_engine_score" higher_score_better="true" significance_threshold="0.0" MZ="643.03499169873669" RT="108.285399999999996" spectrum_reference="controllerType=0 controllerNumber=1 scan=30069" >
+			<PeptideHit score="0.0" sequence="LQSRPAAPPAPGPGQLTLR" charge="3" protein_refs="PH_0" >
 				<UserParam type="string" name="target_decoy" value="target"/>
 				<UserParam type="float" name="isotope_error" value="0.0"/>
+				<UserParam type="float" name="q-value" value="1.0"/>
 				<UserParam type="string" name="sage_filename" value="./tests/test_data/results.sage.tsv"/>
 				<UserParam type="float" name="expmass" value="1926.081500000000005"/>
 				<UserParam type="float" name="calcmass" value="1926.079999999999927"/>
diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index 3ef4d7a..ce45556 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -27,40 +27,7 @@ def test__parse_peptidoform(self):
             assert IdXMLReader._parse_peptidoform(*test_in) == expected_out
 
     def test__parse_psm(self):
-        test_psm = PSM(
-            peptidoform=Peptidoform("LVPIWKK/2"),
-            spectrum_id="controllerType=0 controllerNumber=1 scan=294",
-            run="HepG2_rep3_small",
-            collection=None,
-            spectrum=None,
-            is_decoy=True,
-            score=1.0099999904632568,
-            qvalue=None,
-            pep=None,
-            precursor_mz=442.29595853189994,
-            retention_time=849.0,
-            ion_mobility=None,
-            protein_list=["DECOY_sp|Q5TH74|STPG1_HUMAN"],
-            rank=1,
-            source="idXML",
-            provenance_data={"LVPIWKK/2": "LVPIWKK"},
-            metadata={
-                "idxml:score_type": "expect",
-                "idxml:higher_score_better": "False",
-                "idxml:significance_threshold": "0.0",
-            },
-            rescoring_features={
-                "MS:1002252": 0.693,
-                "COMET:deltaCn": 1.0,
-                "MS:1002255": 35.9,
-                "COMET:deltaLCn": 0.0,
-                "COMET:lnExpect": 0.009950330853168092,
-                "COMET:lnNumSP": 3.555348061489414,
-                "COMET:lnRankSP": 0.0,
-                "COMET:IonFrac": 0.25,
-            },
-        )
-
+        test_psm = PSM(peptidoform=Peptidoform('LVPIWKK/2'), spectrum_id='controllerType=0 controllerNumber=1 scan=294', run='HepG2_rep3_small', collection=None, spectrum=None, is_decoy=True, score=1.0099999904632568, qvalue=None, pep=None, precursor_mz=442.29595853189994, retention_time=849.0, ion_mobility=None, protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'], rank=1, source='idXML', provenance_data={'LVPIWKK/2': 'LVPIWKK'}, metadata={'idxml:score_type': 'expect', 'idxml:higher_score_better': 'False', 'idxml:significance_threshold': '0.0', 'target_decoy': 'decoy', 'protein_references': 'unique'}, rescoring_features={'MS:1002252': 0.693, 'COMET:deltaCn': 1.0, 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25})
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
         psm = reader._parse_psm(
             reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0]
@@ -70,7 +37,7 @@ def test__parse_psm(self):
     def test__get_run(self):
         expected_output = "HepG2_rep3_small"
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
-        assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output
+        assert reader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output
 
     def test__get_rescoring_features(self):
         expected_output = [
@@ -85,7 +52,7 @@ def test__get_rescoring_features(self):
         ]
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
         assert (
-            IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0])
+            reader._get_rescoring_features(reader.peptide_ids[0].getHits()[0])
             == expected_output
         )
 
@@ -103,7 +70,7 @@ def test_write_file_with_pyopenms_objects(self):
         assert sha == expected_sha
 
     def test_write_file_without_pyopenms_objects(self):
-        expected_sha = "a24e157579a4e9da71900931c6b5578a264ea7830135ede480b41e6550944e5e"
+        expected_sha = "b81addaf8ef1f5cb5007f14a914bee508c54d59f34f8857a5770d3db9aa2c15b"
         reader = SageReader("./tests/test_data/results.sage.tsv")
         psm_list = reader.read_file()
         writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML")

From 5bb19efb89ef80bcf0a92d6878e6941f9f4527f5 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Sun, 22 Oct 2023 16:46:46 +0000
Subject: [PATCH 18/20] black

---
 psm_utils/io/idxml.py       | 35 +++++++++++++++++++++------------
 tests/test_io/test_idxml.py | 39 ++++++++++++++++++++++++++++++++++---
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 00ae57d..dbf36e1 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -166,10 +166,13 @@ def _parse_psm(
         )
         # This is needed to calculate a qvalue before rescoring the PSMList
         peptide_id_metadata = {
-                "idxml:score_type": str(peptide_id.getScoreType()),
-                "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
-                "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())}
-        peptide_hit_metadata = {key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata}
+            "idxml:score_type": str(peptide_id.getScoreType()),
+            "idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
+            "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()),
+        }
+        peptide_hit_metadata = {
+            key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata
+        }
         return PSM(
             peptidoform=peptidoform,
             spectrum_id=peptide_id.getMetaValue("spectrum_reference"),
@@ -190,8 +193,7 @@ def _parse_psm(
             # Store metadata of PeptideIdentification and PeptideHit objects
             metadata=peptide_id_metadata | peptide_hit_metadata,
             rescoring_features={
-                key: float(peptide_hit.getMetaValue(key))
-                for key in self.rescoring_features
+                key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features
             },
         )
 
@@ -226,14 +228,23 @@ def _get_userparams_metadata(self, peptide_hit: oms.PeptideHit) -> List[str]:
         # Empty list is required for the Cython wrapper to work correctly
         keys = []
         peptide_hit.getKeys(keys)
-        keys = [key.decode() for key in keys if not self._is_float(peptide_hit.getMetaValue(key.decode()))]
+        keys = [
+            key.decode()
+            for key in keys
+            if not self._is_float(peptide_hit.getMetaValue(key.decode()))
+        ]
         return keys
-    
+
     def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]:
         """Get list of rescoring features in UserParams attached to each PeptideHit."""
         keys = []
         peptide_hit.getKeys(keys)
-        keys = [key.decode() for key in keys if self._is_float(peptide_hit.getMetaValue(key.decode())) and key.decode() in RESCORING_FEATURE_LIST]
+        keys = [
+            key.decode()
+            for key in keys
+            if self._is_float(peptide_hit.getMetaValue(key.decode()))
+            and key.decode() in RESCORING_FEATURE_LIST
+        ]
         return keys
 
     @staticmethod
@@ -285,7 +296,7 @@ def __init__(
         - If `protein_ids` and `peptide_ids` are provided, each :py:class:`~pyopenms.PeptideIdentification`
           object in the list `peptide_ids` will be updated with new `rescoring_features` from the PSMList.
           Otherwise, new pyopenms objects will be created, filled with information of PSMList and written to the idXML file.
-        
+
         Examples
         --------
         - Example with `pyopenms` objects:
@@ -408,7 +419,7 @@ def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None:
             peptide_hit.setMetaValue("q-value", psm.qvalue)
         if psm.pep is not None:
             peptide_hit.setMetaValue("PEP", psm.pep)
-        
+
         for feature, value in psm.rescoring_features.items():
             if feature not in RESCORING_FEATURE_LIST:
                 peptide_hit.setMetaValue(feature, value)
@@ -502,7 +513,7 @@ def _create_new_ids(self, psm_dict: dict) -> None:
 
     def _convert_proforma_to_unimod(self, peptidoform: Peptidoform) -> str:
         """Convert a peptidoform sequence in proforma notation to UNIMOD notation."""
-        sequence = str(peptidoform).split('/')[0]
+        sequence = str(peptidoform).split("/")[0]
 
         # Replace square brackets around modifications with parentheses
         sequence = re.sub(r"\[([^\]]+)\]", r"(\1)", sequence)
diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index ce45556..075b12f 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -27,7 +27,41 @@ def test__parse_peptidoform(self):
             assert IdXMLReader._parse_peptidoform(*test_in) == expected_out
 
     def test__parse_psm(self):
-        test_psm = PSM(peptidoform=Peptidoform('LVPIWKK/2'), spectrum_id='controllerType=0 controllerNumber=1 scan=294', run='HepG2_rep3_small', collection=None, spectrum=None, is_decoy=True, score=1.0099999904632568, qvalue=None, pep=None, precursor_mz=442.29595853189994, retention_time=849.0, ion_mobility=None, protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'], rank=1, source='idXML', provenance_data={'LVPIWKK/2': 'LVPIWKK'}, metadata={'idxml:score_type': 'expect', 'idxml:higher_score_better': 'False', 'idxml:significance_threshold': '0.0', 'target_decoy': 'decoy', 'protein_references': 'unique'}, rescoring_features={'MS:1002252': 0.693, 'COMET:deltaCn': 1.0, 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25})
+        test_psm = PSM(
+            peptidoform=Peptidoform("LVPIWKK/2"),
+            spectrum_id="controllerType=0 controllerNumber=1 scan=294",
+            run="HepG2_rep3_small",
+            collection=None,
+            spectrum=None,
+            is_decoy=True,
+            score=1.0099999904632568,
+            qvalue=None,
+            pep=None,
+            precursor_mz=442.29595853189994,
+            retention_time=849.0,
+            ion_mobility=None,
+            protein_list=["DECOY_sp|Q5TH74|STPG1_HUMAN"],
+            rank=1,
+            source="idXML",
+            provenance_data={"LVPIWKK/2": "LVPIWKK"},
+            metadata={
+                "idxml:score_type": "expect",
+                "idxml:higher_score_better": "False",
+                "idxml:significance_threshold": "0.0",
+                "target_decoy": "decoy",
+                "protein_references": "unique",
+            },
+            rescoring_features={
+                "MS:1002252": 0.693,
+                "COMET:deltaCn": 1.0,
+                "MS:1002255": 35.9,
+                "COMET:deltaLCn": 0.0,
+                "COMET:lnExpect": 0.009950330853168092,
+                "COMET:lnNumSP": 3.555348061489414,
+                "COMET:lnRankSP": 0.0,
+                "COMET:IonFrac": 0.25,
+            },
+        )
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
         psm = reader._parse_psm(
             reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0]
@@ -52,8 +86,7 @@ def test__get_rescoring_features(self):
         ]
         reader = IdXMLReader("./tests/test_data/test_in.idXML")
         assert (
-            reader._get_rescoring_features(reader.peptide_ids[0].getHits()[0])
-            == expected_output
+            reader._get_rescoring_features(reader.peptide_ids[0].getHits()[0]) == expected_output
         )
 
 

From dc3b5d0220451691929ec17230090d01bba2528b Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Mon, 23 Oct 2023 21:11:02 +0000
Subject: [PATCH 19/20] incorporate feedback

---
 psm_utils/io/idxml.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index dbf36e1..109ff2c 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -191,7 +191,7 @@ def _parse_psm(
             # to original sequence in writer
             provenance_data={str(peptidoform): peptide_hit.getSequence().toString()},
             # Store metadata of PeptideIdentification and PeptideHit objects
-            metadata=peptide_id_metadata | peptide_hit_metadata,
+            metadata= {**peptide_id_metadata, **peptide_hit_metadata},
             rescoring_features={
                 key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features
             },
@@ -302,10 +302,10 @@ def __init__(
         - Example with `pyopenms` objects:
 
         >>> from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
-        >>> reader = IdXMLReader("psm_utils/tests/test_data/test_in.idXML"")
+        >>> reader = IdXMLReader("psm_utils/tests/test_data/test_in.idXML")
         >>> psm_list = reader.read_file()
         >>> for psm in psm_list:
-        ...     psm.rescoring_features = psm.rescoring_features | {"feature": 1}
+        ...     psm.rescoring_features = {**psm.rescoring_features, **{"feature": 1}}
         >>> writer = IdXMLWriter("psm_utils/tests/test_data//test_out.idXML", reader.protein_ids, reader.peptide_ids)
         >>> writer.write_file(psm_list)
 
@@ -448,7 +448,6 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                     peptide_id = oms.PeptideIdentification()
                     peptide_id.setMetaValue("spectrum_reference", spectrum_id)
                     peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run)))
-                    # TODO: Question: Is there a way to infer the score type from the PSM?
                     if psms[0].score is not None:
                         peptide_id.setScoreType("search_engine_score")
                     if psms[0].precursor_mz is not None:

From bd3baf35fa5daa70f793faab0f626ef877110525 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Mon, 23 Oct 2023 22:19:09 +0000
Subject: [PATCH 20/20] fix byte error for p38

---
 psm_utils/io/idxml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index 109ff2c..84ecb26 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -434,7 +434,7 @@ def _create_new_ids(self, psm_dict: dict) -> None:
             self.peptide_ids = []
 
             # Set msrun filename with spectra_data meta value
-            msrun_reference = [str(run) for run in runs.keys()]
+            msrun_reference = [str(run).encode() for run in runs.keys()]
             self.protein_ids.setMetaValue("spectra_data", msrun_reference)
 
             protein_list = []
@@ -447,7 +447,7 @@ def _create_new_ids(self, psm_dict: dict) -> None:
                     # Fill PeptideIdentification object with PeptideHits
                     peptide_id = oms.PeptideIdentification()
                     peptide_id.setMetaValue("spectrum_reference", spectrum_id)
-                    peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run)))
+                    peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run).encode()))
                     if psms[0].score is not None:
                         peptide_id.setScoreType("search_engine_score")
                     if psms[0].precursor_mz is not None: